diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-11 10:55:49 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-11 10:55:49 -0700 |
commit | 237f83dfbe668443b5e31c3c7576125871cca674 (patch) | |
tree | 11848a8d0aa414a1d3ce2024e181071b1d9dea08 /net | |
parent | 8f6ccf6159aed1f04c6d179f61f6fb2691261e84 (diff) | |
parent | 1ff2f0fa450ea4e4f87793d9ed513098ec6e12be (diff) | |
download | linux-237f83dfbe668443b5e31c3c7576125871cca674.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
"Some highlights from this development cycle:
1) Big refactoring of ipv6 route and neigh handling to support
nexthop objects configurable as units from userspace. From David
Ahern.
2) Convert explored_states in BPF verifier into a hash table,
significantly decreased state held for programs with bpf2bpf
calls, from Alexei Starovoitov.
3) Implement bpf_send_signal() helper, from Yonghong Song.
4) Various classifier enhancements to mvpp2 driver, from Maxime
Chevallier.
5) Add aRFS support to hns3 driver, from Jian Shen.
6) Fix use after free in inet frags by allocating fqdirs dynamically
and reworking how rhashtable dismantle occurs, from Eric Dumazet.
7) Add act_ctinfo packet classifier action, from Kevin
Darbyshire-Bryant.
8) Add TFO key backup infrastructure, from Jason Baron.
9) Remove several old and unused ISDN drivers, from Arnd Bergmann.
10) Add devlink notifications for flash update status to mlxsw driver,
from Jiri Pirko.
11) Lots of kTLS offload infrastructure fixes, from Jakub Kicinski.
12) Add support for mv88e6250 DSA chips, from Rasmus Villemoes.
13) Various enhancements to ipv6 flow label handling, from Eric
Dumazet and Willem de Bruijn.
14) Support TLS offload in nfp driver, from Jakub Kicinski, Dirk van
der Merwe, and others.
15) Various improvements to axienet driver including converting it to
phylink, from Robert Hancock.
16) Add PTP support to sja1105 DSA driver, from Vladimir Oltean.
17) Add mqprio qdisc offload support to dpaa2-eth, from Ioana
Radulescu.
18) Add devlink health reporting to mlx5, from Moshe Shemesh.
19) Convert stmmac over to phylink, from Jose Abreu.
20) Add PTP PHC (Physical Hardware Clock) support to mlxsw, from
Shalom Toledo.
21) Add nftables SYNPROXY support, from Fernando Fernandez Mancera.
22) Convert tcp_fastopen over to use SipHash, from Ard Biesheuvel.
23) Track spill/fill of constants in BPF verifier, from Alexei
Starovoitov.
24) Support bounded loops in BPF, from Alexei Starovoitov.
25) Various page_pool API fixes and improvements, from Jesper Dangaard
Brouer.
26) Just like ipv4, support ref-countless ipv6 route handling. From
Wei Wang.
27) Support VLAN offloading in aquantia driver, from Igor Russkikh.
28) Add AF_XDP zero-copy support to mlx5, from Maxim Mikityanskiy.
29) Add flower GRE encap/decap support to nfp driver, from Pieter
Jansen van Vuuren.
30) Protect against stack overflow when using act_mirred, from John
Hurley.
31) Allow devmap map lookups from eBPF, from Toke Høiland-Jørgensen.
32) Use page_pool API in netsec driver, Ilias Apalodimas.
33) Add Google gve network driver, from Catherine Sullivan.
34) More indirect call avoidance, from Paolo Abeni.
35) Add kTLS TX HW offload support to mlx5, from Tariq Toukan.
36) Add XDP_REDIRECT support to bnxt_en, from Andy Gospodarek.
37) Add MPLS manipulation actions to TC, from John Hurley.
38) Add sending a packet to connection tracking from TC actions, and
then allow flower classifier matching on conntrack state. From
Paul Blakey.
39) Netfilter hw offload support, from Pablo Neira Ayuso"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2080 commits)
net/mlx5e: Return in default case statement in tx_post_resync_params
mlx5: Return -EINVAL when WARN_ON_ONCE triggers in mlx5e_tls_resync().
net: dsa: add support for BRIDGE_MROUTER attribute
pkt_sched: Include const.h
net: netsec: remove static declaration for netsec_set_tx_de()
net: netsec: remove superfluous if statement
netfilter: nf_tables: add hardware offload support
net: flow_offload: rename tc_cls_flower_offload to flow_cls_offload
net: flow_offload: add flow_block_cb_is_busy() and use it
net: sched: remove tcf block API
drivers: net: use flow block API
net: sched: use flow block API
net: flow_offload: add flow_block_cb_{priv, incref, decref}()
net: flow_offload: add list handling functions
net: flow_offload: add flow_block_cb_alloc() and flow_block_cb_free()
net: flow_offload: rename TCF_BLOCK_BINDER_TYPE_* to FLOW_BLOCK_BINDER_TYPE_*
net: flow_offload: rename TC_BLOCK_{UN}BIND to FLOW_BLOCK_{UN}BIND
net: flow_offload: add flow_block_cb_setup_simple()
net: hisilicon: Add an tx_desc to adapt HI13X1_GMAC
net: hisilicon: Add an rx_desc to adapt HI13X1_GMAC
...
Diffstat (limited to 'net')
345 files changed, 16460 insertions, 5206 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h index 53cf446ce2e3..01853cec0209 100644 --- a/net/6lowpan/6lowpan_i.h +++ b/net/6lowpan/6lowpan_i.h @@ -18,24 +18,16 @@ extern const struct ndisc_ops lowpan_ndisc_ops; int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev); #ifdef CONFIG_6LOWPAN_DEBUGFS -int lowpan_dev_debugfs_init(struct net_device *dev); +void lowpan_dev_debugfs_init(struct net_device *dev); void lowpan_dev_debugfs_exit(struct net_device *dev); -int __init lowpan_debugfs_init(void); +void __init lowpan_debugfs_init(void); void lowpan_debugfs_exit(void); #else -static inline int lowpan_dev_debugfs_init(struct net_device *dev) -{ - return 0; -} - +static inline void lowpan_dev_debugfs_init(struct net_device *dev) { } static inline void lowpan_dev_debugfs_exit(struct net_device *dev) { } -static inline int __init lowpan_debugfs_init(void) -{ - return 0; -} - +static inline void __init lowpan_debugfs_init(void) { } static inline void lowpan_debugfs_exit(void) { } #endif /* CONFIG_6LOWPAN_DEBUGFS */ diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 2d68351f1ac4..a068757eabaf 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -42,9 +42,7 @@ int lowpan_register_netdevice(struct net_device *dev, if (ret < 0) return ret; - ret = lowpan_dev_debugfs_init(dev); - if (ret < 0) - unregister_netdevice(dev); + lowpan_dev_debugfs_init(dev); return ret; } @@ -152,9 +150,7 @@ static int __init lowpan_module_init(void) { int ret; - ret = lowpan_debugfs_init(); - if (ret < 0) - return ret; + lowpan_debugfs_init(); ret = register_netdevice_notifier(&lowpan_notifier); if (ret < 0) { diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index f5a8eec9d7a3..1c140af06d52 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -163,11 +163,11 @@ static const struct file_operations lowpan_ctx_pfx_fops = { .release = single_release, }; -static int lowpan_dev_debugfs_ctx_init(struct net_device *dev, - struct dentry *ctx, u8 id) +static void lowpan_dev_debugfs_ctx_init(struct net_device *dev, + struct dentry *ctx, u8 id) { struct lowpan_dev *ldev = lowpan_dev(dev); - struct dentry *dentry, *root; + struct dentry *root; char buf[32]; WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE); @@ -175,34 +175,18 @@ static int lowpan_dev_debugfs_ctx_init(struct net_device *dev, sprintf(buf, "%d", id); root = debugfs_create_dir(buf, ctx); - if (!root) - return -EINVAL; - dentry = debugfs_create_file_unsafe("active", 0644, root, - &ldev->ctx.table[id], - &lowpan_ctx_flag_active_fops); - if (!dentry) - return -EINVAL; + debugfs_create_file("active", 0644, root, &ldev->ctx.table[id], + &lowpan_ctx_flag_active_fops); - dentry = debugfs_create_file_unsafe("compression", 0644, root, - &ldev->ctx.table[id], - &lowpan_ctx_flag_c_fops); - if (!dentry) - return -EINVAL; - - dentry = debugfs_create_file("prefix", 0644, root, - &ldev->ctx.table[id], - &lowpan_ctx_pfx_fops); - if (!dentry) - return -EINVAL; + debugfs_create_file("compression", 0644, root, &ldev->ctx.table[id], + &lowpan_ctx_flag_c_fops); - dentry = debugfs_create_file_unsafe("prefix_len", 0644, root, - &ldev->ctx.table[id], - &lowpan_ctx_plen_fops); - if (!dentry) - return -EINVAL; + debugfs_create_file("prefix", 0644, root, &ldev->ctx.table[id], + &lowpan_ctx_pfx_fops); - return 0; + debugfs_create_file("prefix_len", 0644, root, &ldev->ctx.table[id], + &lowpan_ctx_plen_fops); } static int lowpan_context_show(struct seq_file *file, void *offset) @@ -242,64 +226,39 @@ static int lowpan_short_addr_get(void *data, u64 *val) DEFINE_DEBUGFS_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get, NULL, "0x%04llx\n"); -static int lowpan_dev_debugfs_802154_init(const struct net_device *dev, +static void lowpan_dev_debugfs_802154_init(const struct net_device *dev, struct lowpan_dev *ldev) { - struct dentry *dentry, *root; + struct dentry *root; if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) - return 0; + return; root = debugfs_create_dir("ieee802154", ldev->iface_debugfs); - if (!root) - return -EINVAL; - - dentry = debugfs_create_file_unsafe("short_addr", 0444, root, - lowpan_802154_dev(dev)->wdev->ieee802154_ptr, - &lowpan_short_addr_fops); - if (!dentry) - return -EINVAL; - return 0; + debugfs_create_file("short_addr", 0444, root, + lowpan_802154_dev(dev)->wdev->ieee802154_ptr, + &lowpan_short_addr_fops); } -int lowpan_dev_debugfs_init(struct net_device *dev) +void lowpan_dev_debugfs_init(struct net_device *dev) { struct lowpan_dev *ldev = lowpan_dev(dev); - struct dentry *contexts, *dentry; - int ret, i; + struct dentry *contexts; + int i; /* creating the root */ ldev->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs); - if (!ldev->iface_debugfs) - goto fail; contexts = debugfs_create_dir("contexts", ldev->iface_debugfs); - if (!contexts) - goto remove_root; - - dentry = debugfs_create_file("show", 0644, contexts, - &lowpan_dev(dev)->ctx, - &lowpan_context_fops); - if (!dentry) - goto remove_root; - - for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) { - ret = lowpan_dev_debugfs_ctx_init(dev, contexts, i); - if (ret < 0) - goto remove_root; - } - ret = lowpan_dev_debugfs_802154_init(dev, ldev); - if (ret < 0) - goto remove_root; + debugfs_create_file("show", 0644, contexts, &lowpan_dev(dev)->ctx, + &lowpan_context_fops); - return 0; + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) + lowpan_dev_debugfs_ctx_init(dev, contexts, i); -remove_root: - lowpan_dev_debugfs_exit(dev); -fail: - return -EINVAL; + lowpan_dev_debugfs_802154_init(dev, ldev); } void lowpan_dev_debugfs_exit(struct net_device *dev) @@ -307,13 +266,9 @@ void lowpan_dev_debugfs_exit(struct net_device *dev) debugfs_remove_recursive(lowpan_dev(dev)->iface_debugfs); } -int __init lowpan_debugfs_init(void) +void __init lowpan_debugfs_init(void) { lowpan_debugfs = debugfs_create_dir("6lowpan", NULL); - if (!lowpan_debugfs) - return -EINVAL; - - return 0; } void lowpan_debugfs_exit(void) diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index a0b2d8b9def7..93eadf179123 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -580,6 +580,7 @@ static int vlan_dev_init(struct net_device *dev) dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE; dev->hw_enc_features = vlan_tnl_features(real_dev); + dev->mpls_features = real_dev->mpls_features; /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; diff --git a/net/Kconfig b/net/Kconfig index d122f53c6fa2..57f51a279ad6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -67,8 +67,6 @@ source "net/xdp/Kconfig" config INET bool "TCP/IP networking" - select CRYPTO - select CRYPTO_AES ---help--- These are the protocols used on the Internet and on most local Ethernets. It is highly recommended to say Y here (this will enlarge diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index cb7d57d16c9d..37898da8ad48 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -9,12 +9,11 @@ #include "main.h" +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> -struct netlink_callback; -struct seq_file; -struct sk_buff; - extern char batadv_routing_algo[]; extern struct list_head batadv_hardif_list; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index bd4138ddf7e0..240ed70912d6 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2337,7 +2337,7 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, return ret; } -static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) +static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface) { /* begin scheduling originator messages on that interface */ batadv_iv_ogm_schedule(hard_iface); @@ -2683,8 +2683,8 @@ unlock: static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .name = "BATMAN_IV", .iface = { - .activate = batadv_iv_iface_activate, .enable = batadv_iv_ogm_iface_enable, + .enabled = batadv_iv_iface_enabled, .disable = batadv_iv_ogm_iface_disable, .update_mac = batadv_iv_ogm_iface_update_mac, .primary_set = batadv_iv_ogm_primary_iface_set, diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 231b4aab4d8d..22672cb3e25d 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -21,6 +21,7 @@ #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> @@ -41,8 +42,6 @@ #include "netlink.h" #include "originator.h" -struct sk_buff; - static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index bb3d40f73bfe..1a29505f4f66 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -9,8 +9,8 @@ #include "main.h" -struct sk_buff; -struct work_struct; +#include <linux/skbuff.h> +#include <linux/workqueue.h> int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 616bf2ea8755..2a50df7fc2bf 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -9,10 +9,9 @@ #include "main.h" +#include <linux/skbuff.h> #include <linux/types.h> -struct sk_buff; - int batadv_v_ogm_init(struct batadv_priv *bat_priv); void batadv_v_ogm_free(struct batadv_priv *bat_priv); int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface); diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 012d72c8d064..02b24a861a85 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -10,14 +10,13 @@ #include "main.h" #include <linux/compiler.h> +#include <linux/netdevice.h> +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> -struct net_device; -struct netlink_callback; -struct seq_file; -struct sk_buff; - /** * batadv_bla_is_loopdetect_mac() - check if the mac address is from a loop * detect frame sent by bridge loop avoidance diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index d38d70ccdd5a..38c4d8e51155 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -10,7 +10,6 @@ #include <asm/current.h> #include <linux/dcache.h> #include <linux/debugfs.h> -#include <linux/err.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/fs.h> @@ -293,31 +292,13 @@ static struct batadv_debuginfo *batadv_hardif_debuginfos[] = { void batadv_debugfs_init(void) { struct batadv_debuginfo **bat_debug; - struct dentry *file; batadv_debugfs = debugfs_create_dir(BATADV_DEBUGFS_SUBDIR, NULL); - if (batadv_debugfs == ERR_PTR(-ENODEV)) - batadv_debugfs = NULL; - - if (!batadv_debugfs) - goto err; - - for (bat_debug = batadv_general_debuginfos; *bat_debug; ++bat_debug) { - file = debugfs_create_file(((*bat_debug)->attr).name, - S_IFREG | ((*bat_debug)->attr).mode, - batadv_debugfs, NULL, - &(*bat_debug)->fops); - if (!file) { - pr_err("Can't add general debugfs file: %s\n", - ((*bat_debug)->attr).name); - goto err; - } - } - return; -err: - debugfs_remove_recursive(batadv_debugfs); - batadv_debugfs = NULL; + for (bat_debug = batadv_general_debuginfos; *bat_debug; ++bat_debug) + debugfs_create_file(((*bat_debug)->attr).name, + S_IFREG | ((*bat_debug)->attr).mode, + batadv_debugfs, NULL, &(*bat_debug)->fops); } /** @@ -333,42 +314,23 @@ void batadv_debugfs_destroy(void) * batadv_debugfs_add_hardif() - creates the base directory for a hard interface * in debugfs. * @hard_iface: hard interface which should be added. - * - * Return: 0 on success or negative error number in case of failure */ -int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) +void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) { struct net *net = dev_net(hard_iface->net_dev); struct batadv_debuginfo **bat_debug; - struct dentry *file; - - if (!batadv_debugfs) - goto out; if (net != &init_net) - return 0; + return; hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name, batadv_debugfs); - if (!hard_iface->debug_dir) - goto out; - - for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug) { - file = debugfs_create_file(((*bat_debug)->attr).name, - S_IFREG | ((*bat_debug)->attr).mode, - hard_iface->debug_dir, - hard_iface->net_dev, - &(*bat_debug)->fops); - if (!file) - goto rem_attr; - } - return 0; -rem_attr: - debugfs_remove_recursive(hard_iface->debug_dir); - hard_iface->debug_dir = NULL; -out: - return -ENOMEM; + for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug) + debugfs_create_file(((*bat_debug)->attr).name, + S_IFREG | ((*bat_debug)->attr).mode, + hard_iface->debug_dir, hard_iface->net_dev, + &(*bat_debug)->fops); } /** @@ -379,15 +341,12 @@ void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface) { const char *name = hard_iface->net_dev->name; struct dentry *dir; - struct dentry *d; dir = hard_iface->debug_dir; if (!dir) return; - d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name); - if (!d) - pr_err("Can't rename debugfs dir to %s\n", name); + debugfs_rename(dir->d_parent, dir, dir->d_parent, name); } /** @@ -419,44 +378,29 @@ int batadv_debugfs_add_meshif(struct net_device *dev) struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_debuginfo **bat_debug; struct net *net = dev_net(dev); - struct dentry *file; - - if (!batadv_debugfs) - goto out; if (net != &init_net) return 0; bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs); - if (!bat_priv->debug_dir) - goto out; - if (batadv_socket_setup(bat_priv) < 0) - goto rem_attr; + batadv_socket_setup(bat_priv); if (batadv_debug_log_setup(bat_priv) < 0) goto rem_attr; - for (bat_debug = batadv_mesh_debuginfos; *bat_debug; ++bat_debug) { - file = debugfs_create_file(((*bat_debug)->attr).name, - S_IFREG | ((*bat_debug)->attr).mode, - bat_priv->debug_dir, - dev, &(*bat_debug)->fops); - if (!file) { - batadv_err(dev, "Can't add debugfs file: %s/%s\n", - dev->name, ((*bat_debug)->attr).name); - goto rem_attr; - } - } + for (bat_debug = batadv_mesh_debuginfos; *bat_debug; ++bat_debug) + debugfs_create_file(((*bat_debug)->attr).name, + S_IFREG | ((*bat_debug)->attr).mode, + bat_priv->debug_dir, dev, + &(*bat_debug)->fops); - if (batadv_nc_init_debugfs(bat_priv) < 0) - goto rem_attr; + batadv_nc_init_debugfs(bat_priv); return 0; rem_attr: debugfs_remove_recursive(bat_priv->debug_dir); bat_priv->debug_dir = NULL; -out: return -ENOMEM; } @@ -469,15 +413,12 @@ void batadv_debugfs_rename_meshif(struct net_device *dev) struct batadv_priv *bat_priv = netdev_priv(dev); const char *name = dev->name; struct dentry *dir; - struct dentry *d; dir = bat_priv->debug_dir; if (!dir) return; - d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name); - if (!d) - pr_err("Can't rename debugfs dir to %s\n", name); + debugfs_rename(dir->d_parent, dir, dir->d_parent, name); } /** diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 7fac680cf740..1c5afd301ce9 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -9,8 +9,8 @@ #include "main.h" -struct file; -struct net_device; +#include <linux/fs.h> +#include <linux/netdevice.h> #define BATADV_DEBUGFS_SUBDIR "batman_adv" @@ -22,7 +22,7 @@ void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); void batadv_debugfs_rename_meshif(struct net_device *dev); void batadv_debugfs_del_meshif(struct net_device *dev); -int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); +void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); @@ -54,9 +54,8 @@ static inline void batadv_debugfs_del_meshif(struct net_device *dev) } static inline -int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) +void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) { - return 0; } static inline diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 110c27447d70..67c7729add55 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -11,15 +11,14 @@ #include <linux/compiler.h> #include <linux/netdevice.h> +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include "originator.h" -struct netlink_callback; -struct seq_file; -struct sk_buff; - #ifdef CONFIG_BATMAN_ADV_DAT /* BATADV_DAT_ADDR_MAX - maximum address value in the DHT space */ diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index d6074ba2ada7..abfe8c6556de 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -11,11 +11,10 @@ #include <linux/compiler.h> #include <linux/list.h> +#include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> -struct sk_buff; - void batadv_frag_purge_orig(struct batadv_orig_node *orig, bool (*check_cb)(struct batadv_frag_table_entry *)); bool batadv_frag_skb_fwd(struct sk_buff *skb, diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 0e14026feebd..0be8e7178ec7 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -9,12 +9,11 @@ #include "main.h" +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> - -struct batadv_tvlv_gateway_data; -struct netlink_callback; -struct seq_file; -struct sk_buff; +#include <uapi/linux/batadv_packet.h> void batadv_gw_check_client_stop(struct batadv_priv *bat_priv); void batadv_gw_reselect(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index dac097f9be03..fc55750542e4 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -11,6 +11,7 @@ #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/kernel.h> +#include <linux/limits.h> #include <linux/math64.h> #include <linux/netdevice.h> #include <linux/stddef.h> diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 5cf50736c635..211b14b37db8 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -9,10 +9,9 @@ #include "main.h" +#include <linux/netdevice.h> #include <linux/types.h> -struct net_device; - /** * enum batadv_bandwidth_units - bandwidth unit types */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 79d1731b8306..c90e47342bb0 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -16,6 +16,7 @@ #include <linux/if_ether.h> #include <linux/kernel.h> #include <linux/kref.h> +#include <linux/limits.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -795,6 +796,9 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, batadv_hardif_recalc_extra_skbroom(soft_iface); + if (bat_priv->algo_ops->iface.enabled) + bat_priv->algo_ops->iface.enabled(hard_iface); + out: return 0; @@ -920,9 +924,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; - ret = batadv_debugfs_add_hardif(hard_iface); - if (ret) - goto free_sysfs; + batadv_debugfs_add_hardif(hard_iface); INIT_LIST_HEAD(&hard_iface->list); INIT_HLIST_HEAD(&hard_iface->neigh_list); @@ -944,8 +946,6 @@ batadv_hardif_add_interface(struct net_device *net_dev) return hard_iface; -free_sysfs: - batadv_sysfs_del_hardif(&hard_iface->hardif_obj); free_if: kfree(hard_iface); release_dev: diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index c8ef6aa0e865..bbb8a6f18d6b 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -11,13 +11,12 @@ #include <linux/compiler.h> #include <linux/kref.h> +#include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/stddef.h> #include <linux/types.h> - -struct net_device; -struct net; +#include <net/net_namespace.h> /** * enum batadv_hard_if_state - State of a hard interface diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index ceef171f7f98..57877f0b78e0 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -12,13 +12,12 @@ #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/list.h> +#include <linux/lockdep.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> -struct lock_class_key; - /* callback to a compare function. should compare 2 element datas for their * keys * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 0a91c8661357..0a70b66e8770 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -314,25 +314,11 @@ static const struct file_operations batadv_fops = { /** * batadv_socket_setup() - Create debugfs "socket" file * @bat_priv: the bat priv with all the soft interface information - * - * Return: 0 on success or negative error number in case of failure */ -int batadv_socket_setup(struct batadv_priv *bat_priv) +void batadv_socket_setup(struct batadv_priv *bat_priv) { - struct dentry *d; - - if (!bat_priv->debug_dir) - goto err; - - d = debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir, - bat_priv, &batadv_fops); - if (!d) - goto err; - - return 0; - -err: - return -ENOMEM; + debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir, + bat_priv, &batadv_fops); } /** diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 35eecbfd2e65..27fafff586df 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -10,12 +10,11 @@ #include "main.h" #include <linux/types.h> - -struct batadv_icmp_header; +#include <uapi/linux/batadv_packet.h> #define BATADV_ICMP_SOCKET "socket" -int batadv_socket_setup(struct batadv_priv *bat_priv); +void batadv_socket_setup(struct batadv_priv *bat_priv); #ifdef CONFIG_BATMAN_ADV_DEBUGFS diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index f79ebd5b46e9..11941cf1adcc 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -190,27 +190,16 @@ static const struct file_operations batadv_log_fops = { */ int batadv_debug_log_setup(struct batadv_priv *bat_priv) { - struct dentry *d; - - if (!bat_priv->debug_dir) - goto err; - bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC); if (!bat_priv->debug_log) - goto err; + return -ENOMEM; spin_lock_init(&bat_priv->debug_log->lock); init_waitqueue_head(&bat_priv->debug_log->queue_wait); - d = debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv, - &batadv_log_fops); - if (!d) - goto err; - + debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv, + &batadv_log_fops); return 0; - -err: - return -ENOMEM; } /** diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 5504637e63d8..741cfa3719ff 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -9,6 +9,7 @@ #include "main.h" +#include <linux/atomic.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/printk.h> diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index c59afcba31e0..3d4c04d87ff3 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2019.2" +#define BATADV_SOURCE_VERSION "2019.3" #endif /* B.A.T.M.A.N. parameters */ @@ -205,20 +205,20 @@ enum batadv_uev_type { /* Kernel headers */ +#include <linux/atomic.h> #include <linux/compiler.h> #include <linux/etherdevice.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> +#include <linux/netdevice.h> #include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include "types.h" - -struct net_device; -struct packet_type; -struct seq_file; -struct sk_buff; +#include "main.h" /** * batadv_print_vid() - return printable version of vid information diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index ec54e236e345..67d7f83009ae 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -20,6 +20,7 @@ #include <linux/igmp.h> #include <linux/in.h> #include <linux/in6.h> +#include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jiffies.h> @@ -98,69 +99,307 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) } /** - * batadv_mcast_addr_is_ipv4() - check if multicast MAC is IPv4 - * @addr: the MAC address to check + * batadv_mcast_mla_rtr_flags_softif_get_ipv4() - get mcast router flags from + * node for IPv4 + * @dev: the interface to check * - * Return: True, if MAC address is one reserved for IPv4 multicast, false - * otherwise. + * Checks the presence of an IPv4 multicast router on this node. + * + * Caller needs to hold rcu read lock. + * + * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise. */ -static bool batadv_mcast_addr_is_ipv4(const u8 *addr) +static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv4(struct net_device *dev) { - static const u8 prefix[] = {0x01, 0x00, 0x5E}; + struct in_device *in_dev = __in_dev_get_rcu(dev); - return memcmp(prefix, addr, sizeof(prefix)) == 0; + if (in_dev && IN_DEV_MFORWARD(in_dev)) + return BATADV_NO_FLAGS; + else + return BATADV_MCAST_WANT_NO_RTR4; } /** - * batadv_mcast_addr_is_ipv6() - check if multicast MAC is IPv6 - * @addr: the MAC address to check + * batadv_mcast_mla_rtr_flags_softif_get_ipv6() - get mcast router flags from + * node for IPv6 + * @dev: the interface to check * - * Return: True, if MAC address is one reserved for IPv6 multicast, false - * otherwise. + * Checks the presence of an IPv6 multicast router on this node. + * + * Caller needs to hold rcu read lock. + * + * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise. */ -static bool batadv_mcast_addr_is_ipv6(const u8 *addr) +#if IS_ENABLED(CONFIG_IPV6_MROUTE) +static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) { - static const u8 prefix[] = {0x33, 0x33}; + struct inet6_dev *in6_dev = __in6_dev_get(dev); - return memcmp(prefix, addr, sizeof(prefix)) == 0; + if (in6_dev && in6_dev->cnf.mc_forwarding) + return BATADV_NO_FLAGS; + else + return BATADV_MCAST_WANT_NO_RTR6; +} +#else +static inline u8 +batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) +{ + return BATADV_MCAST_WANT_NO_RTR6; } +#endif /** - * batadv_mcast_mla_softif_get() - get softif multicast listeners + * batadv_mcast_mla_rtr_flags_softif_get() - get mcast router flags from node + * @bat_priv: the bat priv with all the soft interface information + * @bridge: bridge interface on top of the soft_iface if present, + * otherwise pass NULL + * + * Checks the presence of IPv4 and IPv6 multicast routers on this + * node. + * + * Return: + * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present + * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present + * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present + * The former two OR'd: no multicast router is present + */ +static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + struct net_device *dev = bridge ? bridge : bat_priv->soft_iface; + u8 flags = BATADV_NO_FLAGS; + + rcu_read_lock(); + + flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv4(dev); + flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv6(dev); + + rcu_read_unlock(); + + return flags; +} + +/** + * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge + * @bat_priv: the bat priv with all the soft interface information + * @bridge: bridge interface on top of the soft_iface if present, + * otherwise pass NULL + * + * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge. + * + * Return: + * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present + * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present + * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present + * The former two OR'd: no multicast router is present + */ +#if IS_ENABLED(CONFIG_IPV6) +static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); + struct net_device *dev = bat_priv->soft_iface; + struct br_ip_list *br_ip_entry, *tmp; + u8 flags = BATADV_MCAST_WANT_NO_RTR6; + int ret; + + if (!bridge) + return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; + + /* TODO: ask the bridge if a multicast router is present (the bridge + * is capable of performing proper RFC4286 multicast multicast router + * discovery) instead of searching for a ff02::2 listener here + */ + ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); + if (ret < 0) + return BATADV_NO_FLAGS; + + list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { + /* the bridge snooping does not maintain IPv4 link-local + * addresses - therefore we won't find any IPv4 multicast router + * address here, only IPv6 ones + */ + if (br_ip_entry->addr.proto == htons(ETH_P_IPV6) && + ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.u.ip6)) + flags &= ~BATADV_MCAST_WANT_NO_RTR6; + + list_del(&br_ip_entry->list); + kfree(br_ip_entry); + } + + return flags; +} +#else +static inline u8 +batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + if (bridge) + return BATADV_NO_FLAGS; + else + return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; +} +#endif + +/** + * batadv_mcast_mla_rtr_flags_get() - get multicast router flags + * @bat_priv: the bat priv with all the soft interface information + * @bridge: bridge interface on top of the soft_iface if present, + * otherwise pass NULL + * + * Checks the presence of IPv4 and IPv6 multicast routers on this + * node or behind its bridge. + * + * Return: + * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present + * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present + * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present + * The former two OR'd: no multicast router is present + */ +static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; + + flags &= batadv_mcast_mla_rtr_flags_softif_get(bat_priv, bridge); + flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge); + + return flags; +} + +/** + * batadv_mcast_mla_flags_get() - get the new multicast flags * @bat_priv: the bat priv with all the soft interface information + * + * Return: A set of flags for the current/next TVLV, querier and + * bridge state. + */ +static struct batadv_mcast_mla_flags +batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) +{ + struct net_device *dev = bat_priv->soft_iface; + struct batadv_mcast_querier_state *qr4, *qr6; + struct batadv_mcast_mla_flags mla_flags; + struct net_device *bridge; + + bridge = batadv_mcast_get_bridge(dev); + + memset(&mla_flags, 0, sizeof(mla_flags)); + mla_flags.enabled = 1; + mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv, + bridge); + + if (!bridge) + return mla_flags; + + dev_put(bridge); + + mla_flags.bridged = 1; + qr4 = &mla_flags.querier_ipv4; + qr6 = &mla_flags.querier_ipv6; + + if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) + pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); + + qr4->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); + qr4->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); + + qr6->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); + qr6->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); + + mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; + + /* 1) If no querier exists at all, then multicast listeners on + * our local TT clients behind the bridge will keep silent. + * 2) If the selected querier is on one of our local TT clients, + * behind the bridge, then this querier might shadow multicast + * listeners on our local TT clients, behind this bridge. + * + * In both cases, we will signalize other batman nodes that + * we need all multicast traffic of the according protocol. + */ + if (!qr4->exists || qr4->shadowing) { + mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4; + mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4; + } + + if (!qr6->exists || qr6->shadowing) { + mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6; + mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6; + } + + return mla_flags; +} + +/** + * batadv_mcast_mla_is_duplicate() - check whether an address is in a list + * @mcast_addr: the multicast address to check + * @mcast_list: the list with multicast addresses to search in + * + * Return: true if the given address is already in the given list. + * Otherwise returns false. + */ +static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, + struct hlist_head *mcast_list) +{ + struct batadv_hw_addr *mcast_entry; + + hlist_for_each_entry(mcast_entry, mcast_list, list) + if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) + return true; + + return false; +} + +/** + * batadv_mcast_mla_softif_get_ipv4() - get softif IPv4 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into + * @flags: flags indicating the new multicast state * - * Collects multicast addresses of multicast listeners residing + * Collects multicast addresses of IPv4 multicast listeners residing * on this kernel on the given soft interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * - * If there is a bridge interface on top of dev, collects from that one - * instead. Just like with IP addresses and routes, multicast listeners - * will(/should) register to the bridge interface instead of an - * enslaved bat0. - * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ -static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv, - struct net_device *dev, - struct hlist_head *mcast_list) +static int +batadv_mcast_mla_softif_get_ipv4(struct net_device *dev, + struct hlist_head *mcast_list, + struct batadv_mcast_mla_flags *flags) { - bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4; - bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6; - struct net_device *bridge = batadv_mcast_get_bridge(dev); - struct netdev_hw_addr *mc_list_entry; struct batadv_hw_addr *new; + struct in_device *in_dev; + u8 mcast_addr[ETH_ALEN]; + struct ip_mc_list *pmc; int ret = 0; - netif_addr_lock_bh(bridge ? bridge : dev); - netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) { - if (all_ipv4 && batadv_mcast_addr_is_ipv4(mc_list_entry->addr)) + if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) + return 0; + + rcu_read_lock(); + + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) { + rcu_read_unlock(); + return 0; + } + + for (pmc = rcu_dereference(in_dev->mc_list); pmc; + pmc = rcu_dereference(pmc->next_rcu)) { + if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && + ipv4_is_local_multicast(pmc->multiaddr)) + continue; + + if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && + !ipv4_is_local_multicast(pmc->multiaddr)) continue; - if (all_ipv6 && batadv_mcast_addr_is_ipv6(mc_list_entry->addr)) + ip_eth_mc_map(pmc->multiaddr, mcast_addr); + + if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); @@ -169,36 +408,142 @@ static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv, break; } - ether_addr_copy(new->addr, mc_list_entry->addr); + ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } - netif_addr_unlock_bh(bridge ? bridge : dev); + rcu_read_unlock(); - if (bridge) - dev_put(bridge); + return ret; +} + +/** + * batadv_mcast_mla_softif_get_ipv6() - get softif IPv6 multicast listeners + * @dev: the device to collect multicast addresses from + * @mcast_list: a list to put found addresses into + * @flags: flags indicating the new multicast state + * + * Collects multicast addresses of IPv6 multicast listeners residing + * on this kernel on the given soft interface, dev, in + * the given mcast_list. In general, multicast listeners provided by + * your multicast receiving applications run directly on this node. + * + * Return: -ENOMEM on memory allocation error or the number of + * items added to the mcast_list otherwise. + */ +#if IS_ENABLED(CONFIG_IPV6) +static int +batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, + struct hlist_head *mcast_list, + struct batadv_mcast_mla_flags *flags) +{ + struct batadv_hw_addr *new; + struct inet6_dev *in6_dev; + u8 mcast_addr[ETH_ALEN]; + struct ifmcaddr6 *pmc6; + int ret = 0; + + if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) + return 0; + + rcu_read_lock(); + + in6_dev = __in6_dev_get(dev); + if (!in6_dev) { + rcu_read_unlock(); + return 0; + } + + read_lock_bh(&in6_dev->lock); + for (pmc6 = in6_dev->mc_list; pmc6; pmc6 = pmc6->next) { + if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) < + IPV6_ADDR_SCOPE_LINKLOCAL) + continue; + + if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && + ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr)) + continue; + + if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && + IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) > + IPV6_ADDR_SCOPE_LINKLOCAL) + continue; + + ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr); + + if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) + continue; + + new = kmalloc(sizeof(*new), GFP_ATOMIC); + if (!new) { + ret = -ENOMEM; + break; + } + + ether_addr_copy(new->addr, mcast_addr); + hlist_add_head(&new->list, mcast_list); + ret++; + } + read_unlock_bh(&in6_dev->lock); + rcu_read_unlock(); return ret; } +#else +static inline int +batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, + struct hlist_head *mcast_list, + struct batadv_mcast_mla_flags *flags) +{ + return 0; +} +#endif /** - * batadv_mcast_mla_is_duplicate() - check whether an address is in a list - * @mcast_addr: the multicast address to check - * @mcast_list: the list with multicast addresses to search in + * batadv_mcast_mla_softif_get() - get softif multicast listeners + * @dev: the device to collect multicast addresses from + * @mcast_list: a list to put found addresses into + * @flags: flags indicating the new multicast state * - * Return: true if the given address is already in the given list. - * Otherwise returns false. + * Collects multicast addresses of multicast listeners residing + * on this kernel on the given soft interface, dev, in + * the given mcast_list. In general, multicast listeners provided by + * your multicast receiving applications run directly on this node. + * + * If there is a bridge interface on top of dev, collects from that one + * instead. Just like with IP addresses and routes, multicast listeners + * will(/should) register to the bridge interface instead of an + * enslaved bat0. + * + * Return: -ENOMEM on memory allocation error or the number of + * items added to the mcast_list otherwise. */ -static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, - struct hlist_head *mcast_list) +static int +batadv_mcast_mla_softif_get(struct net_device *dev, + struct hlist_head *mcast_list, + struct batadv_mcast_mla_flags *flags) { - struct batadv_hw_addr *mcast_entry; + struct net_device *bridge = batadv_mcast_get_bridge(dev); + int ret4, ret6 = 0; - hlist_for_each_entry(mcast_entry, mcast_list, list) - if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) - return true; + if (bridge) + dev = bridge; - return false; + ret4 = batadv_mcast_mla_softif_get_ipv4(dev, mcast_list, flags); + if (ret4 < 0) + goto out; + + ret6 = batadv_mcast_mla_softif_get_ipv6(dev, mcast_list, flags); + if (ret6 < 0) { + ret4 = 0; + goto out; + } + +out: + if (bridge) + dev_put(bridge); + + return ret4 + ret6; } /** @@ -227,9 +572,9 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) /** * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners - * @bat_priv: the bat priv with all the soft interface information * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into + * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on foreign, non-mesh devices which we gave access to our mesh via @@ -239,14 +584,13 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ -static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv, - struct net_device *dev, - struct hlist_head *mcast_list) +static int batadv_mcast_mla_bridge_get(struct net_device *dev, + struct hlist_head *mcast_list, + struct batadv_mcast_mla_flags *flags) { struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); - bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4; - bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6; struct br_ip_list *br_ip_entry, *tmp; + u8 tvlv_flags = flags->tvlv_flags; struct batadv_hw_addr *new; u8 mcast_addr[ETH_ALEN]; int ret; @@ -259,11 +603,34 @@ static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv, goto out; list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { - if (all_ipv4 && br_ip_entry->addr.proto == htons(ETH_P_IP)) - continue; + if (br_ip_entry->addr.proto == htons(ETH_P_IP)) { + if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) + continue; - if (all_ipv6 && br_ip_entry->addr.proto == htons(ETH_P_IPV6)) - continue; + if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && + ipv4_is_local_multicast(br_ip_entry->addr.u.ip4)) + continue; + + if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && + !ipv4_is_local_multicast(br_ip_entry->addr.u.ip4)) + continue; + } + +#if IS_ENABLED(CONFIG_IPV6) + if (br_ip_entry->addr.proto == htons(ETH_P_IPV6)) { + if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) + continue; + + if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && + ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.u.ip6)) + continue; + + if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && + IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.u.ip6) > + IPV6_ADDR_SCOPE_LINKLOCAL) + continue; + } +#endif batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) @@ -370,27 +737,6 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, } /** - * batadv_mcast_has_bridge() - check whether the soft-iface is bridged - * @bat_priv: the bat priv with all the soft interface information - * - * Checks whether there is a bridge on top of our soft interface. - * - * Return: true if there is a bridge, false otherwise. - */ -static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) -{ - struct net_device *upper = bat_priv->soft_iface; - - rcu_read_lock(); - do { - upper = netdev_master_upper_dev_get_rcu(upper); - } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); - rcu_read_unlock(); - - return upper; -} - -/** * batadv_mcast_querier_log() - debug output regarding the querier status on * link * @bat_priv: the bat priv with all the soft interface information @@ -424,7 +770,7 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, batadv_info(bat_priv->soft_iface, "%s Querier disappeared - multicast optimizations disabled\n", str_proto); - else if (!bat_priv->mcast.bridged && !new_state->exists) + else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists) batadv_info(bat_priv->soft_iface, "No %s Querier present - multicast optimizations disabled\n", str_proto); @@ -446,9 +792,7 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, * batadv_mcast_bridge_log() - debug output for topology changes in bridged * setups * @bat_priv: the bat priv with all the soft interface information - * @bridged: a flag about whether the soft interface is currently bridged or not - * @querier_ipv4: (maybe) new status of a potential, selected IGMP querier - * @querier_ipv6: (maybe) new status of a potential, selected MLD querier + * @new_flags: flags indicating the new multicast state * * If no bridges are ever used on this node, then this function does nothing. * @@ -461,126 +805,86 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, * multicast flags this node is going to set. */ static void -batadv_mcast_bridge_log(struct batadv_priv *bat_priv, bool bridged, - struct batadv_mcast_querier_state *querier_ipv4, - struct batadv_mcast_querier_state *querier_ipv6) +batadv_mcast_bridge_log(struct batadv_priv *bat_priv, + struct batadv_mcast_mla_flags *new_flags) { - if (!bat_priv->mcast.bridged && bridged) + struct batadv_mcast_mla_flags *old_flags = &bat_priv->mcast.mla_flags; + + if (!old_flags->bridged && new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge added: Setting Unsnoopables(U)-flag\n"); - else if (bat_priv->mcast.bridged && !bridged) + else if (old_flags->bridged && !new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge removed: Unsetting Unsnoopables(U)-flag\n"); - if (bridged) { + if (new_flags->bridged) { batadv_mcast_querier_log(bat_priv, "IGMP", - &bat_priv->mcast.querier_ipv4, - querier_ipv4); + &old_flags->querier_ipv4, + &new_flags->querier_ipv4); batadv_mcast_querier_log(bat_priv, "MLD", - &bat_priv->mcast.querier_ipv6, - querier_ipv6); + &old_flags->querier_ipv6, + &new_flags->querier_ipv6); } } /** * batadv_mcast_flags_logs() - output debug information about mcast flag changes * @bat_priv: the bat priv with all the soft interface information - * @flags: flags indicating the new multicast state + * @flags: TVLV flags indicating the new multicast state * - * Whenever the multicast flags this nodes announces changes (@mcast_flags vs. - * bat_priv->mcast.flags), this notifies userspace via the 'mcast' log level. + * Whenever the multicast TVLV flags this nodes announces change this notifies + * userspace via the 'mcast' log level. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { - u8 old_flags = bat_priv->mcast.flags; - char str_old_flags[] = "[...]"; + bool old_enabled = bat_priv->mcast.mla_flags.enabled; + u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags; + char str_old_flags[] = "[.... . ]"; - sprintf(str_old_flags, "[%c%c%c]", + sprintf(str_old_flags, "[%c%c%c%s%s]", (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', - (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', + !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", + !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); batadv_dbg(BATADV_DBG_MCAST, bat_priv, - "Changing multicast flags from '%s' to '[%c%c%c]'\n", - bat_priv->mcast.enabled ? str_old_flags : "<undefined>", + "Changing multicast flags from '%s' to '[%c%c%c%s%s]'\n", + old_enabled ? str_old_flags : "<undefined>", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', - (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', + !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", + !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); } /** - * batadv_mcast_mla_tvlv_update() - update multicast tvlv + * batadv_mcast_mla_flags_update() - update multicast flags * @bat_priv: the bat priv with all the soft interface information + * @flags: flags indicating the new multicast state * * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. - * - * Return: false if we want all IPv4 && IPv6 multicast traffic and true - * otherwise. */ -static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) +static void +batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv, + struct batadv_mcast_mla_flags *flags) { struct batadv_tvlv_mcast_data mcast_data; - struct batadv_mcast_querier_state querier4 = {false, false}; - struct batadv_mcast_querier_state querier6 = {false, false}; - struct net_device *dev = bat_priv->soft_iface; - bool bridged; - - mcast_data.flags = BATADV_NO_FLAGS; - memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); - - bridged = batadv_mcast_has_bridge(bat_priv); - if (!bridged) - goto update; - - if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) - pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); - - querier4.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); - querier4.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); - querier6.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); - querier6.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); - - mcast_data.flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; - - /* 1) If no querier exists at all, then multicast listeners on - * our local TT clients behind the bridge will keep silent. - * 2) If the selected querier is on one of our local TT clients, - * behind the bridge, then this querier might shadow multicast - * listeners on our local TT clients, behind this bridge. - * - * In both cases, we will signalize other batman nodes that - * we need all multicast traffic of the according protocol. - */ - if (!querier4.exists || querier4.shadowing) - mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV4; - - if (!querier6.exists || querier6.shadowing) - mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV6; - -update: - batadv_mcast_bridge_log(bat_priv, bridged, &querier4, &querier6); - - bat_priv->mcast.querier_ipv4.exists = querier4.exists; - bat_priv->mcast.querier_ipv4.shadowing = querier4.shadowing; + if (!memcmp(flags, &bat_priv->mcast.mla_flags, sizeof(*flags))) + return; - bat_priv->mcast.querier_ipv6.exists = querier6.exists; - bat_priv->mcast.querier_ipv6.shadowing = querier6.shadowing; + batadv_mcast_bridge_log(bat_priv, flags); + batadv_mcast_flags_log(bat_priv, flags->tvlv_flags); - bat_priv->mcast.bridged = bridged; + mcast_data.flags = flags->tvlv_flags; + memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); - if (!bat_priv->mcast.enabled || - mcast_data.flags != bat_priv->mcast.flags) { - batadv_mcast_flags_log(bat_priv, mcast_data.flags); - batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, - &mcast_data, sizeof(mcast_data)); - bat_priv->mcast.flags = mcast_data.flags; - bat_priv->mcast.enabled = true; - } + batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, + &mcast_data, sizeof(mcast_data)); - return !(mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV4 && - mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV6); + bat_priv->mcast.mla_flags = *flags; } /** @@ -599,22 +903,24 @@ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) { struct net_device *soft_iface = bat_priv->soft_iface; struct hlist_head mcast_list = HLIST_HEAD_INIT; + struct batadv_mcast_mla_flags flags; int ret; - if (!batadv_mcast_mla_tvlv_update(bat_priv)) - goto update; + flags = batadv_mcast_mla_flags_get(bat_priv); - ret = batadv_mcast_mla_softif_get(bat_priv, soft_iface, &mcast_list); + ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list, &flags); if (ret < 0) goto out; - ret = batadv_mcast_mla_bridge_get(bat_priv, soft_iface, &mcast_list); + ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list, &flags); if (ret < 0) goto out; -update: + spin_lock(&bat_priv->mcast.mla_lock); batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); batadv_mcast_mla_tt_add(bat_priv, &mcast_list); + batadv_mcast_mla_flags_update(bat_priv, &flags); + spin_unlock(&bat_priv->mcast.mla_lock); out: batadv_mcast_mla_list_free(&mcast_list); @@ -639,10 +945,7 @@ static void batadv_mcast_mla_update(struct work_struct *work) priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work); bat_priv = container_of(priv_mcast, struct batadv_priv, mcast); - spin_lock(&bat_priv->mcast.mla_lock); __batadv_mcast_mla_update(bat_priv); - spin_unlock(&bat_priv->mcast.mla_lock); - batadv_mcast_start_timer(bat_priv); } @@ -677,6 +980,7 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable + * @is_routable: stores whether the destination is routable * * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. @@ -686,7 +990,8 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, - bool *is_unsnoopable) + bool *is_unsnoopable, + int *is_routable) { struct iphdr *iphdr; @@ -699,16 +1004,13 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, iphdr = ip_hdr(skb); - /* TODO: Implement Multicast Router Discovery (RFC4286), - * then allow scope > link local, too - */ - if (!ipv4_is_local_multicast(iphdr->daddr)) - return -EINVAL; - /* link-local multicast listeners behind a bridge are * not snoopable (see RFC4541, section 2.1.2.2) */ - *is_unsnoopable = true; + if (ipv4_is_local_multicast(iphdr->daddr)) + *is_unsnoopable = true; + else + *is_routable = ETH_P_IP; return 0; } @@ -743,6 +1045,7 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable + * @is_routable: stores whether the destination is routable * * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. @@ -751,7 +1054,8 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, - bool *is_unsnoopable) + bool *is_unsnoopable, + int *is_routable) { struct ipv6hdr *ip6hdr; @@ -764,10 +1068,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, ip6hdr = ipv6_hdr(skb); - /* TODO: Implement Multicast Router Discovery (RFC4286), - * then allow scope > link local, too - */ - if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) != IPV6_ADDR_SCOPE_LINKLOCAL) + if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) < IPV6_ADDR_SCOPE_LINKLOCAL) return -EINVAL; /* link-local-all-nodes multicast listeners behind a bridge are @@ -775,6 +1076,8 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, */ if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr)) *is_unsnoopable = true; + else if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) > IPV6_ADDR_SCOPE_LINKLOCAL) + *is_routable = ETH_P_IPV6; return 0; } @@ -784,6 +1087,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast frame to check * @is_unsnoopable: stores whether the destination is snoopable + * @is_routable: stores whether the destination is routable * * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. @@ -792,7 +1096,8 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, - bool *is_unsnoopable) + bool *is_unsnoopable, + int *is_routable) { struct ethhdr *ethhdr = eth_hdr(skb); @@ -802,13 +1107,15 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, - is_unsnoopable); + is_unsnoopable, + is_routable); case ETH_P_IPV6: if (!IS_ENABLED(CONFIG_IPV6)) return -EINVAL; return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, - is_unsnoopable); + is_unsnoopable, + is_routable); default: return -EINVAL; } @@ -839,6 +1146,29 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, } /** + * batadv_mcast_forw_rtr_count() - count nodes with a multicast router + * @bat_priv: the bat priv with all the soft interface information + * @protocol: the ethernet protocol type to count multicast routers for + * + * Return: the number of nodes which want all routable IPv4 multicast traffic + * if the protocol is ETH_P_IP or the number of nodes which want all routable + * IPv6 traffic if the protocol is ETH_P_IPV6. Otherwise returns 0. + */ + +static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, + int protocol) +{ + switch (protocol) { + case ETH_P_IP: + return atomic_read(&bat_priv->mcast.num_want_all_rtr4); + case ETH_P_IPV6: + return atomic_read(&bat_priv->mcast.num_want_all_rtr6); + default: + return 0; + } +} + +/** * batadv_mcast_forw_tt_node_get() - get a multicast tt node * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the ether header containing the multicast destination @@ -960,6 +1290,84 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) } /** + * batadv_mcast_forw_rtr4_node_get() - get a node with an ipv4 mcast router flag + * @bat_priv: the bat priv with all the soft interface information + * + * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR4 flag unset and + * increases its refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_rtr4_node_get(struct batadv_priv *bat_priv) +{ + struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_orig_node, + &bat_priv->mcast.want_all_rtr4_list, + mcast_want_all_rtr4_node) { + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) + continue; + + orig_node = tmp_orig_node; + break; + } + rcu_read_unlock(); + + return orig_node; +} + +/** + * batadv_mcast_forw_rtr6_node_get() - get a node with an ipv6 mcast router flag + * @bat_priv: the bat priv with all the soft interface information + * + * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR6 flag unset + * and increases its refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_rtr6_node_get(struct batadv_priv *bat_priv) +{ + struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_orig_node, + &bat_priv->mcast.want_all_rtr6_list, + mcast_want_all_rtr6_node) { + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) + continue; + + orig_node = tmp_orig_node; + break; + } + rcu_read_unlock(); + + return orig_node; +} + +/** + * batadv_mcast_forw_rtr_node_get() - get a node with an ipv4/ipv6 router flag + * @bat_priv: the bat priv with all the soft interface information + * @ethhdr: an ethernet header to determine the protocol family from + * + * Return: an orig_node which has no BATADV_MCAST_WANT_NO_RTR4 or + * BATADV_MCAST_WANT_NO_RTR6 flag, depending on the provided ethhdr, set and + * increases its refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_rtr_node_get(struct batadv_priv *bat_priv, + struct ethhdr *ethhdr) +{ + switch (ntohs(ethhdr->h_proto)) { + case ETH_P_IP: + return batadv_mcast_forw_rtr4_node_get(bat_priv); + case ETH_P_IPV6: + return batadv_mcast_forw_rtr6_node_get(bat_priv); + default: + /* we shouldn't be here... */ + return NULL; + } +} + +/** * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the soft interface information * @skb: The multicast packet to check @@ -977,8 +1385,11 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, bool is_unsnoopable = false; unsigned int mcast_fanout; struct ethhdr *ethhdr; + int is_routable = 0; + int rtr_count = 0; - ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable); + ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable, + &is_routable); if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) @@ -991,8 +1402,9 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr); unsnoop_count = !is_unsnoopable ? 0 : atomic_read(&bat_priv->mcast.num_want_all_unsnoopables); + rtr_count = batadv_mcast_forw_rtr_count(bat_priv, is_routable); - total_count = tt_count + ip_count + unsnoop_count; + total_count = tt_count + ip_count + unsnoop_count + rtr_count; switch (total_count) { case 1: @@ -1002,6 +1414,9 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, *orig = batadv_mcast_forw_ip_node_get(bat_priv, ethhdr); else if (unsnoop_count) *orig = batadv_mcast_forw_unsnoop_node_get(bat_priv); + else if (rtr_count) + *orig = batadv_mcast_forw_rtr_node_get(bat_priv, + ethhdr); if (*orig) return BATADV_FORW_SINGLE; @@ -1173,6 +1588,111 @@ batadv_mcast_forw_want_all(struct batadv_priv *bat_priv, } /** + * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4 + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_NO_RTR4 flag unset. A transmission is performed via a + * batman-adv unicast packet for each such destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS + * otherwise. + */ +static int +batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + struct batadv_orig_node *orig_node; + int ret = NET_XMIT_SUCCESS; + struct sk_buff *newskb; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, + &bat_priv->mcast.want_all_rtr4_list, + mcast_want_all_rtr4_node) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + ret = NET_XMIT_DROP; + break; + } + + batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0, + orig_node, vid); + } + rcu_read_unlock(); + return ret; +} + +/** + * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6 + * @bat_priv: the bat priv with all the soft interface information + * @skb: The multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_NO_RTR6 flag unset. A transmission is performed via a + * batman-adv unicast packet for each such destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS + * otherwise. + */ +static int +batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + struct batadv_orig_node *orig_node; + int ret = NET_XMIT_SUCCESS; + struct sk_buff *newskb; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, + &bat_priv->mcast.want_all_rtr6_list, + mcast_want_all_rtr6_node) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + ret = NET_XMIT_DROP; + break; + } + + batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0, + orig_node, vid); + } + rcu_read_unlock(); + return ret; +} + +/** + * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_NO_RTR4 or BATADV_MCAST_WANT_NO_RTR6 flag unset. A + * transmission is performed via a batman-adv unicast packet for each such + * destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family + * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. + */ +static int +batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + switch (ntohs(eth_hdr(skb)->h_proto)) { + case ETH_P_IP: + return batadv_mcast_forw_want_all_rtr4(bat_priv, skb, vid); + case ETH_P_IPV6: + return batadv_mcast_forw_want_all_rtr6(bat_priv, skb, vid); + default: + /* we shouldn't be here... */ + return NET_XMIT_DROP; + } +} + +/** * batadv_mcast_forw_send() - send packet to any detected multicast recpient * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit @@ -1205,6 +1725,12 @@ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, return ret; } + ret = batadv_mcast_forw_want_rtr(bat_priv, skb, vid); + if (ret != NET_XMIT_SUCCESS) { + kfree_skb(skb); + return ret; + } + consume_skb(skb); return ret; } @@ -1345,6 +1871,127 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, } /** + * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node which multicast state might have changed of + * @mcast_flags: flags indicating the new multicast state + * + * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has + * toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. + */ +static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 mcast_flags) +{ + struct hlist_node *node = &orig->mcast_want_all_rtr4_node; + struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list; + + lockdep_assert_held(&orig->mcast_handler_lock); + + /* switched from flag set to unset */ + if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) && + orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) { + atomic_inc(&bat_priv->mcast.num_want_all_rtr4); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + /* switched from flag unset to set */ + } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 && + !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) { + atomic_dec(&bat_priv->mcast.num_want_all_rtr4); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + } +} + +/** + * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node which multicast state might have changed of + * @mcast_flags: flags indicating the new multicast state + * + * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has + * toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. + */ +static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 mcast_flags) +{ + struct hlist_node *node = &orig->mcast_want_all_rtr6_node; + struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list; + + lockdep_assert_held(&orig->mcast_handler_lock); + + /* switched from flag set to unset */ + if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) && + orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) { + atomic_inc(&bat_priv->mcast.num_want_all_rtr6); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + /* switched from flag unset to set */ + } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 && + !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) { + atomic_dec(&bat_priv->mcast.num_want_all_rtr6); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + } +} + +/** + * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV + * @enabled: whether the originator has multicast TVLV support enabled + * @tvlv_value: tvlv buffer containing the multicast flags + * @tvlv_value_len: tvlv buffer length + * + * Return: multicast flags for the given tvlv buffer + */ +static u8 +batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len) +{ + u8 mcast_flags = BATADV_NO_FLAGS; + + if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags)) + mcast_flags = *(u8 *)tvlv_value; + + if (!enabled) { + mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; + mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; + } + + /* remove redundant flags to avoid sending duplicate packets later */ + if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) + mcast_flags |= BATADV_MCAST_WANT_NO_RTR4; + + if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) + mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; + + return mcast_flags; +} + +/** * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm @@ -1359,16 +2006,10 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); - u8 mcast_flags = BATADV_NO_FLAGS; + u8 mcast_flags; - if (orig_mcast_enabled && tvlv_value && - tvlv_value_len >= sizeof(mcast_flags)) - mcast_flags = *(u8 *)tvlv_value; - - if (!orig_mcast_enabled) { - mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; - mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; - } + mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled, + tvlv_value, tvlv_value_len); spin_lock_bh(&orig->mcast_handler_lock); @@ -1385,6 +2026,8 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); + batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags); + batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; spin_unlock_bh(&orig->mcast_handler_lock); @@ -1417,15 +2060,16 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv, struct seq_file *seq) { - u8 flags = bat_priv->mcast.flags; + struct batadv_mcast_mla_flags *mla_flags = &bat_priv->mcast.mla_flags; char querier4, querier6, shadowing4, shadowing6; - bool bridged = bat_priv->mcast.bridged; + bool bridged = mla_flags->bridged; + u8 flags = mla_flags->tvlv_flags; if (bridged) { - querier4 = bat_priv->mcast.querier_ipv4.exists ? '.' : '4'; - querier6 = bat_priv->mcast.querier_ipv6.exists ? '.' : '6'; - shadowing4 = bat_priv->mcast.querier_ipv4.shadowing ? '4' : '.'; - shadowing6 = bat_priv->mcast.querier_ipv6.shadowing ? '6' : '.'; + querier4 = mla_flags->querier_ipv4.exists ? '.' : '4'; + querier6 = mla_flags->querier_ipv6.exists ? '.' : '6'; + shadowing4 = mla_flags->querier_ipv4.shadowing ? '4' : '.'; + shadowing6 = mla_flags->querier_ipv6.shadowing ? '6' : '.'; } else { querier4 = '?'; querier6 = '?'; @@ -1433,10 +2077,12 @@ static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv, shadowing6 = '?'; } - seq_printf(seq, "Multicast flags (own flags: [%c%c%c])\n", + seq_printf(seq, "Multicast flags (own flags: [%c%c%c%s%s])\n", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', - (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', + !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", + !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.'); seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n", querier4, querier6); @@ -1490,13 +2136,17 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) flags = orig_node->mcast_flags; - seq_printf(seq, "%pM [%c%c%c]\n", orig_node->orig, + seq_printf(seq, "%pM [%c%c%c%s%s]\n", orig_node->orig, (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) - ? '6' : '.'); + ? '6' : '.', + !(flags & BATADV_MCAST_WANT_NO_RTR4) + ? "R4" : ". ", + !(flags & BATADV_MCAST_WANT_NO_RTR6) + ? "R6" : ". "); } rcu_read_unlock(); } @@ -1517,19 +2167,19 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) int batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv) { - u32 flags = bat_priv->mcast.flags; + u32 flags = bat_priv->mcast.mla_flags.tvlv_flags; u32 flags_priv = BATADV_NO_FLAGS; - if (bat_priv->mcast.bridged) { + if (bat_priv->mcast.mla_flags.bridged) { flags_priv |= BATADV_MCAST_FLAGS_BRIDGED; - if (bat_priv->mcast.querier_ipv4.exists) + if (bat_priv->mcast.mla_flags.querier_ipv4.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS; - if (bat_priv->mcast.querier_ipv6.exists) + if (bat_priv->mcast.mla_flags.querier_ipv6.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS; - if (bat_priv->mcast.querier_ipv4.shadowing) + if (bat_priv->mcast.mla_flags.querier_ipv4.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING; - if (bat_priv->mcast.querier_ipv6.shadowing) + if (bat_priv->mcast.mla_flags.querier_ipv6.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING; } @@ -1770,6 +2420,8 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); + batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_NO_FLAGS); + batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_NO_FLAGS); spin_unlock_bh(&orig->mcast_handler_lock); } diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 653b9b76fabe..5d9e2bb29c97 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -9,9 +9,9 @@ #include "main.h" -struct netlink_callback; -struct seq_file; -struct sk_buff; +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> /** * enum batadv_forw_mode - the way a packet should be forwarded as diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index a67720fad46c..6f08fd122a8d 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -21,6 +21,7 @@ #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/limits.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> @@ -30,6 +31,7 @@ #include <linux/stddef.h> #include <linux/types.h> #include <net/genetlink.h> +#include <net/net_namespace.h> #include <net/netlink.h> #include <net/sock.h> #include <uapi/linux/batadv_packet.h> @@ -49,8 +51,6 @@ #include "tp_meter.h" #include "translation-table.h" -struct net; - struct genl_family batadv_netlink_family; /* multicast groups */ diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index d1e0681b8743..ddc674e47dbb 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -9,11 +9,10 @@ #include "main.h" +#include <linux/netlink.h> #include <linux/types.h> #include <net/genetlink.h> -struct nlmsghdr; - void batadv_netlink_register(void); void batadv_netlink_unregister(void); int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype); diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index c5e7906045f3..580609389f0f 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1951,34 +1951,19 @@ out: /** * batadv_nc_init_debugfs() - create nc folder and related files in debugfs * @bat_priv: the bat priv with all the soft interface information - * - * Return: 0 on success or negative error number in case of failure */ -int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) +void batadv_nc_init_debugfs(struct batadv_priv *bat_priv) { - struct dentry *nc_dir, *file; + struct dentry *nc_dir; nc_dir = debugfs_create_dir("nc", bat_priv->debug_dir); - if (!nc_dir) - goto out; - file = debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq); - if (!file) - goto out; + debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq); - file = debugfs_create_u32("max_fwd_delay", 0644, nc_dir, - &bat_priv->nc.max_fwd_delay); - if (!file) - goto out; + debugfs_create_u32("max_fwd_delay", 0644, nc_dir, + &bat_priv->nc.max_fwd_delay); - file = debugfs_create_u32("max_buffer_time", 0644, nc_dir, - &bat_priv->nc.max_buffer_time); - if (!file) - goto out; - - return 0; - -out: - return -ENOMEM; + debugfs_create_u32("max_buffer_time", 0644, nc_dir, + &bat_priv->nc.max_buffer_time); } #endif diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 74f56113a5d0..753fa49723cf 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -9,12 +9,11 @@ #include "main.h" +#include <linux/netdevice.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> - -struct batadv_ogm_packet; -struct net_device; -struct seq_file; -struct sk_buff; +#include <uapi/linux/batadv_packet.h> #ifdef CONFIG_BATMAN_ADV_NC @@ -40,7 +39,7 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb); int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset); -int batadv_nc_init_debugfs(struct batadv_priv *bat_priv); +void batadv_nc_init_debugfs(struct batadv_priv *bat_priv); #else /* ifdef CONFIG_BATMAN_ADV_NC */ @@ -111,9 +110,8 @@ static inline int batadv_nc_nodes_seq_print_text(struct seq_file *seq, return 0; } -static inline int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) +static inline void batadv_nc_init_debugfs(struct batadv_priv *bat_priv) { - return 0; } #endif /* ifdef CONFIG_BATMAN_ADV_NC */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 45db798a7297..38613487fb1b 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -27,6 +27,7 @@ #include <linux/stddef.h> #include <linux/workqueue.h> #include <net/sock.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" @@ -1043,7 +1044,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, orig_node->bcast_seqno_reset = reset_time; #ifdef CONFIG_BATMAN_ADV_MCAST - orig_node->mcast_flags = BATADV_NO_FLAGS; + orig_node->mcast_flags = BATADV_MCAST_WANT_NO_RTR4; + orig_node->mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node); diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 3829e26f9c5d..512a1f99dd75 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -12,12 +12,11 @@ #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> -struct netlink_callback; -struct seq_file; -struct sk_buff; - bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index b96c6d06d188..c20feac95107 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -9,10 +9,9 @@ #include "main.h" +#include <linux/skbuff.h> #include <linux/types.h> -struct sk_buff; - bool batadv_check_management_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, int header_len); diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 5921ee4e107c..5fc0fd1e5d08 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -10,12 +10,11 @@ #include "main.h" #include <linux/compiler.h> +#include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> -struct sk_buff; - void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet, bool dropped); struct batadv_forw_packet * diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index a7677e1d000f..c7a2e77ca1da 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -24,6 +24,7 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/percpu.h> #include <linux/printk.h> #include <linux/random.h> @@ -803,11 +804,6 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST - bat_priv->mcast.querier_ipv4.exists = false; - bat_priv->mcast.querier_ipv4.shadowing = false; - bat_priv->mcast.querier_ipv6.exists = false; - bat_priv->mcast.querier_ipv6.shadowing = false; - bat_priv->mcast.flags = BATADV_NO_FLAGS; atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 275442a7acb6..29139ad769fe 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -9,13 +9,12 @@ #include "main.h" +#include <linux/netdevice.h> +#include <linux/skbuff.h> #include <linux/types.h> +#include <net/net_namespace.h> #include <net/rtnetlink.h> -struct net_device; -struct net; -struct sk_buff; - int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 80fc3253c336..1efcb97039cd 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -18,6 +18,7 @@ #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/kref.h> +#include <linux/limits.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 83fa808b1871..5e466093dfa5 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -9,12 +9,11 @@ #include "main.h" +#include <linux/kobject.h> +#include <linux/netdevice.h> #include <linux/sysfs.h> #include <linux/types.h> -struct kobject; -struct net_device; - #define BATADV_SYSFS_IF_MESH_SUBDIR "mesh" #define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv" /** diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 820392146249..dd6a9a40dbb9 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/kthread.h> +#include <linux/limits.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/param.h> diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index 604b3799c972..78d310da0ad3 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -9,10 +9,9 @@ #include "main.h" +#include <linux/skbuff.h> #include <linux/types.h> -struct sk_buff; - void batadv_tp_meter_init(void); void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, u32 test_length, u32 *cookie); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 1ddfd5e011ee..8a482c5ec67b 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3813,6 +3813,8 @@ static void batadv_tt_purge(struct work_struct *work) */ void batadv_tt_free(struct batadv_priv *bat_priv) { + batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_ROAM, 1); + batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_TT, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_TT, 1); diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index c8c48d62a430..4a98860d7f0e 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -9,13 +9,12 @@ #include "main.h" +#include <linux/netdevice.h> +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> -struct netlink_callback; -struct net_device; -struct seq_file; -struct sk_buff; - int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark); diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 114ac01e06af..36985000a0a8 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -10,8 +10,7 @@ #include "main.h" #include <linux/types.h> - -struct batadv_ogm_packet; +#include <uapi/linux/batadv_packet.h> void batadv_tvlv_container_register(struct batadv_priv *bat_priv, u8 type, u8 version, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 74b644738a36..6ae139d74e0f 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -14,20 +14,22 @@ #include <linux/average.h> #include <linux/bitops.h> #include <linux/compiler.h> +#include <linux/if.h> #include <linux/if_ether.h> #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/sched.h> /* for linux/wait.h */ +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/spinlock.h> +#include <linux/timer.h> #include <linux/types.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> -struct seq_file; - #ifdef CONFIG_BATMAN_ADV_DAT /** @@ -402,6 +404,17 @@ struct batadv_orig_node { * list */ struct hlist_node mcast_want_all_ipv6_node; + + /** + * @mcast_want_all_rtr4_node: a list node for the mcast.want_all_rtr4 + * list + */ + struct hlist_node mcast_want_all_rtr4_node; + /** + * @mcast_want_all_rtr6_node: a list node for the mcast.want_all_rtr6 + * list + */ + struct hlist_node mcast_want_all_rtr6_node; #endif /** @capabilities: announced capabilities of this originator */ @@ -1169,6 +1182,26 @@ struct batadv_mcast_querier_state { }; /** + * struct batadv_mcast_mla_flags - flags for the querier, bridge and tvlv state + */ +struct batadv_mcast_mla_flags { + /** @querier_ipv4: the current state of an IGMP querier in the mesh */ + struct batadv_mcast_querier_state querier_ipv4; + + /** @querier_ipv6: the current state of an MLD querier in the mesh */ + struct batadv_mcast_querier_state querier_ipv6; + + /** @enabled: whether the multicast tvlv is currently enabled */ + unsigned char enabled:1; + + /** @bridged: whether the soft interface has a bridge on top */ + unsigned char bridged:1; + + /** @tvlv_flags: the flags we have last sent in our mcast tvlv */ + u8 tvlv_flags; +}; + +/** * struct batadv_priv_mcast - per mesh interface mcast data */ struct batadv_priv_mcast { @@ -1196,20 +1229,22 @@ struct batadv_priv_mcast { */ struct hlist_head want_all_ipv6_list; - /** @querier_ipv4: the current state of an IGMP querier in the mesh */ - struct batadv_mcast_querier_state querier_ipv4; - - /** @querier_ipv6: the current state of an MLD querier in the mesh */ - struct batadv_mcast_querier_state querier_ipv6; - - /** @flags: the flags we have last sent in our mcast tvlv */ - u8 flags; + /** + * @want_all_rtr4_list: a list of orig_nodes wanting all routable IPv4 + * multicast traffic + */ + struct hlist_head want_all_rtr4_list; - /** @enabled: whether the multicast tvlv is currently enabled */ - unsigned char enabled:1; + /** + * @want_all_rtr6_list: a list of orig_nodes wanting all routable IPv6 + * multicast traffic + */ + struct hlist_head want_all_rtr6_list; - /** @bridged: whether the soft interface has a bridge on top */ - unsigned char bridged:1; + /** + * @mla_flags: flags for the querier, bridge and tvlv state + */ + struct batadv_mcast_mla_flags mla_flags; /** * @mla_lock: a lock protecting mla_list and mla_flags @@ -1228,6 +1263,12 @@ struct batadv_priv_mcast { /** @num_want_all_ipv6: counter for items in want_all_ipv6_list */ atomic_t num_want_all_ipv6; + /** @num_want_all_rtr4: counter for items in want_all_rtr4_list */ + atomic_t num_want_all_rtr4; + + /** @num_want_all_rtr6: counter for items in want_all_rtr6_list */ + atomic_t num_want_all_rtr6; + /** * @want_lists_lock: lock for protecting modifications to mcasts * want_all_{unsnoopables,ipv4,ipv6}_list (traversals are rcu-locked) @@ -2129,6 +2170,9 @@ struct batadv_algo_iface_ops { /** @enable: init routing info when hard-interface is enabled */ int (*enable)(struct batadv_hard_iface *hard_iface); + /** @enabled: notification when hard-interface was enabled (optional) */ + void (*enabled)(struct batadv_hard_iface *hard_iface); + /** @disable: de-init routing info when hard-interface is disabled */ void (*disable)(struct batadv_hard_iface *hard_iface); diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 1555b0c6f7ec..9d41de1ec90f 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -164,26 +164,21 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, int count = atomic_read(&dev->peer_count); const struct in6_addr *nexthop; struct lowpan_peer *peer; + struct neighbour *neigh; BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt); - /* If we have multiple 6lowpan peers, then check where we should - * send the packet. If only one peer exists, then we can send the - * packet right away. - */ - if (count == 1) { - rcu_read_lock(); - peer = list_first_or_null_rcu(&dev->peers, struct lowpan_peer, - list); - rcu_read_unlock(); - return peer; - } - if (!rt) { - nexthop = &lowpan_cb(skb)->gw; - - if (ipv6_addr_any(nexthop)) - return NULL; + if (ipv6_addr_any(&lowpan_cb(skb)->gw)) { + /* There is neither route nor gateway, + * probably the destination is a direct peer. + */ + nexthop = daddr; + } else { + /* There is a known gateway + */ + nexthop = &lowpan_cb(skb)->gw; + } } else { nexthop = rt6_nexthop(rt, daddr); @@ -209,6 +204,20 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, } } + /* use the neighbour cache for matching addresses assigned by SLAAC + */ + neigh = __ipv6_neigh_lookup(dev->netdev, nexthop); + if (neigh) { + list_for_each_entry_rcu(peer, &dev->peers, list) { + if (!memcmp(neigh->ha, peer->lladdr, ETH_ALEN)) { + neigh_release(neigh); + rcu_read_unlock(); + return peer; + } + } + neigh_release(neigh); + } + rcu_read_unlock(); return NULL; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 15d1cb5aee18..ad5b0ac1f9ce 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -520,6 +520,9 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, set_bit(HCI_CONN_POWER_SAVE, &conn->flags); conn->disc_timeout = HCI_DISCONN_TIMEOUT; + /* Set Default Authenticated payload timeout to 30s */ + conn->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; + if (conn->role == HCI_ROLE_MASTER) conn->out = true; @@ -912,7 +915,7 @@ static void hci_req_directed_advertising(struct hci_request *req, sizeof(cp), &cp); } - __hci_req_enable_ext_advertising(req); + __hci_req_enable_ext_advertising(req, 0x00); } else { struct hci_cp_le_set_adv_param cp; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b81bf53c5ac4..b9585e7d9d2e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2827,7 +2827,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, memset(adv_instance->scan_rsp_data, 0, sizeof(adv_instance->scan_rsp_data)); } else { - if (hdev->adv_instance_cnt >= HCI_MAX_ADV_INSTANCES || + if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets || instance < 1 || instance > HCI_MAX_ADV_INSTANCES) return -EOVERFLOW; @@ -3195,11 +3195,13 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE; hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; + hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE; hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE; + hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 51f5b1efc3a5..bb67f4a5479a 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -941,6 +941,35 @@ static int adv_max_interval_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get, adv_max_interval_set, "%llu\n"); +static int auth_payload_timeout_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val < 0x0001 || val > 0xffff) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->auth_payload_timeout = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int auth_payload_timeout_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->auth_payload_timeout; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(auth_payload_timeout_fops, + auth_payload_timeout_get, + auth_payload_timeout_set, "%llu\n"); + DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter, HCI_QUIRK_STRICT_DUPLICATE_FILTER); DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery, @@ -994,6 +1023,8 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &adv_max_interval_fops); debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs, &hdev->discov_interleaved_timeout); + debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev, + &auth_payload_timeout_fops); debugfs_create_file("quirk_strict_duplicate_filter", 0644, hdev->debugfs, hdev, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 9e4fcf406d9c..cdb00c2ef242 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -579,6 +579,51 @@ static void hci_cc_read_local_commands(struct hci_dev *hdev, memcpy(hdev->commands, rp->commands, sizeof(hdev->commands)); } +static void hci_cc_read_auth_payload_timeout(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_auth_payload_to *rp = (void *)skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); + if (conn) + conn->auth_payload_timeout = __le16_to_cpu(rp->timeout); + + hci_dev_unlock(hdev); +} + +static void hci_cc_write_auth_payload_timeout(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_write_auth_payload_to *rp = (void *)skb->data; + struct hci_conn *conn; + void *sent; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO); + if (!sent) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); + if (conn) + conn->auth_payload_timeout = get_unaligned_le16(sent + 2); + + hci_dev_unlock(hdev); +} + static void hci_cc_read_local_features(struct hci_dev *hdev, struct sk_buff *skb) { @@ -2975,6 +3020,25 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; } + /* Set the default Authenticated Payload Timeout after + * an LE Link is established. As per Core Spec v5.0, Vol 2, Part B + * Section 3.3, the HCI command WRITE_AUTH_PAYLOAD_TIMEOUT should be + * sent when the link is active and Encryption is enabled, the conn + * type can be either LE or ACL and controller must support LMP Ping. + * Ensure for AES-CCM encryption as well. + */ + if (test_bit(HCI_CONN_ENCRYPT, &conn->flags) && + test_bit(HCI_CONN_AES_CCM, &conn->flags) && + ((conn->type == ACL_LINK && lmp_ping_capable(hdev)) || + (conn->type == LE_LINK && (hdev->le_features[0] & HCI_LE_PING)))) { + struct hci_cp_write_auth_payload_to cp; + + cp.handle = cpu_to_le16(conn->handle); + cp.timeout = cpu_to_le16(hdev->auth_payload_timeout); + hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO, + sizeof(cp), &cp); + } + notify: if (conn->state == BT_CONFIG) { if (!ev->status) @@ -3170,6 +3234,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_write_sc_support(hdev, skb); break; + case HCI_OP_READ_AUTH_PAYLOAD_TO: + hci_cc_read_auth_payload_timeout(hdev, skb); + break; + + case HCI_OP_WRITE_AUTH_PAYLOAD_TO: + hci_cc_write_auth_payload_timeout(hdev, skb); + break; + case HCI_OP_READ_LOCAL_VERSION: hci_cc_read_local_version(hdev, skb); break; @@ -5588,6 +5660,11 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_UNKNOWN_CONN_ID); + if (min < hcon->le_conn_min_interval || + max > hcon->le_conn_max_interval) + return send_conn_param_neg_reply(hdev, handle, + HCI_ERROR_INVALID_LL_PARAMS); + if (hci_check_conn_params(min, max, latency, timeout)) return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_INVALID_LL_PARAMS); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index e9a95ed65491..621f1a97d803 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1601,7 +1601,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; cp.tx_power = 127; - cp.handle = 0; + cp.handle = instance; if (flags & MGMT_ADV_FLAG_SEC_2M) { cp.primary_phy = HCI_ADV_PHY_1M; @@ -1643,11 +1643,21 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) return 0; } -void __hci_req_enable_ext_advertising(struct hci_request *req) +int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance) { + struct hci_dev *hdev = req->hdev; struct hci_cp_le_set_ext_adv_enable *cp; struct hci_cp_ext_adv_set *adv_set; u8 data[sizeof(*cp) + sizeof(*adv_set) * 1]; + struct adv_info *adv_instance; + + if (instance > 0) { + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -EINVAL; + } else { + adv_instance = NULL; + } cp = (void *) data; adv_set = (void *) cp->data; @@ -1659,11 +1669,23 @@ void __hci_req_enable_ext_advertising(struct hci_request *req) memset(adv_set, 0, sizeof(*adv_set)); - adv_set->handle = 0; + adv_set->handle = instance; + + /* Set duration per instance since controller is responsible for + * scheduling it. + */ + if (adv_instance && adv_instance->duration) { + u16 duration = adv_instance->duration * MSEC_PER_SEC; + + /* Time = N * 10 ms */ + adv_set->duration = cpu_to_le16(duration / 10); + } hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets, data); + + return 0; } int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) @@ -1679,7 +1701,7 @@ int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) return err; __hci_req_update_scan_rsp_data(req, instance); - __hci_req_enable_ext_advertising(req); + __hci_req_enable_ext_advertising(req, instance); return 0; } @@ -1723,10 +1745,13 @@ int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, adv_instance->remaining_time = adv_instance->remaining_time - timeout; - hdev->adv_instance_timeout = timeout; - queue_delayed_work(hdev->req_workqueue, + /* Only use work for scheduling instances with legacy advertising */ + if (!ext_adv_capable(hdev)) { + hdev->adv_instance_timeout = timeout; + queue_delayed_work(hdev->req_workqueue, &hdev->adv_instance_expire, msecs_to_jiffies(timeout * 1000)); + } /* If we're just re-scheduling the same instance again then do not * execute any HCI commands. This happens when a single instance is @@ -2744,7 +2769,8 @@ static int powered_update_hci(struct hci_request *req, unsigned long opt) if (!ext_adv_capable(hdev)) __hci_req_enable_advertising(req); else if (!err) - __hci_req_enable_ext_advertising(req); + __hci_req_enable_ext_advertising(req, + 0x00); } } else if (!list_empty(&hdev->adv_instances)) { struct adv_info *adv_instance; diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 55b2050cc9ff..a7019fbeadd3 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -83,7 +83,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); -void __hci_req_enable_ext_advertising(struct hci_request *req); +int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance); void __hci_req_clear_ext_adv_sets(struct hci_request *req); int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, bool use_rpa, struct adv_info *adv_instance, diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index a442e21f3894..5abd423b55fa 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -775,7 +775,7 @@ static int hidp_setup_hid(struct hidp_session *session, hid->version = req->version; hid->country = req->country; - strncpy(hid->name, req->name, sizeof(hid->name)); + strscpy(hid->name, req->name, sizeof(hid->name)); snprintf(hid->phys, sizeof(hid->phys), "%pMR", &l2cap_pi(session->ctrl_sock->sk)->chan->src); diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 2151913892ce..03be6a4baef3 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -192,6 +192,7 @@ static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne ca.version = ca32.version; ca.flags = ca32.flags; ca.idle_to = ca32.idle_to; + ca32.name[sizeof(ca32.name) - 1] = '\0'; memcpy(ca.name, ca32.name, 128); csock = sockfd_lookup(ca.ctrl_sock, &err); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 5406d7cd46ad..cc506fe99b4d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -168,11 +168,18 @@ static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn, return c; } -static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src) +static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src, + u8 src_type) { struct l2cap_chan *c; list_for_each_entry(c, &chan_list, global_l) { + if (src_type == BDADDR_BREDR && c->src_type != BDADDR_BREDR) + continue; + + if (src_type != BDADDR_BREDR && c->src_type == BDADDR_BREDR) + continue; + if (c->sport == psm && !bacmp(&c->src, src)) return c; } @@ -185,7 +192,7 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm) write_lock(&chan_list_lock); - if (psm && __l2cap_global_chan_by_addr(psm, src)) { + if (psm && __l2cap_global_chan_by_addr(psm, src, chan->src_type)) { err = -EADDRINUSE; goto done; } @@ -209,7 +216,8 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm) err = -EINVAL; for (p = start; p <= end; p += incr) - if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) { + if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src, + chan->src_type)) { chan->psm = cpu_to_le16(p); chan->sport = cpu_to_le16(p); err = 0; @@ -4394,6 +4402,12 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, l2cap_chan_lock(chan); + if (chan->state != BT_DISCONN) { + l2cap_chan_unlock(chan); + mutex_unlock(&conn->chan_lock); + return 0; + } + l2cap_chan_hold(chan); l2cap_chan_del(chan, 0); @@ -5291,7 +5305,14 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, memset(&rsp, 0, sizeof(rsp)); - err = hci_check_conn_params(min, max, latency, to_multiplier); + if (min < hcon->le_conn_min_interval || + max > hcon->le_conn_max_interval) { + BT_DBG("requested connection interval exceeds current bounds."); + err = -EINVAL; + } else { + err = hci_check_conn_params(min, max, latency, to_multiplier); + } + if (err) rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); else diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index e68c715f8d37..6c2b4e6e87ba 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2579,6 +2579,19 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, goto distribute; } + /* Drop IRK if peer is using identity address during pairing but is + * providing different address as identity information. + * + * Microsoft Surface Precision Mouse is known to have this bug. + */ + if (hci_is_identity_address(&hcon->dst, hcon->dst_type) && + (bacmp(&info->bdaddr, &hcon->dst) || + info->addr_type != hcon->dst_type)) { + bt_dev_err(hcon->hdev, + "ignoring IRK with invalid identity address"); + goto distribute; + } + bacpy(&smp->id_addr, &info->bdaddr); smp->id_addr_type = info->addr_type; diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c index 61ce8454a88e..77396a098fbe 100644 --- a/net/bpfilter/main.c +++ b/net/bpfilter/main.c @@ -55,7 +55,7 @@ static void loop(void) int main(void) { - debug_fd = open("/dev/console", 00000002); + debug_fd = open("/dev/kmsg", 00000002); dprintf(debug_fd, "Started bpfilter\n"); loop(); close(debug_fd); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index c05def8fd9cd..681b72862c16 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -52,6 +52,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br_switchdev_frame_unmark(skb); BR_INPUT_SKB_CB(skb)->brdev = dev; + BR_INPUT_SKB_CB(skb)->frag_max_size = 0; skb_reset_mac_header(skb); eth = eth_hdr(skb); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 21b74e7a7b2f..09b1dd8cd853 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -74,7 +74,6 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; - const unsigned char *dest; struct net_bridge *br; u16 vid = 0; @@ -92,10 +91,9 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); local_rcv = !!(br->dev->flags & IFF_PROMISC); - dest = eth_hdr(skb)->h_dest; - if (is_multicast_ether_addr(dest)) { + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ - if (is_broadcast_ether_addr(dest)) { + if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { pkt_type = BR_PKT_BROADCAST; local_rcv = true; } else { @@ -145,7 +143,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } break; case BR_PKT_UNICAST: - dst = br_fdb_find_rcu(br, dest, vid); + dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); default: break; } @@ -234,7 +232,7 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) kfree_skb(skb); return RX_HANDLER_CONSUMED; case NF_QUEUE: - ret = nf_queue(skb, &state, e, i, verdict); + ret = nf_queue(skb, &state, i, verdict); if (ret == 1) continue; return RX_HANDLER_CONSUMED; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index de22c8fbbb15..3d8deac2353d 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -911,6 +911,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, int type; int err = 0; __be32 group; + u16 nsrcs; ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); @@ -924,8 +925,9 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, grec = (void *)(skb->data + len - sizeof(*grec)); group = grec->grec_mca; type = grec->grec_type; + nsrcs = ntohs(grec->grec_nsrcs); - len += ntohs(grec->grec_nsrcs) * 4; + len += nsrcs * 4; if (!ip_mc_may_pull(skb, len)) return -EINVAL; @@ -946,7 +948,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, src = eth_hdr(skb)->h_source; if ((type == IGMPV3_CHANGE_TO_INCLUDE || type == IGMPV3_MODE_IS_INCLUDE) && - ntohs(grec->grec_nsrcs) == 0) { + nsrcs == 0) { br_ip4_multicast_leave_group(br, port, group, vid, src); } else { err = br_ip4_multicast_add_group(br, port, group, vid, @@ -983,7 +985,8 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, len = skb_transport_offset(skb) + sizeof(*icmp6h); for (i = 0; i < num; i++) { - __be16 *nsrcs, _nsrcs; + __be16 *_nsrcs, __nsrcs; + u16 nsrcs; nsrcs_offset = len + offsetof(struct mld2_grec, grec_nsrcs); @@ -991,12 +994,13 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, nsrcs_offset + sizeof(_nsrcs)) return -EINVAL; - nsrcs = skb_header_pointer(skb, nsrcs_offset, - sizeof(_nsrcs), &_nsrcs); - if (!nsrcs) + _nsrcs = skb_header_pointer(skb, nsrcs_offset, + sizeof(__nsrcs), &__nsrcs); + if (!_nsrcs) return -EINVAL; - grec_len = struct_size(grec, grec_src, ntohs(*nsrcs)); + nsrcs = ntohs(*_nsrcs); + grec_len = struct_size(grec, grec_src, nsrcs); if (!ipv6_mc_may_pull(skb, len + grec_len)) return -EINVAL; @@ -1021,7 +1025,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, src = eth_hdr(skb)->h_source; if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || grec->grec_type == MLD2_MODE_IS_INCLUDE) && - ntohs(*nsrcs) == 0) { + nsrcs == 0) { br_ip6_multicast_leave_group(br, port, &grec->grec_mca, vid, src); } else { @@ -1275,7 +1279,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, u16 vid) { unsigned int transport_len = ipv6_transport_len(skb); - const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct mld_msg *mld; struct net_bridge_mdb_entry *mp; struct mld2_query *mld2q; @@ -1319,7 +1322,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); - saddr.u.ip6 = ip6h->saddr; + saddr.u.ip6 = ipv6_hdr(skb)->saddr; br_multicast_query_received(br, port, &br->ip6_other_query, &saddr, max_delay); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 34fa72c72ad8..d3f9592f4ff8 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -47,25 +47,22 @@ static unsigned int brnf_net_id __read_mostly; struct brnf_net { bool enabled; -}; #ifdef CONFIG_SYSCTL -static struct ctl_table_header *brnf_sysctl_header; -static int brnf_call_iptables __read_mostly = 1; -static int brnf_call_ip6tables __read_mostly = 1; -static int brnf_call_arptables __read_mostly = 1; -static int brnf_filter_vlan_tagged __read_mostly; -static int brnf_filter_pppoe_tagged __read_mostly; -static int brnf_pass_vlan_indev __read_mostly; -#else -#define brnf_call_iptables 1 -#define brnf_call_ip6tables 1 -#define brnf_call_arptables 1 -#define brnf_filter_vlan_tagged 0 -#define brnf_filter_pppoe_tagged 0 -#define brnf_pass_vlan_indev 0 + struct ctl_table_header *ctl_hdr; #endif + /* default value is 1 */ + int call_iptables; + int call_ip6tables; + int call_arptables; + + /* default value is 0 */ + int filter_vlan_tagged; + int filter_pppoe_tagged; + int pass_vlan_indev; +}; + #define IS_IP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) @@ -85,17 +82,28 @@ static inline __be16 vlan_proto(const struct sk_buff *skb) return 0; } -#define IS_VLAN_IP(skb) \ - (vlan_proto(skb) == htons(ETH_P_IP) && \ - brnf_filter_vlan_tagged) +static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged; +} -#define IS_VLAN_IPV6(skb) \ - (vlan_proto(skb) == htons(ETH_P_IPV6) && \ - brnf_filter_vlan_tagged) +static inline bool is_vlan_ipv6(const struct sk_buff *skb, + const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); -#define IS_VLAN_ARP(skb) \ - (vlan_proto(skb) == htons(ETH_P_ARP) && \ - brnf_filter_vlan_tagged) + return vlan_proto(skb) == htons(ETH_P_IPV6) && + brnet->filter_vlan_tagged; +} + +static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged; +} static inline __be16 pppoe_proto(const struct sk_buff *skb) { @@ -103,15 +111,23 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) sizeof(struct pppoe_hdr))); } -#define IS_PPPOE_IP(skb) \ - (skb->protocol == htons(ETH_P_PPP_SES) && \ - pppoe_proto(skb) == htons(PPP_IP) && \ - brnf_filter_pppoe_tagged) +static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + return skb->protocol == htons(ETH_P_PPP_SES) && + pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged; +} + +static inline bool is_pppoe_ipv6(const struct sk_buff *skb, + const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); -#define IS_PPPOE_IPV6(skb) \ - (skb->protocol == htons(ETH_P_PPP_SES) && \ - pppoe_proto(skb) == htons(PPP_IPV6) && \ - brnf_filter_pppoe_tagged) + return skb->protocol == htons(ETH_P_PPP_SES) && + pppoe_proto(skb) == htons(PPP_IPV6) && + brnet->filter_pppoe_tagged; +} /* largest possible L2 header, see br_nf_dev_queue_xmit() */ #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) @@ -408,12 +424,16 @@ bridged_dnat: return 0; } -static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev) +static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, + const struct net_device *dev, + const struct net *net) { struct net_device *vlan, *br; + struct brnf_net *brnet = net_generic(net, brnf_net_id); br = bridge_parent(dev); - if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) + + if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) return br; vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, @@ -423,7 +443,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct } /* Some common code for IPv4/IPv6 */ -struct net_device *setup_pre_routing(struct sk_buff *skb) +struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); @@ -434,7 +454,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb) nf_bridge->in_prerouting = 1; nf_bridge->physindev = skb->dev; - skb->dev = brnf_get_logical_dev(skb, skb->dev); + skb->dev = brnf_get_logical_dev(skb, skb->dev, net); if (skb->protocol == htons(ETH_P_8021Q)) nf_bridge->orig_proto = BRNF_PROTO_8021Q; @@ -460,6 +480,7 @@ static unsigned int br_nf_pre_routing(void *priv, struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); + struct brnf_net *brnet; if (unlikely(!pskb_may_pull(skb, len))) return NF_DROP; @@ -469,8 +490,10 @@ static unsigned int br_nf_pre_routing(void *priv, return NF_DROP; br = p->br; - if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { - if (!brnf_call_ip6tables && + brnet = net_generic(state->net, brnf_net_id); + if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || + is_pppoe_ipv6(skb, state->net)) { + if (!brnet->call_ip6tables && !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; @@ -478,10 +501,11 @@ static unsigned int br_nf_pre_routing(void *priv, return br_nf_pre_routing_ipv6(priv, skb, state); } - if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) + if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) return NF_ACCEPT; - if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) + if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) && + !is_pppoe_ip(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); @@ -491,7 +515,7 @@ static unsigned int br_nf_pre_routing(void *priv, if (!nf_bridge_alloc(skb)) return NF_DROP; - if (!setup_pre_routing(skb)) + if (!setup_pre_routing(skb, state->net)) return NF_DROP; nf_bridge = nf_bridge_info_get(skb); @@ -514,7 +538,7 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *in; - if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { + if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) { if (skb->protocol == htons(ETH_P_IP)) nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; @@ -569,9 +593,11 @@ static unsigned int br_nf_forward_ip(void *priv, if (!parent) return NF_DROP; - if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) + if (IS_IP(skb) || is_vlan_ip(skb, state->net) || + is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; - else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) + else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || + is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; @@ -602,7 +628,7 @@ static unsigned int br_nf_forward_ip(void *priv, skb->protocol = htons(ETH_P_IPV6); NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb, - brnf_get_logical_dev(skb, state->in), + brnf_get_logical_dev(skb, state->in, state->net), parent, br_nf_forward_finish); return NF_STOLEN; @@ -615,23 +641,25 @@ static unsigned int br_nf_forward_arp(void *priv, struct net_bridge_port *p; struct net_bridge *br; struct net_device **d = (struct net_device **)(skb->cb); + struct brnf_net *brnet; p = br_port_get_rcu(state->out); if (p == NULL) return NF_ACCEPT; br = p->br; - if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) + brnet = net_generic(state->net, brnf_net_id); + if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) return NF_ACCEPT; if (!IS_ARP(skb)) { - if (!IS_VLAN_ARP(skb)) + if (!is_vlan_arp(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header(skb); } if (arp_hdr(skb)->ar_pln != 4) { - if (IS_VLAN_ARP(skb)) + if (is_vlan_arp(skb, state->net)) nf_bridge_push_encap_header(skb); return NF_ACCEPT; } @@ -791,9 +819,11 @@ static unsigned int br_nf_post_routing(void *priv, if (!realoutdev) return NF_DROP; - if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) + if (IS_IP(skb) || is_vlan_ip(skb, state->net) || + is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; - else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) + else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || + is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; @@ -946,23 +976,6 @@ static int brnf_device_event(struct notifier_block *unused, unsigned long event, return NOTIFY_OK; } -static void __net_exit brnf_exit_net(struct net *net) -{ - struct brnf_net *brnet = net_generic(net, brnf_net_id); - - if (!brnet->enabled) - return; - - nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); - brnet->enabled = false; -} - -static struct pernet_operations brnf_net_ops __read_mostly = { - .exit = brnf_exit_net, - .id = &brnf_net_id, - .size = sizeof(struct brnf_net), -}; - static struct notifier_block brnf_notifier __read_mostly = { .notifier_call = brnf_device_event, }; @@ -1021,49 +1034,124 @@ int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, static struct ctl_table brnf_table[] = { { .procname = "bridge-nf-call-arptables", - .data = &brnf_call_arptables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-iptables", - .data = &brnf_call_iptables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-ip6tables", - .data = &brnf_call_ip6tables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-vlan-tagged", - .data = &brnf_filter_vlan_tagged, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-pppoe-tagged", - .data = &brnf_filter_pppoe_tagged, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-pass-vlan-input-dev", - .data = &brnf_pass_vlan_indev, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { } }; + +static inline void br_netfilter_sysctl_default(struct brnf_net *brnf) +{ + brnf->call_iptables = 1; + brnf->call_ip6tables = 1; + brnf->call_arptables = 1; + brnf->filter_vlan_tagged = 0; + brnf->filter_pppoe_tagged = 0; + brnf->pass_vlan_indev = 0; +} + +static int br_netfilter_sysctl_init_net(struct net *net) +{ + struct ctl_table *table = brnf_table; + struct brnf_net *brnet; + + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL); + if (!table) + return -ENOMEM; + } + + brnet = net_generic(net, brnf_net_id); + table[0].data = &brnet->call_arptables; + table[1].data = &brnet->call_iptables; + table[2].data = &brnet->call_ip6tables; + table[3].data = &brnet->filter_vlan_tagged; + table[4].data = &brnet->filter_pppoe_tagged; + table[5].data = &brnet->pass_vlan_indev; + + br_netfilter_sysctl_default(brnet); + + brnet->ctl_hdr = register_net_sysctl(net, "net/bridge", table); + if (!brnet->ctl_hdr) { + if (!net_eq(net, &init_net)) + kfree(table); + + return -ENOMEM; + } + + return 0; +} + +static void br_netfilter_sysctl_exit_net(struct net *net, + struct brnf_net *brnet) +{ + struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; + + unregister_net_sysctl_table(brnet->ctl_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static int __net_init brnf_init_net(struct net *net) +{ + return br_netfilter_sysctl_init_net(net); +} +#endif + +static void __net_exit brnf_exit_net(struct net *net) +{ + struct brnf_net *brnet; + + brnet = net_generic(net, brnf_net_id); + if (brnet->enabled) { + nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); + brnet->enabled = false; + } + +#ifdef CONFIG_SYSCTL + br_netfilter_sysctl_exit_net(net, brnet); #endif +} + +static struct pernet_operations brnf_net_ops __read_mostly = { +#ifdef CONFIG_SYSCTL + .init = brnf_init_net, +#endif + .exit = brnf_exit_net, + .id = &brnf_net_id, + .size = sizeof(struct brnf_net), +}; static int __init br_netfilter_init(void) { @@ -1079,16 +1167,6 @@ static int __init br_netfilter_init(void) return ret; } -#ifdef CONFIG_SYSCTL - brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); - if (brnf_sysctl_header == NULL) { - printk(KERN_WARNING - "br_netfilter: can't register to sysctl.\n"); - unregister_netdevice_notifier(&brnf_notifier); - unregister_pernet_subsys(&brnf_net_ops); - return -ENOMEM; - } -#endif RCU_INIT_POINTER(nf_br_ops, &br_ops); printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; @@ -1099,9 +1177,6 @@ static void __exit br_netfilter_fini(void) RCU_INIT_POINTER(nf_br_ops, NULL); unregister_netdevice_notifier(&brnf_notifier); unregister_pernet_subsys(&brnf_net_ops); -#ifdef CONFIG_SYSCTL - unregister_net_sysctl_table(brnf_sysctl_header); -#endif } module_init(br_netfilter_init); diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 0e63e5dc5ac4..e4e0c836c3f5 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -224,7 +224,7 @@ unsigned int br_nf_pre_routing_ipv6(void *priv, nf_bridge = nf_bridge_alloc(skb); if (!nf_bridge) return NF_DROP; - if (!setup_pre_routing(skb)) + if (!setup_pre_routing(skb, state->net)) return NF_DROP; nf_bridge = nf_bridge_info_get(skb); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 159a0e2cb0f6..e8cf03b43b7d 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -421,6 +421,7 @@ struct net_bridge { struct br_input_skb_cb { struct net_device *brdev; + u16 frag_max_size; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING u8 igmp; u8 mrouters_only:1; diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index 68a6922b4141..7796dd9d42d7 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -143,7 +143,6 @@ void br_send_tcn_bpdu(struct net_bridge_port *p) void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb, struct net_device *dev) { - const unsigned char *dest = eth_hdr(skb)->h_dest; struct net_bridge_port *p; struct net_bridge *br; const unsigned char *buf; @@ -172,7 +171,7 @@ void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb, if (p->state == BR_STATE_DISABLED) goto out; - if (!ether_addr_equal(dest, br->group_addr)) + if (!ether_addr_equal(eth_hdr(skb)->h_dest, br->group_addr)) goto out; if (p->flags & BR_BPDU_GUARD) { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index f47f526b4f19..021cc9f66804 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -797,6 +797,16 @@ bool br_vlan_enabled(const struct net_device *dev) } EXPORT_SYMBOL_GPL(br_vlan_enabled); +int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) +{ + struct net_bridge *br = netdev_priv(dev); + + *p_proto = ntohs(br->vlan_proto); + + return 0; +} +EXPORT_SYMBOL_GPL(br_vlan_get_proto); + int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) { int err = 0; @@ -1227,13 +1237,11 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, } } -int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) +static int __br_vlan_get_pvid(const struct net_device *dev, + struct net_bridge_port *p, u16 *p_pvid) { struct net_bridge_vlan_group *vg; - struct net_bridge_port *p; - ASSERT_RTNL(); - p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) @@ -1244,8 +1252,21 @@ int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) *p_pvid = br_get_pvid(vg); return 0; } + +int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) +{ + ASSERT_RTNL(); + + return __br_vlan_get_pvid(dev, br_port_get_check_rtnl(dev), p_pvid); +} EXPORT_SYMBOL_GPL(br_vlan_get_pvid); +int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) +{ + return __br_vlan_get_pvid(dev, br_port_get_check_rcu(dev), p_pvid); +} +EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu); + int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 36a98d36d339..154fa558bb90 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -9,6 +9,12 @@ menuconfig NF_TABLES_BRIDGE bool "Ethernet Bridge nf_tables support" if NF_TABLES_BRIDGE + +config NFT_BRIDGE_META + tristate "Netfilter nf_table bridge meta support" + help + Add support for bridge dedicated meta key. + config NFT_BRIDGE_REJECT tristate "Netfilter nf_tables bridge reject support" depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6 @@ -19,6 +25,20 @@ config NF_LOG_BRIDGE tristate "Bridge packet logging" select NF_LOG_COMMON +config NF_CONNTRACK_BRIDGE + tristate "IPv4/IPV6 bridge connection tracking support" + depends on NF_CONNTRACK + default n + help + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. This is used to enhance packet filtering via + stateful policies. Enable this if you want native tracking from + the bridge. This provides a replacement for the `br_netfilter' + infrastructure. + + To compile it as a module, choose M here. If unsure, say N. + endif # NF_TABLES_BRIDGE menuconfig BRIDGE_NF_EBTABLES diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 9b868861f21a..8e2c5759d964 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -3,8 +3,12 @@ # Makefile for the netfilter modules for Link Layer filtering on a bridge. # +obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o +# connection tracking +obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o + # packet logging obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index eeae23a73c6a..ed91ea31978a 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -22,7 +22,7 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) const struct ebt_nat_info *info = par->targinfo; struct net_device *dev; - if (!skb_make_writable(skb, 0)) + if (skb_ensure_writable(skb, ETH_ALEN)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 53ef08e6765f..0cad62a4052b 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -21,7 +21,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_redirect_info *info = par->targinfo; - if (!skb_make_writable(skb, 0)) + if (skb_ensure_writable(skb, ETH_ALEN)) return EBT_DROP; if (xt_hooknum(par) != NF_BR_BROUTING) diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c index 700d338d5ddb..27443bf229a3 100644 --- a/net/bridge/netfilter/ebt_snat.c +++ b/net/bridge/netfilter/ebt_snat.c @@ -22,7 +22,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; - if (!skb_make_writable(skb, 0)) + if (skb_ensure_writable(skb, ETH_ALEN * 2)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_source, info->mac); diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c new file mode 100644 index 000000000000..4f5444d2a526 --- /dev/null +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -0,0 +1,435 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_bridge.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/route.h> +#include <net/ip.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_bridge.h> + +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/netfilter/nf_tables.h> + +#include "../br_private.h" + +/* Best effort variant of ip_do_fragment which preserves geometry, unless skbuff + * has been linearized or cloned. + */ +static int nf_br_ip_fragment(struct net *net, struct sock *sk, + struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + unsigned int hlen, ll_rs, mtu; + struct ip_frag_state state; + struct iphdr *iph; + int err; + + /* for offloaded checksums cleanup checksum before fragmentation */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto blackhole; + + iph = ip_hdr(skb); + + /* + * Setup starting values + */ + + hlen = iph->ihl * 4; + frag_max_size -= hlen; + ll_rs = LL_RESERVED_SPACE(skb->dev); + mtu = skb->dev->mtu; + + if (skb_has_frag_list(skb)) { + unsigned int first_len = skb_pagelen(skb); + struct ip_fraglist_iter iter; + struct sk_buff *frag; + + if (first_len - hlen > mtu || + skb_headroom(skb) < ll_rs) + goto blackhole; + + if (skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag) { + if (frag->len > mtu || + skb_headroom(frag) < hlen + ll_rs) + goto blackhole; + + if (skb_shared(frag)) + goto slow_path; + } + + ip_fraglist_init(skb, iph, hlen, &iter); + + for (;;) { + if (iter.frag) + ip_fraglist_prepare(skb, &iter); + + err = output(net, sk, data, skb); + if (err || !iter.frag) + break; + + skb = ip_fraglist_next(&iter); + } + return err; + } +slow_path: + /* This is a linearized skbuff, the original geometry is lost for us. + * This may also be a clone skbuff, we could preserve the geometry for + * the copies but probably not worth the effort. + */ + ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state); + + while (state.left > 0) { + struct sk_buff *skb2; + + skb2 = ip_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); + goto blackhole; + } + + err = output(net, sk, data, skb2); + if (err) + goto blackhole; + } + consume_skb(skb); + return err; + +blackhole: + kfree_skb(skb); + return 0; +} + +/* ip_defrag() expects IPCB() in place. */ +static void br_skb_cb_save(struct sk_buff *skb, struct br_input_skb_cb *cb, + size_t inet_skb_parm_size) +{ + memcpy(cb, skb->cb, sizeof(*cb)); + memset(skb->cb, 0, inet_skb_parm_size); +} + +static void br_skb_cb_restore(struct sk_buff *skb, + const struct br_input_skb_cb *cb, + u16 fragsz) +{ + memcpy(skb->cb, cb, sizeof(*cb)); + BR_INPUT_SKB_CB(skb)->frag_max_size = fragsz; +} + +static unsigned int nf_ct_br_defrag4(struct sk_buff *skb, + const struct nf_hook_state *state) +{ + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; + enum ip_conntrack_info ctinfo; + struct br_input_skb_cb cb; + const struct nf_conn *ct; + int err; + + if (!ip_is_fragment(ip_hdr(skb))) + return NF_ACCEPT; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + + br_skb_cb_save(skb, &cb, sizeof(struct inet_skb_parm)); + local_bh_disable(); + err = ip_defrag(state->net, skb, + IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id); + local_bh_enable(); + if (!err) { + br_skb_cb_restore(skb, &cb, IPCB(skb)->frag_max_size); + skb->ignore_df = 1; + return NF_ACCEPT; + } + + return NF_STOLEN; +} + +static unsigned int nf_ct_br_defrag6(struct sk_buff *skb, + const struct nf_hook_state *state) +{ + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; + enum ip_conntrack_info ctinfo; + struct br_input_skb_cb cb; + const struct nf_conn *ct; + int err; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + + br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm)); + + err = nf_ipv6_br_defrag(state->net, skb, + IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id); + /* queued */ + if (err == -EINPROGRESS) + return NF_STOLEN; + + br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size); + return err == 0 ? NF_ACCEPT : NF_DROP; +} + +static int nf_ct_br_ip_check(const struct sk_buff *skb) +{ + const struct iphdr *iph; + int nhoff, len; + + nhoff = skb_network_offset(skb); + iph = ip_hdr(skb); + if (iph->ihl < 5 || + iph->version != 4) + return -1; + + len = ntohs(iph->tot_len); + if (skb->len < nhoff + len || + len < (iph->ihl * 4)) + return -1; + + return 0; +} + +static int nf_ct_br_ipv6_check(const struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + int nhoff, len; + + nhoff = skb_network_offset(skb); + hdr = ipv6_hdr(skb); + if (hdr->version != 6) + return -1; + + len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff; + if (skb->len < len) + return -1; + + return 0; +} + +static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_hook_state bridge_state = *state; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u32 len; + int ret; + + ct = nf_ct_get(skb, &ctinfo); + if ((ct && !nf_ct_is_template(ct)) || + ctinfo == IP_CT_UNTRACKED) + return NF_ACCEPT; + + switch (skb->protocol) { + case htons(ETH_P_IP): + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return NF_ACCEPT; + + len = ntohs(ip_hdr(skb)->tot_len); + if (pskb_trim_rcsum(skb, len)) + return NF_ACCEPT; + + if (nf_ct_br_ip_check(skb)) + return NF_ACCEPT; + + bridge_state.pf = NFPROTO_IPV4; + ret = nf_ct_br_defrag4(skb, &bridge_state); + break; + case htons(ETH_P_IPV6): + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return NF_ACCEPT; + + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + if (pskb_trim_rcsum(skb, len)) + return NF_ACCEPT; + + if (nf_ct_br_ipv6_check(skb)) + return NF_ACCEPT; + + bridge_state.pf = NFPROTO_IPV6; + ret = nf_ct_br_defrag6(skb, &bridge_state); + break; + default: + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + return NF_ACCEPT; + } + + if (ret != NF_ACCEPT) + return ret; + + return nf_conntrack_in(skb, &bridge_state); +} + +static void nf_ct_bridge_frag_save(struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data) +{ + if (skb_vlan_tag_present(skb)) { + data->vlan_present = true; + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + } else { + data->vlan_present = false; + } + skb_copy_from_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN); +} + +static unsigned int +nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + struct nf_ct_bridge_frag_data data; + + if (!BR_INPUT_SKB_CB(skb)->frag_max_size) + return NF_ACCEPT; + + nf_ct_bridge_frag_save(skb, &data); + switch (skb->protocol) { + case htons(ETH_P_IP): + nf_br_ip_fragment(state->net, state->sk, skb, &data, output); + break; + case htons(ETH_P_IPV6): + nf_br_ip6_fragment(state->net, state->sk, skb, &data, output); + break; + default: + WARN_ON_ONCE(1); + return NF_DROP; + } + + return NF_STOLEN; +} + +/* Actually only slow path refragmentation needs this. */ +static int nf_ct_bridge_frag_restore(struct sk_buff *skb, + const struct nf_ct_bridge_frag_data *data) +{ + int err; + + err = skb_cow_head(skb, ETH_HLEN); + if (err) { + kfree_skb(skb); + return -ENOMEM; + } + if (data->vlan_present) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); + else if (skb_vlan_tag_present(skb)) + __vlan_hwaccel_clear_tag(skb); + + skb_copy_to_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN); + skb_reset_mac_header(skb); + + return 0; +} + +static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *skb) +{ + int err; + + err = nf_ct_bridge_frag_restore(skb, data); + if (err < 0) + return err; + + return br_dev_queue_push_xmit(net, sk, skb); +} + +static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + int protoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return nf_conntrack_confirm(skb); + + switch (skb->protocol) { + case htons(ETH_P_IP): + protoff = skb_network_offset(skb) + ip_hdrlen(skb); + break; + case htons(ETH_P_IPV6): { + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + return nf_conntrack_confirm(skb); + } + break; + default: + return NF_ACCEPT; + } + return nf_confirm(skb, protoff, ct, ctinfo); +} + +static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + int ret; + + ret = nf_ct_bridge_confirm(skb); + if (ret != NF_ACCEPT) + return ret; + + return nf_ct_bridge_refrag(skb, state, nf_ct_bridge_refrag_post); +} + +static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = { + { + .hook = nf_ct_bridge_pre, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = nf_ct_bridge_post, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, +}; + +static struct nf_ct_bridge_info bridge_info = { + .ops = nf_ct_bridge_hook_ops, + .ops_size = ARRAY_SIZE(nf_ct_bridge_hook_ops), + .me = THIS_MODULE, +}; + +static int __init nf_conntrack_l3proto_bridge_init(void) +{ + nf_ct_bridge_register(&bridge_info); + + return 0; +} + +static void __exit nf_conntrack_l3proto_bridge_fini(void) +{ + nf_ct_bridge_unregister(&bridge_info); +} + +module_init(nf_conntrack_l3proto_bridge_init); +module_exit(nf_conntrack_l3proto_bridge_fini); + +MODULE_ALIAS("nf_conntrack-" __stringify(AF_BRIDGE)); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c new file mode 100644 index 000000000000..bed66f536b34 --- /dev/null +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> +#include <linux/if_bridge.h> + +static const struct net_device * +nft_meta_get_bridge(const struct net_device *dev) +{ + if (dev && netif_is_bridge_port(dev)) + return netdev_master_upper_dev_get_rcu((struct net_device *)dev); + + return NULL; +} + +static void nft_meta_bridge_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); + u32 *dest = ®s->data[priv->dreg]; + const struct net_device *br_dev; + + switch (priv->key) { + case NFT_META_BRI_IIFNAME: + br_dev = nft_meta_get_bridge(in); + if (!br_dev) + goto err; + break; + case NFT_META_BRI_OIFNAME: + br_dev = nft_meta_get_bridge(out); + if (!br_dev) + goto err; + break; + case NFT_META_BRI_IIFPVID: { + u16 p_pvid; + + br_dev = nft_meta_get_bridge(in); + if (!br_dev || !br_vlan_enabled(br_dev)) + goto err; + + br_vlan_get_pvid_rcu(in, &p_pvid); + nft_reg_store16(dest, p_pvid); + return; + } + case NFT_META_BRI_IIFVPROTO: { + u16 p_proto; + + br_dev = nft_meta_get_bridge(in); + if (!br_dev || !br_vlan_enabled(br_dev)) + goto err; + + br_vlan_get_proto(br_dev, &p_proto); + nft_reg_store16(dest, p_proto); + return; + } + default: + goto out; + } + + strncpy((char *)dest, br_dev->name, IFNAMSIZ); + return; +out: + return nft_meta_get_eval(expr, regs, pkt); +err: + regs->verdict.code = NFT_BREAK; +} + +static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + unsigned int len; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_BRI_IIFNAME: + case NFT_META_BRI_OIFNAME: + len = IFNAMSIZ; + break; + case NFT_META_BRI_IIFPVID: + case NFT_META_BRI_IIFVPROTO: + len = sizeof(u16); + break; + default: + return nft_meta_get_init(ctx, expr, tb); + } + + priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); +} + +static struct nft_expr_type nft_meta_bridge_type; +static const struct nft_expr_ops nft_meta_bridge_get_ops = { + .type = &nft_meta_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_bridge_get_eval, + .init = nft_meta_bridge_get_init, + .dump = nft_meta_get_dump, +}; + +static const struct nft_expr_ops nft_meta_bridge_set_ops = { + .type = &nft_meta_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_set_eval, + .init = nft_meta_set_init, + .destroy = nft_meta_set_destroy, + .dump = nft_meta_set_dump, + .validate = nft_meta_set_validate, +}; + +static const struct nft_expr_ops * +nft_meta_bridge_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_META_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG]) + return &nft_meta_bridge_get_ops; + + if (tb[NFTA_META_SREG]) + return &nft_meta_bridge_set_ops; + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_meta_bridge_type __read_mostly = { + .family = NFPROTO_BRIDGE, + .name = "meta", + .select_ops = nft_meta_bridge_select_ops, + .policy = nft_meta_policy, + .maxattr = NFTA_META_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_meta_bridge_module_init(void) +{ + return nft_register_expr(&nft_meta_bridge_type); +} + +static void __exit nft_meta_bridge_module_exit(void) +{ + nft_unregister_expr(&nft_meta_bridge_type); +} + +module_init(nft_meta_bridge_module_init); +module_exit(nft_meta_bridge_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("wenxu <wenxu@ucloud.cn>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d1c4e1f3be2c..94c7f77ecb6b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -627,6 +627,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) unsigned int i; u32 nbuckets; u64 cost; + int ret; smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); if (!smap) @@ -636,13 +637,21 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus()))); nbuckets = 1U << smap->bucket_log; + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); + + ret = bpf_map_charge_init(&smap->map.memory, cost); + if (ret < 0) { + kfree(smap); + return ERR_PTR(ret); + } + smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { + bpf_map_charge_finish(&smap->map.memory); kfree(smap); return ERR_PTR(-ENOMEM); } - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); @@ -652,7 +661,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % BPF_SK_STORAGE_CACHE_SIZE; - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; return &smap->map; } diff --git a/net/core/dev.c b/net/core/dev.c index d6edd218babd..fc676b2610e3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2900,12 +2900,10 @@ static void skb_warn_bad_offload(const struct sk_buff *skb) else name = netdev_name(dev); } - WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " - "gso_type=%d ip_summed=%d\n", + skb_dump(KERN_WARNING, skb, false); + WARN(1, "%s: caps=(%pNF, %pNF)\n", name, dev ? &dev->features : &null_features, - skb->sk ? &skb->sk->sk_route_caps : &null_features, - skb->len, skb->data_len, skb_shinfo(skb)->gso_size, - skb_shinfo(skb)->gso_type, skb->ip_summed); + skb->sk ? &skb->sk->sk_route_caps : &null_features); } /* @@ -3124,13 +3122,7 @@ void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { if (net_ratelimit()) { pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); - if (dev) - pr_err("dev features: %pNF\n", &dev->features); - pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n", - skb->len, skb->data_len, skb->pkt_type, - skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, - skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum, - skb->csum_complete_sw, skb->csum_valid, skb->csum_level); + skb_dump(KERN_ERR, skb, true); dump_stack(); } } @@ -4689,9 +4681,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, __skb_push(skb, skb->mac_len); skb_do_redirect(skb); return NULL; - case TC_ACT_REINSERT: - /* this does not scrub the packet, and updates stats on error */ - skb_tc_reinsert(skb, &cl_res); + case TC_ACT_CONSUMED: return NULL; default: break; diff --git a/net/core/devlink.c b/net/core/devlink.c index 132f4b757963..4f40aeace902 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -17,6 +17,7 @@ #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/refcount.h> +#include <linux/workqueue.h> #include <rdma/ib_verbs.h> #include <net/netlink.h> #include <net/genetlink.h> @@ -514,14 +515,31 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number)) + if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_PF) { + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, + attrs->pci_pf.pf)) + return -EMSGSIZE; + } else if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_VF) { + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, + attrs->pci_vf.pf) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, + attrs->pci_vf.vf)) + return -EMSGSIZE; + } + if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PHYSICAL && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA) + return 0; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, + attrs->phys.port_number)) return -EMSGSIZE; if (!attrs->split) return 0; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number)) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, + attrs->phys.port_number)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, - attrs->split_subport_number)) + attrs->phys.split_subport_number)) return -EMSGSIZE; return 0; } @@ -1548,7 +1566,8 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, u32 seq, int flags) { const struct devlink_ops *ops = devlink->ops; - u8 inline_mode, encap_mode; + enum devlink_eswitch_encap_mode encap_mode; + u8 inline_mode; void *hdr; int err = 0; u16 mode; @@ -1624,7 +1643,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; - u8 inline_mode, encap_mode; + enum devlink_eswitch_encap_mode encap_mode; + u8 inline_mode; int err = 0; u16 mode; @@ -2668,6 +2688,108 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static int devlink_nl_flash_update_fill(struct sk_buff *msg, + struct devlink *devlink, + enum devlink_command cmd, + const char *status_msg, + const char *component, + unsigned long done, unsigned long total) +{ + void *hdr; + + hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) + goto out; + + if (status_msg && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, + status_msg)) + goto nla_put_failure; + if (component && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, + component)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, + done, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, + total, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void __devlink_flash_update_notify(struct devlink *devlink, + enum devlink_command cmd, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && + cmd != DEVLINK_CMD_FLASH_UPDATE_END && + cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg, + component, done, total); + if (err) + goto out_free_msg; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + return; + +out_free_msg: + nlmsg_free(msg); +} + +void devlink_flash_update_begin_notify(struct devlink *devlink) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE, + NULL, NULL, 0, 0); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); + +void devlink_flash_update_end_notify(struct devlink *devlink) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_END, + NULL, NULL, 0, 0); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); + +void devlink_flash_update_status_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + status_msg, component, done, total); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); + static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { @@ -4415,6 +4537,35 @@ nla_put_failure: return err; } +static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, + struct netlink_callback *cb, + enum devlink_command cmd) +{ + int index = cb->args[0]; + int tmp_index = index; + void *hdr; + int err; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd); + if (!hdr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = devlink_fmsg_prepare_skb(fmsg, skb, &index); + if ((err && err != -EMSGSIZE) || tmp_index == index) + goto nla_put_failure; + + cb->args[0] = index; + genlmsg_end(skb, hdr); + return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return err; +} + struct devlink_health_reporter { struct list_head list; void *priv; @@ -4647,17 +4798,16 @@ int devlink_health_report(struct devlink_health_reporter *reporter, EXPORT_SYMBOL_GPL(devlink_health_report); static struct devlink_health_reporter * -devlink_health_reporter_get_from_info(struct devlink *devlink, - struct genl_info *info) +devlink_health_reporter_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) { struct devlink_health_reporter *reporter; char *reporter_name; - if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) + if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) return NULL; - reporter_name = - nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); + reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); mutex_lock(&devlink->reporters_lock); reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); if (reporter) @@ -4666,6 +4816,48 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, return reporter; } +static struct devlink_health_reporter * +devlink_health_reporter_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_health_reporter_get_from_attrs(devlink, info->attrs); +} + +static struct devlink_health_reporter * +devlink_health_reporter_get_from_cb(struct netlink_callback *cb) +{ + struct devlink_health_reporter *reporter; + struct devlink *devlink; + struct nlattr **attrs; + int err; + + attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return NULL; + + err = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + devlink_nl_family.hdrsize, + attrs, DEVLINK_ATTR_MAX, + devlink_nl_family.policy, cb->extack); + if (err) + goto free; + + mutex_lock(&devlink_mutex); + devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + if (IS_ERR(devlink)) + goto unlock; + + reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); + mutex_unlock(&devlink_mutex); + kfree(attrs); + return reporter; +unlock: + mutex_unlock(&devlink_mutex); +free: + kfree(attrs); + return NULL; +} + static void devlink_health_reporter_put(struct devlink_health_reporter *reporter) { @@ -4901,32 +5093,40 @@ out: return err; } -static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, - struct genl_info *info) +static int +devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) { - struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; + u64 start = cb->args[0]; int err; - reporter = devlink_health_reporter_get_from_info(devlink, info); + reporter = devlink_health_reporter_get_from_cb(cb); if (!reporter) return -EINVAL; if (!reporter->ops->dump) { - devlink_health_reporter_put(reporter); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto out; } - mutex_lock(&reporter->dump_lock); - err = devlink_health_do_dump(reporter, NULL); - if (err) - goto out; - - err = devlink_fmsg_snd(reporter->dump_fmsg, info, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0); + if (!start) { + err = devlink_health_do_dump(reporter, NULL); + if (err) + goto unlock; + cb->args[1] = reporter->dump_ts; + } + if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) { + NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry"); + err = -EAGAIN; + goto unlock; + } -out: + err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); +unlock: mutex_unlock(&reporter->dump_lock); +out: devlink_health_reporter_put(reporter); return err; } @@ -5263,7 +5463,7 @@ static const struct genl_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_dump_get_doit, + .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, @@ -5386,6 +5586,38 @@ void devlink_free(struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_free); +static void devlink_port_type_warn(struct work_struct *work) +{ + WARN(true, "Type was not set for devlink port."); +} + +static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) +{ + /* Ignore CPU and DSA flavours. */ + return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA; +} + +#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30) + +static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) +{ + if (!devlink_port_type_should_warn(devlink_port)) + return; + /* Schedule a work to WARN in case driver does not set port + * type within timeout. + */ + schedule_delayed_work(&devlink_port->type_warn_dw, + DEVLINK_PORT_TYPE_WARN_TIMEOUT); +} + +static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) +{ + if (!devlink_port_type_should_warn(devlink_port)) + return; + cancel_delayed_work_sync(&devlink_port->type_warn_dw); +} + /** * devlink_port_register - Register devlink port * @@ -5415,6 +5647,8 @@ int devlink_port_register(struct devlink *devlink, list_add_tail(&devlink_port->list, &devlink->port_list); INIT_LIST_HEAD(&devlink_port->param_list); mutex_unlock(&devlink->lock); + INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); + devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } @@ -5429,6 +5663,7 @@ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; + devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); mutex_lock(&devlink->lock); list_del(&devlink_port->list); @@ -5442,6 +5677,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, { if (WARN_ON(!devlink_port->registered)) return; + devlink_port_type_warn_cancel(devlink_port); spin_lock(&devlink_port->type_lock); devlink_port->type = type; devlink_port->type_dev = type_dev; @@ -5515,9 +5751,33 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); void devlink_port_type_clear(struct devlink_port *devlink_port) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); + devlink_port_type_warn_schedule(devlink_port); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); +static int __devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour, + const unsigned char *switch_id, + unsigned char switch_id_len) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + + if (WARN_ON(devlink_port->registered)) + return -EEXIST; + attrs->set = true; + attrs->flavour = flavour; + if (switch_id) { + attrs->switch_port = true; + if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN)) + switch_id_len = MAX_PHYS_ITEM_ID_LEN; + memcpy(attrs->switch_id.id, switch_id, switch_id_len); + attrs->switch_id.id_len = switch_id_len; + } else { + attrs->switch_port = false; + } + return 0; +} + /** * devlink_port_attrs_set - Set port attributes * @@ -5540,26 +5800,72 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, unsigned char switch_id_len) { struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; - if (WARN_ON(devlink_port->registered)) + ret = __devlink_port_attrs_set(devlink_port, flavour, + switch_id, switch_id_len); + if (ret) return; - attrs->set = true; - attrs->flavour = flavour; - attrs->port_number = port_number; attrs->split = split; - attrs->split_subport_number = split_subport_number; - if (switch_id) { - attrs->switch_port = true; - if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN)) - switch_id_len = MAX_PHYS_ITEM_ID_LEN; - memcpy(attrs->switch_id.id, switch_id, switch_id_len); - attrs->switch_id.id_len = switch_id_len; - } else { - attrs->switch_port = false; - } + attrs->phys.port_number = port_number; + attrs->phys.split_subport_number = split_subport_number; } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); +/** + * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes + * + * @devlink_port: devlink port + * @pf: associated PF for the devlink port instance + * @switch_id: if the port is part of switch, this is buffer with ID, + * otherwise this is NULL + * @switch_id_len: length of the switch_id buffer + */ +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, + const unsigned char *switch_id, + unsigned char switch_id_len, u16 pf) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; + + ret = __devlink_port_attrs_set(devlink_port, + DEVLINK_PORT_FLAVOUR_PCI_PF, + switch_id, switch_id_len); + if (ret) + return; + + attrs->pci_pf.pf = pf; +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); + +/** + * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes + * + * @devlink_port: devlink port + * @pf: associated PF for the devlink port instance + * @vf: associated VF of a PF for the devlink port instance + * @switch_id: if the port is part of switch, this is buffer with ID, + * otherwise this is NULL + * @switch_id_len: length of the switch_id buffer + */ +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, + const unsigned char *switch_id, + unsigned char switch_id_len, + u16 pf, u16 vf) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; + + ret = __devlink_port_attrs_set(devlink_port, + DEVLINK_PORT_FLAVOUR_PCI_VF, + switch_id, switch_id_len); + if (ret) + return; + attrs->pci_vf.pf = pf; + attrs->pci_vf.vf = vf; +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { @@ -5572,10 +5878,11 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: if (!attrs->split) - n = snprintf(name, len, "p%u", attrs->port_number); + n = snprintf(name, len, "p%u", attrs->phys.port_number); else - n = snprintf(name, len, "p%us%u", attrs->port_number, - attrs->split_subport_number); + n = snprintf(name, len, "p%us%u", + attrs->phys.port_number, + attrs->phys.split_subport_number); break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: @@ -5584,6 +5891,13 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, */ WARN_ON(1); return -EINVAL; + case DEVLINK_PORT_FLAVOUR_PCI_PF: + n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); + break; + case DEVLINK_PORT_FLAVOUR_PCI_VF: + n = snprintf(name, len, "pf%uvf%u", + attrs->pci_vf.pf, attrs->pci_vf.vf); + break; } if (n >= len) diff --git a/net/core/dst.c b/net/core/dst.c index e46366228eaf..1325316d9eab 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev, true); dst->input = dst_discard; dst->output = dst_discard_out; - dst->dev = dev_net(dst->dev)->loopback_dev; + dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4d1011b2e24f..6288e69e94fc 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2883,6 +2883,30 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) match->mask.basic.n_proto = htons(0xffff); switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { + case ETHER_FLOW: { + const struct ethhdr *ether_spec, *ether_m_spec; + + ether_spec = &fs->h_u.ether_spec; + ether_m_spec = &fs->m_u.ether_spec; + + if (!is_zero_ether_addr(ether_m_spec->h_source)) { + ether_addr_copy(match->key.eth_addrs.src, + ether_spec->h_source); + ether_addr_copy(match->mask.eth_addrs.src, + ether_m_spec->h_source); + } + if (!is_zero_ether_addr(ether_m_spec->h_dest)) { + ether_addr_copy(match->key.eth_addrs.dst, + ether_spec->h_dest); + ether_addr_copy(match->mask.eth_addrs.dst, + ether_m_spec->h_dest); + } + if (ether_m_spec->h_proto) { + match->key.basic.n_proto = ether_spec->h_proto; + match->mask.basic.n_proto = ether_m_spec->h_proto; + } + } + break; case TCP_V4_FLOW: case UDP_V4_FLOW: { const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec; diff --git a/net/core/filter.c b/net/core/filter.c index f615e42cf4ef..47f6386fb17a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -62,6 +62,7 @@ #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> @@ -2157,8 +2158,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) if (unlikely(flags & ~(BPF_F_INGRESS))) return TC_ACT_SHOT; - ri->ifindex = ifindex; ri->flags = flags; + ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } @@ -2168,8 +2169,8 @@ int skb_do_redirect(struct sk_buff *skb) struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; - dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); - ri->ifindex = 0; + dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); + ri->tgt_index = 0; if (unlikely(!dev)) { kfree_skb(skb); return -EINVAL; @@ -3487,11 +3488,11 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) { struct net_device *fwd; - u32 index = ri->ifindex; + u32 index = ri->tgt_index; int err; fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->ifindex = 0; + ri->tgt_index = 0; if (unlikely(!fwd)) { err = -EINVAL; goto err; @@ -3522,7 +3523,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = dev_map_enqueue(dst, xdp, dev_rx); if (unlikely(err)) return err; - __dev_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_CPUMAP: { @@ -3531,7 +3531,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (unlikely(err)) return err; - __cpu_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_XSKMAP: { @@ -3605,18 +3604,14 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, struct bpf_map *map, struct bpf_redirect_info *ri) { - u32 index = ri->ifindex; - void *fwd = NULL; + u32 index = ri->tgt_index; + void *fwd = ri->tgt_value; int err; - ri->ifindex = 0; + ri->tgt_index = 0; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - fwd = __xdp_map_lookup_elem(map, index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) xdp_do_flush_map(); @@ -3652,19 +3647,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - u32 index = ri->ifindex; - void *fwd = NULL; + u32 index = ri->tgt_index; + void *fwd = ri->tgt_value; int err = 0; - ri->ifindex = 0; + ri->tgt_index = 0; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - fwd = __xdp_map_lookup_elem(map, index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { struct bpf_dtab_netdev *dst = fwd; @@ -3696,14 +3686,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->ifindex; + u32 index = ri->tgt_index; struct net_device *fwd; int err = 0; if (map) return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, map); - ri->ifindex = 0; + ri->tgt_index = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); if (unlikely(!fwd)) { err = -EINVAL; @@ -3731,8 +3721,9 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) if (unlikely(flags)) return XDP_ABORTED; - ri->ifindex = ifindex; ri->flags = flags; + ri->tgt_index = ifindex; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); return XDP_REDIRECT; @@ -3751,11 +3742,23 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - if (unlikely(flags)) + /* Lower bits of the flags are used as return code on lookup failure */ + if (unlikely(flags > XDP_TX)) return XDP_ABORTED; - ri->ifindex = ifindex; + ri->tgt_value = __xdp_map_lookup_elem(map, ifindex); + if (unlikely(!ri->tgt_value)) { + /* If the lookup fails we want to clear out the state in the + * redirect_info struct completely, so that if an eBPF program + * performs multiple lookups, the last one always takes + * precedence. + */ + WRITE_ONCE(ri->map, NULL); + return flags; + } + ri->flags = flags; + ri->tgt_index = ifindex; WRITE_ONCE(ri->map, map); return XDP_REDIRECT; @@ -4670,7 +4673,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (res.type != RTN_UNICAST) return BPF_FIB_LKUP_RET_NOT_FWDED; - if (res.fi->fib_nhs > 1) + if (fib_info_num_path(res.fi) > 1) fib_select_path(net, &res, &fl4, NULL); if (check_mtu) { @@ -4737,7 +4740,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, return -ENODEV; idev = __in6_dev_get_safely(dev); - if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) + if (unlikely(!idev || !idev->cnf.forwarding)) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { @@ -5191,54 +5194,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ -#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \ -do { \ - switch (si->off) { \ - case offsetof(md_type, snd_cwnd): \ - CONVERT(snd_cwnd); break; \ - case offsetof(md_type, srtt_us): \ - CONVERT(srtt_us); break; \ - case offsetof(md_type, snd_ssthresh): \ - CONVERT(snd_ssthresh); break; \ - case offsetof(md_type, rcv_nxt): \ - CONVERT(rcv_nxt); break; \ - case offsetof(md_type, snd_nxt): \ - CONVERT(snd_nxt); break; \ - case offsetof(md_type, snd_una): \ - CONVERT(snd_una); break; \ - case offsetof(md_type, mss_cache): \ - CONVERT(mss_cache); break; \ - case offsetof(md_type, ecn_flags): \ - CONVERT(ecn_flags); break; \ - case offsetof(md_type, rate_delivered): \ - CONVERT(rate_delivered); break; \ - case offsetof(md_type, rate_interval_us): \ - CONVERT(rate_interval_us); break; \ - case offsetof(md_type, packets_out): \ - CONVERT(packets_out); break; \ - case offsetof(md_type, retrans_out): \ - CONVERT(retrans_out); break; \ - case offsetof(md_type, total_retrans): \ - CONVERT(total_retrans); break; \ - case offsetof(md_type, segs_in): \ - CONVERT(segs_in); break; \ - case offsetof(md_type, data_segs_in): \ - CONVERT(data_segs_in); break; \ - case offsetof(md_type, segs_out): \ - CONVERT(segs_out); break; \ - case offsetof(md_type, data_segs_out): \ - CONVERT(data_segs_out); break; \ - case offsetof(md_type, lost_out): \ - CONVERT(lost_out); break; \ - case offsetof(md_type, sacked_out): \ - CONVERT(sacked_out); break; \ - case offsetof(md_type, bytes_received): \ - CONVERT(bytes_received); break; \ - case offsetof(md_type, bytes_acked): \ - CONVERT(bytes_acked); break; \ - } \ -} while (0) - #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) @@ -5589,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, + icsk_retransmits)) return false; if (off % size != 0) @@ -5620,8 +5576,19 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct tcp_sock, FIELD)); \ } while (0) - CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, - BPF_TCP_SOCK_GET_COMMON); +#define BPF_INET_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \ + FIELD) > \ + FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct inet_connection_sock, \ + FIELD), \ + si->dst_reg, si->src_reg, \ + offsetof( \ + struct inet_connection_sock, \ + FIELD)); \ + } while (0) if (insn > insn_buf) return insn - insn_buf; @@ -5637,6 +5604,81 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct tcp_sock, rtt_min) + offsetof(struct minmax_sample, v)); break; + case offsetof(struct bpf_tcp_sock, snd_cwnd): + BPF_TCP_SOCK_GET_COMMON(snd_cwnd); + break; + case offsetof(struct bpf_tcp_sock, srtt_us): + BPF_TCP_SOCK_GET_COMMON(srtt_us); + break; + case offsetof(struct bpf_tcp_sock, snd_ssthresh): + BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); + break; + case offsetof(struct bpf_tcp_sock, rcv_nxt): + BPF_TCP_SOCK_GET_COMMON(rcv_nxt); + break; + case offsetof(struct bpf_tcp_sock, snd_nxt): + BPF_TCP_SOCK_GET_COMMON(snd_nxt); + break; + case offsetof(struct bpf_tcp_sock, snd_una): + BPF_TCP_SOCK_GET_COMMON(snd_una); + break; + case offsetof(struct bpf_tcp_sock, mss_cache): + BPF_TCP_SOCK_GET_COMMON(mss_cache); + break; + case offsetof(struct bpf_tcp_sock, ecn_flags): + BPF_TCP_SOCK_GET_COMMON(ecn_flags); + break; + case offsetof(struct bpf_tcp_sock, rate_delivered): + BPF_TCP_SOCK_GET_COMMON(rate_delivered); + break; + case offsetof(struct bpf_tcp_sock, rate_interval_us): + BPF_TCP_SOCK_GET_COMMON(rate_interval_us); + break; + case offsetof(struct bpf_tcp_sock, packets_out): + BPF_TCP_SOCK_GET_COMMON(packets_out); + break; + case offsetof(struct bpf_tcp_sock, retrans_out): + BPF_TCP_SOCK_GET_COMMON(retrans_out); + break; + case offsetof(struct bpf_tcp_sock, total_retrans): + BPF_TCP_SOCK_GET_COMMON(total_retrans); + break; + case offsetof(struct bpf_tcp_sock, segs_in): + BPF_TCP_SOCK_GET_COMMON(segs_in); + break; + case offsetof(struct bpf_tcp_sock, data_segs_in): + BPF_TCP_SOCK_GET_COMMON(data_segs_in); + break; + case offsetof(struct bpf_tcp_sock, segs_out): + BPF_TCP_SOCK_GET_COMMON(segs_out); + break; + case offsetof(struct bpf_tcp_sock, data_segs_out): + BPF_TCP_SOCK_GET_COMMON(data_segs_out); + break; + case offsetof(struct bpf_tcp_sock, lost_out): + BPF_TCP_SOCK_GET_COMMON(lost_out); + break; + case offsetof(struct bpf_tcp_sock, sacked_out): + BPF_TCP_SOCK_GET_COMMON(sacked_out); + break; + case offsetof(struct bpf_tcp_sock, bytes_received): + BPF_TCP_SOCK_GET_COMMON(bytes_received); + break; + case offsetof(struct bpf_tcp_sock, bytes_acked): + BPF_TCP_SOCK_GET_COMMON(bytes_acked); + break; + case offsetof(struct bpf_tcp_sock, dsack_dups): + BPF_TCP_SOCK_GET_COMMON(dsack_dups); + break; + case offsetof(struct bpf_tcp_sock, delivered): + BPF_TCP_SOCK_GET_COMMON(delivered); + break; + case offsetof(struct bpf_tcp_sock, delivered_ce): + BPF_TCP_SOCK_GET_COMMON(delivered_ce); + break; + case offsetof(struct bpf_tcp_sock, icsk_retransmits): + BPF_INET_SOCK_GET_COMMON(icsk_retransmits); + break; } return insn - insn_buf; @@ -5650,7 +5692,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) return (unsigned long)NULL; } -static const struct bpf_func_proto bpf_tcp_sock_proto = { +const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, @@ -5694,6 +5736,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) return INET_ECN_set_ce(skb); } +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + default: + return size == sizeof(__u32); + } +} + +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_XDP_SOCK_GET(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct xdp_sock, FIELD)); \ + } while (0) + + switch (si->off) { + case offsetof(struct bpf_xdp_sock, queue_id): + BPF_XDP_SOCK_GET(queue_id); + break; + } + + return insn - insn_buf; +} + static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, @@ -5896,6 +5978,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skc_lookup_tcp: return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; default: return bpf_base_func_proto(func_id); } @@ -5933,6 +6019,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; +#endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; @@ -6113,6 +6203,14 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_sockopt_event_output_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } @@ -6792,6 +6890,16 @@ static bool sock_addr_is_valid_access(int off, int size, if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { + if (bpf_ctx_wide_store_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_store_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + if (size != size_default) return false; } @@ -6800,6 +6908,13 @@ static bool sock_addr_is_valid_access(int off, int size, if (size != size_default) return false; break; + case offsetof(struct bpf_sock_addr, sk): + if (type != BPF_READ) + return false; + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; default: if (type == BPF_READ) { if (size != size_default) @@ -6843,6 +6958,11 @@ static bool sock_ops_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct bpf_sock_ops, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET_OR_NULL; + break; default: if (size != size_default) return false; @@ -7620,9 +7740,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. * - * It doesn't support SIZE argument though since narrow stores are not - * supported for now. - * * In addition it uses Temporary Field TF (member of struct S) as the 3rd * "register" since two registers available in convert_ctx_access are not * enough: we can't override neither SRC, since it contains value to store, nor @@ -7630,7 +7747,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, * instructions. But we need a temporary place to save pointer to nested * structure whose field we want to store to. */ -#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ do { \ int tmp_reg = BPF_REG_9; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ @@ -7641,8 +7758,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ - *insn++ = BPF_STX_MEM( \ - BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ + *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ target_size) \ + OFF); \ @@ -7654,8 +7770,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, TF) \ do { \ if (type == BPF_WRITE) { \ - SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ - TF); \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ + OFF, TF); \ } else { \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, SIZE, OFF); \ @@ -7750,6 +7866,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; + case offsetof(struct bpf_sock_addr, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_addr_kern, sk)); + break; } return insn - insn_buf; @@ -7837,9 +7958,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) - CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops, - SOCK_OPS_GET_TCP_SOCK_FIELD); - if (insn > insn_buf) return insn - insn_buf; @@ -8009,6 +8127,82 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); + break; + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); + break; + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); + break; + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); + break; + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); + break; + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); + break; + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); + break; + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); + break; + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); + break; + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); + break; + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); + break; + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); + break; + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); + break; + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); + break; + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); + break; + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); + break; + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); + break; + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); + break; + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); + break; + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); + break; + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); + break; + case offsetof(struct bpf_sock_ops, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + break; } return insn - insn_buf; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index edd622956083..3e6fedb57bc1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -27,6 +27,10 @@ #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> #include <linux/bpf.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_labels.h> +#endif static DEFINE_MUTEX(flow_dissector_mutex); @@ -199,6 +203,22 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +void skb_flow_dissect_meta(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_meta *meta; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META)) + return; + + meta = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_META, + target_container); + meta->ingress_ifindex = skb->skb_iif; +} +EXPORT_SYMBOL(skb_flow_dissect_meta); + static void skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, struct flow_dissector *flow_dissector, @@ -216,6 +236,46 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, } void +skb_flow_dissect_ct(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, + u16 *ctinfo_map, + size_t mapsize) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + struct flow_dissector_key_ct *key; + enum ip_conntrack_info ctinfo; + struct nf_conn_labels *cl; + struct nf_conn *ct; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT)) + return; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return; + + key = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CT, + target_container); + + if (ctinfo < mapsize) + key->ct_state = ctinfo_map[ctinfo]; +#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) + key->ct_zone = ct->zone.id; +#endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + key->ct_mark = ct->mark; +#endif + + cl = nf_ct_labels_find(ct); + if (cl) + memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels)); +#endif /* CONFIG_NF_CONNTRACK */ +} +EXPORT_SYMBOL(skb_flow_dissect_ct); + +void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) @@ -757,7 +817,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * @flags: flags that control the dissection process, e.g. - * FLOW_DISSECTOR_F_STOP_AT_L3. + * FLOW_DISSECTOR_F_STOP_AT_ENCAP. * * The function will try to retrieve individual keys into target specified * by flow_dissector from either the skbuff or a raw buffer specified by the @@ -922,11 +982,6 @@ proto_again: __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) { - fdret = FLOW_DISSECT_RET_OUT_GOOD; - break; - } - break; } case htons(ETH_P_IPV6): { @@ -975,9 +1030,6 @@ proto_again: __skb_flow_dissect_ipv6(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) - fdret = FLOW_DISSECT_RET_OUT_GOOD; - break; } case htons(ETH_P_8021AD): diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 5ce7d47a960e..76f8db3841d7 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -7,8 +7,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) { struct flow_rule *rule; - rule = kzalloc(sizeof(struct flow_rule) + - sizeof(struct flow_action_entry) * num_actions, + rule = kzalloc(struct_size(rule, action.entries, num_actions), GFP_KERNEL); if (!rule) return NULL; @@ -26,6 +25,13 @@ EXPORT_SYMBOL(flow_rule_alloc); (__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \ (__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \ +void flow_rule_match_meta(const struct flow_rule *rule, + struct flow_match_meta *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out); +} +EXPORT_SYMBOL(flow_rule_match_meta); + void flow_rule_match_basic(const struct flow_rule *rule, struct flow_match_basic *out) { @@ -158,3 +164,121 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule, FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out); } EXPORT_SYMBOL(flow_rule_match_enc_opts); + +struct flow_block_cb *flow_block_cb_alloc(struct net *net, tc_setup_cb_t *cb, + void *cb_ident, void *cb_priv, + void (*release)(void *cb_priv)) +{ + struct flow_block_cb *block_cb; + + block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); + if (!block_cb) + return ERR_PTR(-ENOMEM); + + block_cb->net = net; + block_cb->cb = cb; + block_cb->cb_ident = cb_ident; + block_cb->cb_priv = cb_priv; + block_cb->release = release; + + return block_cb; +} +EXPORT_SYMBOL(flow_block_cb_alloc); + +void flow_block_cb_free(struct flow_block_cb *block_cb) +{ + if (block_cb->release) + block_cb->release(block_cb->cb_priv); + + kfree(block_cb); +} +EXPORT_SYMBOL(flow_block_cb_free); + +struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *f, + tc_setup_cb_t *cb, void *cb_ident) +{ + struct flow_block_cb *block_cb; + + list_for_each_entry(block_cb, f->driver_block_list, driver_list) { + if (block_cb->net == f->net && + block_cb->cb == cb && + block_cb->cb_ident == cb_ident) + return block_cb; + } + + return NULL; +} +EXPORT_SYMBOL(flow_block_cb_lookup); + +void *flow_block_cb_priv(struct flow_block_cb *block_cb) +{ + return block_cb->cb_priv; +} +EXPORT_SYMBOL(flow_block_cb_priv); + +void flow_block_cb_incref(struct flow_block_cb *block_cb) +{ + block_cb->refcnt++; +} +EXPORT_SYMBOL(flow_block_cb_incref); + +unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb) +{ + return --block_cb->refcnt; +} +EXPORT_SYMBOL(flow_block_cb_decref); + +bool flow_block_cb_is_busy(tc_setup_cb_t *cb, void *cb_ident, + struct list_head *driver_block_list) +{ + struct flow_block_cb *block_cb; + + list_for_each_entry(block_cb, driver_block_list, driver_list) { + if (block_cb->cb == cb && + block_cb->cb_ident == cb_ident) + return true; + } + + return false; +} +EXPORT_SYMBOL(flow_block_cb_is_busy); + +int flow_block_cb_setup_simple(struct flow_block_offload *f, + struct list_head *driver_block_list, + tc_setup_cb_t *cb, void *cb_ident, void *cb_priv, + bool ingress_only) +{ + struct flow_block_cb *block_cb; + + if (ingress_only && + f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) + return -EOPNOTSUPP; + + f->driver_block_list = driver_block_list; + + switch (f->command) { + case FLOW_BLOCK_BIND: + if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list)) + return -EBUSY; + + block_cb = flow_block_cb_alloc(f->net, cb, cb_ident, + cb_priv, NULL); + if (IS_ERR(block_cb)) + return PTR_ERR(block_cb); + + flow_block_cb_add(block_cb, f); + list_add_tail(&block_cb->driver_list, driver_block_list); + return 0; + case FLOW_BLOCK_UNBIND: + block_cb = flow_block_cb_lookup(f, cb, cb_ident); + if (!block_cb) + return -ENOENT; + + flow_block_cb_remove(block_cb, f); + list_del(&block_cb->driver_list); + return 0; + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL(flow_block_cb_setup_simple); diff --git a/net/core/hwbm.c b/net/core/hwbm.c index fd822ca5a245..ac1a66df9adc 100644 --- a/net/core/hwbm.c +++ b/net/core/hwbm.c @@ -43,34 +43,33 @@ int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) } EXPORT_SYMBOL_GPL(hwbm_pool_refill); -int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp) +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num) { int err, i; - unsigned long flags; - spin_lock_irqsave(&bm_pool->lock, flags); + mutex_lock(&bm_pool->buf_lock); if (bm_pool->buf_num == bm_pool->size) { pr_warn("pool already filled\n"); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return bm_pool->buf_num; } if (buf_num + bm_pool->buf_num > bm_pool->size) { pr_warn("cannot allocate %d buffers for pool\n", buf_num); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return 0; } if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) { pr_warn("Adding %d buffers to the %d current buffers will overflow\n", buf_num, bm_pool->buf_num); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return 0; } for (i = 0; i < buf_num; i++) { - err = hwbm_pool_refill(bm_pool, gfp); + err = hwbm_pool_refill(bm_pool, GFP_KERNEL); if (err < 0) break; } @@ -79,7 +78,7 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp) bm_pool->buf_num += i; pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return i; } diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 04fdc9535772..f153e0601838 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -163,9 +163,16 @@ static void linkwatch_do_dev(struct net_device *dev) static void __linkwatch_run_queue(int urgent_only) { +#define MAX_DO_DEV_PER_LOOP 100 + + int do_dev = MAX_DO_DEV_PER_LOOP; struct net_device *dev; LIST_HEAD(wrk); + /* Give urgent case more budget */ + if (urgent_only) + do_dev += MAX_DO_DEV_PER_LOOP; + /* * Limit the number of linkwatch events to one * per second so that a runaway driver does not @@ -184,7 +191,7 @@ static void __linkwatch_run_queue(int urgent_only) spin_lock_irq(&lweventlist_lock); list_splice_init(&lweventlist, &wrk); - while (!list_empty(&wrk)) { + while (!list_empty(&wrk) && do_dev > 0) { dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); @@ -195,9 +202,13 @@ static void __linkwatch_run_queue(int urgent_only) } spin_unlock_irq(&lweventlist_lock); linkwatch_do_dev(dev); + do_dev--; spin_lock_irq(&lweventlist_lock); } + /* Add the remaining work back to lweventlist */ + list_splice_init(&wrk, &lweventlist); + if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9e7fc929bc50..742cea4ce72e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -583,6 +583,8 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, int error; struct neigh_hash_table *nht; + trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); + if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 470b179d599e..283ddb2dbc7d 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -43,6 +43,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); #endif +#if IS_ENABLED(CONFIG_PAGE_POOL) +#include <trace/events/page_pool.h> +#endif + #include <trace/events/neigh.h> EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f7b6dda798e0..a0e0d298c991 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -152,6 +152,17 @@ static void ops_free(const struct pernet_operations *ops, struct net *net) } } +static void ops_pre_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + + if (ops->pre_exit) { + list_for_each_entry(net, net_exit_list, exit_list) + ops->pre_exit(net); + } +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { @@ -337,6 +348,12 @@ out_undo: list_add(&net->exit_list, &net_exit_list); saved_ops = ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_pre_exit_list(ops, &net_exit_list); + + synchronize_rcu(); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); ops = saved_ops; @@ -560,10 +577,15 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } + /* Run all of the network namespace pre_exit methods */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_pre_exit_list(ops, &net_exit_list); + /* * Another CPU might be rcu-iterating the list, wait for it. * This needs to be before calling the exit() notifiers, so * the rcu_barrier() below isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. */ synchronize_rcu(); @@ -1121,6 +1143,8 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); + ops_pre_exit_list(ops, &net_exit_list); + synchronize_rcu(); ops_exit_list(ops, &net_exit_list); ops_free_list(ops, &net_exit_list); return error; @@ -1135,6 +1159,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); + ops_pre_exit_list(ops, &net_exit_list); + synchronize_rcu(); ops_exit_list(ops, &net_exit_list); ops_free_list(ops, &net_exit_list); } @@ -1159,6 +1185,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) } else { LIST_HEAD(net_exit_list); list_add(&init_net.exit_list, &net_exit_list); + ops_pre_exit_list(ops, &net_exit_list); + synchronize_rcu(); ops_exit_list(ops, &net_exit_list); ops_free_list(ops, &net_exit_list); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index dd8b1a460d64..2cf27da1baeb 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -696,16 +696,22 @@ int netpoll_setup(struct netpoll *np) if (!np->local_ip.ip) { if (!np->ipv6) { + const struct in_ifaddr *ifa; + in_dev = __in_dev_get_rtnl(ndev); + if (!in_dev) + goto put_noaddr; - if (!in_dev || !in_dev->ifa_list) { + ifa = rtnl_dereference(in_dev->ifa_list); + if (!ifa) { +put_noaddr: np_err(np, "no IP address for %s, aborting\n", np->dev_name); err = -EDESTADDRREQ; goto put; } - np->local_ip.ip = in_dev->ifa_list->ifa_local; + np->local_ip.ip = ifa->ifa_local; np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5b2252c6d49b..3272dc7a8c81 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -4,9 +4,11 @@ * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ + #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/device.h> #include <net/page_pool.h> #include <linux/dma-direction.h> @@ -14,6 +16,8 @@ #include <linux/page-flags.h> #include <linux/mm.h> /* for __put_page() */ +#include <trace/events/page_pool.h> + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -43,6 +47,14 @@ static int page_pool_init(struct page_pool *pool, if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; + atomic_set(&pool->pages_state_release_cnt, 0); + + /* Driver calling page_pool_create() also call page_pool_destroy() */ + refcount_set(&pool->user_cnt, 1); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + get_device(pool->p.dev); + return 0; } @@ -61,6 +73,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) kfree(pool); return ERR_PTR(err); } + return pool; } EXPORT_SYMBOL(page_pool_create); @@ -151,6 +164,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, page->dma_addr = dma; skip_dma_map: + /* Track how many pages are held 'in-flight' */ + pool->pages_state_hold_cnt++; + + trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + /* When page just alloc'ed is should/must have refcnt 1. */ return page; } @@ -173,6 +191,33 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) } EXPORT_SYMBOL(page_pool_alloc_pages); +/* Calculate distance between two u32 values, valid if distance is below 2^(31) + * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution + */ +#define _distance(a, b) (s32)((a) - (b)) + +static s32 page_pool_inflight(struct page_pool *pool) +{ + u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); + u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); + s32 distance; + + distance = _distance(hold_cnt, release_cnt); + + trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt); + return distance; +} + +static bool __page_pool_safe_to_destroy(struct page_pool *pool) +{ + s32 inflight = page_pool_inflight(pool); + + /* The distance should not be able to become negative */ + WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); + + return (inflight == 0); +} + /* Cleanup page_pool state from page */ static void __page_pool_clean_page(struct page_pool *pool, struct page *page) @@ -180,7 +225,7 @@ static void __page_pool_clean_page(struct page_pool *pool, dma_addr_t dma; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) - return; + goto skip_dma_unmap; dma = page->dma_addr; /* DMA unmap */ @@ -188,12 +233,27 @@ static void __page_pool_clean_page(struct page_pool *pool, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); page->dma_addr = 0; +skip_dma_unmap: + atomic_inc(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, page, + atomic_read(&pool->pages_state_release_cnt)); +} + +/* unmap the page and clean our state */ +void page_pool_unmap_page(struct page_pool *pool, struct page *page) +{ + /* When page is unmapped, this implies page will not be + * returned to page_pool. + */ + __page_pool_clean_page(pool, page); } +EXPORT_SYMBOL(page_pool_unmap_page); /* Return a page to the page allocator, cleaning up our state */ static void __page_pool_return_page(struct page_pool *pool, struct page *page) { __page_pool_clean_page(pool, page); + put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a @@ -285,21 +345,45 @@ static void __page_pool_empty_ring(struct page_pool *pool) } } -static void __page_pool_destroy_rcu(struct rcu_head *rcu) +static void __warn_in_flight(struct page_pool *pool) { - struct page_pool *pool; + u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); + u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); + s32 distance; - pool = container_of(rcu, struct page_pool, rcu); + distance = _distance(hold_cnt, release_cnt); + + /* Drivers should fix this, but only problematic when DMA is used */ + WARN(1, "Still in-flight pages:%d hold:%u released:%u", + distance, hold_cnt, release_cnt); +} + +void __page_pool_free(struct page_pool *pool) +{ + /* Only last user actually free/release resources */ + if (!page_pool_put(pool)) + return; WARN(pool->alloc.count, "API usage violation"); + WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty"); + + /* Can happen due to forced shutdown */ + if (!__page_pool_safe_to_destroy(pool)) + __warn_in_flight(pool); - __page_pool_empty_ring(pool); ptr_ring_cleanup(&pool->ring, NULL); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + put_device(pool->p.dev); + kfree(pool); } +EXPORT_SYMBOL(__page_pool_free); -/* Cleanup and release resources */ -void page_pool_destroy(struct page_pool *pool) +/* Request to shutdown: release pages cached by page_pool, and check + * for in-flight pages + */ +bool __page_pool_request_shutdown(struct page_pool *pool) { struct page *page; @@ -317,7 +401,6 @@ void page_pool_destroy(struct page_pool *pool) */ __page_pool_empty_ring(pool); - /* An xdp_mem_allocator can still ref page_pool pointer */ - call_rcu(&pool->rcu, __page_pool_destroy_rcu); + return __page_pool_safe_to_destroy(pool); } -EXPORT_SYMBOL(page_pool_destroy); +EXPORT_SYMBOL(__page_pool_request_shutdown); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f975c5e2a369..bb9915291644 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2118,9 +2118,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) rcu_read_lock(); in_dev = __in_dev_get_rcu(pkt_dev->odev); if (in_dev) { - if (in_dev->ifa_list) { - pkt_dev->saddr_min = - in_dev->ifa_list->ifa_address; + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(in_dev->ifa_list); + if (ifa) { + pkt_dev->saddr_min = ifa->ifa_address; pkt_dev->saddr_max = pkt_dev->saddr_min; } } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cec60583931f..1ee6460f8275 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -751,6 +751,10 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) struct nlattr *mx; int i, valid = 0; + /* nothing is dumped for dst_default_metrics, so just skip the loop */ + if (metrics == dst_default_metrics.metrics) + return 0; + mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; @@ -908,6 +912,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * @@ -1197,6 +1202,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; + struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; memset(&ivi, 0, sizeof(ivi)); @@ -1231,6 +1237,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_trust.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; @@ -1247,6 +1254,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, if (!vf) goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || + nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || @@ -1753,6 +1761,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c8cd99c3603f..6f1e31f674a3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -59,6 +59,7 @@ #include <linux/errqueue.h> #include <linux/prefetch.h> #include <linux/if_vlan.h> +#include <linux/mpls.h> #include <net/protocol.h> #include <net/dst.h> @@ -66,12 +67,14 @@ #include <net/checksum.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> +#include <net/mpls.h> #include <linux/uaccess.h> #include <trace/events/skb.h> #include <linux/highmem.h> #include <linux/capability.h> #include <linux/user_namespace.h> +#include <linux/indirect_call_wrapper.h> #include "datagram.h" @@ -365,18 +368,20 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); -static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct page_frag_cache *nc; - unsigned long flags; - void *data; + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - local_irq_save(flags); - nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc(nc, fragsz, gfp_mask); - local_irq_restore(flags); - return data; + return page_frag_alloc(&nc->page, fragsz, gfp_mask); +} + +void *napi_alloc_frag(unsigned int fragsz) +{ + fragsz = SKB_DATA_ALIGN(fragsz); + + return __napi_alloc_frag(fragsz, GFP_ATOMIC); } +EXPORT_SYMBOL(napi_alloc_frag); /** * netdev_alloc_frag - allocate a page fragment @@ -387,26 +392,21 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) */ void *netdev_alloc_frag(unsigned int fragsz) { - fragsz = SKB_DATA_ALIGN(fragsz); - - return __netdev_alloc_frag(fragsz, GFP_ATOMIC); -} -EXPORT_SYMBOL(netdev_alloc_frag); - -static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) -{ - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - - return page_frag_alloc(&nc->page, fragsz, gfp_mask); -} + struct page_frag_cache *nc; + void *data; -void *napi_alloc_frag(unsigned int fragsz) -{ fragsz = SKB_DATA_ALIGN(fragsz); - - return __napi_alloc_frag(fragsz, GFP_ATOMIC); + if (in_irq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc(nc, fragsz, GFP_ATOMIC); + } else { + local_bh_disable(); + data = __napi_alloc_frag(fragsz, GFP_ATOMIC); + local_bh_enable(); + } + return data; } -EXPORT_SYMBOL(napi_alloc_frag); +EXPORT_SYMBOL(netdev_alloc_frag); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device @@ -425,7 +425,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, gfp_t gfp_mask) { struct page_frag_cache *nc; - unsigned long flags; struct sk_buff *skb; bool pfmemalloc; void *data; @@ -446,13 +445,17 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - local_irq_save(flags); - - nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; - - local_irq_restore(flags); + if (in_irq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + } else { + local_bh_disable(); + nc = this_cpu_ptr(&napi_alloc_cache.page); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + local_bh_enable(); + } if (unlikely(!data)) return NULL; @@ -706,6 +709,105 @@ void kfree_skb_list(struct sk_buff *segs) } EXPORT_SYMBOL(kfree_skb_list); +/* Dump skb information and contents. + * + * Must only be called from net_ratelimit()-ed paths. + * + * Dumps up to can_dump_full whole packets if full_pkt, headers otherwise. + */ +void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) +{ + static atomic_t can_dump_full = ATOMIC_INIT(5); + struct skb_shared_info *sh = skb_shinfo(skb); + struct net_device *dev = skb->dev; + struct sock *sk = skb->sk; + struct sk_buff *list_skb; + bool has_mac, has_trans; + int headroom, tailroom; + int i, len, seg_len; + + if (full_pkt) + full_pkt = atomic_dec_if_positive(&can_dump_full) >= 0; + + if (full_pkt) + len = skb->len; + else + len = min_t(int, skb->len, MAX_HEADER + 128); + + headroom = skb_headroom(skb); + tailroom = skb_tailroom(skb); + + has_mac = skb_mac_header_was_set(skb); + has_trans = skb_transport_header_was_set(skb); + + printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" + "mac=(%d,%d) net=(%d,%d) trans=%d\n" + "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" + "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" + "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", + level, skb->len, headroom, skb_headlen(skb), tailroom, + has_mac ? skb->mac_header : -1, + has_mac ? skb_mac_header_len(skb) : -1, + skb->network_header, + has_trans ? skb_network_header_len(skb) : -1, + has_trans ? skb->transport_header : -1, + sh->tx_flags, sh->nr_frags, + sh->gso_size, sh->gso_type, sh->gso_segs, + skb->csum, skb->ip_summed, skb->csum_complete_sw, + skb->csum_valid, skb->csum_level, + skb->hash, skb->sw_hash, skb->l4_hash, + ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); + + if (dev) + printk("%sdev name=%s feat=0x%pNF\n", + level, dev->name, &dev->features); + if (sk) + printk("%ssk family=%hu type=%hu proto=%hu\n", + level, sk->sk_family, sk->sk_type, sk->sk_protocol); + + if (full_pkt && headroom) + print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, + 16, 1, skb->head, headroom, false); + + seg_len = min_t(int, skb_headlen(skb), len); + if (seg_len) + print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, + 16, 1, skb->data, seg_len, false); + len -= seg_len; + + if (full_pkt && tailroom) + print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, + 16, 1, skb_tail_pointer(skb), tailroom, false); + + for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + skb_frag_foreach_page(frag, frag->page_offset, + skb_frag_size(frag), p, p_off, p_len, + copied) { + seg_len = min_t(int, p_len, len); + vaddr = kmap_atomic(p); + print_hex_dump(level, "skb frag: ", + DUMP_PREFIX_OFFSET, + 16, 1, vaddr + p_off, seg_len, false); + kunmap_atomic(vaddr); + len -= seg_len; + if (!len) + break; + } + } + + if (full_pkt && skb_has_frag_list(skb)) { + printk("skb fraglist:\n"); + skb_walk_frags(skb, list_skb) + skb_dump(level, list_skb, true); + } +} +EXPORT_SYMBOL(skb_dump); + /** * skb_tx_error - report an sk_buff xmit error * @skb: buffer that triggered an error @@ -909,6 +1011,31 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) } /** + * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg + * @first: first sk_buff of the msg + */ +struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) +{ + struct sk_buff *n; + + n = alloc_skb(0, GFP_ATOMIC); + if (!n) + return NULL; + + n->len = first->len; + n->data_len = first->len; + n->truesize = first->truesize; + + skb_shinfo(n)->frag_list = first; + + __copy_skb_header(n, first); + n->destructor = NULL; + + return n; +} +EXPORT_SYMBOL_GPL(alloc_skb_for_msg); + +/** * skb_morph - morph one skb into another * @dst: the skb to receive the contents * @src: the skb to supply the contents @@ -2508,7 +2635,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, if (copy > 0) { if (copy > len) copy = len; - csum = ops->update(skb->data + offset, copy, csum); + csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, + skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; @@ -2535,9 +2663,13 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, frag->page_offset + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); - csum2 = ops->update(vaddr + p_off, p_len, 0); + csum2 = INDIRECT_CALL_1(ops->update, + csum_partial_ext, + vaddr + p_off, p_len, 0); kunmap_atomic(vaddr); - csum = ops->combine(csum, csum2, pos, p_len); + csum = INDIRECT_CALL_1(ops->combine, + csum_block_add_ext, csum, + csum2, pos, p_len); pos += p_len; } @@ -2560,7 +2692,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, copy = len; csum2 = __skb_checksum(frag_iter, offset - start, copy, 0, ops); - csum = ops->combine(csum, csum2, pos, copy); + csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, + csum, csum2, pos, copy); if ((len -= copy) == 0) return csum; offset += copy; @@ -5294,6 +5427,173 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) } EXPORT_SYMBOL(skb_vlan_push); +/* Update the ethertype of hdr and the skb csum value if required. */ +static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, + __be16 ethertype) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be16 diff[] = { ~hdr->h_proto, ethertype }; + + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); + } + + hdr->h_proto = ethertype; +} + +/** + * skb_mpls_push() - push a new MPLS header after the mac header + * + * @skb: buffer + * @mpls_lse: MPLS label stack entry to push + * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto) +{ + struct mpls_shim_hdr *lse; + int err; + + if (unlikely(!eth_p_mpls(mpls_proto))) + return -EINVAL; + + /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ + if (skb->encapsulation) + return -EINVAL; + + err = skb_cow_head(skb, MPLS_HLEN); + if (unlikely(err)) + return err; + + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb->mac_len); + skb_set_inner_protocol(skb, skb->protocol); + } + + skb_push(skb, MPLS_HLEN); + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb->mac_len); + + lse = mpls_hdr(skb); + lse->label_stack_entry = mpls_lse; + skb_postpush_rcsum(skb, lse, MPLS_HLEN); + + if (skb->dev && skb->dev->type == ARPHRD_ETHER) + skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); + skb->protocol = mpls_proto; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_mpls_push); + +/** + * skb_mpls_pop() - pop the outermost MPLS header + * + * @skb: buffer + * @next_proto: ethertype of header after popped MPLS header + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto) +{ + int err; + + if (unlikely(!eth_p_mpls(skb->protocol))) + return -EINVAL; + + err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + + __skb_pull(skb, MPLS_HLEN); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb->mac_len); + + if (skb->dev && skb->dev->type == ARPHRD_ETHER) { + struct ethhdr *hdr; + + /* use mpls_hdr() to get ethertype to account for VLANs. */ + hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); + skb_mod_eth_type(skb, hdr, next_proto); + } + skb->protocol = next_proto; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_mpls_pop); + +/** + * skb_mpls_update_lse() - modify outermost MPLS header and update csum + * + * @skb: buffer + * @mpls_lse: new MPLS label stack entry to update to + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) +{ + int err; + + if (unlikely(!eth_p_mpls(skb->protocol))) + return -EINVAL; + + err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; + + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); + } + + mpls_hdr(skb)->label_stack_entry = mpls_lse; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_mpls_update_lse); + +/** + * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header + * + * @skb: buffer + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_dec_ttl(struct sk_buff *skb) +{ + u32 lse; + u8 ttl; + + if (unlikely(!eth_p_mpls(skb->protocol))) + return -EINVAL; + + lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); + ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + if (!--ttl) + return -EINVAL; + + lse &= ~MPLS_LS_TTL_MASK; + lse |= ttl << MPLS_LS_TTL_SHIFT; + + return skb_mpls_update_lse(skb, cpu_to_be32(lse)); +} +EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); + /** * alloc_skb_with_frags - allocate skb with page frags * diff --git a/net/core/sock.c b/net/core/sock.c index aa4a00d381e3..3e073ca6138f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1039,6 +1039,10 @@ set_rcvbuf: } break; + case SO_DETACH_REUSEPORT_BPF: + ret = reuseport_detach_prog(sk); + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -2843,7 +2847,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) if (sock) { sk->sk_type = sock->type; - RCU_INIT_POINTER(sk->sk_wq, sock->wq); + RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; sk->sk_uid = SOCK_INODE(sock)->i_uid; } else { diff --git a/net/core/sock_map.c b/net/core/sock_map.c index be6092ac69f8..52d4faeee18b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -44,13 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* Make sure page count doesn't overflow. */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_stab; - } - - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(stab->map.pages); + err = bpf_map_charge_init(&stab->map.memory, cost); if (err) goto free_stab; @@ -60,6 +54,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (stab->sks) return &stab->map; err = -ENOMEM; + bpf_map_charge_finish(&stab->map.memory); free_stab: kfree(stab); return ERR_PTR(err); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index dc4aefdf2a08..9408f9264d05 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -332,3 +332,27 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) return 0; } EXPORT_SYMBOL(reuseport_attach_prog); + +int reuseport_detach_prog(struct sock *sk) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return sk->sk_reuseport ? -ENOENT : -EINVAL; + + old_prog = NULL; + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + rcu_swap_protected(reuse->prog, old_prog, + lockdep_is_held(&reuseport_lock)); + spin_unlock_bh(&reuseport_lock); + + if (!old_prog) + return -ENOENT; + + sk_reuseport_prog_free(old_prog); + return 0; +} +EXPORT_SYMBOL(reuseport_detach_prog); diff --git a/net/core/xdp.c b/net/core/xdp.c index 8aab08b131d9..d7bf62ffbb5e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -14,6 +14,8 @@ #include <net/page_pool.h> #include <net/xdp.h> +#include <net/xdp_priv.h> /* struct xdp_mem_allocator */ +#include <trace/events/xdp.h> #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 @@ -29,17 +31,6 @@ static int mem_id_next = MEM_ID_MIN; static bool mem_id_init; /* false */ static struct rhashtable *mem_id_ht; -struct xdp_mem_allocator { - struct xdp_mem_info mem; - union { - void *allocator; - struct page_pool *page_pool; - struct zero_copy_allocator *zc_alloc; - }; - struct rhash_head node; - struct rcu_head rcu; -}; - static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) { const u32 *k = data; @@ -79,13 +70,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); + /* Allocator have indicated safe to remove before this is called */ + if (xa->mem.type == MEM_TYPE_PAGE_POOL) + page_pool_free(xa->page_pool); + /* Allow this ID to be reused */ ida_simple_remove(&mem_id_pool, xa->mem.id); - /* Notice, driver is expected to free the *allocator, - * e.g. page_pool, and MUST also use RCU free. - */ - /* Poison memory */ xa->mem.id = 0xFFFF; xa->mem.type = 0xF0F0; @@ -94,6 +85,64 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) kfree(xa); } +static bool __mem_id_disconnect(int id, bool force) +{ + struct xdp_mem_allocator *xa; + bool safe_to_remove = true; + + mutex_lock(&mem_id_lock); + + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + WARN(1, "Request remove non-existing id(%d), driver bug?", id); + return true; + } + xa->disconnect_cnt++; + + /* Detects in-flight packet-pages for page_pool */ + if (xa->mem.type == MEM_TYPE_PAGE_POOL) + safe_to_remove = page_pool_request_shutdown(xa->page_pool); + + trace_mem_disconnect(xa, safe_to_remove, force); + + if ((safe_to_remove || force) && + !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + + mutex_unlock(&mem_id_lock); + return (safe_to_remove|force); +} + +#define DEFER_TIME (msecs_to_jiffies(1000)) +#define DEFER_WARN_INTERVAL (30 * HZ) +#define DEFER_MAX_RETRIES 120 + +static void mem_id_disconnect_defer_retry(struct work_struct *wq) +{ + struct delayed_work *dwq = to_delayed_work(wq); + struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq); + bool force = false; + + if (xa->disconnect_cnt > DEFER_MAX_RETRIES) + force = true; + + if (__mem_id_disconnect(xa->mem.id, force)) + return; + + /* Periodic warning */ + if (time_after_eq(jiffies, xa->defer_warn)) { + int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ; + + pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n", + __func__, xa->mem.id, xa->disconnect_cnt, sec); + xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; + } + + /* Still not ready to be disconnected, retry later */ + schedule_delayed_work(&xa->defer_wq, DEFER_TIME); +} + void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; @@ -112,16 +161,30 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; + if (__mem_id_disconnect(id, false)) + return; + + /* Could not disconnect, defer new disconnect attempt to later */ mutex_lock(&mem_id_lock); xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + if (!xa) { + mutex_unlock(&mem_id_lock); + return; + } + xa->defer_start = jiffies; + xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; + INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry); mutex_unlock(&mem_id_lock); + schedule_delayed_work(&xa->defer_wq, DEFER_TIME); } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); +/* This unregister operation will also cleanup and destroy the + * allocator. The page_pool_free() operation is first called when it's + * safe to remove, possibly deferred to a workqueue. + */ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -301,12 +364,18 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { + ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id); + xdp_rxq->mem.id = 0; errno = PTR_ERR(ptr); goto err; } + if (type == MEM_TYPE_PAGE_POOL) + page_pool_get(xdp_alloc->page_pool); + mutex_unlock(&mem_id_lock); + trace_mem_connect(xdp_alloc, xdp_rxq); return 0; err: mutex_unlock(&mem_id_lock); @@ -333,10 +402,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (xa) { + if (likely(xa)) { napi_direct &= !xdp_return_frame_no_direct(); page_pool_put_page(xa->page_pool, page, napi_direct); } else { + /* Hopefully stack show who to blame for late return */ + WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id); + trace_mem_return_failed(mem, page); put_page(page); } rcu_read_unlock(); @@ -379,6 +451,21 @@ void xdp_return_buff(struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_return_buff); +/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ +void __xdp_release_frame(void *data, struct xdp_mem_info *mem) +{ + struct xdp_mem_allocator *xa; + struct page *page; + + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_release_page(xa->page_pool, page); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(__xdp_release_frame); + int xdp_attachment_query(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 85c10c8f50bd..1b7381ff787b 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -830,7 +830,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index d449f78c1bd0..6e942dda1bcd 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -106,6 +106,7 @@ config NET_DSA_TAG_LAN9303 config NET_DSA_TAG_SJA1105 tristate "Tag driver for NXP SJA1105 switches" select NET_DSA_TAG_8021Q + select PACKING help Say Y or M if you want to enable support for tagging frames with the NXP SJA1105 switch family. Both the native tagging protocol (which diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 820dd8da57fc..3abd173ebacb 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -257,7 +257,7 @@ static int dsa_port_setup(struct dsa_port *dp) enum devlink_port_flavour flavour; struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; - int err; + int err = 0; if (dp->type == DSA_PORT_TYPE_UNUSED) return 0; @@ -295,19 +295,15 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_CPU: err = dsa_port_link_register_of(dp); - if (err) { + if (err) dev_err(ds->dev, "failed to setup link for port %d.%d\n", ds->index, dp->index); - return err; - } break; case DSA_PORT_TYPE_DSA: err = dsa_port_link_register_of(dp); - if (err) { + if (err) dev_err(ds->dev, "failed to setup link for port %d.%d\n", ds->index, dp->index); - return err; - } break; case DSA_PORT_TYPE_USER: err = dsa_slave_create(dp); @@ -319,7 +315,10 @@ static int dsa_port_setup(struct dsa_port *dp) break; } - return 0; + if (err) + devlink_port_unregister(&dp->devlink_port); + + return err; } static void dsa_port_teardown(struct dsa_port *dp) @@ -347,7 +346,7 @@ static void dsa_port_teardown(struct dsa_port *dp) static int dsa_switch_setup(struct dsa_switch *ds) { - int err; + int err = 0; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus * driver and before ops->setup() has run, since the switch drivers and @@ -365,29 +364,41 @@ static int dsa_switch_setup(struct dsa_switch *ds) err = devlink_register(ds->devlink, ds->dev); if (err) - return err; + goto free_devlink; err = dsa_switch_register_notifier(ds); if (err) - return err; + goto unregister_devlink; err = ds->ops->setup(ds); if (err < 0) - return err; + goto unregister_notifier; if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); - if (!ds->slave_mii_bus) - return -ENOMEM; + if (!ds->slave_mii_bus) { + err = -ENOMEM; + goto unregister_notifier; + } dsa_slave_mii_bus_init(ds); err = mdiobus_register(ds->slave_mii_bus); if (err < 0) - return err; + goto unregister_notifier; } return 0; + +unregister_notifier: + dsa_switch_unregister_notifier(ds); +unregister_devlink: + devlink_unregister(ds->devlink); +free_devlink: + devlink_free(ds->devlink); + ds->devlink = NULL; + + return err; } static void dsa_switch_teardown(struct dsa_switch *ds) @@ -397,6 +408,9 @@ static void dsa_switch_teardown(struct dsa_switch *ds) dsa_switch_unregister_notifier(ds); + if (ds->ops->teardown) + ds->ops->teardown(ds); + if (ds->devlink) { devlink_unregister(ds->devlink); devlink_free(ds->devlink); @@ -409,8 +423,8 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) { struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - int err; + int device, port, i; + int err = 0; for (device = 0; device < DSA_MAX_SWITCHES; device++) { ds = dst->ds[device]; @@ -419,18 +433,41 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) err = dsa_switch_setup(ds); if (err) - return err; + goto switch_teardown; for (port = 0; port < ds->num_ports; port++) { dp = &ds->ports[port]; err = dsa_port_setup(dp); if (err) - return err; + goto ports_teardown; } } return 0; + +ports_teardown: + for (i = 0; i < port; i++) + dsa_port_teardown(&ds->ports[i]); + + dsa_switch_teardown(ds); + +switch_teardown: + for (i = 0; i < device; i++) { + ds = dst->ds[i]; + if (!ds) + continue; + + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + dsa_port_teardown(dp); + } + + dsa_switch_teardown(ds); + } + + return err; } static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) @@ -492,17 +529,24 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) err = dsa_tree_setup_switches(dst); if (err) - return err; + goto teardown_default_cpu; err = dsa_tree_setup_master(dst); if (err) - return err; + goto teardown_switches; dst->setup = true; pr_info("DSA: tree %d setup\n", dst->index); return 0; + +teardown_switches: + dsa_tree_teardown_switches(dst); +teardown_default_cpu: + dsa_tree_teardown_default_cpu(dst); + + return err; } static void dsa_tree_teardown(struct dsa_switch_tree *dst) @@ -543,8 +587,10 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst, dst->ds[index] = ds; err = dsa_tree_setup(dst); - if (err) - dsa_tree_remove_switch(dst, index); + if (err) { + dst->ds[index] = NULL; + dsa_tree_put(dst); + } return err; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 3986cedfafc0..12f8c7ee4dd8 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -150,6 +150,8 @@ int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags, struct switchdev_trans *trans); int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags, struct switchdev_trans *trans); +int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, + struct switchdev_trans *trans); int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans); @@ -159,6 +161,23 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags); int dsa_port_vid_del(struct dsa_port *dp, u16 vid); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); +void dsa_port_phylink_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state); +int dsa_port_phylink_mac_link_state(struct phylink_config *config, + struct phylink_link_state *state); +void dsa_port_phylink_mac_config(struct phylink_config *config, + unsigned int mode, + const struct phylink_link_state *state); +void dsa_port_phylink_mac_an_restart(struct phylink_config *config); +void dsa_port_phylink_mac_link_down(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface); +void dsa_port_phylink_mac_link_up(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev); +extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index 363eab6df51b..f071acf2842b 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -261,6 +261,18 @@ int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags, return err; } +int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, + struct switchdev_trans *trans) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + if (switchdev_trans_ph_prepare(trans)) + return ds->ops->port_egress_floods ? 0 : -EOPNOTSUPP; + + return ds->ops->port_egress_floods(ds, port, true, mrouter); +} + int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid) { @@ -336,9 +348,6 @@ int dsa_port_vlan_add(struct dsa_port *dp, .vlan = vlan, }; - /* Can be called from dsa_slave_port_obj_add() or - * dsa_slave_vlan_rx_add_vid() - */ if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); @@ -354,12 +363,6 @@ int dsa_port_vlan_del(struct dsa_port *dp, .vlan = vlan, }; - if (vlan->obj.orig_dev && netif_is_bridge_master(vlan->obj.orig_dev)) - return -EOPNOTSUPP; - - /* Can be called from dsa_slave_port_obj_del() or - * dsa_slave_vlan_rx_kill_vid() - */ if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); @@ -418,6 +421,108 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) return phydev; } +void dsa_port_phylink_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_validate) + return; + + ds->ops->phylink_validate(ds, dp->index, supported, state); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_validate); + +int dsa_port_phylink_mac_link_state(struct phylink_config *config, + struct phylink_link_state *state) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + /* Only called for SGMII and 802.3z */ + if (!ds->ops->phylink_mac_link_state) + return -EOPNOTSUPP; + + return ds->ops->phylink_mac_link_state(ds, dp->index, state); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state); + +void dsa_port_phylink_mac_config(struct phylink_config *config, + unsigned int mode, + const struct phylink_link_state *state) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_config) + return; + + ds->ops->phylink_mac_config(ds, dp->index, mode, state); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_config); + +void dsa_port_phylink_mac_an_restart(struct phylink_config *config) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_an_restart) + return; + + ds->ops->phylink_mac_an_restart(ds, dp->index); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_an_restart); + +void dsa_port_phylink_mac_link_down(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct phy_device *phydev = NULL; + struct dsa_switch *ds = dp->ds; + + if (dsa_is_user_port(ds, dp->index)) + phydev = dp->slave->phydev; + + if (!ds->ops->phylink_mac_link_down) { + if (ds->ops->adjust_link && phydev) + ds->ops->adjust_link(ds, dp->index, phydev); + return; + } + + ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_down); + +void dsa_port_phylink_mac_link_up(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_link_up) { + if (ds->ops->adjust_link && phydev) + ds->ops->adjust_link(ds, dp->index, phydev); + return; + } + + ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up); + +const struct phylink_mac_ops dsa_port_phylink_mac_ops = { + .validate = dsa_port_phylink_validate, + .mac_link_state = dsa_port_phylink_mac_link_state, + .mac_config = dsa_port_phylink_mac_config, + .mac_an_restart = dsa_port_phylink_mac_an_restart, + .mac_link_down = dsa_port_phylink_mac_link_down, + .mac_link_up = dsa_port_phylink_mac_link_up, +}; + static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable) { struct dsa_switch *ds = dp->ds; @@ -495,8 +600,53 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) return 0; } +static int dsa_port_phylink_register(struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + struct device_node *port_dn = dp->dn; + int mode, err; + + mode = of_get_phy_mode(port_dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + + dp->pl_config.dev = ds->dev; + dp->pl_config.type = PHYLINK_DEV; + + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), + mode, &dsa_port_phylink_mac_ops); + if (IS_ERR(dp->pl)) { + pr_err("error creating PHYLINK: %ld\n", PTR_ERR(dp->pl)); + return PTR_ERR(dp->pl); + } + + err = phylink_of_phy_connect(dp->pl, port_dn, 0); + if (err && err != -ENODEV) { + pr_err("could not attach to PHY: %d\n", err); + goto err_phy_connect; + } + + rtnl_lock(); + phylink_start(dp->pl); + rtnl_unlock(); + + return 0; + +err_phy_connect: + phylink_destroy(dp->pl); + return err; +} + int dsa_port_link_register_of(struct dsa_port *dp) { + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->adjust_link) + return dsa_port_phylink_register(dp); + + dev_warn(ds->dev, + "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n"); + if (of_phy_is_fixed_link(dp->dn)) return dsa_port_fixed_link_register_of(dp); else @@ -505,6 +655,16 @@ int dsa_port_link_register_of(struct dsa_port *dp) void dsa_port_link_unregister_of(struct dsa_port *dp) { + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->adjust_link) { + rtnl_lock(); + phylink_disconnect_phy(dp->pl); + rtnl_unlock(); + phylink_destroy(dp->pl); + return; + } + if (of_phy_is_fixed_link(dp->dn)) of_phy_deregister_fixed_link(dp->dn); else diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 8157be7e162d..614c38ece104 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -22,7 +22,7 @@ #include "dsa_priv.h" -static bool dsa_slave_dev_check(struct net_device *dev); +static bool dsa_slave_dev_check(const struct net_device *dev); /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) @@ -301,6 +301,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev, case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, trans); break; + case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER: + ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter, trans); + break; default: ret = -EOPNOTSUPP; break; @@ -311,7 +314,8 @@ static int dsa_slave_port_attr_set(struct net_device *dev, static int dsa_slave_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans) + struct switchdev_trans *trans, + struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); int err; @@ -323,6 +327,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans); break; case SWITCHDEV_OBJ_ID_HOST_MDB: @@ -333,6 +339,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev, trans); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), trans); break; @@ -352,6 +360,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: @@ -361,6 +371,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev, err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj)); break; default: @@ -423,6 +435,8 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, if (!clone) return; + DSA_SKB_CB(skb)->clone = clone; + if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) return; @@ -460,6 +474,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) u64_stats_update_end(&s->syncp); DSA_SKB_CB(skb)->deferred_xmit = false; + DSA_SKB_CB(skb)->clone = NULL; /* Identify PTP protocol packets, clone them, and pass them to the * switch driver @@ -930,23 +945,42 @@ static int dsa_slave_setup_tc_block_cb_eg(enum tc_setup_type type, return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, false); } +static LIST_HEAD(dsa_slave_block_cb_list); + static int dsa_slave_setup_tc_block(struct net_device *dev, - struct tc_block_offload *f) + struct flow_block_offload *f) { + struct flow_block_cb *block_cb; tc_setup_cb_t *cb; - if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS) + if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) cb = dsa_slave_setup_tc_block_cb_ig; - else if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS) + else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) cb = dsa_slave_setup_tc_block_cb_eg; else return -EOPNOTSUPP; + f->driver_block_list = &dsa_slave_block_cb_list; + switch (f->command) { - case TC_BLOCK_BIND: - return tcf_block_cb_register(f->block, cb, dev, dev, f->extack); - case TC_BLOCK_UNBIND: - tcf_block_cb_unregister(f->block, cb, dev); + case FLOW_BLOCK_BIND: + if (flow_block_cb_is_busy(cb, dev, &dsa_slave_block_cb_list)) + return -EBUSY; + + block_cb = flow_block_cb_alloc(f->net, cb, dev, dev, NULL); + if (IS_ERR(block_cb)) + return PTR_ERR(block_cb); + + flow_block_cb_add(block_cb, f); + list_add_tail(&block_cb->driver_list, &dsa_slave_block_cb_list); + return 0; + case FLOW_BLOCK_UNBIND: + block_cb = flow_block_cb_lookup(f, cb, dev); + if (!block_cb) + return -ENOENT; + + flow_block_cb_remove(block_cb, f); + list_del(&block_cb->driver_list); return 0; default: return -EOPNOTSUPP; @@ -1160,98 +1194,6 @@ static struct device_type dsa_type = { .name = "dsa", }; -static void dsa_slave_phylink_validate(struct net_device *dev, - unsigned long *supported, - struct phylink_link_state *state) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_validate) - return; - - ds->ops->phylink_validate(ds, dp->index, supported, state); -} - -static int dsa_slave_phylink_mac_link_state(struct net_device *dev, - struct phylink_link_state *state) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - /* Only called for SGMII and 802.3z */ - if (!ds->ops->phylink_mac_link_state) - return -EOPNOTSUPP; - - return ds->ops->phylink_mac_link_state(ds, dp->index, state); -} - -static void dsa_slave_phylink_mac_config(struct net_device *dev, - unsigned int mode, - const struct phylink_link_state *state) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_config) - return; - - ds->ops->phylink_mac_config(ds, dp->index, mode, state); -} - -static void dsa_slave_phylink_mac_an_restart(struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_an_restart) - return; - - ds->ops->phylink_mac_an_restart(ds, dp->index); -} - -static void dsa_slave_phylink_mac_link_down(struct net_device *dev, - unsigned int mode, - phy_interface_t interface) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_link_down) { - if (ds->ops->adjust_link && dev->phydev) - ds->ops->adjust_link(ds, dp->index, dev->phydev); - return; - } - - ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); -} - -static void dsa_slave_phylink_mac_link_up(struct net_device *dev, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_link_up) { - if (ds->ops->adjust_link && dev->phydev) - ds->ops->adjust_link(ds, dp->index, dev->phydev); - return; - } - - ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev); -} - -static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = { - .validate = dsa_slave_phylink_validate, - .mac_link_state = dsa_slave_phylink_mac_link_state, - .mac_config = dsa_slave_phylink_mac_config, - .mac_an_restart = dsa_slave_phylink_mac_an_restart, - .mac_link_down = dsa_slave_phylink_mac_link_down, - .mac_link_up = dsa_slave_phylink_mac_link_up, -}; - void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) { const struct dsa_port *dp = dsa_to_port(ds, port); @@ -1299,8 +1241,11 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) if (mode < 0) mode = PHY_INTERFACE_MODE_NA; - dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode, - &dsa_slave_phylink_mac_ops); + dp->pl_config.dev = &slave_dev->dev; + dp->pl_config.type = PHYLINK_NETDEV; + + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, + &dsa_port_phylink_mac_ops); if (IS_ERR(dp->pl)) { netdev_err(slave_dev, "error creating PHYLINK: %ld\n", PTR_ERR(dp->pl)); @@ -1494,7 +1439,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) free_netdev(slave_dev); } -static bool dsa_slave_dev_check(struct net_device *dev) +static bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; } @@ -1565,19 +1510,6 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, return NOTIFY_DONE; } -static int -dsa_slave_switchdev_port_attr_set_event(struct net_device *netdev, - struct switchdev_notifier_port_attr_info *port_attr_info) -{ - int err; - - err = dsa_slave_port_attr_set(netdev, port_attr_info->attr, - port_attr_info->trans); - - port_attr_info->handled = true; - return notifier_from_errno(err); -} - struct dsa_switchdev_event_work { struct work_struct work; struct switchdev_notifier_fdb_info fdb_info; @@ -1652,13 +1584,18 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct dsa_switchdev_event_work *switchdev_work; + int err; + + if (event == SWITCHDEV_PORT_ATTR_SET) { + err = switchdev_handle_port_attr_set(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_attr_set); + return notifier_from_errno(err); + } if (!dsa_slave_dev_check(dev)) return NOTIFY_DONE; - if (event == SWITCHDEV_PORT_ATTR_SET) - return dsa_slave_switchdev_port_attr_set_event(dev, ptr); - switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); if (!switchdev_work) return NOTIFY_BAD; @@ -1688,41 +1625,28 @@ err_fdb_work_init: return NOTIFY_BAD; } -static int -dsa_slave_switchdev_port_obj_event(unsigned long event, - struct net_device *netdev, - struct switchdev_notifier_port_obj_info *port_obj_info) -{ - int err = -EOPNOTSUPP; - - switch (event) { - case SWITCHDEV_PORT_OBJ_ADD: - err = dsa_slave_port_obj_add(netdev, port_obj_info->obj, - port_obj_info->trans); - break; - case SWITCHDEV_PORT_OBJ_DEL: - err = dsa_slave_port_obj_del(netdev, port_obj_info->obj); - break; - } - - port_obj_info->handled = true; - return notifier_from_errno(err); -} - static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); - - if (!dsa_slave_dev_check(dev)) - return NOTIFY_DONE; + int err; switch (event) { - case SWITCHDEV_PORT_OBJ_ADD: /* fall through */ + case SWITCHDEV_PORT_OBJ_ADD: + err = switchdev_handle_port_obj_add(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_obj_add); + return notifier_from_errno(err); case SWITCHDEV_PORT_OBJ_DEL: - return dsa_slave_switchdev_port_obj_event(event, dev, ptr); + err = switchdev_handle_port_obj_del(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_obj_del); + return notifier_from_errno(err); case SWITCHDEV_PORT_ATTR_SET: - return dsa_slave_switchdev_port_attr_set_event(dev, ptr); + err = switchdev_handle_port_attr_set(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_attr_set); + return notifier_from_errno(err); } return NOTIFY_DONE; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 65a35e976d7b..6ebbd799c4eb 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -235,31 +235,48 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev, - struct packet_type *pt, u16 *tpid, u16 *tci) +/* In the DSA packet_type handler, skb->data points in the middle of the VLAN + * tag, after tpid and before tci. This is because so far, ETH_HLEN + * (DMAC, SMAC, EtherType) bytes were pulled. + * There are 2 bytes of VLAN tag left in skb->data, and upper + * layers expect the 'real' EtherType to be consumed as well. + * Coincidentally, a VLAN header is also of the same size as + * the number of bytes that need to be pulled. + * + * skb_mac_header skb->data + * | | + * v v + * | | | | | | | | | | | | | | | | | | | + * +-----------------------+-----------------------+-------+-------+-------+ + * | Destination MAC | Source MAC | TPID | TCI | EType | + * +-----------------------+-----------------------+-------+-------+-------+ + * ^ | | + * |<--VLAN_HLEN-->to <---VLAN_HLEN---> + * from | + * >>>>>>> v + * >>>>>>> | | | | | | | | | | | | | | | + * >>>>>>> +-----------------------+-----------------------+-------+ + * >>>>>>> | Destination MAC | Source MAC | EType | + * +-----------------------+-----------------------+-------+ + * ^ ^ + * (now part of | | + * skb->head) skb_mac_header skb->data + */ +struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) { - struct vlan_ethhdr *tag; - - if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) - return NULL; + u8 *from = skb_mac_header(skb); + u8 *dest = from + VLAN_HLEN; - tag = vlan_eth_hdr(skb); - *tpid = ntohs(tag->h_vlan_proto); - *tci = ntohs(tag->h_vlan_TCI); - - /* skb->data points in the middle of the VLAN tag, - * after tpid and before tci. This is because so far, - * ETH_HLEN (DMAC, SMAC, EtherType) bytes were pulled. - * There are 2 bytes of VLAN tag left in skb->data, and upper - * layers expect the 'real' EtherType to be consumed as well. - * Coincidentally, a VLAN header is also of the same size as - * the number of bytes that need to be pulled. - */ - skb_pull_rcsum(skb, VLAN_HLEN); + memmove(dest, from, ETH_HLEN - VLAN_HLEN); + skb_pull(skb, VLAN_HLEN); + skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + skb_pull_rcsum(skb, ETH_HLEN); return skb; } -EXPORT_SYMBOL_GPL(dsa_8021q_rcv); +EXPORT_SYMBOL_GPL(dsa_8021q_remove_header); static const struct dsa_device_ops dsa_8021q_netdev_ops = { .name = "8021q", diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index d43737e6c3fb..1d96c9d4a8e9 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -13,6 +13,8 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb) const struct ethhdr *hdr = eth_hdr(skb); u64 dmac = ether_addr_to_u64(hdr->h_dest); + if (ntohs(hdr->h_proto) == ETH_P_SJA1105_META) + return false; if ((dmac & SJA1105_LINKLOCAL_FILTER_A_MASK) == SJA1105_LINKLOCAL_FILTER_A) return true; @@ -22,15 +24,61 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb) return false; } +struct sja1105_meta { + u64 tstamp; + u64 dmac_byte_4; + u64 dmac_byte_3; + u64 source_port; + u64 switch_id; +}; + +static void sja1105_meta_unpack(const struct sk_buff *skb, + struct sja1105_meta *meta) +{ + u8 *buf = skb_mac_header(skb) + ETH_HLEN; + + /* UM10944.pdf section 4.2.17 AVB Parameters: + * Structure of the meta-data follow-up frame. + * It is in network byte order, so there are no quirks + * while unpacking the meta frame. + * + * Also SJA1105 E/T only populates bits 23:0 of the timestamp + * whereas P/Q/R/S does 32 bits. Since the structure is the + * same and the E/T puts zeroes in the high-order byte, use + * a unified unpacking command for both device series. + */ + packing(buf, &meta->tstamp, 31, 0, 4, UNPACK, 0); + packing(buf + 4, &meta->dmac_byte_4, 7, 0, 1, UNPACK, 0); + packing(buf + 5, &meta->dmac_byte_3, 7, 0, 1, UNPACK, 0); + packing(buf + 6, &meta->source_port, 7, 0, 1, UNPACK, 0); + packing(buf + 7, &meta->switch_id, 7, 0, 1, UNPACK, 0); +} + +static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) +{ + const struct ethhdr *hdr = eth_hdr(skb); + u64 smac = ether_addr_to_u64(hdr->h_source); + u64 dmac = ether_addr_to_u64(hdr->h_dest); + + if (smac != SJA1105_META_SMAC) + return false; + if (dmac != SJA1105_META_DMAC) + return false; + if (ntohs(hdr->h_proto) != ETH_P_SJA1105_META) + return false; + return true; +} + /* This is the first time the tagger sees the frame on RX. - * Figure out if we can decode it, and if we can, annotate skb->cb with how we - * plan to do that, so we don't need to check again in the rcv function. + * Figure out if we can decode it. */ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) { + if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) + return true; if (sja1105_is_link_local(skb)) return true; - if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) + if (sja1105_is_meta_frame(skb)) return true; return false; } @@ -62,25 +110,152 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } +static void sja1105_transfer_meta(struct sk_buff *skb, + const struct sja1105_meta *meta) +{ + struct ethhdr *hdr = eth_hdr(skb); + + hdr->h_dest[3] = meta->dmac_byte_3; + hdr->h_dest[4] = meta->dmac_byte_4; + SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp; +} + +/* This is a simple state machine which follows the hardware mechanism of + * generating RX timestamps: + * + * After each timestampable skb (all traffic for which send_meta1 and + * send_meta0 is true, aka all MAC-filtered link-local traffic) a meta frame + * containing a partial timestamp is immediately generated by the switch and + * sent as a follow-up to the link-local frame on the CPU port. + * + * The meta frames have no unique identifier (such as sequence number) by which + * one may pair them to the correct timestampable frame. + * Instead, the switch has internal logic that ensures no frames are sent on + * the CPU port between a link-local timestampable frame and its corresponding + * meta follow-up. It also ensures strict ordering between ports (lower ports + * have higher priority towards the CPU port). For this reason, a per-port + * data structure is not needed/desirable. + * + * This function pairs the link-local frame with its partial timestamp from the + * meta follow-up frame. The full timestamp will be reconstructed later in a + * work queue. + */ +static struct sk_buff +*sja1105_rcv_meta_state_machine(struct sk_buff *skb, + struct sja1105_meta *meta, + bool is_link_local, + bool is_meta) +{ + struct sja1105_port *sp; + struct dsa_port *dp; + + dp = dsa_slave_to_port(skb->dev); + sp = dp->priv; + + /* Step 1: A timestampable frame was received. + * Buffer it until we get its meta frame. + */ + if (is_link_local && sp->data->hwts_rx_en) { + spin_lock(&sp->data->meta_lock); + /* Was this a link-local frame instead of the meta + * that we were expecting? + */ + if (sp->data->stampable_skb) { + dev_err_ratelimited(dp->ds->dev, + "Expected meta frame, is %12llx " + "in the DSA master multicast filter?\n", + SJA1105_META_DMAC); + } + + /* Hold a reference to avoid dsa_switch_rcv + * from freeing the skb. + */ + sp->data->stampable_skb = skb_get(skb); + spin_unlock(&sp->data->meta_lock); + + /* Tell DSA we got nothing */ + return NULL; + + /* Step 2: The meta frame arrived. + * Time to take the stampable skb out of the closet, annotate it + * with the partial timestamp, and pretend that we received it + * just now (basically masquerade the buffered frame as the meta + * frame, which serves no further purpose). + */ + } else if (is_meta) { + struct sk_buff *stampable_skb; + + spin_lock(&sp->data->meta_lock); + + stampable_skb = sp->data->stampable_skb; + sp->data->stampable_skb = NULL; + + /* Was this a meta frame instead of the link-local + * that we were expecting? + */ + if (!stampable_skb) { + dev_err_ratelimited(dp->ds->dev, + "Unexpected meta frame\n"); + spin_unlock(&sp->data->meta_lock); + return NULL; + } + + if (stampable_skb->dev != skb->dev) { + dev_err_ratelimited(dp->ds->dev, + "Meta frame on wrong port\n"); + spin_unlock(&sp->data->meta_lock); + return NULL; + } + + /* Free the meta frame and give DSA the buffered stampable_skb + * for further processing up the network stack. + */ + kfree_skb(skb); + + skb = skb_copy(stampable_skb, GFP_ATOMIC); + if (!skb) { + dev_err_ratelimited(dp->ds->dev, + "Failed to copy stampable skb\n"); + return NULL; + } + sja1105_transfer_meta(skb, meta); + /* The cached copy will be freed now */ + skb_unref(stampable_skb); + + spin_unlock(&sp->data->meta_lock); + } + + return skb; +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { - struct ethhdr *hdr = eth_hdr(skb); - u64 source_port, switch_id; - struct sk_buff *nskb; + struct sja1105_meta meta = {0}; + int source_port, switch_id; + struct vlan_ethhdr *hdr; u16 tpid, vid, tci; + bool is_link_local; bool is_tagged; + bool is_meta; - nskb = dsa_8021q_rcv(skb, netdev, pt, &tpid, &tci); - is_tagged = (nskb && tpid == ETH_P_SJA1105); - - skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; - vid = tci & VLAN_VID_MASK; + hdr = vlan_eth_hdr(skb); + tpid = ntohs(hdr->h_vlan_proto); + is_tagged = (tpid == ETH_P_SJA1105); + is_link_local = sja1105_is_link_local(skb); + is_meta = sja1105_is_meta_frame(skb); skb->offload_fwd_mark = 1; - if (sja1105_is_link_local(skb)) { + if (is_tagged) { + /* Normal traffic path. */ + tci = ntohs(hdr->h_vlan_TCI); + vid = tci & VLAN_VID_MASK; + source_port = dsa_8021q_rx_source_port(vid); + switch_id = dsa_8021q_rx_switch_id(vid); + skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of * the incl_srcpt options. @@ -90,10 +265,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, /* Clear the DMAC bytes that were mangled by the switch */ hdr->h_dest[3] = 0; hdr->h_dest[4] = 0; + } else if (is_meta) { + sja1105_meta_unpack(skb, &meta); + source_port = meta.source_port; + switch_id = meta.switch_id; } else { - /* Normal traffic path. */ - source_port = dsa_8021q_rx_source_port(vid); - switch_id = dsa_8021q_rx_switch_id(vid); + return NULL; } skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); @@ -106,10 +283,10 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN). */ if (is_tagged) - memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - VLAN_HLEN, - ETH_HLEN - VLAN_HLEN); + skb = dsa_8021q_remove_header(skb); - return skb; + return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, + is_meta); } static struct dsa_device_ops sja1105_netdev_ops = { diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 20b0bcb7e9e3..17374afee28f 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -545,17 +545,10 @@ unsigned char * __weak arch_get_platform_mac_address(void) int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) { - const unsigned char *addr; - struct device_node *dp; + const unsigned char *addr = NULL; - if (dev_is_pci(dev)) - dp = pci_device_to_OF_node(to_pci_dev(dev)); - else - dp = dev->of_node; - - addr = NULL; - if (dp) - addr = of_get_mac_address(dp); + if (dev->of_node) + addr = of_get_mac_address(dev->of_node); if (IS_ERR_OR_NULL(addr)) addr = arch_get_platform_mac_address(); @@ -563,6 +556,7 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) return -ENODEV; ether_addr_copy(mac_addr, addr); + return 0; } EXPORT_SYMBOL(eth_platform_get_mac_address); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 15c72065df79..f0f9b493c47b 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -227,9 +227,13 @@ static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct hsr_port *master; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); - skb->dev = master->dev; - hsr_forward_skb(skb, master); - + if (master) { + skb->dev = master->dev; + hsr_forward_skb(skb, master); + } else { + atomic_long_inc(&dev->tx_dropped); + dev_kfree_skb_any(skb); + } return NETDEV_TX_OK; } @@ -344,27 +348,26 @@ static void hsr_announce(struct timer_list *t) rcu_read_unlock(); } -/* According to comments in the declaration of struct net_device, this function - * is "Called from unregister, can be used to call free_netdev". Ok then... - */ -static void hsr_dev_destroy(struct net_device *hsr_dev) +void hsr_dev_destroy(struct net_device *hsr_dev) { struct hsr_priv *hsr; struct hsr_port *port; + struct hsr_port *tmp; hsr = netdev_priv(hsr_dev); hsr_debugfs_term(hsr); - rtnl_lock(); - hsr_for_each_port(hsr, port) + list_for_each_entry_safe(port, tmp, &hsr->ports, port_list) hsr_del_port(port); - rtnl_unlock(); del_timer_sync(&hsr->prune_timer); del_timer_sync(&hsr->announce_timer); synchronize_rcu(); + + hsr_del_self_node(&hsr->self_node_db); + hsr_del_nodes(&hsr->node_db); } static const struct net_device_ops hsr_device_ops = { @@ -391,7 +394,6 @@ void hsr_dev_setup(struct net_device *dev) dev->priv_flags |= IFF_NO_QUEUE; dev->needs_free_netdev = true; - dev->priv_destructor = hsr_dev_destroy; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | @@ -428,6 +430,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], { struct hsr_priv *hsr; struct hsr_port *port; + struct hsr_port *tmp; int res; hsr = netdev_priv(hsr_dev); @@ -492,10 +495,10 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], return 0; fail: - hsr_for_each_port(hsr, port) + list_for_each_entry_safe(port, tmp, &hsr->ports, port_list) hsr_del_port(port); err_add_port: - hsr_del_node(&hsr->self_node_db); + hsr_del_self_node(&hsr->self_node_db); return res; } diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index 6d7759c4f5f9..d0fa6b0696d2 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -14,6 +14,7 @@ void hsr_dev_setup(struct net_device *dev); int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], unsigned char multicast_spec, u8 protocol_version); +void hsr_dev_destroy(struct net_device *hsr_dev); void hsr_check_carrier_and_operstate(struct hsr_priv *hsr); bool is_hsr_master(struct net_device *dev); int hsr_get_max_mtu(struct hsr_priv *hsr); diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 2d7a19750436..292be446007b 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -104,7 +104,7 @@ int hsr_create_self_node(struct list_head *self_node_db, return 0; } -void hsr_del_node(struct list_head *self_node_db) +void hsr_del_self_node(struct list_head *self_node_db) { struct hsr_node *node; @@ -117,6 +117,15 @@ void hsr_del_node(struct list_head *self_node_db) } } +void hsr_del_nodes(struct list_head *node_db) +{ + struct hsr_node *node; + struct hsr_node *tmp; + + list_for_each_entry_safe(node, tmp, node_db, mac_list) + kfree(node); +} + /* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index a3bdcdab469d..89a3ce38151d 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -12,7 +12,8 @@ struct hsr_node; -void hsr_del_node(struct list_head *self_node_db); +void hsr_del_self_node(struct list_head *self_node_db); +void hsr_del_nodes(struct list_head *node_db); struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], u16 seq_out); struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 8f8337f893ba..160edd24de4e 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -69,6 +69,12 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, return hsr_dev_finalize(dev, link, multicast_spec, hsr_version); } +static void hsr_dellink(struct net_device *hsr_dev, struct list_head *head) +{ + hsr_dev_destroy(hsr_dev); + unregister_netdevice_queue(hsr_dev, head); +} + static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct hsr_priv *hsr; @@ -113,6 +119,7 @@ static struct rtnl_link_ops hsr_link_ops __read_mostly = { .priv_size = sizeof(struct hsr_priv), .setup = hsr_dev_setup, .newlink = hsr_newlink, + .dellink = hsr_dellink, .fill_info = hsr_fill_info, }; diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 88b6705ded83..ee561297d8a7 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -193,4 +193,5 @@ void hsr_del_port(struct hsr_port *port) if (port != master) dev_put(port->dev); + kfree(port); } diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 9133e3cede77..e4aba5d485be 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -74,7 +74,7 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, key.src = *src; key.dst = *dst; - q = inet_frag_find(&ieee802154_lowpan->frags, &key); + q = inet_frag_find(ieee802154_lowpan->fqdir, &key); if (!q) return NULL; @@ -134,7 +134,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, fq->q.flags |= INET_FRAG_FIRST_IN; fq->q.meat += skb->len; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { @@ -321,23 +321,18 @@ err: static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", - .data = &init_net.ieee802154_lowpan.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", - .data = &init_net.ieee802154_lowpan.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, { .procname = "6lowpanfrag_time", - .data = &init_net.ieee802154_lowpan.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -372,17 +367,17 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) if (table == NULL) goto err_alloc; - table[0].data = &ieee802154_lowpan->frags.high_thresh; - table[0].extra1 = &ieee802154_lowpan->frags.low_thresh; - table[1].data = &ieee802154_lowpan->frags.low_thresh; - table[1].extra2 = &ieee802154_lowpan->frags.high_thresh; - table[2].data = &ieee802154_lowpan->frags.timeout; - /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) table[0].procname = NULL; } + table[0].data = &ieee802154_lowpan->fqdir->high_thresh; + table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh; + table[1].data = &ieee802154_lowpan->fqdir->low_thresh; + table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh; + table[2].data = &ieee802154_lowpan->fqdir->timeout; + hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table); if (hdr == NULL) goto err_reg; @@ -449,32 +444,42 @@ static int __net_init lowpan_frags_init_net(struct net *net) net_ieee802154_lowpan(net); int res; - ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; - ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; - ieee802154_lowpan->frags.f = &lowpan_frags; - res = inet_frags_init_net(&ieee802154_lowpan->frags); + res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net); if (res < 0) return res; + + ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = lowpan_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&ieee802154_lowpan->frags); + fqdir_exit(ieee802154_lowpan->fqdir); return res; } +static void __net_exit lowpan_frags_pre_exit_net(struct net *net) +{ + struct netns_ieee802154_lowpan *ieee802154_lowpan = + net_ieee802154_lowpan(net); + + fqdir_pre_exit(ieee802154_lowpan->fqdir); +} + static void __net_exit lowpan_frags_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&ieee802154_lowpan->frags); + fqdir_exit(ieee802154_lowpan->fqdir); } static struct pernet_operations lowpan_frags_ops = { - .init = lowpan_frags_init_net, - .exit = lowpan_frags_exit_net, + .init = lowpan_frags_init_net, + .pre_exit = lowpan_frags_pre_exit_net, + .exit = lowpan_frags_exit_net, }; static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) @@ -539,7 +544,7 @@ err_sysctl: void lowpan_net_frag_exit(void) { - inet_frags_fini(&lowpan_frags); lowpan_frags_sysctl_unregister(); unregister_pernet_subsys(&lowpan_frags_ops); + inet_frags_fini(&lowpan_frags); } diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 000a61994c8f..d57ecfaf89d4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -14,7 +14,7 @@ obj-y := route.o inetpeer.o protocol.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ - metrics.o netlink.o + metrics.o netlink.o nexthop.o obj-$(CONFIG_BPFILTER) += bpfilter/ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 52bdb881a506..ed2301ef872e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -784,10 +784,8 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, } EXPORT_SYMBOL(inet_getname); -int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +int inet_send_prepare(struct sock *sk) { - struct sock *sk = sock->sk; - sock_rps_record_flow(sk); /* We may need to bind the socket. */ @@ -795,7 +793,19 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->sendmsg(sk, msg, size); + return 0; +} +EXPORT_SYMBOL_GPL(inet_send_prepare); + +int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + + if (unlikely(inet_send_prepare(sk))) + return -EAGAIN; + + return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg, + sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg); @@ -804,11 +814,7 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, { struct sock *sk = sock->sk; - sock_rps_record_flow(sk); - - /* We may need to bind the socket. */ - if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind && - inet_autobind(sk)) + if (unlikely(inet_send_prepare(sk))) return -EAGAIN; if (sk->sk_prot->sendpage) @@ -817,6 +823,8 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, } EXPORT_SYMBOL(inet_sendpage); +INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, + size_t, int, int, int *)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -827,8 +835,9 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); - err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, - flags & ~MSG_DONTWAIT, &addr_len); + err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, + sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 9c3afd550612..974179b3b314 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -590,8 +590,7 @@ static void __exit ah4_fini(void) { if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ah_type, AF_INET) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&ah_type, AF_INET); } module_init(ah4_init); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c6bd0f7a020a..a4b5bd4d2c89 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -62,6 +62,11 @@ #include <net/net_namespace.h> #include <net/addrconf.h> +#define IPV6ONLY_FLAGS \ + (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \ + IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \ + IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY) + static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, @@ -190,7 +195,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +static void inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); @@ -296,8 +302,8 @@ static void in_dev_rcu_put(struct rcu_head *head) static void inetdev_destroy(struct in_device *in_dev) { - struct in_ifaddr *ifa; struct net_device *dev; + struct in_ifaddr *ifa; ASSERT_RTNL(); @@ -307,7 +313,7 @@ static void inetdev_destroy(struct in_device *in_dev) ip_mc_destroy_dev(in_dev); - while ((ifa = in_dev->ifa_list) != NULL) { + while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } @@ -323,30 +329,35 @@ static void inetdev_destroy(struct in_device *in_dev) int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { + const struct in_ifaddr *ifa; + rcu_read_lock(); - for_primary_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { rcu_read_unlock(); return 1; } } - } endfor_ifa(in_dev); + } rcu_read_unlock(); return 0; } -static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, - int destroy, struct nlmsghdr *nlh, u32 portid) +static void __inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, + int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; - struct in_ifaddr *ifa, *ifa1 = *ifap; - struct in_ifaddr *last_prim = in_dev->ifa_list; + struct in_ifaddr *ifa, *ifa1; + struct in_ifaddr *last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); + ifa1 = rtnl_dereference(*ifap); + last_prim = rtnl_dereference(in_dev->ifa_list); if (in_dev->dead) goto no_promotions; @@ -355,9 +366,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { - struct in_ifaddr **ifap1 = &ifa1->ifa_next; + struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; - while ((ifa = *ifap1) != NULL) { + while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = ifa; @@ -390,7 +401,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ - for (ifa = promote; ifa; ifa = ifa->ifa_next) { + for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); @@ -416,19 +427,25 @@ no_promotions: blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { - struct in_ifaddr *next_sec = promote->ifa_next; + struct in_ifaddr *next_sec; + next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { - prev_prom->ifa_next = promote->ifa_next; - promote->ifa_next = last_prim->ifa_next; - last_prim->ifa_next = promote; + struct in_ifaddr *last_sec; + + rcu_assign_pointer(prev_prom->ifa_next, next_sec); + + last_sec = rtnl_dereference(last_prim->ifa_next); + rcu_assign_pointer(promote->ifa_next, last_sec); + rcu_assign_pointer(last_prim->ifa_next, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); - for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { + for (ifa = next_sec; ifa; + ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; @@ -440,7 +457,8 @@ no_promotions: inet_free_ifa(ifa1); } -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +static void inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); @@ -453,9 +471,10 @@ static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { + struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; - struct in_ifaddr *ifa1, **ifap, **last_primary; struct in_validator_info ivi; + struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); @@ -468,8 +487,13 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; - for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; - ifap = &ifa1->ifa_next) { + /* Don't set IPv6 only flags to IPv4 addresses */ + ifa->ifa_flags &= ~IPV6ONLY_FLAGS; + + ifap = &in_dev->ifa_list; + ifa1 = rtnl_dereference(*ifap); + + while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; @@ -485,6 +509,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } ifa->ifa_flags |= IFA_F_SECONDARY; } + + ifap = &ifa1->ifa_next; + ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh @@ -510,8 +537,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifap = last_primary; } - ifa->ifa_next = *ifap; - *ifap = ifa; + rcu_assign_pointer(ifa->ifa_next, *ifap); + rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); @@ -576,12 +603,14 @@ EXPORT_SYMBOL(inetdev_by_index); struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask) { + struct in_ifaddr *ifa; + ASSERT_RTNL(); - for_primary_ifa(in_dev) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) return ifa; - } endfor_ifa(in_dev); + } return NULL; } @@ -609,10 +638,12 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; - struct in_ifaddr *ifa, **ifap; + struct in_ifaddr *ifa; + int err = -EINVAL; ASSERT_RTNL(); @@ -629,7 +660,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) @@ -717,15 +748,19 @@ static void check_lifetime(struct work_struct *work) if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { - struct in_ifaddr **ifap; + struct in_ifaddr __rcu **ifap; + struct in_ifaddr *tmp; - for (ifap = &ifa->ifa_dev->ifa_list; - *ifap != NULL; ifap = &(*ifap)->ifa_next) { - if (*ifap == ifa) { + ifap = &ifa->ifa_dev->ifa_list; + tmp = rtnl_dereference(*ifap); + while (tmp) { + if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } + ifap = &tmp->ifa_next; + tmp = rtnl_dereference(*ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && @@ -869,13 +904,12 @@ errout: static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; - struct in_ifaddr *ifa1, **ifap; + struct in_ifaddr *ifa1; if (!ifa->ifa_local) return NULL; - for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; - ifap = &ifa1->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) @@ -970,8 +1004,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; + struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; - struct in_ifaddr **ifap = NULL; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; @@ -1042,7 +1076,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == @@ -1055,7 +1091,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; @@ -1204,7 +1241,7 @@ out: static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl(dev); - struct in_ifaddr *ifa; + const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; @@ -1214,7 +1251,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int s if (!in_dev) goto out; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (!buf) { done += size; continue; @@ -1242,18 +1279,24 @@ out: static __be32 in_dev_select_addr(const struct in_device *in_dev, int scope) { - for_primary_ifa(in_dev) { + const struct in_ifaddr *ifa; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) return ifa->ifa_local; - } endfor_ifa(in_dev); + } return 0; } __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { + const struct in_ifaddr *ifa; __be32 addr = 0; + unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; struct net *net = dev_net(dev); int master_idx; @@ -1263,8 +1306,13 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (!in_dev) goto no_in_dev; - for_primary_ifa(in_dev) { - if (ifa->ifa_scope > scope) + if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) + localnet_scope = RT_SCOPE_LINK; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + if (min(ifa->ifa_scope, localnet_scope) > scope) continue; if (!dst || inet_ifa_match(dst, ifa)) { addr = ifa->ifa_local; @@ -1272,7 +1320,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) } if (!addr) addr = ifa->ifa_local; - } endfor_ifa(in_dev); + } if (addr) goto out_unlock; @@ -1317,13 +1365,20 @@ EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) { - int same = 0; + unsigned char localnet_scope = RT_SCOPE_HOST; + const struct in_ifaddr *ifa; __be32 addr = 0; + int same = 0; + + if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) + localnet_scope = RT_SCOPE_LINK; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + unsigned char min_scope = min(ifa->ifa_scope, localnet_scope); - for_ifa(in_dev) { if (!addr && (local == ifa->ifa_local || !local) && - ifa->ifa_scope <= scope) { + min_scope <= scope) { addr = ifa->ifa_local; if (same) break; @@ -1338,7 +1393,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, if (inet_ifa_match(addr, ifa)) break; /* No, then can we use new local src? */ - if (ifa->ifa_scope <= scope) { + if (min_scope <= scope) { addr = ifa->ifa_local; break; } @@ -1346,7 +1401,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, same = 0; } } - } endfor_ifa(in_dev); + } return same ? addr : 0; } @@ -1420,7 +1475,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) struct in_ifaddr *ifa; int named = 0; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { char old[IFNAMSIZ], *dot; memcpy(old, ifa->ifa_label, IFNAMSIZ); @@ -1450,10 +1505,9 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { - struct in_ifaddr *ifa; + const struct in_ifaddr *ifa; - for (ifa = in_dev->ifa_list; ifa; - ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { arp_send(ARPOP_REQUEST, ETH_P_ARP, ifa->ifa_local, dev, ifa->ifa_local, NULL, @@ -1723,15 +1777,17 @@ static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, int ip_idx = 0; int err; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) { - if (ip_idx < s_ip_idx) + in_dev_for_each_ifa_rtnl(ifa, in_dev) { + if (ip_idx < s_ip_idx) { + ip_idx++; continue; - + } err = inet_fill_ifaddr(skb, ifa, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + ip_idx++; } err = 0; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b9ae95576084..5c967764041f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -33,8 +33,6 @@ struct esp_output_extra { #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) -static u32 esp4_get_mtu(struct xfrm_state *x, int mtu); - /* * Allocate an AEAD request structure with extra space for SG and IV. * @@ -506,7 +504,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } @@ -788,28 +786,6 @@ out: return err; } -static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) -{ - struct crypto_aead *aead = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); - unsigned int net_adj; - - switch (x->props.mode) { - case XFRM_MODE_TRANSPORT: - case XFRM_MODE_BEET: - net_adj = sizeof(struct iphdr); - break; - case XFRM_MODE_TUNNEL: - net_adj = 0; - break; - default: - BUG(); - } - - return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - - net_adj) & ~(blksize - 1)) + net_adj - 2; -} - static int esp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); @@ -1035,7 +1011,6 @@ static const struct xfrm_type esp_type = .flags = XFRM_TYPE_REPLAY_PROT, .init_state = esp_init_state, .destructor = esp_destroy, - .get_mtu = esp4_get_mtu, .input = esp_input, .output = esp_output, }; @@ -1066,8 +1041,7 @@ static void __exit esp4_fini(void) { if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&esp_type, AF_INET) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&esp_type, AF_INET); } module_init(esp4_init); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 2e5e377f50a1..0e4a7cf6bc87 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -312,9 +312,7 @@ static int __init esp4_offload_init(void) static void __exit esp4_offload_exit(void) { - if (xfrm_unregister_type_offload(&esp_type_offload, AF_INET) < 0) - pr_info("%s: can't remove xfrm type offload\n", __func__); - + xfrm_unregister_type_offload(&esp_type_offload, AF_INET); inet_del_offload(&esp4_offload, IPPROTO_ESP); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e54c2bcbb465..317339cd7f03 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -39,6 +39,7 @@ #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> @@ -188,7 +189,7 @@ int fib_unmerge(struct net *net) return 0; } -static void fib_flush(struct net *net) +void fib_flush(struct net *net) { int flushed = 0; unsigned int h; @@ -230,7 +231,9 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { - if (!dev || dev == res.fi->fib_dev) + struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); + + if (!dev || dev == nhc->nhc_dev) ret = res.type; } } @@ -317,19 +320,19 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) #ifdef CONFIG_IP_ROUTE_MULTIPATH int ret; - for (ret = 0; ret < fi->fib_nhs; ret++) { - struct fib_nh *nh = &fi->fib_nh[ret]; + for (ret = 0; ret < fib_info_num_path(fi); ret++) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); - if (nh->fib_nh_dev == dev) { + if (nhc->nhc_dev == dev) { dev_match = true; break; - } else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) { + } else if (l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) { dev_match = true; break; } } #else - if (fi->fib_nh[0].fib_nh_dev == dev) + if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif @@ -536,14 +539,22 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { - struct in_ifaddr *ifa; - struct in_device *in_dev = __in_dev_get_rtnl(dev); + const struct in_ifaddr *ifa; + struct in_device *in_dev; + + in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return -ENODEV; + *colon = ':'; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + + rcu_read_lock(); + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; + } + rcu_read_unlock(); + if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; @@ -641,6 +652,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) } const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { + [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, @@ -659,6 +671,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, + [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, @@ -796,6 +809,18 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, if (err < 0) goto errout; break; + case RTA_NH_ID: + cfg->fc_nh_id = nla_get_u32(attr); + break; + } + } + + if (cfg->fc_nh_id) { + if (cfg->fc_oif || cfg->fc_gw_family || + cfg->fc_encap || cfg->fc_mp) { + NL_SET_ERR_MSG(extack, + "Nexthop specification and nexthop id are mutually exclusive"); + return -EINVAL; } } @@ -822,6 +847,12 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + err = -EINVAL; + goto errout; + } + tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); @@ -881,10 +912,15 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } + if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); return -EINVAL; } + if (rtm->rtm_flags & RTM_F_CLONED) + filter->dump_routes = false; + else + filter->dump_exceptions = false; filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC); filter->flags = rtm->rtm_flags; @@ -931,9 +967,10 @@ EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct fib_dump_filter filter = { .dump_routes = true, + .dump_exceptions = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct fib_dump_filter filter = {}; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; @@ -950,8 +987,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); } - /* fib entries are never clones and ipv4 does not use prefix flag */ - if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED)) + /* ipv4 does not use prefix flag */ + if (filter.flags & RTM_F_PREFIX) return skb->len; if (filter.table_id) { @@ -1172,8 +1209,8 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) * * Scan address list to be sure that addresses are really gone. */ - - for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + rcu_read_lock(); + in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; @@ -1241,6 +1278,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) } } } + rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) @@ -1410,6 +1448,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); + struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { @@ -1424,9 +1463,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo switch (event) { case NETDEV_UP: - for_ifa(in_dev) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); - } endfor_ifa(in_dev); + } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 7945f0534db7..a68b5e21ec51 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/list.h> #include <net/ip_fib.h> +#include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index a38e86b98e4f..b43a7ba5c6a4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -27,6 +27,7 @@ #include <net/route.h> #include <net/tcp.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/fib_rules.h> struct fib4_rule { @@ -141,8 +142,11 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg struct fib_result *result = (struct fib_result *) arg->result; struct net_device *dev = NULL; - if (result->fi) - dev = result->fi->fib_dev; + if (result->fi) { + struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0); + + dev = nhc->nhc_dev; + } /* do not accept result if the route does * not meet the required prefix length diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index bfa49a88d03a..2db089e10ba0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -38,6 +38,7 @@ #include <net/sock.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> +#include <net/nexthop.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> @@ -56,18 +57,21 @@ static unsigned int fib_info_cnt; #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; +/* for_nexthops and change_nexthops only used when nexthop object + * is not set in a fib_info. The logic within can reference fib_nh. + */ #ifdef CONFIG_IP_ROUTE_MULTIPATH #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ for (nhsel = 0, nh = (fi)->fib_nh; \ - nhsel < (fi)->fib_nhs; \ + nhsel < fib_info_num_path((fi)); \ nh++, nhsel++) #define change_nexthops(fi) { \ int nhsel; struct fib_nh *nexthop_nh; \ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ - nhsel < (fi)->fib_nhs; \ + nhsel < fib_info_num_path((fi)); \ nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -228,9 +232,13 @@ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); - change_nexthops(fi) { - fib_nh_release(fi->fib_net, nexthop_nh); - } endfor_nexthops(fi); + if (fi->nh) { + nexthop_put(fi->nh); + } else { + change_nexthops(fi) { + fib_nh_release(fi->fib_net, nexthop_nh); + } endfor_nexthops(fi); + } ip_fib_metrics_put(fi->fib_metrics); @@ -256,22 +264,34 @@ void fib_release_info(struct fib_info *fi) hlist_del(&fi->fib_hash); if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); - change_nexthops(fi) { - if (!nexthop_nh->fib_nh_dev) - continue; - hlist_del(&nexthop_nh->nh_hash); - } endfor_nexthops(fi) + if (fi->nh) { + list_del(&fi->nh_list); + } else { + change_nexthops(fi) { + if (!nexthop_nh->fib_nh_dev) + continue; + hlist_del(&nexthop_nh->nh_hash); + } endfor_nexthops(fi) + } fi->fib_dead = 1; fib_info_put(fi); } spin_unlock_bh(&fib_info_lock); } -static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) { - const struct fib_nh *onh = ofi->fib_nh; + const struct fib_nh *onh; + + if (fi->nh || ofi->nh) + return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1; + + if (ofi->fib_nhs == 0) + return 0; for_nexthops(fi) { + onh = fib_info_nh(ofi, nhsel); + if (nh->fib_nh_oif != onh->fib_nh_oif || nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || @@ -292,8 +312,6 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) if (nh->fib_nh_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) return -1; - - onh++; } endfor_nexthops(fi); return 0; } @@ -307,22 +325,78 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val) (val >> (DEVINDEX_HASHBITS * 2))) & mask; } -static inline unsigned int fib_info_hashfn(const struct fib_info *fi) +static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, + u32 prefsrc, u32 priority) { - unsigned int mask = (fib_info_hash_size - 1); - unsigned int val = fi->fib_nhs; + unsigned int val = init_val; - val ^= (fi->fib_protocol << 8) | fi->fib_scope; - val ^= (__force u32)fi->fib_prefsrc; - val ^= fi->fib_priority; - for_nexthops(fi) { - val ^= fib_devindex_hashfn(nh->fib_nh_oif); - } endfor_nexthops(fi) + val ^= (protocol << 8) | scope; + val ^= prefsrc; + val ^= priority; + + return val; +} + +static unsigned int fib_info_hashfn_result(unsigned int val) +{ + unsigned int mask = (fib_info_hash_size - 1); return (val ^ (val >> 7) ^ (val >> 12)) & mask; } -static struct fib_info *fib_find_info(const struct fib_info *nfi) +static inline unsigned int fib_info_hashfn(struct fib_info *fi) +{ + unsigned int val; + + val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, + fi->fib_scope, (__force u32)fi->fib_prefsrc, + fi->fib_priority); + + if (fi->nh) { + val ^= fib_devindex_hashfn(fi->nh->id); + } else { + for_nexthops(fi) { + val ^= fib_devindex_hashfn(nh->fib_nh_oif); + } endfor_nexthops(fi) + } + + return fib_info_hashfn_result(val); +} + +/* no metrics, only nexthop id */ +static struct fib_info *fib_find_info_nh(struct net *net, + const struct fib_config *cfg) +{ + struct hlist_head *head; + struct fib_info *fi; + unsigned int hash; + + hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id), + cfg->fc_protocol, cfg->fc_scope, + (__force u32)cfg->fc_prefsrc, + cfg->fc_priority); + hash = fib_info_hashfn_result(hash); + head = &fib_info_hash[hash]; + + hlist_for_each_entry(fi, head, fib_hash) { + if (!net_eq(fi->fib_net, net)) + continue; + if (!fi->nh || fi->nh->id != cfg->fc_nh_id) + continue; + if (cfg->fc_protocol == fi->fib_protocol && + cfg->fc_scope == fi->fib_scope && + cfg->fc_prefsrc == fi->fib_prefsrc && + cfg->fc_priority == fi->fib_priority && + cfg->fc_type == fi->fib_type && + cfg->fc_table == fi->fib_tb_id && + !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK)) + return fi; + } + + return NULL; +} + +static struct fib_info *fib_find_info(struct fib_info *nfi) { struct hlist_head *head; struct fib_info *fi; @@ -344,7 +418,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && - (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) + nh_comp(fi, nfi) == 0) return fi; } @@ -386,34 +460,40 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) + nla_total_size(4) /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PREFSRC */ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ + unsigned int nhs = fib_info_num_path(fi); /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); - if (fi->fib_nhs) { + if (fi->nh) + payload += nla_total_size(4); /* RTA_NH_ID */ + + if (nhs) { size_t nh_encapsize = 0; - /* Also handles the special case fib_nhs == 1 */ + /* Also handles the special case nhs == 1 */ /* each nexthop is packed in an attribute */ size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); + unsigned int i; /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); /* grab encap info */ - for_nexthops(fi) { - if (nh->fib_nh_lws) { + for (i = 0; i < fib_info_num_path(fi); i++) { + struct fib_nh_common *nhc = fib_info_nhc(fi, i); + + if (nhc->nhc_lwtstate) { /* RTA_ENCAP_TYPE */ nh_encapsize += lwtunnel_get_encap_size( - nh->fib_nh_lws); + nhc->nhc_lwtstate); /* RTA_ENCAP */ nh_encapsize += nla_total_size(2); } - } endfor_nexthops(fi); + } /* all nexthops are packed in a nested attribute */ - payload += nla_total_size((fi->fib_nhs * nhsize) + - nh_encapsize); + payload += nla_total_size((nhs * nhsize) + nh_encapsize); } @@ -574,12 +654,14 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, return nhs; } +/* only called when fib_nh is integrated into fib_info */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct net *net = fi->fib_net; struct fib_config fib_cfg; + struct fib_nh *nh; int ret; change_nexthops(fi) { @@ -642,24 +724,25 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, } endfor_nexthops(fi); ret = -EINVAL; - if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) { + nh = fib_info_nh(fi, 0); + if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) { NL_SET_ERR_MSG(extack, "Nexthop device index does not match RTA_OIF"); goto errout; } if (cfg->fc_gw_family) { - if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family || + if (cfg->fc_gw_family != nh->fib_nh_gw_family || (cfg->fc_gw_family == AF_INET && - fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) || + nh->fib_nh_gw4 != cfg->fc_gw4) || (cfg->fc_gw_family == AF_INET6 && - ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) { + ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) { NL_SET_ERR_MSG(extack, "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); goto errout; } } #ifdef CONFIG_IP_ROUTE_CLASSID - if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { + if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) { NL_SET_ERR_MSG(extack, "Nexthop class id does not match RTA_FLOW"); goto errout; @@ -670,12 +753,13 @@ errout: return ret; } +/* only called when fib_nh is integrated into fib_info */ static void fib_rebalance(struct fib_info *fi) { int total; int w; - if (fi->fib_nhs < 2) + if (fib_info_num_path(fi) < 2) return; total = 0; @@ -756,28 +840,36 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; + if (cfg->fc_nh_id) { + if (fi->nh && cfg->fc_nh_id == fi->nh->id) + return 0; + return 1; + } + if (cfg->fc_oif || cfg->fc_gw_family) { + struct fib_nh *nh = fib_info_nh(fi, 0); + if (cfg->fc_encap) { if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, - fi->fib_nh, cfg, extack)) + nh, cfg, extack)) return 1; } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && - cfg->fc_flow != fi->fib_nh->nh_tclassid) + cfg->fc_flow != nh->nh_tclassid) return 1; #endif - if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) || + if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) || (cfg->fc_gw_family && - cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family)) + cfg->fc_gw_family != nh->fib_nh_gw_family)) return 1; if (cfg->fc_gw_family == AF_INET && - cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4) + cfg->fc_gw4 != nh->fib_nh_gw4) return 1; if (cfg->fc_gw_family == AF_INET6 && - ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6)) + ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6)) return 1; return 0; @@ -1088,15 +1180,13 @@ out: return err; } -static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, - struct netlink_ext_ack *extack) +int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, + struct netlink_ext_ack *extack) { - struct net *net = cfg->fc_nlinfo.nl_net; - u32 table = cfg->fc_table; int err; if (nh->fib_nh_gw_family == AF_INET) - err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack); + err = fib_check_nh_v4_gw(net, nh, table, scope, extack); else if (nh->fib_nh_gw_family == AF_INET6) err = fib_check_nh_v6_gw(net, nh, table, extack); else @@ -1187,11 +1277,16 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, fib_info_hash_free(old_laddrhash, bytes); } -__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) +__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, + unsigned char scope) { - nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, - nh->fib_nh_gw4, - nh->nh_parent->fib_scope); + struct fib_nh *nh; + + if (nhc->nhc_family != AF_INET) + return inet_select_addr(nhc->nhc_dev, 0, scope); + + nh = container_of(nhc, struct fib_nh, nh_common); + nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); return nh->nh_saddr; @@ -1200,16 +1295,19 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) { struct fib_nh_common *nhc = res->nhc; - struct fib_nh *nh; if (res->fi->fib_prefsrc) return res->fi->fib_prefsrc; - nh = container_of(nhc, struct fib_nh, nh_common); - if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) - return nh->nh_saddr; + if (nhc->nhc_family == AF_INET) { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) + return nh->nh_saddr; + } - return fib_info_update_nh_saddr(net, nh); + return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); } static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) @@ -1241,6 +1339,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, { int err; struct fib_info *fi = NULL; + struct nexthop *nh = NULL; struct fib_info *ofi; int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; @@ -1260,6 +1359,23 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } + if (cfg->fc_nh_id) { + if (!cfg->fc_mx) { + fi = fib_find_info_nh(net, cfg); + if (fi) { + fi->fib_treeref++; + return fi; + } + } + + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto err_inval; + } + nhs = 0; + } + #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); @@ -1295,7 +1411,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, extack); - if (unlikely(IS_ERR(fi->fib_metrics))) { + if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); kfree(fi); return ERR_PTR(err); @@ -1312,14 +1428,25 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; - change_nexthops(fi) { - nexthop_nh->nh_parent = fi; - } endfor_nexthops(fi) + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = 0; + fi->nh = nh; + } + } else { + change_nexthops(fi) { + nexthop_nh->nh_parent = fi; + } endfor_nexthops(fi) - if (cfg->fc_mp) - err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); - else - err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); + if (cfg->fc_mp) + err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, + extack); + else + err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); + } if (err != 0) goto failure; @@ -1350,7 +1477,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } - if (cfg->fc_scope == RT_SCOPE_HOST) { + if (fi->nh) { + err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack); + if (err) + goto failure; + } else if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ @@ -1365,7 +1496,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } nh->fib_nh_scope = RT_SCOPE_NOWHERE; - nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif); + nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif); err = -ENODEV; if (!nh->fib_nh_dev) goto failure; @@ -1373,7 +1504,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, nexthop_nh, extack); + err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh, + cfg->fc_table, cfg->fc_scope, + extack); if (err != 0) goto failure; if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) @@ -1388,13 +1521,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } - change_nexthops(fi) { - fib_info_update_nh_saddr(net, nexthop_nh); - if (nexthop_nh->fib_nh_gw_family == AF_INET6) - fi->fib_nh_is_v6 = true; - } endfor_nexthops(fi) + if (!fi->nh) { + change_nexthops(fi) { + fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common, + fi->fib_scope); + if (nexthop_nh->fib_nh_gw_family == AF_INET6) + fi->fib_nh_is_v6 = true; + } endfor_nexthops(fi) - fib_rebalance(fi); + fib_rebalance(fi); + } link_it: ofi = fib_find_info(fi); @@ -1416,16 +1552,20 @@ link_it: head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; hlist_add_head(&fi->fib_lhash, head); } - change_nexthops(fi) { - struct hlist_head *head; - unsigned int hash; + if (fi->nh) { + list_add(&fi->nh_list, &nh->fi_list); + } else { + change_nexthops(fi) { + struct hlist_head *head; + unsigned int hash; - if (!nexthop_nh->fib_nh_dev) - continue; - hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); - head = &fib_info_devhash[hash]; - hlist_add_head(&nexthop_nh->nh_hash, head); - } endfor_nexthops(fi) + if (!nexthop_nh->fib_nh_dev) + continue; + hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_add_head(&nexthop_nh->nh_hash, head); + } endfor_nexthops(fi) + } spin_unlock_bh(&fib_info_lock); return fi; @@ -1552,6 +1692,12 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) if (!mp) goto nla_put_failure; + if (unlikely(fi->nh)) { + if (nexthop_mpath_fill_node(skb, fi->nh) < 0) + goto nla_put_failure; + goto mp_end; + } + for_nexthops(fi) { if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0) goto nla_put_failure; @@ -1562,6 +1708,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif } endfor_nexthops(fi); +mp_end: nla_nest_end(skb, mp); return 0; @@ -1580,6 +1727,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) { + unsigned int nhs = fib_info_num_path(fi); struct nlmsghdr *nlh; struct rtmsg *rtm; @@ -1615,18 +1763,31 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_prefsrc && nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; - if (fi->fib_nhs == 1) { - struct fib_nh *nh = &fi->fib_nh[0]; + + if (fi->nh) { + if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id)) + goto nla_put_failure; + if (nexthop_is_blackhole(fi->nh)) + rtm->rtm_type = RTN_BLACKHOLE; + } + + if (nhs == 1) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); unsigned char flags = 0; - if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0) + if (fib_nexthop_info(skb, nhc, &flags, false) < 0) goto nla_put_failure; rtm->rtm_flags = flags; #ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; + if (nhc->nhc_family == AF_INET) { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + if (nh->nh_tclassid && + nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) + goto nla_put_failure; + } #endif } else { if (fib_add_multipath(skb, fi) < 0) @@ -1709,7 +1870,7 @@ static int call_fib_nh_notifiers(struct fib_nh *nh, * - if the new MTU is greater than the PMTU, don't make any change * - otherwise, unlock and set PMTU */ -static void nh_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) +void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) { struct fnhe_hash_bucket *bucket; int i; @@ -1745,7 +1906,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) hlist_for_each_entry(nh, head, nh_hash) { if (nh->fib_nh_dev == dev) - nh_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); + fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } @@ -1754,6 +1915,8 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed + * + * only used when fib_nh is built into fib_info */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { @@ -1835,6 +1998,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; + struct fib_nh *nh; if (fa->fa_slen != slen) continue; @@ -1856,8 +2020,9 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - if (!next_fi->fib_nh[0].fib_nh_gw4 || - next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK) + + nh = fib_info_nh(next_fi, 0); + if (!nh->fib_nh_gw4 || nh->fib_nh_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); @@ -1899,6 +2064,8 @@ out: /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. + * + * only used when fib_nh is built into fib_info */ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { @@ -1993,6 +2160,11 @@ void fib_select_multipath(struct fib_result *res, int hash) struct net *net = fi->fib_net; bool first = false; + if (unlikely(res->fi->nh)) { + nexthop_path_fib_result(res, hash); + return; + } + change_nexthops(fi) { if (net->ipv4.sysctl_fib_multipath_use_neigh) { if (!fib_good_nh(nexthop_nh)) @@ -2021,7 +2193,7 @@ void fib_select_path(struct net *net, struct fib_result *res, goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1) { + if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 868c74771fa9..2b2b3d291ab0 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -338,12 +338,18 @@ static struct tnode *tnode_alloc(int bits) static inline void empty_child_inc(struct key_vector *n) { - ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children; + tn_info(n)->empty_children++; + + if (!tn_info(n)->empty_children) + tn_info(n)->full_children++; } static inline void empty_child_dec(struct key_vector *n) { - tn_info(n)->empty_children-- ? : tn_info(n)->full_children--; + if (!tn_info(n)->empty_children) + tn_info(n)->full_children--; + + tn_info(n)->empty_children--; } static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) @@ -1449,6 +1455,7 @@ found: fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (unlikely(err < 0)) { +out_reject: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif @@ -1457,7 +1464,13 @@ found: } if (fi->fib_flags & RTNH_F_DEAD) continue; - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + + if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) { + err = fib_props[RTN_BLACKHOLE].error; + goto out_reject; + } + + for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); if (nhc->nhc_flags & RTNH_F_DEAD) @@ -1931,6 +1944,77 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) return found; } +/* derived from fib_trie_free */ +static void __fib_info_notify_update(struct net *net, struct fib_table *tb, + struct nl_info *info) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct fib_alias *fa; + + for (;;) { + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; + + if (IS_TRIE(pn)) + break; + + pn = node_parent(pn); + cindex = get_index(pkey, pn); + continue; + } + + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; + + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; + + continue; + } + + hlist_for_each_entry(fa, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id) + continue; + + rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, + KEYLENGTH - fa->fa_slen, tb->tb_id, + info, NLM_F_REPLACE); + + /* call_fib_entry_notifiers will be removed when + * in-kernel notifier is implemented and supported + * for nexthop objects + */ + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, + n->key, + KEYLENGTH - fa->fa_slen, fa, + NULL); + } + } +} + +void fib_info_notify_update(struct net *net, struct nl_info *info) +{ + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb_hlist) + __fib_info_notify_update(net, tb, info); + } +} + static void fib_leaf_notify(struct net *net, struct key_vector *l, struct fib_table *tb, struct notifier_block *nb) { @@ -2006,22 +2090,26 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, { unsigned int flags = NLM_F_MULTI; __be32 xkey = htonl(l->key); + int i, s_i, i_fa, s_fa, err; struct fib_alias *fa; - int i, s_i; - if (filter->filter_set) + if (filter->filter_set || + !filter->dump_exceptions || !filter->dump_routes) flags |= NLM_F_DUMP_FILTERED; s_i = cb->args[4]; + s_fa = cb->args[5]; i = 0; /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { - int err; + struct fib_info *fi = fa->fa_info; if (i < s_i) goto next; + i_fa = 0; + if (tb->tb_id != fa->tb_id) goto next; @@ -2030,29 +2118,49 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, goto next; if ((filter->protocol && - fa->fa_info->fib_protocol != filter->protocol)) + fi->fib_protocol != filter->protocol)) goto next; if (filter->dev && - !fib_info_nh_uses_dev(fa->fa_info, filter->dev)) + !fib_info_nh_uses_dev(fi, filter->dev)) goto next; } - err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - tb->tb_id, fa->fa_type, - xkey, KEYLENGTH - fa->fa_slen, - fa->fa_tos, fa->fa_info, flags); - if (err < 0) { - cb->args[4] = i; - return err; + if (filter->dump_routes) { + if (!s_fa) { + err = fib_dump_info(skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, + tb->tb_id, fa->fa_type, + xkey, + KEYLENGTH - fa->fa_slen, + fa->fa_tos, fi, flags); + if (err < 0) + goto stop; + } + + i_fa++; } + + if (filter->dump_exceptions) { + err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, + &i_fa, s_fa); + if (err < 0) + goto stop; + } + next: i++; } cb->args[4] = i; return skb->len; + +stop: + cb->args[4] = i; + cb->args[5] = i_fa; + return err; } /* rcu_read_lock needs to be hold by caller from readside */ @@ -2634,14 +2742,18 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } -static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) +static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { unsigned int flags = 0; if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; - if (fi && fi->fib_nh->fib_nh_gw4) - flags |= RTF_GATEWAY; + if (fi) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); + + if (nhc->nhc_gw.ipv4) + flags |= RTF_GATEWAY; + } if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; flags |= RTF_UP; @@ -2672,7 +2784,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) prefix = htonl(l->key); hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { - const struct fib_info *fi = fa->fa_info; + struct fib_info *fi = fa->fa_info; __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); @@ -2685,26 +2797,31 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) seq_setwidth(seq, 127); - if (fi) + if (fi) { + struct fib_nh_common *nhc = fib_info_nhc(fi, 0); + __be32 gw = 0; + + if (nhc->nhc_gw_family == AF_INET) + gw = nhc->nhc_gw.ipv4; + seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", - fi->fib_dev ? fi->fib_dev->name : "*", - prefix, - fi->fib_nh->fib_nh_gw4, flags, 0, 0, + nhc->nhc_dev ? nhc->nhc_dev->name : "*", + prefix, gw, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); - else + } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); - + } seq_pad(seq, '\n'); } diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 293acfb36376..44bfeecac33e 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -83,7 +83,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { if (!skb_checksum_simple_validate(skb)) { - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + skb_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } else if (csum_err) { *csum_err = true; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7c857c72aad1..1510e951f451 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -201,7 +201,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; */ static struct sock *icmp_sk(struct net *net) { - return *this_cpu_ptr(net->ipv4.icmp_sk); + return this_cpu_read(*net->ipv4.icmp_sk); } /* Called with BH disabled */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a57f0d69eadb..180f6896b98b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -332,14 +332,15 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev, const struct flowi4 *fl4) { struct in_device *in_dev = __in_dev_get_rcu(dev); + const struct in_ifaddr *ifa; if (!in_dev) return htonl(INADDR_ANY); - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (fl4->saddr == ifa->ifa_local) return fl4->saddr; - } endfor_ifa(in_dev); + } return htonl(INADDR_ANY); } @@ -1228,12 +1229,8 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) if (pmc) { im->interface = pmc->interface; if (im->sfmode == MCAST_INCLUDE) { - im->tomb = pmc->tomb; - pmc->tomb = NULL; - - im->sources = pmc->sources; - pmc->sources = NULL; - + swap(im->tomb, pmc->tomb); + swap(im->sources, pmc->sources); for (psf = im->sources; psf; psf = psf->sf_next) psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; } else { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7fd6db3fe366..f5c163d4771b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -649,8 +649,7 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) EXPORT_SYMBOL(inet_rtx_syn_ack); /* return true if req was found in the ehash table */ -static bool reqsk_queue_unlink(struct request_sock_queue *queue, - struct request_sock *req) +static bool reqsk_queue_unlink(struct request_sock *req) { struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; bool found = false; @@ -669,7 +668,7 @@ static bool reqsk_queue_unlink(struct request_sock_queue *queue, void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { - if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) { + if (reqsk_queue_unlink(req)) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5ce6969896f5..d666756be5f1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -106,48 +106,90 @@ int inet_frags_init(struct inet_frags *f) if (!f->frags_cachep) return -ENOMEM; + refcount_set(&f->refcnt, 1); + init_completion(&f->completion); return 0; } EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { - /* We must wait that all inet_frag_destroy_rcu() have completed. */ - rcu_barrier(); + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); + + wait_for_completion(&f->completion); kmem_cache_destroy(f->frags_cachep); f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); +/* called from rhashtable_free_and_destroy() at netns_frags dismantle */ static void inet_frags_free_cb(void *ptr, void *arg) { struct inet_frag_queue *fq = ptr; + int count; - /* If we can not cancel the timer, it means this frag_queue - * is already disappearing, we have nothing to do. - * Otherwise, we own a refcount until the end of this function. - */ - if (!del_timer(&fq->timer)) - return; + count = del_timer_sync(&fq->timer) ? 1 : 0; spin_lock_bh(&fq->lock); if (!(fq->flags & INET_FRAG_COMPLETE)) { fq->flags |= INET_FRAG_COMPLETE; - refcount_dec(&fq->refcnt); + count++; + } else if (fq->flags & INET_FRAG_HASH_DEAD) { + count++; } spin_unlock_bh(&fq->lock); - inet_frag_put(fq); + if (refcount_sub_and_test(count, &fq->refcnt)) + inet_frag_destroy(fq); +} + +static void fqdir_work_fn(struct work_struct *work) +{ + struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); + struct inet_frags *f = fqdir->f; + + rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); + + /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) + * have completed, since they need to dereference fqdir. + * Would it not be nice to have kfree_rcu_barrier() ? :) + */ + rcu_barrier(); + + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); + + kfree(fqdir); } -void inet_frags_exit_net(struct netns_frags *nf) +int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) { - nf->high_thresh = 0; /* prevent creation of new frags */ + struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); + int res; - rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); + if (!fqdir) + return -ENOMEM; + fqdir->f = f; + fqdir->net = net; + res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); + if (res < 0) { + kfree(fqdir); + return res; + } + refcount_inc(&f->refcnt); + *fqdirp = fqdir; + return 0; } -EXPORT_SYMBOL(inet_frags_exit_net); +EXPORT_SYMBOL(fqdir_init); + +void fqdir_exit(struct fqdir *fqdir) +{ + INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); + queue_work(system_wq, &fqdir->destroy_work); +} +EXPORT_SYMBOL(fqdir_exit); void inet_frag_kill(struct inet_frag_queue *fq) { @@ -155,11 +197,23 @@ void inet_frag_kill(struct inet_frag_queue *fq) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - struct netns_frags *nf = fq->net; + struct fqdir *fqdir = fq->fqdir; fq->flags |= INET_FRAG_COMPLETE; - rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); - refcount_dec(&fq->refcnt); + rcu_read_lock(); + /* The RCU read lock provides a memory barrier + * guaranteeing that if fqdir->dead is false then + * the hash table destruction will not start until + * after we unlock. Paired with inet_frags_exit_net(). + */ + if (!fqdir->dead) { + rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, + fqdir->f->rhash_params); + refcount_dec(&fq->refcnt); + } else { + fq->flags |= INET_FRAG_HASH_DEAD; + } + rcu_read_unlock(); } } EXPORT_SYMBOL(inet_frag_kill); @@ -168,7 +222,7 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) { struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, rcu); - struct inet_frags *f = q->net->f; + struct inet_frags *f = q->fqdir->f; if (f->destructor) f->destructor(q); @@ -199,7 +253,7 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge); void inet_frag_destroy(struct inet_frag_queue *q) { - struct netns_frags *nf; + struct fqdir *fqdir; unsigned int sum, sum_truesize = 0; struct inet_frags *f; @@ -207,18 +261,18 @@ void inet_frag_destroy(struct inet_frag_queue *q) WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ - nf = q->net; - f = nf->f; + fqdir = q->fqdir; + f = fqdir->f; sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); sum = sum_truesize + f->qsize; call_rcu(&q->rcu, inet_frag_destroy_rcu); - sub_frag_mem_limit(nf, sum); + sub_frag_mem_limit(fqdir, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, +static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, struct inet_frags *f, void *arg) { @@ -228,9 +282,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, if (!q) return NULL; - q->net = nf; + q->fqdir = fqdir; f->constructor(q, arg); - add_frag_mem_limit(nf, f->qsize); + add_frag_mem_limit(fqdir, f->qsize); timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); @@ -239,21 +293,21 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, return q; } -static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, +static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, void *arg, struct inet_frag_queue **prev) { - struct inet_frags *f = nf->f; + struct inet_frags *f = fqdir->f; struct inet_frag_queue *q; - q = inet_frag_alloc(nf, f, arg); + q = inet_frag_alloc(fqdir, f, arg); if (!q) { *prev = ERR_PTR(-ENOMEM); return NULL; } - mod_timer(&q->timer, jiffies + nf->timeout); + mod_timer(&q->timer, jiffies + fqdir->timeout); - *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, + *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { q->flags |= INET_FRAG_COMPLETE; @@ -265,18 +319,18 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, } /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) +struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { struct inet_frag_queue *fq = NULL, *prev; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) + if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh) return NULL; rcu_read_lock(); - prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); if (!prev) - fq = inet_frag_create(nf, key, &prev); + fq = inet_frag_create(fqdir, key, &prev); if (prev && !IS_ERR(prev)) { fq = prev; if (!refcount_inc_not_zero(&fq->refcnt)) @@ -387,7 +441,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, delta += head->truesize; if (delta) - add_frag_mem_limit(q->net, delta); + add_frag_mem_limit(q->fqdir, delta); /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part @@ -409,7 +463,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, head->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; - add_frag_mem_limit(q->net, clone->truesize); + add_frag_mem_limit(q->fqdir, clone->truesize); skb_shinfo(head)->frag_list = clone; nextp = &clone->next; } else { @@ -462,7 +516,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, rbn = rbnext; } } - sub_frag_mem_limit(q->net, head->truesize); + sub_frag_mem_limit(q->fqdir, head->truesize); *nextp = NULL; skb_mark_not_on_list(head); @@ -490,7 +544,7 @@ struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) if (head == q->fragments_tail) q->fragments_tail = NULL; - sub_frag_mem_limit(q->net, head->truesize); + sub_frag_mem_limit(q->fqdir, head->truesize); return head; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c4503073248b..97824864e40d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -316,7 +316,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); done: - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cf2b0a6a3337..4385eb9e781f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -82,15 +82,13 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); - struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, - frags); - struct net *net = container_of(ipv4, struct net, ipv4); + struct net *net = q->fqdir->net; const struct frag_v4_compare_key *key = a; q->key.v4 = *key; qp->ecn = 0; - qp->peer = q->net->max_dist ? + qp->peer = q->fqdir->max_dist ? inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } @@ -142,9 +140,13 @@ static void ip_expire(struct timer_list *t) int err; qp = container_of(frag, struct ipq, q); - net = container_of(qp->q.net, struct net, ipv4.frags); + net = qp->q.fqdir->net; rcu_read_lock(); + + if (qp->q.fqdir->dead) + goto out_rcu_unlock; + spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) @@ -211,7 +213,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->ipv4.frags, &key); + q = inet_frag_find(net->ipv4.fqdir, &key); if (!q) return NULL; @@ -222,7 +224,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; - unsigned int max = qp->q.net->max_dist; + unsigned int max = qp->q.fqdir->max_dist; unsigned int start, end; int rc; @@ -236,12 +238,8 @@ static int ip_frag_too_far(struct ipq *qp) rc = qp->q.fragments_tail && (end - start) > max; - if (rc) { - struct net *net; - - net = container_of(qp->q.net, struct net, ipv4.frags); - __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - } + if (rc) + __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS); return rc; } @@ -250,13 +248,13 @@ static int ip_frag_reinit(struct ipq *qp) { unsigned int sum_truesize = 0; - if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { + if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { refcount_inc(&qp->q.refcnt); return -ETIMEDOUT; } sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments); - sub_frag_mem_limit(qp->q.net, sum_truesize); + sub_frag_mem_limit(qp->q.fqdir, sum_truesize); qp->q.flags = 0; qp->q.len = 0; @@ -273,7 +271,7 @@ static int ip_frag_reinit(struct ipq *qp) /* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; struct sk_buff *prev_tail; struct net_device *dev; @@ -352,7 +350,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - add_frag_mem_limit(qp->q.net, skb->truesize); + add_frag_mem_limit(qp->q.fqdir, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; @@ -399,7 +397,7 @@ err: static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct net *net = qp->q.fqdir->net; struct iphdr *iph; void *reasm_data; int len, err; @@ -544,30 +542,24 @@ static int dist_min; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", - .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", - .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv4.frags.high_thresh }, { .procname = "ipfrag_time", - .data = &init_net.ipv4.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ipfrag_max_dist", - .data = &init_net.ipv4.frags.max_dist, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -600,13 +592,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv4.frags.high_thresh; - table[0].extra1 = &net->ipv4.frags.low_thresh; - table[1].data = &net->ipv4.frags.low_thresh; - table[1].extra2 = &net->ipv4.frags.high_thresh; - table[2].data = &net->ipv4.frags.timeout; - table[3].data = &net->ipv4.frags.max_dist; } + table[0].data = &net->ipv4.fqdir->high_thresh; + table[0].extra1 = &net->ipv4.fqdir->low_thresh; + table[1].data = &net->ipv4.fqdir->low_thresh; + table[1].extra2 = &net->ipv4.fqdir->high_thresh; + table[2].data = &net->ipv4.fqdir->timeout; + table[3].data = &net->ipv4.fqdir->max_dist; hdr = register_net_sysctl(net, "net/ipv4", table); if (!hdr) @@ -654,6 +646,9 @@ static int __net_init ipv4_frags_init_net(struct net *net) { int res; + res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net); + if (res < 0) + return res; /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -668,36 +663,38 @@ static int __net_init ipv4_frags_init_net(struct net *net) * we will prune down to 3MB, making room for approx 8 big 64K * fragments 8x128k. */ - net->ipv4.frags.high_thresh = 4 * 1024 * 1024; - net->ipv4.frags.low_thresh = 3 * 1024 * 1024; + net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024; + net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival * by TTL. */ - net->ipv4.frags.timeout = IP_FRAG_TIME; + net->ipv4.fqdir->timeout = IP_FRAG_TIME; - net->ipv4.frags.max_dist = 64; - net->ipv4.frags.f = &ip4_frags; + net->ipv4.fqdir->max_dist = 64; - res = inet_frags_init_net(&net->ipv4.frags); - if (res < 0) - return res; res = ip4_frags_ns_ctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv4.frags); + fqdir_exit(net->ipv4.fqdir); return res; } +static void __net_exit ipv4_frags_pre_exit_net(struct net *net) +{ + fqdir_pre_exit(net->ipv4.fqdir); +} + static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); - inet_frags_exit_net(&net->ipv4.frags); + fqdir_exit(net->ipv4.fqdir); } static struct pernet_operations ip4_frags_ops = { - .init = ipv4_frags_init_net, - .exit = ipv4_frags_exit_net, + .init = ipv4_frags_init_net, + .pre_exit = ipv4_frags_pre_exit_net, + .exit = ipv4_frags_exit_net, }; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 3db31bb9df50..ddaa01ec2bce 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -473,6 +473,7 @@ error: *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } +EXPORT_SYMBOL(__ip_options_compile); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8c2ec35b6512..cc7ef0d05bbd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, return ret; } -static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -315,14 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + static int ip_mc_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *new_rt; - int ret; + bool do_cn = false; + int ret, err; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { + switch (ret) { + case NET_XMIT_CN: + do_cn = true; + /* fall through */ + case NET_XMIT_SUCCESS: + break; + default: kfree_skb(skb); return ret; } @@ -338,7 +354,8 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk, skb_dst_set(skb, &new_rt->dst); } - return dev_loopback_xmit(net, sk, skb); + err = dev_loopback_xmit(net, sk, skb); + return (do_cn && err) ? ret : err; } int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -537,9 +554,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_hash(to, from); - /* Copy the flags to each fragment. */ - IPCB(to)->flags = IPCB(from)->flags; - #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif @@ -573,6 +587,175 @@ static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return ip_do_fragment(net, sk, skb, output); } +void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, + unsigned int hlen, struct ip_fraglist_iter *iter) +{ + unsigned int first_len = skb_pagelen(skb); + + iter->frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + + iter->offset = 0; + iter->iph = iph; + iter->hlen = hlen; + + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + iph->tot_len = htons(first_len); + iph->frag_off = htons(IP_MF); + ip_send_check(iph); +} +EXPORT_SYMBOL(ip_fraglist_init); + +static void ip_fraglist_ipcb_prepare(struct sk_buff *skb, + struct ip_fraglist_iter *iter) +{ + struct sk_buff *to = iter->frag; + + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(skb)->flags; + + if (iter->offset == 0) + ip_options_fragment(to); +} + +void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) +{ + unsigned int hlen = iter->hlen; + struct iphdr *iph = iter->iph; + struct sk_buff *frag; + + frag = iter->frag; + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iph, hlen); + iter->iph = ip_hdr(frag); + iph = iter->iph; + iph->tot_len = htons(frag->len); + ip_copy_metadata(frag, skb); + iter->offset += skb->len - hlen; + iph->frag_off = htons(iter->offset >> 3); + if (frag->next) + iph->frag_off |= htons(IP_MF); + /* Ready, complete checksum */ + ip_send_check(iph); +} +EXPORT_SYMBOL(ip_fraglist_prepare); + +void ip_frag_init(struct sk_buff *skb, unsigned int hlen, + unsigned int ll_rs, unsigned int mtu, + struct ip_frag_state *state) +{ + struct iphdr *iph = ip_hdr(skb); + + state->hlen = hlen; + state->ll_rs = ll_rs; + state->mtu = mtu; + + state->left = skb->len - hlen; /* Space per frame */ + state->ptr = hlen; /* Where to start from */ + + state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; + state->not_last_frag = iph->frag_off & htons(IP_MF); +} +EXPORT_SYMBOL(ip_frag_init); + +static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, + bool first_frag, struct ip_frag_state *state) +{ + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(from)->flags; + + if (IPCB(from)->flags & IPSKB_FRAG_PMTU) + state->iph->frag_off |= htons(IP_DF); + + /* ANK: dirty, but effective trick. Upgrade options only if + * the segment to be fragmented was THE FIRST (otherwise, + * options are already fixed) and make it ONCE + * on the initial skb, so that all the following fragments + * will inherit fixed options. + */ + if (first_frag) + ip_options_fragment(from); +} + +struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) +{ + unsigned int len = state->left; + struct sk_buff *skb2; + struct iphdr *iph; + + len = state->left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > state->mtu) + len = state->mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < state->left) { + len &= ~7; + } + + /* Allocate buffer */ + skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC); + if (!skb2) + return ERR_PTR(-ENOMEM); + + /* + * Set up data on packet + */ + + ip_copy_metadata(skb2, skb); + skb_reserve(skb2, state->ll_rs); + skb_put(skb2, len + state->hlen); + skb_reset_network_header(skb2); + skb2->transport_header = skb2->network_header + state->hlen; + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + + skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen); + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len)) + BUG(); + state->left -= len; + + /* + * Fill in the new header fields. + */ + iph = ip_hdr(skb2); + iph->frag_off = htons((state->offset >> 3)); + + /* + * Added AC : If we are fragmenting a fragment that's not the + * last fragment then keep MF on each bit + */ + if (state->left > 0 || state->not_last_frag) + iph->frag_off |= htons(IP_MF); + state->ptr += len; + state->offset += len; + + iph->tot_len = htons(len + state->hlen); + + ip_send_check(iph); + + return skb2; +} +EXPORT_SYMBOL(ip_frag_next); + /* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus @@ -584,12 +767,11 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; - int ptr; struct sk_buff *skb2; - unsigned int mtu, hlen, left, len, ll_rs; - int offset; - __be16 not_last_frag; struct rtable *rt = skb_rtable(skb); + unsigned int mtu, hlen, ll_rs; + struct ip_fraglist_iter iter; + struct ip_frag_state state; int err = 0; /* for offloaded checksums cleanup checksum before fragmentation */ @@ -654,49 +836,24 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, } /* Everything is OK. Generate! */ - - err = 0; - offset = 0; - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - iph->tot_len = htons(first_len); - iph->frag_off = htons(IP_MF); - ip_send_check(iph); + ip_fraglist_init(skb, iph, hlen, &iter); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), iph, hlen); - iph = ip_hdr(frag); - iph->tot_len = htons(frag->len); - ip_copy_metadata(frag, skb); - if (offset == 0) - ip_options_fragment(frag); - offset += skb->len - hlen; - iph->frag_off = htons(offset>>3); - if (frag->next) - iph->frag_off |= htons(IP_MF); - /* Ready, complete checksum */ - ip_send_check(iph); + if (iter.frag) { + ip_fraglist_ipcb_prepare(skb, &iter); + ip_fraglist_prepare(skb, &iter); } err = output(net, sk, skb); if (!err) IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); - if (err || !frag) + if (err || !iter.frag) break; - skb = frag; - frag = skb->next; - skb_mark_not_on_list(skb); + skb = ip_fraglist_next(&iter); } if (err == 0) { @@ -704,7 +861,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - kfree_skb_list(frag); + kfree_skb_list(iter.frag); IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; @@ -720,105 +877,29 @@ slow_path_clean: } slow_path: - iph = ip_hdr(skb); - - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - /* * Fragment the datagram. */ - offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; - not_last_frag = iph->frag_off & htons(IP_MF); + ip_frag_init(skb, hlen, ll_rs, mtu, &state); /* * Keep copying data until we run out. */ - while (left > 0) { - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } + while (state.left > 0) { + bool first_frag = (state.offset == 0); - /* Allocate buffer */ - skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC); - if (!skb2) { - err = -ENOMEM; + skb2 = ip_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); goto fail; } - - /* - * Set up data on packet - */ - - ip_copy_metadata(skb2, skb); - skb_reserve(skb2, ll_rs); - skb_put(skb2, len + hlen); - skb_reset_network_header(skb2); - skb2->transport_header = skb2->network_header + hlen; - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - - skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); - - /* - * Copy a block of the IP datagram. - */ - if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) - BUG(); - left -= len; - - /* - * Fill in the new header fields. - */ - iph = ip_hdr(skb2); - iph->frag_off = htons((offset >> 3)); - - if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) - iph->frag_off |= htons(IP_DF); - - /* ANK: dirty, but effective trick. Upgrade options only if - * the segment to be fragmented was THE FIRST (otherwise, - * options are already fixed) and make it ONCE - * on the initial skb, so that all the following fragments - * will inherit fixed options. - */ - if (offset == 0) - ip_options_fragment(skb); - - /* - * Added AC : If we are fragmenting a fragment that's not the - * last fragment then keep MF on each bit - */ - if (left > 0 || not_last_frag) - iph->frag_off |= htons(IP_MF); - ptr += len; - offset += len; + ip_frag_ipcb(skb, skb2, first_frag, &state); /* * Put this fragment into the sending queue. */ - iph->tot_len = htons(len + hlen); - - ip_send_check(iph); - err = output(net, sk, skb2); if (err) goto fail; @@ -1568,7 +1649,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, - unsigned int len) + unsigned int len, u64 transmit_time) { struct ip_options_data replyopts; struct ipcm_cookie ipc; @@ -1584,6 +1665,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, ipcm_init(&ipc); ipc.addr = daddr; + ipc.sockc.transmit_time = transmit_time; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 2f4cdcc13d53..59bfa3825810 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -186,8 +186,7 @@ static void __exit ipcomp4_fini(void) { if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&ipcomp_type, AF_INET); } module_init(ipcomp4_init); diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 87ca2c42359b..a4e07e5e9c11 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -17,7 +17,7 @@ target(struct sk_buff *skb, const struct xt_action_param *par) unsigned char *arpptr; int pln, hln; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return NF_DROP; arp = arp_hdr(skb); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 5f116c3749b4..5930d3b02555 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -29,7 +29,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo) if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { __u8 oldtos; - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return false; iph = ip_hdr(skb); oldtos = iph->tos; @@ -58,7 +58,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) tcph->cwr == einfo->proto.tcp.cwr)) return true; - if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) + if (skb_ensure_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) return false; tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb); diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 64d9563c0218..8e7f84ec783d 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -3,258 +3,11 @@ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <net/tcp.h> - #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_synproxy.h> -#include <net/netfilter/nf_conntrack_ecache.h> - -static struct iphdr * -synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, - __be32 daddr) -{ - struct iphdr *iph; - - skb_reset_network_header(skb); - iph = skb_put(skb, sizeof(*iph)); - iph->version = 4; - iph->ihl = sizeof(*iph) / 4; - iph->tos = 0; - iph->id = 0; - iph->frag_off = htons(IP_DF); - iph->ttl = net->ipv4.sysctl_ip_default_ttl; - iph->protocol = IPPROTO_TCP; - iph->check = 0; - iph->saddr = saddr; - iph->daddr = daddr; - - return iph; -} - -static void -synproxy_send_tcp(struct net *net, - const struct sk_buff *skb, struct sk_buff *nskb, - struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, - struct iphdr *niph, struct tcphdr *nth, - unsigned int tcp_hdr_size) -{ - nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)nth - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - skb_dst_set_noref(nskb, skb_dst(skb)); - nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) - goto free_nskb; - - if (nfct) { - nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); - nf_conntrack_get(nfct); - } - - ip_local_out(net, nskb->sk, nskb); - return; - -free_nskb: - kfree_skb(nskb); -} - -static void -synproxy_send_client_synack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - u16 mss = opts->mss; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE; - nth->doff = tcp_hdr_size / 4; - nth->window = 0; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_syn(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(recv_seq - 1); - /* ack_seq is used to relay our ISN to the synproxy hook to initialize - * sequence number translation once a connection tracking entry exists. - */ - nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); - tcp_flag_word(nth) = TCP_FLAG_SYN; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; - nth->doff = tcp_hdr_size / 4; - nth->window = th->window; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, - niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_ack(struct net *net, - const struct ip_ct_tcp *state, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(ntohl(th->ack_seq)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(ntohl(th->seq) + 1); - nth->ack_seq = th->ack_seq; - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(ntohs(th->window) >> opts->wscale); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static bool -synproxy_recv_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - int mss; - - mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); - if (mss == 0) { - this_cpu_inc(snet->stats->cookie_invalid); - return false; - } - - this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; - - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy_check_timestamp_cookie(opts); - - synproxy_send_server_syn(net, skb, th, opts, recv_seq); - return true; -} +#include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) @@ -306,135 +59,6 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv4_synproxy_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct net *net = nhs->net; - struct synproxy_net *snet = synproxy_pernet(net); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - struct nf_conn_synproxy *synproxy; - struct synproxy_options opts = {}; - const struct ip_ct_tcp *state; - struct tcphdr *th, _th; - unsigned int thoff; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_ACCEPT; - - synproxy = nfct_synproxy(ct); - if (synproxy == NULL) - return NF_ACCEPT; - - if (nf_is_loopback_packet(skb) || - ip_hdr(skb)->protocol != IPPROTO_TCP) - return NF_ACCEPT; - - thoff = ip_hdrlen(skb); - th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); - if (th == NULL) - return NF_DROP; - - state = &ct->proto.tcp; - switch (state->state) { - case TCP_CONNTRACK_CLOSE: - if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - - ntohl(th->seq) + 1); - break; - } - - if (!th->syn || th->ack || - CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - break; - - /* Reopened connection - reset the sequence number and timestamp - * adjustments, they will get initialized once the connection is - * reestablished. - */ - nf_ct_seqadj_init(ct, ctinfo, 0); - synproxy->tsoff = 0; - this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ - case TCP_CONNTRACK_SYN_SENT: - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (!th->syn && th->ack && - CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, - * therefore we need to add 1 to make the SYN sequence - * number match the one of first SYN. - */ - if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) { - this_cpu_inc(snet->stats->cookie_retrans); - consume_skb(skb); - return NF_STOLEN; - } else { - return NF_DROP; - } - } - - synproxy->isn = ntohl(th->ack_seq); - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy->its = opts.tsecr; - - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - break; - case TCP_CONNTRACK_SYN_RECV: - if (!th->syn || !th->ack) - break; - - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { - synproxy->tsoff = opts.tsval - synproxy->its; - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - } - - opts.options &= ~(XT_SYNPROXY_OPT_MSS | - XT_SYNPROXY_OPT_WSCALE | - XT_SYNPROXY_OPT_SACK_PERM); - - swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(net, state, skb, th, &opts); - - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); - nf_conntrack_event_cache(IPCT_SEQADJ, ct); - - swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(net, skb, th, &opts); - - consume_skb(skb); - return NF_STOLEN; - default: - break; - } - - synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); - return NF_ACCEPT; -} - -static const struct nf_hook_ops ipv4_synproxy_ops[] = { - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int synproxy_tg4_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); @@ -449,16 +73,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par) if (err) return err; - if (snet->hook_ref4 == 0) { - err = nf_register_net_hooks(par->net, ipv4_synproxy_ops, - ARRAY_SIZE(ipv4_synproxy_ops)); - if (err) { - nf_ct_netns_put(par->net, par->family); - return err; - } + err = nf_synproxy_ipv4_init(snet, par->net); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; } - snet->hook_ref4++; return err; } @@ -466,10 +86,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); - snet->hook_ref4--; - if (snet->hook_ref4 == 0) - nf_unregister_net_hooks(par->net, ipv4_synproxy_ops, - ARRAY_SIZE(ipv4_synproxy_ops)); + nf_synproxy_ipv4_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 6eefde5bc468..69697eb4bfc6 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -2,7 +2,7 @@ /* * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT . * - * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index dfea10f13878..87b711fd5a44 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -6,7 +6,7 @@ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the 'brute force' H.323 NAT module by - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Jozsef Kadlecsik <kadlec@netfilter.org> */ #include <linux/module.h> @@ -58,7 +58,7 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff, net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); return -1; } - /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy + /* nf_nat_mangle_udp_packet uses skb_ensure_writable() to copy * or pull everything in a linear buffer, so we can safely * use the skb pointers now */ *data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c index 657d2dcec3cc..717b726504fe 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -186,7 +186,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, return NF_DROP; } - if (!skb_make_writable(skb, skb->len)) { + if (skb_ensure_writable(skb, skb->len)) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); return NF_DROP; } diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index b6dd39636bea..b2bae0b0e42a 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -49,6 +49,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4); __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) { + const struct in_ifaddr *ifa; struct in_device *indev; __be32 laddr; @@ -57,10 +58,14 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) laddr = 0; indev = __in_dev_get_rcu(skb->dev); - for_primary_ifa(indev) { + + in_dev_for_each_ifa_rcu(ifa, indev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + laddr = ifa->ifa_local; break; - } endfor_ifa(indev); + } return laddr ? laddr : daddr; } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c new file mode 100644 index 000000000000..5fe5a3981d43 --- /dev/null +++ b/net/ipv4/nexthop.c @@ -0,0 +1,1828 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Generic nexthop implementation + * + * Copyright (c) 2017-19 Cumulus Networks + * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> + */ + +#include <linux/nexthop.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> +#include <net/arp.h> +#include <net/ipv6_stubs.h> +#include <net/lwtunnel.h> +#include <net/ndisc.h> +#include <net/nexthop.h> +#include <net/route.h> +#include <net/sock.h> + +static void remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo); + +#define NH_DEV_HASHBITS 8 +#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) + +static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { + [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 }, + [NHA_ID] = { .type = NLA_U32 }, + [NHA_GROUP] = { .type = NLA_BINARY }, + [NHA_GROUP_TYPE] = { .type = NLA_U16 }, + [NHA_BLACKHOLE] = { .type = NLA_FLAG }, + [NHA_OIF] = { .type = NLA_U32 }, + [NHA_GATEWAY] = { .type = NLA_BINARY }, + [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, + [NHA_ENCAP] = { .type = NLA_NESTED }, + [NHA_GROUPS] = { .type = NLA_FLAG }, + [NHA_MASTER] = { .type = NLA_U32 }, +}; + +static unsigned int nh_dev_hashfn(unsigned int val) +{ + unsigned int mask = NH_DEV_HASHSIZE - 1; + + return (val ^ + (val >> NH_DEV_HASHBITS) ^ + (val >> (NH_DEV_HASHBITS * 2))) & mask; +} + +static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) +{ + struct net_device *dev = nhi->fib_nhc.nhc_dev; + struct hlist_head *head; + unsigned int hash; + + WARN_ON(!dev); + + hash = nh_dev_hashfn(dev->ifindex); + head = &net->nexthop.devhash[hash]; + hlist_add_head(&nhi->dev_hash, head); +} + +static void nexthop_free_mpath(struct nexthop *nh) +{ + struct nh_group *nhg; + int i; + + nhg = rcu_dereference_raw(nh->nh_grp); + for (i = 0; i < nhg->num_nh; ++i) + WARN_ON(nhg->nh_entries[i].nh); + + kfree(nhg); +} + +static void nexthop_free_single(struct nexthop *nh) +{ + struct nh_info *nhi; + + nhi = rcu_dereference_raw(nh->nh_info); + switch (nhi->family) { + case AF_INET: + fib_nh_release(nh->net, &nhi->fib_nh); + break; + case AF_INET6: + ipv6_stub->fib6_nh_release(&nhi->fib6_nh); + break; + } + kfree(nhi); +} + +void nexthop_free_rcu(struct rcu_head *head) +{ + struct nexthop *nh = container_of(head, struct nexthop, rcu); + + if (nh->is_group) + nexthop_free_mpath(nh); + else + nexthop_free_single(nh); + + kfree(nh); +} +EXPORT_SYMBOL_GPL(nexthop_free_rcu); + +static struct nexthop *nexthop_alloc(void) +{ + struct nexthop *nh; + + nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); + if (nh) { + INIT_LIST_HEAD(&nh->fi_list); + INIT_LIST_HEAD(&nh->f6i_list); + INIT_LIST_HEAD(&nh->grp_list); + } + return nh; +} + +static struct nh_group *nexthop_grp_alloc(u16 num_nh) +{ + size_t sz = offsetof(struct nexthop, nh_grp) + + sizeof(struct nh_group) + + sizeof(struct nh_grp_entry) * num_nh; + struct nh_group *nhg; + + nhg = kzalloc(sz, GFP_KERNEL); + if (nhg) + nhg->num_nh = num_nh; + + return nhg; +} + +static void nh_base_seq_inc(struct net *net) +{ + while (++net->nexthop.seq == 0) + ; +} + +/* no reference taken; rcu lock or rtnl must be held */ +struct nexthop *nexthop_find_by_id(struct net *net, u32 id) +{ + struct rb_node **pp, *parent = NULL, *next; + + pp = &net->nexthop.rb_root.rb_node; + while (1) { + struct nexthop *nh; + + next = rcu_dereference_raw(*pp); + if (!next) + break; + parent = next; + + nh = rb_entry(parent, struct nexthop, rb_node); + if (id < nh->id) + pp = &next->rb_left; + else if (id > nh->id) + pp = &next->rb_right; + else + return nh; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nexthop_find_by_id); + +/* used for auto id allocation; called with rtnl held */ +static u32 nh_find_unused_id(struct net *net) +{ + u32 id_start = net->nexthop.last_id_allocated; + + while (1) { + net->nexthop.last_id_allocated++; + if (net->nexthop.last_id_allocated == id_start) + break; + + if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) + return net->nexthop.last_id_allocated; + } + return 0; +} + +static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) +{ + struct nexthop_grp *p; + size_t len = nhg->num_nh * sizeof(*p); + struct nlattr *nla; + u16 group_type = 0; + int i; + + if (nhg->mpath) + group_type = NEXTHOP_GRP_TYPE_MPATH; + + if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) + goto nla_put_failure; + + nla = nla_reserve(skb, NHA_GROUP, len); + if (!nla) + goto nla_put_failure; + + p = nla_data(nla); + for (i = 0; i < nhg->num_nh; ++i) { + p->id = nhg->nh_entries[i].nh->id; + p->weight = nhg->nh_entries[i].weight - 1; + p += 1; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, + int event, u32 portid, u32 seq, unsigned int nlflags) +{ + struct fib6_nh *fib6_nh; + struct fib_nh *fib_nh; + struct nlmsghdr *nlh; + struct nh_info *nhi; + struct nhmsg *nhm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); + if (!nlh) + return -EMSGSIZE; + + nhm = nlmsg_data(nlh); + nhm->nh_family = AF_UNSPEC; + nhm->nh_flags = nh->nh_flags; + nhm->nh_protocol = nh->protocol; + nhm->nh_scope = 0; + nhm->resvd = 0; + + if (nla_put_u32(skb, NHA_ID, nh->id)) + goto nla_put_failure; + + if (nh->is_group) { + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + if (nla_put_nh_group(skb, nhg)) + goto nla_put_failure; + goto out; + } + + nhi = rtnl_dereference(nh->nh_info); + nhm->nh_family = nhi->family; + if (nhi->reject_nh) { + if (nla_put_flag(skb, NHA_BLACKHOLE)) + goto nla_put_failure; + goto out; + } else { + const struct net_device *dev; + + dev = nhi->fib_nhc.nhc_dev; + if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) + goto nla_put_failure; + } + + nhm->nh_scope = nhi->fib_nhc.nhc_scope; + switch (nhi->family) { + case AF_INET: + fib_nh = &nhi->fib_nh; + if (fib_nh->fib_nh_gw_family && + nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) + goto nla_put_failure; + break; + + case AF_INET6: + fib6_nh = &nhi->fib6_nh; + if (fib6_nh->fib_nh_gw_family && + nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) + goto nla_put_failure; + break; + } + + if (nhi->fib_nhc.nhc_lwtstate && + lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, + NHA_ENCAP, NHA_ENCAP_TYPE) < 0) + goto nla_put_failure; + +out: + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static size_t nh_nlmsg_size_grp(struct nexthop *nh) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; + + return nla_total_size(sz) + + nla_total_size(2); /* NHA_GROUP_TYPE */ +} + +static size_t nh_nlmsg_size_single(struct nexthop *nh) +{ + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + size_t sz; + + /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE + * are mutually exclusive + */ + sz = nla_total_size(4); /* NHA_OIF */ + + switch (nhi->family) { + case AF_INET: + if (nhi->fib_nh.fib_nh_gw_family) + sz += nla_total_size(4); /* NHA_GATEWAY */ + break; + + case AF_INET6: + /* NHA_GATEWAY */ + if (nhi->fib6_nh.fib_nh_gw_family) + sz += nla_total_size(sizeof(const struct in6_addr)); + break; + } + + if (nhi->fib_nhc.nhc_lwtstate) { + sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); + sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ + } + + return sz; +} + +static size_t nh_nlmsg_size(struct nexthop *nh) +{ + size_t sz = nla_total_size(4); /* NHA_ID */ + + if (nh->is_group) + sz += nh_nlmsg_size_grp(nh); + else + sz += nh_nlmsg_size_single(nh); + + return sz; +} + +static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) +{ + unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); + if (!skb) + goto errout; + + err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags); + if (err < 0) { + /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); +} + +static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, + struct netlink_ext_ack *extack) +{ + if (nh->is_group) { + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + /* nested multipath (group within a group) is not + * supported + */ + if (nhg->mpath) { + NL_SET_ERR_MSG(extack, + "Multipath group can not be a nexthop within a group"); + return false; + } + } else { + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + + if (nhi->reject_nh && npaths > 1) { + NL_SET_ERR_MSG(extack, + "Blackhole nexthop can not be used in a group with more than 1 path"); + return false; + } + } + + return true; +} + +static int nh_check_attr_group(struct net *net, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + unsigned int len = nla_len(tb[NHA_GROUP]); + struct nexthop_grp *nhg; + unsigned int i, j; + + if (len & (sizeof(struct nexthop_grp) - 1)) { + NL_SET_ERR_MSG(extack, + "Invalid length for nexthop group attribute"); + return -EINVAL; + } + + /* convert len to number of nexthop ids */ + len /= sizeof(*nhg); + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + if (nhg[i].resvd1 || nhg[i].resvd2) { + NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); + return -EINVAL; + } + if (nhg[i].weight > 254) { + NL_SET_ERR_MSG(extack, "Invalid value for weight"); + return -EINVAL; + } + for (j = i + 1; j < len; ++j) { + if (nhg[i].id == nhg[j].id) { + NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); + return -EINVAL; + } + } + } + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + struct nexthop *nh; + + nh = nexthop_find_by_id(net, nhg[i].id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + return -EINVAL; + } + if (!valid_group_nh(nh, len, extack)) + return -EINVAL; + } + for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { + if (!tb[i]) + continue; + + NL_SET_ERR_MSG(extack, + "No other attributes can be set in nexthop groups"); + return -EINVAL; + } + + return 0; +} + +static bool ipv6_good_nh(const struct fib6_nh *nh) +{ + int state = NUD_REACHABLE; + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + + return !!(state & NUD_VALID); +} + +static bool ipv4_good_nh(const struct fib_nh *nh) +{ + int state = NUD_REACHABLE; + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, + (__force u32)nh->fib_nh_gw4); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + + return !!(state & NUD_VALID); +} + +struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) +{ + struct nexthop *rc = NULL; + struct nh_group *nhg; + int i; + + if (!nh->is_group) + return nh; + + nhg = rcu_dereference(nh->nh_grp); + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + struct nh_info *nhi; + + if (hash > atomic_read(&nhge->upper_bound)) + continue; + + /* nexthops always check if it is good and does + * not rely on a sysctl for this behavior + */ + nhi = rcu_dereference(nhge->nh->nh_info); + switch (nhi->family) { + case AF_INET: + if (ipv4_good_nh(&nhi->fib_nh)) + return nhge->nh; + break; + case AF_INET6: + if (ipv6_good_nh(&nhi->fib6_nh)) + return nhge->nh; + break; + } + + if (!rc) + rc = nhge->nh; + } + + return rc; +} +EXPORT_SYMBOL_GPL(nexthop_select_path); + +int nexthop_for_each_fib6_nh(struct nexthop *nh, + int (*cb)(struct fib6_nh *nh, void *arg), + void *arg) +{ + struct nh_info *nhi; + int err; + + if (nh->is_group) { + struct nh_group *nhg; + int i; + + nhg = rcu_dereference_rtnl(nh->nh_grp); + for (i = 0; i < nhg->num_nh; i++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + nhi = rcu_dereference_rtnl(nhge->nh->nh_info); + err = cb(&nhi->fib6_nh, arg); + if (err) + return err; + } + } else { + nhi = rcu_dereference_rtnl(nh->nh_info); + err = cb(&nhi->fib6_nh, arg); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); + +static int check_src_addr(const struct in6_addr *saddr, + struct netlink_ext_ack *extack) +{ + if (!ipv6_addr_any(saddr)) { + NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); + return -EINVAL; + } + return 0; +} + +int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + /* fib6_src is unique to a fib6_info and limits the ability to cache + * routes in fib6_nh within a nexthop that is potentially shared + * across multiple fib entries. If the config wants to use source + * routing it can not use nexthop objects. mlxsw also does not allow + * fib6_src on routes. + */ + if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) + return -EINVAL; + + if (nh->is_group) { + struct nh_group *nhg; + + nhg = rtnl_dereference(nh->nh_grp); + if (nhg->has_v4) + goto no_v4_nh; + } else { + nhi = rtnl_dereference(nh->nh_info); + if (nhi->family == AF_INET) + goto no_v4_nh; + } + + return 0; +no_v4_nh: + NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(fib6_check_nexthop); + +/* if existing nexthop has ipv6 routes linked to it, need + * to verify this new spec works with ipv6 + */ +static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct fib6_info *f6i; + + if (list_empty(&old->f6i_list)) + return 0; + + list_for_each_entry(f6i, &old->f6i_list, nh_list) { + if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) + return -EINVAL; + } + + return fib6_check_nexthop(new, NULL, extack); +} + +static int nexthop_check_scope(struct nexthop *nh, u8 scope, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + nhi = rtnl_dereference(nh->nh_info); + if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { + NL_SET_ERR_MSG(extack, + "Route with host scope can not have a gateway"); + return -EINVAL; + } + + if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); + return -EINVAL; + } + + return 0; +} + +/* Invoked by fib add code to verify nexthop by id is ok with + * config for prefix; parts of fib_check_nh not done when nexthop + * object is used. + */ +int fib_check_nexthop(struct nexthop *nh, u8 scope, + struct netlink_ext_ack *extack) +{ + int err = 0; + + if (nh->is_group) { + struct nh_group *nhg; + + if (scope == RT_SCOPE_HOST) { + NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); + err = -EINVAL; + goto out; + } + + nhg = rtnl_dereference(nh->nh_grp); + /* all nexthops in a group have the same scope */ + err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack); + } else { + err = nexthop_check_scope(nh, scope, extack); + } +out: + return err; +} + +static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct fib_info *fi; + + list_for_each_entry(fi, &old->fi_list, nh_list) { + int err; + + err = fib_check_nexthop(new, fi->fib_scope, extack); + if (err) + return err; + } + return 0; +} + +static void nh_group_rebalance(struct nh_group *nhg) +{ + int total = 0; + int w = 0; + int i; + + for (i = 0; i < nhg->num_nh; ++i) + total += nhg->nh_entries[i].weight; + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + int upper_bound; + + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; + atomic_set(&nhge->upper_bound, upper_bound); + } +} + +static void remove_nh_grp_entry(struct nh_grp_entry *nhge, + struct nh_group *nhg, + struct nl_info *nlinfo) +{ + struct nexthop *nh = nhge->nh; + struct nh_grp_entry *nhges; + bool found = false; + int i; + + WARN_ON(!nh); + + nhges = nhg->nh_entries; + for (i = 0; i < nhg->num_nh; ++i) { + if (found) { + nhges[i-1].nh = nhges[i].nh; + nhges[i-1].weight = nhges[i].weight; + list_del(&nhges[i].nh_list); + list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list); + } else if (nhg->nh_entries[i].nh == nh) { + found = true; + } + } + + if (WARN_ON(!found)) + return; + + nhg->num_nh--; + nhg->nh_entries[nhg->num_nh].nh = NULL; + + nh_group_rebalance(nhg); + + nexthop_put(nh); + + if (nlinfo) + nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo); +} + +static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + struct nh_grp_entry *nhge, *tmp; + + list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) { + struct nh_group *nhg; + + list_del(&nhge->nh_list); + nhg = rtnl_dereference(nhge->nh_parent->nh_grp); + remove_nh_grp_entry(nhge, nhg, nlinfo); + + /* if this group has no more entries then remove it */ + if (!nhg->num_nh) + remove_nexthop(net, nhge->nh_parent, nlinfo); + } +} + +static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) +{ + struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); + int i, num_nh = nhg->num_nh; + + for (i = 0; i < num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + if (WARN_ON(!nhge->nh)) + continue; + + list_del(&nhge->nh_list); + nexthop_put(nhge->nh); + nhge->nh = NULL; + nhg->num_nh--; + } +} + +/* not called for nexthop replace */ +static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) +{ + struct fib6_info *f6i, *tmp; + bool do_flush = false; + struct fib_info *fi; + + list_for_each_entry(fi, &nh->fi_list, nh_list) { + fi->fib_flags |= RTNH_F_DEAD; + do_flush = true; + } + if (do_flush) + fib_flush(net); + + /* ip6_del_rt removes the entry from this list hence the _safe */ + list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { + /* __ip6_del_rt does a release, so do a hold here */ + fib6_info_hold(f6i); + ipv6_stub->ip6_del_rt(net, f6i); + } +} + +static void __remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + __remove_nexthop_fib(net, nh); + + if (nh->is_group) { + remove_nexthop_group(nh, nlinfo); + } else { + struct nh_info *nhi; + + nhi = rtnl_dereference(nh->nh_info); + if (nhi->fib_nhc.nhc_dev) + hlist_del(&nhi->dev_hash); + + remove_nexthop_from_groups(net, nh, nlinfo); + } +} + +static void remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + /* remove from the tree */ + rb_erase(&nh->rb_node, &net->nexthop.rb_root); + + if (nlinfo) + nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); + + __remove_nexthop(net, nh, nlinfo); + nh_base_seq_inc(net); + + nexthop_put(nh); +} + +/* if any FIB entries reference this nexthop, any dst entries + * need to be regenerated + */ +static void nh_rt_cache_flush(struct net *net, struct nexthop *nh) +{ + struct fib6_info *f6i; + + if (!list_empty(&nh->fi_list)) + rt_cache_flush(net); + + list_for_each_entry(f6i, &nh->f6i_list, nh_list) + ipv6_stub->fib6_update_sernum(net, f6i); +} + +static int replace_nexthop_grp(struct net *net, struct nexthop *old, + struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct nh_group *oldg, *newg; + int i; + + if (!new->is_group) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); + return -EINVAL; + } + + oldg = rtnl_dereference(old->nh_grp); + newg = rtnl_dereference(new->nh_grp); + + /* update parents - used by nexthop code for cleanup */ + for (i = 0; i < newg->num_nh; i++) + newg->nh_entries[i].nh_parent = old; + + rcu_assign_pointer(old->nh_grp, newg); + + for (i = 0; i < oldg->num_nh; i++) + oldg->nh_entries[i].nh_parent = new; + + rcu_assign_pointer(new->nh_grp, oldg); + + return 0; +} + +static int replace_nexthop_single(struct net *net, struct nexthop *old, + struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct nh_info *oldi, *newi; + + if (new->is_group) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); + return -EINVAL; + } + + oldi = rtnl_dereference(old->nh_info); + newi = rtnl_dereference(new->nh_info); + + newi->nh_parent = old; + oldi->nh_parent = new; + + old->protocol = new->protocol; + old->nh_flags = new->nh_flags; + + rcu_assign_pointer(old->nh_info, newi); + rcu_assign_pointer(new->nh_info, oldi); + + return 0; +} + +static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, + struct nl_info *info) +{ + struct fib6_info *f6i; + + if (!list_empty(&nh->fi_list)) { + struct fib_info *fi; + + /* expectation is a few fib_info per nexthop and then + * a lot of routes per fib_info. So mark the fib_info + * and then walk the fib tables once + */ + list_for_each_entry(fi, &nh->fi_list, nh_list) + fi->nh_updated = true; + + fib_info_notify_update(net, info); + + list_for_each_entry(fi, &nh->fi_list, nh_list) + fi->nh_updated = false; + } + + list_for_each_entry(f6i, &nh->f6i_list, nh_list) + ipv6_stub->fib6_rt_update(net, f6i, info); +} + +/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries + * linked to this nexthop and for all groups that the nexthop + * is a member of + */ +static void nexthop_replace_notify(struct net *net, struct nexthop *nh, + struct nl_info *info) +{ + struct nh_grp_entry *nhge; + + __nexthop_replace_notify(net, nh, info); + + list_for_each_entry(nhge, &nh->grp_list, nh_list) + __nexthop_replace_notify(net, nhge->nh_parent, info); +} + +static int replace_nexthop(struct net *net, struct nexthop *old, + struct nexthop *new, struct netlink_ext_ack *extack) +{ + bool new_is_reject = false; + struct nh_grp_entry *nhge; + int err; + + /* check that existing FIB entries are ok with the + * new nexthop definition + */ + err = fib_check_nh_list(old, new, extack); + if (err) + return err; + + err = fib6_check_nh_list(old, new, extack); + if (err) + return err; + + if (!new->is_group) { + struct nh_info *nhi = rtnl_dereference(new->nh_info); + + new_is_reject = nhi->reject_nh; + } + + list_for_each_entry(nhge, &old->grp_list, nh_list) { + /* if new nexthop is a blackhole, any groups using this + * nexthop cannot have more than 1 path + */ + if (new_is_reject && + nexthop_num_path(nhge->nh_parent) > 1) { + NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); + return -EINVAL; + } + + err = fib_check_nh_list(nhge->nh_parent, new, extack); + if (err) + return err; + + err = fib6_check_nh_list(nhge->nh_parent, new, extack); + if (err) + return err; + } + + if (old->is_group) + err = replace_nexthop_grp(net, old, new, extack); + else + err = replace_nexthop_single(net, old, new, extack); + + if (!err) { + nh_rt_cache_flush(net, old); + + __remove_nexthop(net, new, NULL); + nexthop_put(new); + } + + return err; +} + +/* called with rtnl_lock held */ +static int insert_nexthop(struct net *net, struct nexthop *new_nh, + struct nh_config *cfg, struct netlink_ext_ack *extack) +{ + struct rb_node **pp, *parent = NULL, *next; + struct rb_root *root = &net->nexthop.rb_root; + bool replace = !!(cfg->nlflags & NLM_F_REPLACE); + bool create = !!(cfg->nlflags & NLM_F_CREATE); + u32 new_id = new_nh->id; + int replace_notify = 0; + int rc = -EEXIST; + + pp = &root->rb_node; + while (1) { + struct nexthop *nh; + + next = rtnl_dereference(*pp); + if (!next) + break; + + parent = next; + + nh = rb_entry(parent, struct nexthop, rb_node); + if (new_id < nh->id) { + pp = &next->rb_left; + } else if (new_id > nh->id) { + pp = &next->rb_right; + } else if (replace) { + rc = replace_nexthop(net, nh, new_nh, extack); + if (!rc) { + new_nh = nh; /* send notification with old nh */ + replace_notify = 1; + } + goto out; + } else { + /* id already exists and not a replace */ + goto out; + } + } + + if (replace && !create) { + NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); + rc = -ENOENT; + goto out; + } + + rb_link_node_rcu(&new_nh->rb_node, parent, pp); + rb_insert_color(&new_nh->rb_node, root); + rc = 0; +out: + if (!rc) { + nh_base_seq_inc(net); + nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); + if (replace_notify) + nexthop_replace_notify(net, new_nh, &cfg->nlinfo); + } + + return rc; +} + +/* rtnl */ +/* remove all nexthops tied to a device being deleted */ +static void nexthop_flush_dev(struct net_device *dev) +{ + unsigned int hash = nh_dev_hashfn(dev->ifindex); + struct net *net = dev_net(dev); + struct hlist_head *head = &net->nexthop.devhash[hash]; + struct hlist_node *n; + struct nh_info *nhi; + + hlist_for_each_entry_safe(nhi, n, head, dev_hash) { + if (nhi->fib_nhc.nhc_dev != dev) + continue; + + remove_nexthop(net, nhi->nh_parent, NULL); + } +} + +/* rtnl; called when net namespace is deleted */ +static void flush_all_nexthops(struct net *net) +{ + struct rb_root *root = &net->nexthop.rb_root; + struct rb_node *node; + struct nexthop *nh; + + while ((node = rb_first(root))) { + nh = rb_entry(node, struct nexthop, rb_node); + remove_nexthop(net, nh, NULL); + cond_resched(); + } +} + +static struct nexthop *nexthop_create_group(struct net *net, + struct nh_config *cfg) +{ + struct nlattr *grps_attr = cfg->nh_grp; + struct nexthop_grp *entry = nla_data(grps_attr); + struct nh_group *nhg; + struct nexthop *nh; + int i; + + nh = nexthop_alloc(); + if (!nh) + return ERR_PTR(-ENOMEM); + + nh->is_group = 1; + + nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry)); + if (!nhg) { + kfree(nh); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nhg->num_nh; ++i) { + struct nexthop *nhe; + struct nh_info *nhi; + + nhe = nexthop_find_by_id(net, entry[i].id); + if (!nexthop_get(nhe)) + goto out_no_nh; + + nhi = rtnl_dereference(nhe->nh_info); + if (nhi->family == AF_INET) + nhg->has_v4 = true; + + nhg->nh_entries[i].nh = nhe; + nhg->nh_entries[i].weight = entry[i].weight + 1; + list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); + nhg->nh_entries[i].nh_parent = nh; + } + + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + nhg->mpath = 1; + nh_group_rebalance(nhg); + } + + rcu_assign_pointer(nh->nh_grp, nhg); + + return nh; + +out_no_nh: + for (; i >= 0; --i) + nexthop_put(nhg->nh_entries[i].nh); + + kfree(nhg); + kfree(nh); + + return ERR_PTR(-ENOENT); +} + +static int nh_create_ipv4(struct net *net, struct nexthop *nh, + struct nh_info *nhi, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct fib_nh *fib_nh = &nhi->fib_nh; + struct fib_config fib_cfg = { + .fc_oif = cfg->nh_ifindex, + .fc_gw4 = cfg->gw.ipv4, + .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, + .fc_flags = cfg->nh_flags, + .fc_encap = cfg->nh_encap, + .fc_encap_type = cfg->nh_encap_type, + }; + u32 tb_id = l3mdev_fib_table(cfg->dev); + int err = -EINVAL; + + err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); + if (err) { + fib_nh_release(net, fib_nh); + goto out; + } + + /* sets nh_dev if successful */ + err = fib_check_nh(net, fib_nh, tb_id, 0, extack); + if (!err) { + nh->nh_flags = fib_nh->fib_nh_flags; + fib_info_update_nhc_saddr(net, &fib_nh->nh_common, + fib_nh->fib_nh_scope); + } else { + fib_nh_release(net, fib_nh); + } +out: + return err; +} + +static int nh_create_ipv6(struct net *net, struct nexthop *nh, + struct nh_info *nhi, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct fib6_nh *fib6_nh = &nhi->fib6_nh; + struct fib6_config fib6_cfg = { + .fc_table = l3mdev_fib_table(cfg->dev), + .fc_ifindex = cfg->nh_ifindex, + .fc_gateway = cfg->gw.ipv6, + .fc_flags = cfg->nh_flags, + .fc_encap = cfg->nh_encap, + .fc_encap_type = cfg->nh_encap_type, + }; + int err; + + if (!ipv6_addr_any(&cfg->gw.ipv6)) + fib6_cfg.fc_flags |= RTF_GATEWAY; + + /* sets nh_dev if successful */ + err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, + extack); + if (err) + ipv6_stub->fib6_nh_release(fib6_nh); + else + nh->nh_flags = fib6_nh->fib_nh_flags; + + return err; +} + +static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + struct nexthop *nh; + int err = 0; + + nh = nexthop_alloc(); + if (!nh) + return ERR_PTR(-ENOMEM); + + nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); + if (!nhi) { + kfree(nh); + return ERR_PTR(-ENOMEM); + } + + nh->nh_flags = cfg->nh_flags; + nh->net = net; + + nhi->nh_parent = nh; + nhi->family = cfg->nh_family; + nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; + + if (cfg->nh_blackhole) { + nhi->reject_nh = 1; + cfg->nh_ifindex = net->loopback_dev->ifindex; + } + + switch (cfg->nh_family) { + case AF_INET: + err = nh_create_ipv4(net, nh, nhi, cfg, extack); + break; + case AF_INET6: + err = nh_create_ipv6(net, nh, nhi, cfg, extack); + break; + } + + if (err) { + kfree(nhi); + kfree(nh); + return ERR_PTR(err); + } + + /* add the entry to the device based hash */ + nexthop_devhash_add(net, nhi); + + rcu_assign_pointer(nh->nh_info, nhi); + + return nh; +} + +/* called with rtnl lock held */ +static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nexthop *nh; + int err; + + if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { + NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); + return ERR_PTR(-EINVAL); + } + + if (!cfg->nh_id) { + cfg->nh_id = nh_find_unused_id(net); + if (!cfg->nh_id) { + NL_SET_ERR_MSG(extack, "No unused id"); + return ERR_PTR(-EINVAL); + } + } + + if (cfg->nh_grp) + nh = nexthop_create_group(net, cfg); + else + nh = nexthop_create(net, cfg, extack); + + if (IS_ERR(nh)) + return nh; + + refcount_set(&nh->refcnt, 1); + nh->id = cfg->nh_id; + nh->protocol = cfg->nh_protocol; + nh->net = net; + + err = insert_nexthop(net, nh, cfg, extack); + if (err) { + __remove_nexthop(net, nh, NULL); + nexthop_put(nh); + nh = ERR_PTR(err); + } + + return nh; +} + +static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nhmsg *nhm = nlmsg_data(nlh); + struct nlattr *tb[NHA_MAX + 1]; + int err; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + extack); + if (err < 0) + return err; + + err = -EINVAL; + if (nhm->resvd || nhm->nh_scope) { + NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); + goto out; + } + if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { + NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); + goto out; + } + + switch (nhm->nh_family) { + case AF_INET: + case AF_INET6: + break; + case AF_UNSPEC: + if (tb[NHA_GROUP]) + break; + /* fallthrough */ + default: + NL_SET_ERR_MSG(extack, "Invalid address family"); + goto out; + } + + if (tb[NHA_GROUPS] || tb[NHA_MASTER]) { + NL_SET_ERR_MSG(extack, "Invalid attributes in request"); + goto out; + } + + memset(cfg, 0, sizeof(*cfg)); + cfg->nlflags = nlh->nlmsg_flags; + cfg->nlinfo.portid = NETLINK_CB(skb).portid; + cfg->nlinfo.nlh = nlh; + cfg->nlinfo.nl_net = net; + + cfg->nh_family = nhm->nh_family; + cfg->nh_protocol = nhm->nh_protocol; + cfg->nh_flags = nhm->nh_flags; + + if (tb[NHA_ID]) + cfg->nh_id = nla_get_u32(tb[NHA_ID]); + + if (tb[NHA_GROUP]) { + if (nhm->nh_family != AF_UNSPEC) { + NL_SET_ERR_MSG(extack, "Invalid family for group"); + goto out; + } + cfg->nh_grp = tb[NHA_GROUP]; + + cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; + if (tb[NHA_GROUP_TYPE]) + cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); + + if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { + NL_SET_ERR_MSG(extack, "Invalid group type"); + goto out; + } + err = nh_check_attr_group(net, tb, extack); + + /* no other attributes should be set */ + goto out; + } + + if (tb[NHA_BLACKHOLE]) { + if (tb[NHA_GATEWAY] || tb[NHA_OIF] || + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif"); + goto out; + } + + cfg->nh_blackhole = 1; + err = 0; + goto out; + } + + if (!tb[NHA_OIF]) { + NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops"); + goto out; + } + + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); + if (cfg->nh_ifindex) + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); + + if (!cfg->dev) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + goto out; + } else if (!(cfg->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } else if (!netif_carrier_ok(cfg->dev)) { + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); + err = -ENETDOWN; + goto out; + } + + err = -EINVAL; + if (tb[NHA_GATEWAY]) { + struct nlattr *gwa = tb[NHA_GATEWAY]; + + switch (cfg->nh_family) { + case AF_INET: + if (nla_len(gwa) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid gateway"); + goto out; + } + cfg->gw.ipv4 = nla_get_be32(gwa); + break; + case AF_INET6: + if (nla_len(gwa) != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG(extack, "Invalid gateway"); + goto out; + } + cfg->gw.ipv6 = nla_get_in6_addr(gwa); + break; + default: + NL_SET_ERR_MSG(extack, + "Unknown address family for gateway"); + goto out; + } + } else { + /* device only nexthop (no gateway) */ + if (cfg->nh_flags & RTNH_F_ONLINK) { + NL_SET_ERR_MSG(extack, + "ONLINK flag can not be set for nexthop without a gateway"); + goto out; + } + } + + if (tb[NHA_ENCAP]) { + cfg->nh_encap = tb[NHA_ENCAP]; + + if (!tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); + goto out; + } + + cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); + err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); + if (err < 0) + goto out; + + } else if (tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); + goto out; + } + + + err = 0; +out: + return err; +} + +/* rtnl */ +static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nh_config cfg; + struct nexthop *nh; + int err; + + err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); + if (!err) { + nh = nexthop_add(net, &cfg, extack); + if (IS_ERR(nh)) + err = PTR_ERR(nh); + } + + return err; +} + +static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id, + struct netlink_ext_ack *extack) +{ + struct nhmsg *nhm = nlmsg_data(nlh); + struct nlattr *tb[NHA_MAX + 1]; + int err, i; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + extack); + if (err < 0) + return err; + + err = -EINVAL; + for (i = 0; i < __NHA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NHA_ID: + break; + default: + NL_SET_ERR_MSG_ATTR(extack, tb[i], + "Unexpected attribute in request"); + goto out; + } + } + if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Invalid values in header"); + goto out; + } + + if (!tb[NHA_ID]) { + NL_SET_ERR_MSG(extack, "Nexthop id is missing"); + goto out; + } + + *id = nla_get_u32(tb[NHA_ID]); + if (!(*id)) + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + else + err = 0; +out: + return err; +} + +/* rtnl */ +static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nl_info nlinfo = { + .nlh = nlh, + .nl_net = net, + .portid = NETLINK_CB(skb).portid, + }; + struct nexthop *nh; + int err; + u32 id; + + err = nh_valid_get_del_req(nlh, &id, extack); + if (err) + return err; + + nh = nexthop_find_by_id(net, id); + if (!nh) + return -ENOENT; + + remove_nexthop(net, nh, &nlinfo); + + return 0; +} + +/* rtnl */ +static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct sk_buff *skb = NULL; + struct nexthop *nh; + int err; + u32 id; + + err = nh_valid_get_del_req(nlh, &id, extack); + if (err) + return err; + + err = -ENOBUFS; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto out; + + err = -ENOENT; + nh = nexthop_find_by_id(net, id); + if (!nh) + goto errout_free; + + err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + goto errout_free; + } + + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + return err; +errout_free: + kfree_skb(skb); + goto out; +} + +static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, + bool group_filter, u8 family) +{ + const struct net_device *dev; + const struct nh_info *nhi; + + if (group_filter && !nh->is_group) + return true; + + if (!dev_idx && !master_idx && !family) + return false; + + if (nh->is_group) + return true; + + nhi = rtnl_dereference(nh->nh_info); + if (family && nhi->family != family) + return true; + + dev = nhi->fib_nhc.nhc_dev; + if (dev_idx && (!dev || dev->ifindex != dev_idx)) + return true; + + if (master_idx) { + struct net_device *master; + + if (!dev) + return true; + + master = netdev_master_upper_dev_get((struct net_device *)dev); + if (!master || master->ifindex != master_idx) + return true; + } + + return false; +} + +static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, + int *master_idx, bool *group_filter, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NHA_MAX + 1]; + struct nhmsg *nhm; + int err, i; + u32 idx; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + NULL); + if (err < 0) + return err; + + for (i = 0; i <= NHA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NHA_OIF: + idx = nla_get_u32(tb[i]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + return -EINVAL; + } + *dev_idx = idx; + break; + case NHA_MASTER: + idx = nla_get_u32(tb[i]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid master device index"); + return -EINVAL; + } + *master_idx = idx; + break; + case NHA_GROUPS: + *group_filter = true; + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + nhm = nlmsg_data(nlh); + if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); + return -EINVAL; + } + + return 0; +} + +/* rtnl */ +static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nhmsg *nhm = nlmsg_data(cb->nlh); + int dev_filter_idx = 0, master_idx = 0; + struct net *net = sock_net(skb->sk); + struct rb_root *root = &net->nexthop.rb_root; + bool group_filter = false; + struct rb_node *node; + int idx = 0, s_idx; + int err; + + err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, + &group_filter, cb); + if (err < 0) + return err; + + s_idx = cb->args[0]; + for (node = rb_first(root); node; node = rb_next(node)) { + struct nexthop *nh; + + if (idx < s_idx) + goto cont; + + nh = rb_entry(node, struct nexthop, rb_node); + if (nh_dump_filtered(nh, dev_filter_idx, master_idx, + group_filter, nhm->nh_family)) + goto cont; + + err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } +cont: + idx++; + } + +out: + err = skb->len; +out_err: + cb->args[0] = idx; + cb->seq = net->nexthop.seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + return err; +} + +static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) +{ + unsigned int hash = nh_dev_hashfn(dev->ifindex); + struct net *net = dev_net(dev); + struct hlist_head *head = &net->nexthop.devhash[hash]; + struct hlist_node *n; + struct nh_info *nhi; + + hlist_for_each_entry_safe(nhi, n, head, dev_hash) { + if (nhi->fib_nhc.nhc_dev == dev) { + if (nhi->family == AF_INET) + fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, + orig_mtu); + } + } +} + +/* rtnl */ +static int nh_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_info_ext *info_ext; + + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_flush_dev(dev); + break; + case NETDEV_CHANGE: + if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) + nexthop_flush_dev(dev); + break; + case NETDEV_CHANGEMTU: + info_ext = ptr; + nexthop_sync_mtu(dev, info_ext->ext.mtu); + rt_cache_flush(dev_net(dev)); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block nh_netdev_notifier = { + .notifier_call = nh_netdev_event, +}; + +static void __net_exit nexthop_net_exit(struct net *net) +{ + rtnl_lock(); + flush_all_nexthops(net); + rtnl_unlock(); + kfree(net->nexthop.devhash); +} + +static int __net_init nexthop_net_init(struct net *net) +{ + size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; + + net->nexthop.rb_root = RB_ROOT; + net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); + if (!net->nexthop.devhash) + return -ENOMEM; + + return 0; +} + +static struct pernet_operations nexthop_net_ops = { + .init = nexthop_net_init, + .exit = nexthop_net_exit, +}; + +static int __init nexthop_init(void) +{ + register_pernet_subsys(&nexthop_net_ops); + + register_netdevice_notifier(&nh_netdev_notifier); + + rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop, + rtm_dump_nexthop, 0); + + rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + + rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + + return 0; +} +subsys_initcall(nexthop_init); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 073273b751f8..cc90243ccf76 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -68,8 +68,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %u memory %lu\n", - atomic_read(&net->ipv4.frags.rhashtable.nelems), - frag_mem_limit(&net->ipv4.frags)); + atomic_read(&net->ipv4.fqdir->rhashtable.nelems), + frag_mem_limit(net->ipv4.fqdir)); return 0; } @@ -288,6 +288,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), + SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index 899e34ceb560..e35736b99300 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -24,9 +24,6 @@ raw_get_hashinfo(const struct inet_diag_req_v2 *r) return &raw_v6_hashinfo; #endif } else { - pr_warn_once("Unexpected inet family %d\n", - r->sdiag_family); - WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8ea0735a6754..517300d587a7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -95,6 +95,7 @@ #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/arp.h> #include <net/tcp.h> #include <net/icmp.h> @@ -447,7 +448,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, n = ip_neigh_gw4(dev, pkey); } - if (n && !refcount_inc_not_zero(&n->refcnt)) + if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock_bh(); @@ -1531,7 +1532,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst) void rt_flush_dev(struct net_device *dev) { - struct net *net = dev_net(dev); struct rtable *rt; int cpu; @@ -1542,7 +1542,7 @@ void rt_flush_dev(struct net_device *dev) list_for_each_entry(rt, &ul->head, rt_uncached) { if (rt->dst.dev != dev) continue; - rt->dst.dev = net->loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(dev); } @@ -1580,7 +1580,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID - { + if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); @@ -1962,6 +1962,36 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.basic.ip_proto = fl4->flowi4_proto; } break; + case 2: + memset(&hash_keys, 0, sizeof(hash_keys)); + /* skb is currently provided only when forwarding */ + if (skb) { + struct flow_keys keys; + + skb_flow_dissect_flow_keys(skb, &keys, 0); + /* Inner can be v4 or v6 */ + if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + hash_keys.tags.flow_label = keys.tags.flow_label; + hash_keys.basic.ip_proto = keys.basic.ip_proto; + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + ip_multipath_l3_keys(skb, &hash_keys); + } + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = fl4->saddr; + hash_keys.addrs.v4addrs.dst = fl4->daddr; + } + break; } mhash = flow_hash_from_keys(&hash_keys); @@ -1979,7 +2009,7 @@ static int ip_mkroute_input(struct sk_buff *skb, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) { + if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); @@ -2714,7 +2744,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = fl4->flowi4_tos; + r->rtm_tos = fl4 ? fl4->flowi4_tos : 0; r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; @@ -2742,7 +2772,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif - if (!rt_is_input_route(rt) && + if (fl4 && !rt_is_input_route(rt) && fl4->saddr != src) { if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; @@ -2782,36 +2812,40 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - if (fl4->flowi4_mark && - nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) - goto nla_put_failure; - - if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && - nla_put_u32(skb, RTA_UID, - from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) - goto nla_put_failure; + if (fl4) { + if (fl4->flowi4_mark && + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) + goto nla_put_failure; - error = rt->dst.error; + if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && + nla_put_u32(skb, RTA_UID, + from_kuid_munged(current_user_ns(), + fl4->flowi4_uid))) + goto nla_put_failure; - if (rt_is_input_route(rt)) { + if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE - if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && - IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { - int err = ipmr_get_route(net, skb, - fl4->saddr, fl4->daddr, - r, portid); - - if (err <= 0) { - if (err == 0) - return 0; - goto nla_put_failure; - } - } else + if (ipv4_is_multicast(dst) && + !ipv4_is_local_multicast(dst) && + IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + int err = ipmr_get_route(net, skb, + fl4->saddr, fl4->daddr, + r, portid); + + if (err <= 0) { + if (err == 0) + return 0; + goto nla_put_failure; + } + } else #endif - if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) - goto nla_put_failure; + if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) + goto nla_put_failure; + } } + error = rt->dst.error; + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; @@ -2823,6 +2857,80 @@ nla_put_failure: return -EMSGSIZE; } +static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, u32 table_id, + struct fnhe_hash_bucket *bucket, int genid, + int *fa_index, int fa_start) +{ + int i; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + for (fnhe = rcu_dereference(bucket[i].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + struct rtable *rt; + int err; + + if (*fa_index < fa_start) + goto next; + + if (fnhe->fnhe_genid != genid) + goto next; + + if (fnhe->fnhe_expires && + time_after(jiffies, fnhe->fnhe_expires)) + goto next; + + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (!rt) + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (!rt) + goto next; + + err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, + table_id, NULL, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err) + return err; +next: + (*fa_index)++; + } + } + + return 0; +} + +int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, + u32 table_id, struct fib_info *fi, + int *fa_index, int fa_start) +{ + struct net *net = sock_net(cb->skb->sk); + int nhsel, genid = fnhe_genid(net); + + for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { + struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); + struct fnhe_hash_bucket *bucket; + int err; + + if (nhc->nhc_flags & RTNH_F_DEAD) + continue; + + rcu_read_lock(); + bucket = rcu_dereference(nhc->nhc_exceptions); + err = 0; + if (bucket) + err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, + genid, fa_index, fa_start); + rcu_read_unlock(); + if (err) + return err; + } + + return 0; +} + static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, u8 ip_proto, __be16 sport, __be16 dport) @@ -3230,9 +3338,11 @@ static struct ctl_table ipv4_route_table[] = { { } }; +static const char ipv4_route_flush_procname[] = "flush"; + static struct ctl_table ipv4_route_flush_table[] = { { - .procname = "flush", + .procname = ipv4_route_flush_procname, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, @@ -3250,9 +3360,11 @@ static __net_init int sysctl_route_net_init(struct net *net) if (!tbl) goto err_dup; - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - tbl[0].procname = NULL; + /* Don't export non-whitelisted sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) { + if (tbl[0].procname != ipv4_route_flush_procname) + tbl[0].procname = NULL; + } } tbl[0].extra1 = net; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b6f14af926fa..7d66306b5f39 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -279,55 +279,96 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } +static int sscanf_key(char *buf, __le32 *key) +{ + u32 user_key[4]; + int i, ret = 0; + + if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1, + user_key + 2, user_key + 3) != 4) { + ret = -EINVAL; + } else { + for (i = 0; i < ARRAY_SIZE(user_key); i++) + key[i] = cpu_to_le32(user_key[i]); + } + pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", + user_key[0], user_key[1], user_key[2], user_key[3], buf, ret); + + return ret; +} + static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); - struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; - struct tcp_fastopen_context *ctxt; - u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ - __le32 key[4]; - int ret, i; + /* maxlen to print the list of keys in hex (*2), with dashes + * separating doublewords and a comma in between keys. + */ + struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * + 2 * TCP_FASTOPEN_KEY_MAX) + + (TCP_FASTOPEN_KEY_MAX * 5)) }; + struct tcp_fastopen_context *ctx; + u32 user_key[TCP_FASTOPEN_KEY_MAX * 4]; + __le32 key[TCP_FASTOPEN_KEY_MAX * 4]; + char *backup_data; + int ret, i = 0, off = 0, n_keys = 0; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); - if (ctxt) - memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); - else - memset(key, 0, sizeof(key)); + ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctx) { + n_keys = tcp_fastopen_context_len(ctx); + memcpy(&key[0], &ctx->key[0], TCP_FASTOPEN_KEY_LENGTH * n_keys); + } rcu_read_unlock(); - for (i = 0; i < ARRAY_SIZE(key); i++) + if (!n_keys) { + memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH); + n_keys = 1; + } + + for (i = 0; i < n_keys * 4; i++) user_key[i] = le32_to_cpu(key[i]); - snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", - user_key[0], user_key[1], user_key[2], user_key[3]); + for (i = 0; i < n_keys; i++) { + off += snprintf(tbl.data + off, tbl.maxlen - off, + "%08x-%08x-%08x-%08x", + user_key[i * 4], + user_key[i * 4 + 1], + user_key[i * 4 + 2], + user_key[i * 4 + 3]); + if (i + 1 < n_keys) + off += snprintf(tbl.data + off, tbl.maxlen - off, ","); + } + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { - if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1, - user_key + 2, user_key + 3) != 4) { + backup_data = strchr(tbl.data, ','); + if (backup_data) { + *backup_data = '\0'; + backup_data++; + } + if (sscanf_key(tbl.data, key)) { ret = -EINVAL; goto bad_key; } - - for (i = 0; i < ARRAY_SIZE(user_key); i++) - key[i] = cpu_to_le32(user_key[i]); - + if (backup_data) { + if (sscanf_key(backup_data, key + 4)) { + ret = -EINVAL; + goto bad_key; + } + } tcp_fastopen_reset_cipher(net, NULL, key, - TCP_FASTOPEN_KEY_LENGTH); + backup_data ? key + 4 : NULL); } bad_key: - pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", - user_key[0], user_key[1], user_key[2], user_key[3], - (char *)tbl.data, ret); kfree(tbl.data); return ret; } @@ -956,7 +997,12 @@ static struct ctl_table ipv4_net_table[] = { .procname = "tcp_fastopen_key", .mode = 0600, .data = &init_net.ipv4.sysctl_tcp_fastopen, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + /* maxlen to print the list of keys in hex (*2), with dashes + * separating doublewords and a comma in between keys. + */ + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * + 2 * TCP_FASTOPEN_KEY_MAX) + + (TCP_FASTOPEN_KEY_MAX * 5)), .proc_handler = proc_tcp_fastopen_key, }, { @@ -984,7 +1030,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = &zero, - .extra2 = &one, + .extra2 = &two, }, #endif { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7dc9ab84bb69..7846afacdf0b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2614,6 +2614,8 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->bytes_sent = 0; + tp->bytes_acked = 0; + tp->bytes_received = 0; tp->bytes_retrans = 0; tp->duplicate_sack[0].start_seq = 0; tp->duplicate_sack[0].end_seq = 0; @@ -2741,6 +2743,21 @@ static int tcp_repair_options_est(struct sock *sk, return 0; } +DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +EXPORT_SYMBOL(tcp_tx_delay_enabled); + +static void tcp_enable_tx_delay(void) +{ + if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { + static int __tcp_tx_delay_enabled = 0; + + if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { + static_branch_enable(&tcp_tx_delay_enabled); + pr_info("TCP_TX_DELAY enabled\n"); + } + } +} + /* * Socket option code for TCP. */ @@ -2791,15 +2808,23 @@ static int do_tcp_setsockopt(struct sock *sk, int level, return err; } case TCP_FASTOPEN_KEY: { - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; + __u8 *backup_key = NULL; - if (optlen != sizeof(key)) + /* Allow a backup key as well to facilitate key rotation + * First key is the active one. + */ + if (optlen != TCP_FASTOPEN_KEY_LENGTH && + optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; if (copy_from_user(key, optval, optlen)) return -EFAULT; - return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); + if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) + backup_key = key + TCP_FASTOPEN_KEY_LENGTH; + + return tcp_fastopen_reset_cipher(net, sk, key, backup_key); } default: /* fallthru */ @@ -3083,6 +3108,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->recvmsg_inq = val; break; + case TCP_TX_DELAY: + if (val) + tcp_enable_tx_delay(); + tp->tcp_tx_delay = val; + break; default: err = -ENOPROTOOPT; break; @@ -3453,21 +3483,23 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; case TCP_FASTOPEN_KEY: { - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; struct tcp_fastopen_context *ctx; + unsigned int key_len = 0; if (get_user(len, optlen)) return -EFAULT; rcu_read_lock(); ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); - if (ctx) - memcpy(key, ctx->key, sizeof(key)); - else - len = 0; + if (ctx) { + key_len = tcp_fastopen_context_len(ctx) * + TCP_FASTOPEN_KEY_LENGTH; + memcpy(&key[0], &ctx->key[0], key_len); + } rcu_read_unlock(); - len = min_t(unsigned int, len, sizeof(key)); + len = min_t(unsigned int, len, key_len); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, key, len)) @@ -3540,6 +3572,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_no_cookie; break; + case TCP_TX_DELAY: + val = tp->tcp_tx_delay; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f5a45e1e1182..3fd451271a70 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -30,15 +30,15 @@ void tcp_fastopen_init_key_once(struct net *net) * for a valid cookie, so this is an acceptable risk. */ get_random_bytes(key, sizeof(key)); - tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key)); + tcp_fastopen_reset_cipher(net, NULL, key, NULL); } static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = container_of(head, struct tcp_fastopen_context, rcu); - crypto_free_cipher(ctx->tfm); - kfree(ctx); + + kzfree(ctx); } void tcp_fastopen_destroy_cipher(struct sock *sk) @@ -67,31 +67,27 @@ void tcp_fastopen_ctx_destroy(struct net *net) } int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, - void *key, unsigned int len) + void *primary_key, void *backup_key) { struct tcp_fastopen_context *ctx, *octx; struct fastopen_queue *q; - int err; + int err = 0; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - ctx->tfm = crypto_alloc_cipher("aes", 0, 0); - - if (IS_ERR(ctx->tfm)) { - err = PTR_ERR(ctx->tfm); -error: kfree(ctx); - pr_err("TCP: TFO aes cipher alloc error: %d\n", err); - return err; + if (!ctx) { + err = -ENOMEM; + goto out; } - err = crypto_cipher_setkey(ctx->tfm, key, len); - if (err) { - pr_err("TCP: TFO cipher key error: %d\n", err); - crypto_free_cipher(ctx->tfm); - goto error; - } - memcpy(ctx->key, key, len); + ctx->key[0].key[0] = get_unaligned_le64(primary_key); + ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8); + if (backup_key) { + ctx->key[1].key[0] = get_unaligned_le64(backup_key); + ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8); + ctx->num = 2; + } else { + ctx->num = 1; + } spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); if (sk) { @@ -108,66 +104,58 @@ error: kfree(ctx); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); +out: return err; } -static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path, - struct tcp_fastopen_cookie *foc) +static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, + struct sk_buff *syn, + const siphash_key_t *key, + struct tcp_fastopen_cookie *foc) { - struct tcp_fastopen_context *ctx; - bool ok = false; - - rcu_read_lock(); - - ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); - if (!ctx) - ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); + BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64)); - if (ctx) { - crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); - foc->len = TCP_FASTOPEN_COOKIE_SIZE; - ok = true; - } - rcu_read_unlock(); - return ok; -} - -/* Generate the fastopen cookie by doing aes128 encryption on both - * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 - * addresses. For the longer IPv6 addresses use CBC-MAC. - * - * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. - */ -static bool tcp_fastopen_cookie_gen(struct sock *sk, - struct request_sock *req, - struct sk_buff *syn, - struct tcp_fastopen_cookie *foc) -{ if (req->rsk_ops->family == AF_INET) { const struct iphdr *iph = ip_hdr(syn); - __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(sk, path, foc); + foc->val[0] = cpu_to_le64(siphash(&iph->saddr, + sizeof(iph->saddr) + + sizeof(iph->daddr), + key)); + foc->len = TCP_FASTOPEN_COOKIE_SIZE; + return true; } - #if IS_ENABLED(CONFIG_IPV6) if (req->rsk_ops->family == AF_INET6) { const struct ipv6hdr *ip6h = ipv6_hdr(syn); - struct tcp_fastopen_cookie tmp; - - if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) { - struct in6_addr *buf = &tmp.addr; - int i; - for (i = 0; i < 4; i++) - buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(sk, buf, foc); - } + foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr, + sizeof(ip6h->saddr) + + sizeof(ip6h->daddr), + key)); + foc->len = TCP_FASTOPEN_COOKIE_SIZE; + return true; } #endif return false; } +/* Generate the fastopen cookie by applying SipHash to both the source and + * destination addresses. + */ +static void tcp_fastopen_cookie_gen(struct sock *sk, + struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *foc) +{ + struct tcp_fastopen_context *ctx; + + rcu_read_lock(); + ctx = tcp_fastopen_get_ctx(sk); + if (ctx) + __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc); + rcu_read_unlock(); +} /* If an incoming SYN or SYNACK frame contains a payload and/or FIN, * queue this additional data / FIN. @@ -212,6 +200,35 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) tcp_fin(sk); } +/* returns 0 - no key match, 1 for primary, 2 for backup */ +static int tcp_fastopen_cookie_gen_check(struct sock *sk, + struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *orig, + struct tcp_fastopen_cookie *valid_foc) +{ + struct tcp_fastopen_cookie search_foc = { .len = -1 }; + struct tcp_fastopen_cookie *foc = valid_foc; + struct tcp_fastopen_context *ctx; + int i, ret = 0; + + rcu_read_lock(); + ctx = tcp_fastopen_get_ctx(sk); + if (!ctx) + goto out; + for (i = 0; i < tcp_fastopen_context_len(ctx); i++) { + __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc); + if (tcp_fastopen_cookie_match(foc, orig)) { + ret = i + 1; + goto out; + } + foc = &search_foc; + } +out: + rcu_read_unlock(); + return ret; +} + static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct request_sock *req) @@ -327,6 +344,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; + int ret = 0; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); @@ -342,31 +360,44 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; - if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) && - foc->len == TCP_FASTOPEN_COOKIE_SIZE && - foc->len == valid_foc.len && - !memcmp(foc->val, valid_foc.val, foc->len)) { - /* Cookie is valid. Create a (full) child socket to accept - * the data in SYN before returning a SYN-ACK to ack the - * data. If we fail to create the socket, fall back and - * ack the ISN only but includes the same cookie. - * - * Note: Data-less SYN with valid cookie is allowed to send - * data in SYN_RECV state. - */ + if (foc->len == 0) { + /* Client requests a cookie. */ + tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc); + } else if (foc->len > 0) { + ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc, + &valid_foc); + if (!ret) { + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + } else { + /* Cookie is valid. Create a (full) child socket to + * accept the data in SYN before returning a SYN-ACK to + * ack the data. If we fail to create the socket, fall + * back and ack the ISN only but includes the same + * cookie. + * + * Note: Data-less SYN with valid cookie is allowed to + * send data in SYN_RECV state. + */ fastopen: - child = tcp_fastopen_create_child(sk, skb, req); - if (child) { - foc->len = -1; + child = tcp_fastopen_create_child(sk, skb, req); + if (child) { + if (ret == 2) { + valid_foc.exp = foc->exp; + *foc = valid_foc; + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEALTKEY); + } else { + foc->len = -1; + } + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVE); + return child; + } NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVE); - return child; + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - } else if (foc->len > 0) /* Client presents an invalid cookie */ - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - + } valid_foc.exp = foc->exp; *foc = valid_foc; return NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d95ee40df6c2..c21e8a22fb3b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -119,7 +119,7 @@ void clean_acked_data_enable(struct inet_connection_sock *icsk, void (*cad)(struct sock *sk, u32 ack_seq)) { icsk->icsk_clean_acked = cad; - static_branch_inc(&clean_acked_data_enabled.key); + static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); @@ -778,6 +778,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); + + tcp_bpf_rtt(sk); } } else { /* no previous measure. */ @@ -786,6 +788,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; + + tcp_bpf_rtt(sk); } tp->srtt_us = max(1U, srtt); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cfa81190a1b1..d57641cb3477 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -662,8 +662,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) int genhash; struct sock *sk1 = NULL; #endif - struct net *net; + u64 transmit_time = 0; struct sock *ctl_sk; + struct net *net; /* Never send a reset in response to a reset. */ if (th->rst) @@ -766,14 +767,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); - if (sk) + ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); + if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + transmit_time = tcp_transmit_time(sk); + } ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - &arg, arg.iov[0].iov_len); + &arg, arg.iov[0].iov_len, + transmit_time); ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); @@ -808,6 +812,7 @@ static void tcp_v4_send_ack(const struct sock *sk, struct net *net = sock_net(sk); struct ip_reply_arg arg; struct sock *ctl_sk; + u64 transmit_time; memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -858,14 +863,15 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); - if (sk) - ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - &arg, arg.iov[0].iov_len); + &arg, arg.iov[0].iov_len, + transmit_time); ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c35731816e2..8bcaf2586b68 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; - + tcptw->tw_tx_delay = tp->tcp_tx_delay; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -283,6 +283,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); + tw->tw_txhash = sk->sk_txhash; tw->tw_ipv6only = sk->sk_ipv6only; } #endif diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0ebc33d1c9e5..4af1f5dae9d3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); + tcp_add_tx_delay(skb, tp); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { @@ -2239,6 +2241,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; + if (static_branch_unlikely(&tcp_tx_delay_enabled) && + tcp_sk(sk)->tcp_tx_delay) { + u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay; + + /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we + * approximate our needs assuming an ~100% skb->truesize overhead. + * USEC_PER_SEC is approximated by 2^20. + * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. + */ + extra_bytes >>= (20 - 1); + limit += extra_bytes; + } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, @@ -3217,6 +3231,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, int tcp_header_size; struct tcphdr *th; int mss; + u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { @@ -3248,13 +3263,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); + now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif { - skb->skb_mstamp_ns = tcp_clock_ns(); + skb->skb_mstamp_ns = now; if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3297,8 +3313,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif - /* Do not fool tcpdump (if any), clean our debris */ - skb->tstamp = 0; + skb->skb_mstamp_ns = now; + tcp_add_tx_delay(skb, tp); + return skb; } EXPORT_SYMBOL(tcp_make_synack); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eed59c847722..c21862ba9c02 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -125,17 +125,6 @@ EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) -/* IPCB reference means this can not be used from early demux */ -static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_udp_l3mdev_accept && - skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) - return true; -#endif - return false; -} - static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, @@ -364,7 +353,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) static int compute_score(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, - int dif, int sdif, bool exact_dif) + int dif, int sdif) { int score; struct inet_sock *inet; @@ -420,7 +409,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, - int dif, int sdif, bool exact_dif, + int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { @@ -432,7 +421,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); + daddr, hnum, dif, sdif); if (score > badness) { if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, @@ -460,7 +449,6 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2; struct udp_hslot *hslot2; - bool exact_dif = udp_lib_exact_dif_match(net, skb); hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -468,7 +456,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, - exact_dif, hslot2, skb); + hslot2, skb); if (!result) { hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; @@ -476,9 +464,9 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, - exact_dif, hslot2, skb); + hslot2, skb); } - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } @@ -2236,8 +2224,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - inet_compute_pseudo); + skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 9763464a75d7..a3908e55ed89 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -208,7 +208,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, gso_skb->destructor = NULL; segs = skb_segment(gso_skb, features); - if (unlikely(IS_ERR_OR_NULL(segs))) { + if (IS_ERR_OR_NULL(segs)) { if (copy_dtor) gso_skb->destructor = sock_wfree; return segs; diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 80c40b4981bb..f8ed3c3bb928 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -15,46 +15,6 @@ #include <linux/netfilter_ipv4.h> #include <linux/export.h> -static int xfrm4_init_flags(struct xfrm_state *x) -{ - if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) - x->props.flags |= XFRM_STATE_NOPMTUDISC; - return 0; -} - -static void -__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) -{ - const struct flowi4 *fl4 = &fl->u.ip4; - - sel->daddr.a4 = fl4->daddr; - sel->saddr.a4 = fl4->saddr; - sel->dport = xfrm_flowi_dport(fl, &fl4->uli); - sel->dport_mask = htons(0xffff); - sel->sport = xfrm_flowi_sport(fl, &fl4->uli); - sel->sport_mask = htons(0xffff); - sel->family = AF_INET; - sel->prefixlen_d = 32; - sel->prefixlen_s = 32; - sel->proto = fl4->flowi4_proto; - sel->ifindex = fl4->flowi4_oif; -} - -static void -xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, - const xfrm_address_t *daddr, const xfrm_address_t *saddr) -{ - x->id = tmpl->id; - if (x->id.daddr.a4 == 0) - x->id.daddr.a4 = daddr->a4; - x->props.saddr = tmpl->saddr; - if (x->props.saddr.a4 == 0) - x->props.saddr.a4 = saddr->a4; - x->props.mode = tmpl->mode; - x->props.reqid = tmpl->reqid; - x->props.family = AF_INET; -} - int xfrm4_extract_header(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); @@ -74,11 +34,6 @@ int xfrm4_extract_header(struct sk_buff *skb) static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, .proto = IPPROTO_IPIP, - .eth_proto = htons(ETH_P_IP), - .owner = THIS_MODULE, - .init_flags = xfrm4_init_flags, - .init_tempsel = __xfrm4_init_tempsel, - .init_temprop = xfrm4_init_temprop, .output = xfrm4_output, .output_finish = xfrm4_output_finish, .extract_input = xfrm4_extract_input, diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 5d00e54cd319..dc19aff7c2e0 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -108,8 +108,7 @@ static void __exit ipip_fini(void) if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET)) pr_info("%s: can't remove xfrm handler for AF_INET\n", __func__); - if (xfrm_unregister_type(&ipip_type, AF_INET) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&ipip_type, AF_INET); } module_init(ipip_init); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 081bb517e40d..521e3203e83a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2417,9 +2417,13 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex) + /* prefix routes only use builtin fib6_nh */ + if (rt->nh) continue; - if (no_gw && rt->fib6_nh.fib_nh_gw_family) + + if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex) + continue; + if (no_gw && rt->fib6_nh->fib_nh_gw_family) continue; if ((rt->fib6_flags & flags) != flags) continue; @@ -3123,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) struct in_device *in_dev = __in_dev_get_rtnl(dev); if (in_dev && (dev->flags & IFF_UP)) { struct in_ifaddr *ifa; - int flag = scope; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - + in_dev_for_each_ifa_rtnl(ifa, in_dev) { addr.s6_addr32[3] = ifa->ifa_local; if (ifa->ifa_scope == RT_SCOPE_LINK) @@ -6350,16 +6352,17 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct fib6_info *rt = ifa->rt; + /* host routes only use builtin fib6_nh */ + struct fib6_nh *nh = ifa->rt->fib6_nh; int cpu; rcu_read_lock(); ifa->rt->dst_nopolicy = val ? true : false; - if (rt->rt6i_pcpu) { + if (nh->rt6i_pcpu) { for_each_possible_cpu(cpu) { struct rt6_info **rtp; - rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu); + rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu); addrconf_set_nopolicy(*rtp, val); } } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 5b1246635e02..783f3c1466da 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -183,6 +183,11 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +{ + return -EAFNOSUPPORT; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .ipv6_route_input = eafnosupport_ipv6_route_input, @@ -192,6 +197,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .fib6_select_path = eafnosupport_fib6_select_path, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, .fib6_nh_init = eafnosupport_fib6_nh_init, + .ip6_del_rt = eafnosupport_ip6_del_rt, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5352708b7b2d..ef37e0574f54 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -208,7 +208,7 @@ lookup_protocol: np->mc_loop = 1; np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->repflow = net->ipv6.sysctl.flowlabel_reflect; + np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets @@ -564,6 +564,39 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } EXPORT_SYMBOL(inet6_ioctl); +INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, + size_t)); +int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + + if (unlikely(inet_send_prepare(sk))) + return -EAGAIN; + + return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, + sk, msg, size); +} + +INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *, + size_t, int, int, int *)); +int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + if (likely(!(flags & MSG_ERRQUEUE))) + sock_rps_record_flow(sk); + + err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, + sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} + const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -580,8 +613,8 @@ const struct proto_ops inet6_stream_ops = { .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = inet_recvmsg, /* ok */ + .sendmsg = inet6_sendmsg, /* retpoline's sake */ + .recvmsg = inet6_recvmsg, /* retpoline's sake */ #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif @@ -614,8 +647,8 @@ const struct proto_ops inet6_dgram_ops = { .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = inet_recvmsg, /* ok */ + .sendmsg = inet6_sendmsg, /* retpoline's sake */ + .recvmsg = inet6_recvmsg, /* retpoline's sake */ .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, @@ -922,6 +955,9 @@ static const struct ipv6_stub ipv6_stub_impl = { .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .fib6_nh_init = fib6_nh_init, .fib6_nh_release = fib6_nh_release, + .fib6_update_sernum = fib6_update_sernum_stub, + .fib6_rt_update = fib6_rt_update, + .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 68b9e92e469e..25e1172fd1c3 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -793,9 +793,7 @@ static void __exit ah6_fini(void) if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); - + xfrm_unregister_type(&ah6_type, AF_INET6); } module_init(ah6_init); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index ae6a739c5f52..a3b403ba8f8f 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -41,8 +41,6 @@ struct esp_skb_cb { #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) -static u32 esp6_get_mtu(struct xfrm_state *x, int mtu); - /* * Allocate an AEAD request structure with extra space for SG and IV. * @@ -447,7 +445,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } @@ -687,21 +685,6 @@ out: return ret; } -static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) -{ - struct crypto_aead *aead = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); - unsigned int net_adj; - - if (x->props.mode != XFRM_MODE_TUNNEL) - net_adj = sizeof(struct ipv6hdr); - else - net_adj = 0; - - return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - - net_adj) & ~(blksize - 1)) + net_adj - 2; -} - static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { @@ -919,7 +902,6 @@ static const struct xfrm_type esp6_type = { .flags = XFRM_TYPE_REPLAY_PROT, .init_state = esp6_init_state, .destructor = esp6_destroy, - .get_mtu = esp6_get_mtu, .input = esp6_input, .output = esp6_output, .hdr_offset = xfrm6_find_1stfragopt, @@ -951,8 +933,7 @@ static void __exit esp6_fini(void) { if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&esp6_type, AF_INET6); } module_init(esp6_init); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index d0d8528b294a..e31626ffccd1 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -336,9 +336,7 @@ static int __init esp6_offload_init(void) static void __exit esp6_offload_exit(void) { - if (xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type offload\n", __func__); - + xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6); inet6_del_offload(&esp6_offload, IPPROTO_ESP); } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index bcfae13409b5..d22b6c140f23 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); } - dst_hold(&net->ipv6.ip6_null_entry->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } @@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } again: - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); err = -EAGAIN; rt = NULL; goto out; discard_pkt: - dst_hold(&rt->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); out: res->rt6 = rt; return err; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 375b4b4f9bf5..62c997201970 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -75,9 +75,9 @@ * * On SMP we have one ICMP socket per-cpu. */ -static inline struct sock *icmpv6_sk(struct net *net) +static struct sock *icmpv6_sk(struct net *net) { - return *this_cpu_ptr(net->ipv6.icmp_sk); + return this_cpu_read(*net->ipv6.icmp_sk); } static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, @@ -703,6 +703,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; memset(&fl6, 0, sizeof(fl6)); + if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) + fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb)); + fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b2a55f300318..cf60fae9533b 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -174,7 +174,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, &in6addr_any, hnum, dif, sdif); done: - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9180c8b6f764..49884f96232b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -143,20 +143,19 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; + size_t sz = sizeof(*f6i); - f6i = kzalloc(sizeof(*f6i), gfp_flags); + if (with_fib6_nh) + sz += sizeof(struct fib6_nh); + + f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; - f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); - if (!f6i->rt6i_pcpu) { - kfree(f6i); - return NULL; - } - + /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); @@ -166,36 +165,15 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); - struct rt6_exception_bucket *bucket; WARN_ON(f6i->fib6_node); - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); - kfree(bucket); - - if (f6i->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - dst_dev_put(&pcpu_rt->dst); - dst_release(&pcpu_rt->dst); - *ppcpu_rt = NULL; - } - } - - free_percpu(f6i->rt6i_pcpu); - } - - fib6_nh_release(&f6i->fib6_nh); + if (f6i->nh) + nexthop_put(f6i->nh); + else + fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); - kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); @@ -338,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); + if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); } return &rt->dst; @@ -389,14 +368,30 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, return call_fib6_notifier(nb, net, event_type, &info.info); } -static int call_fib6_entry_notifiers(struct net *net, - enum fib_event_type event_type, - struct fib6_info *rt, - struct netlink_ext_ack *extack) +int call_fib6_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + }; + + rt->fib6_table->fib_seq++; + return call_fib6_notifiers(net, event_type, &info.info); +} + +int call_fib6_multipath_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + unsigned int nsiblings, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, + .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; @@ -469,12 +464,19 @@ static int fib6_dump_node(struct fib6_walker *w) struct fib6_info *rt; for_each_fib6_walker_rt(w) { - res = rt6_dump_route(rt, w->args); - if (res < 0) { + res = rt6_dump_route(rt, w->args, w->skip_in_node); + if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; + + /* We'll restart from this node, so if some routes were + * already dumped, skip them next time. + */ + w->skip_in_node += res; + return 1; } + w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the @@ -526,6 +528,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, if (cb->args[4] == 0) { w->count = 0; w->skip = 0; + w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); @@ -541,6 +544,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->state = FWS_INIT; w->node = w->root; w->skip = w->count; + w->skip_in_node = 0; } else w->skip = 0; @@ -558,9 +562,10 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, + .filter.dump_routes = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct rt6_rtnl_dump_arg arg = {}; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib6_walker *w; @@ -577,13 +582,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); - arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED); + if (rtm->rtm_flags & RTM_F_PREFIX) + arg.filter.flags = RTM_F_PREFIX; } - /* fib entries are never clones */ - if (arg.filter.flags & RTM_F_CLONED) - goto out; - w = (void *)cb->args[2]; if (!w) { /* New dump: @@ -895,16 +897,14 @@ insert_above: return ln; } -static void fib6_drop_pcpu_from(struct fib6_info *f6i, - const struct fib6_table *table) +static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, + const struct fib6_info *match, + const struct fib6_table *table) { int cpu; - /* Make sure rt6_make_pcpu_route() wont add other percpu routes - * while we are cleaning them here. - */ - f6i->fib6_destroying = 1; - mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + if (!fib6_nh->rt6i_pcpu) + return; /* release the reference to this fib entry from * all of its cached pcpu routes @@ -913,9 +913,15 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { + + /* only dropping the 'from' reference if the cached route + * is using 'match'. The cached pcpu_rt->from only changes + * from a fib6_info to NULL (ip6_dst_destroy); it can never + * change from one fib6_info reference to another + */ + if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); @@ -924,13 +930,53 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, } } +struct fib6_nh_pcpu_arg { + struct fib6_info *from; + const struct fib6_table *table; +}; + +static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_pcpu_arg *arg = _arg; + + __fib6_drop_pcpu_from(nh, arg->from, arg->table); + return 0; +} + +static void fib6_drop_pcpu_from(struct fib6_info *f6i, + const struct fib6_table *table) +{ + /* Make sure rt6_make_pcpu_route() wont add other percpu routes + * while we are cleaning them here. + */ + f6i->fib6_destroying = 1; + mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + + if (f6i->nh) { + struct fib6_nh_pcpu_arg arg = { + .from = f6i, + .table = table + }; + + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, + &arg); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = f6i->fib6_nh; + __fib6_drop_pcpu_from(fib6_nh, f6i, table); + } +} + static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; - if (rt->rt6i_pcpu) - fib6_drop_pcpu_from(rt, table); + fib6_drop_pcpu_from(rt, table); + + if (rt->nh && !list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1101,11 +1147,13 @@ next_iter: add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); - if (err) - return err; + if (!info->skip_notify_kernel) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); + if (err) + return err; + } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); @@ -1130,11 +1178,13 @@ add: return -ENOENT; } - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_REPLACE, - rt, extack); - if (err) - return err; + if (!info->skip_notify_kernel) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_REPLACE, + rt, extack); + if (err) + return err; + } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); @@ -1218,6 +1268,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } +/* allow ipv4 to update sernum via ipv6_stub */ +void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_update_sernum_upto_root(net, f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> @@ -1331,6 +1389,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { + if (rt->nh) + list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); } @@ -1536,7 +1596,8 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, if (plen == fn->fn_bit) return fn; - prev = fn; + if (fn->fn_flags & RTN_RTINFO) + prev = fn; next: /* @@ -1807,9 +1868,11 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); + if (!info->skip_notify_kernel) + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); + fib6_info_release(rt); } @@ -2041,6 +2104,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; + c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; @@ -2292,9 +2356,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; + struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; + if (rt->nh) + fib6_nh = nexthop_fib6_nh(rt->nh); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES @@ -2302,14 +2370,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->fib6_nh.fib_nh_gw_family) { + if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; - seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6); + seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } - dev = rt->fib6_nh.fib_nh_dev; + dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 545e339b8c4f..ad284b1fd308 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> +#include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -53,6 +54,9 @@ static DEFINE_SPINLOCK(ip6_fl_lock); static DEFINE_SPINLOCK(ip6_sk_fl_lock); +DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); +EXPORT_SYMBOL(ipv6_flowlabel_exclusive); + #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ fl != NULL; \ @@ -90,6 +94,13 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) return fl; } +static bool fl_shared_exclusive(struct ip6_flowlabel *fl) +{ + return fl->share == IPV6_FL_S_EXCL || + fl->share == IPV6_FL_S_PROCESS || + fl->share == IPV6_FL_S_USER; +} + static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); @@ -103,8 +114,13 @@ static void fl_free_rcu(struct rcu_head *head) static void fl_free(struct ip6_flowlabel *fl) { - if (fl) - call_rcu(&fl->rcu, fl_free_rcu); + if (!fl) + return; + + if (fl_shared_exclusive(fl) || fl->opt) + static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); + + call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) @@ -240,7 +256,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, /* Socket flowlabel lists */ -struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) +struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); @@ -260,7 +276,7 @@ struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) rcu_read_unlock_bh(); return NULL; } -EXPORT_SYMBOL_GPL(fl6_sock_lookup); +EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { @@ -419,6 +435,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); + if (fl_shared_exclusive(fl) || fl->opt) + static_branch_deferred_inc(&ipv6_flowlabel_exclusive); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: @@ -854,6 +872,7 @@ int ip6_flowlabel_init(void) void ip6_flowlabel_cleanup(void) { + static_key_deferred_flush(&ipv6_flowlabel_exclusive); del_timer(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 21efcd02f337..8e49fd62eea9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -124,16 +124,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return -EINVAL; } -static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } - #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -150,6 +142,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ip6_finish_output2(net, sk, skb); } +static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip6_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip6_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -588,6 +596,169 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, + struct ip6_fraglist_iter *iter) +{ + unsigned int first_len; + struct frag_hdr *fh; + + /* BUILD HEADER */ + *prevhdr = NEXTHDR_FRAGMENT; + iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); + if (!iter->tmp_hdr) + return -ENOMEM; + + iter->frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + + iter->offset = 0; + iter->hlen = hlen; + iter->frag_id = frag_id; + iter->nexthdr = nexthdr; + + __skb_pull(skb, hlen); + fh = __skb_push(skb, sizeof(struct frag_hdr)); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); + + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + fh->identification = frag_id; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); + + return 0; +} +EXPORT_SYMBOL(ip6_fraglist_init); + +void ip6_fraglist_prepare(struct sk_buff *skb, + struct ip6_fraglist_iter *iter) +{ + struct sk_buff *frag = iter->frag; + unsigned int hlen = iter->hlen; + struct frag_hdr *fh; + + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + fh = __skb_push(frag, sizeof(struct frag_hdr)); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); + iter->offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = iter->nexthdr; + fh->reserved = 0; + fh->frag_off = htons(iter->offset); + if (frag->next) + fh->frag_off |= htons(IP6_MF); + fh->identification = iter->frag_id; + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); +} +EXPORT_SYMBOL(ip6_fraglist_prepare); + +void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, + unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) +{ + state->prevhdr = prevhdr; + state->nexthdr = nexthdr; + state->frag_id = frag_id; + + state->hlen = hlen; + state->mtu = mtu; + + state->left = skb->len - hlen; /* Space per frame */ + state->ptr = hlen; /* Where to start from */ + + state->hroom = hdr_room; + state->troom = needed_tailroom; + + state->offset = 0; +} +EXPORT_SYMBOL(ip6_frag_init); + +struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) +{ + u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; + struct sk_buff *frag; + struct frag_hdr *fh; + unsigned int len; + + len = state->left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > state->mtu) + len = state->mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < state->left) + len &= ~7; + + /* Allocate buffer */ + frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + + state->hroom + state->troom, GFP_ATOMIC); + if (!frag) + return ERR_PTR(-ENOMEM); + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, state->hroom); + skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); + frag->transport_header = (frag->network_header + state->hlen + + sizeof(struct frag_hdr)); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); + + fragnexthdr_offset = skb_network_header(frag); + fragnexthdr_offset += prevhdr - skb_network_header(skb); + *fragnexthdr_offset = NEXTHDR_FRAGMENT; + + /* + * Build fragment header. + */ + fh->nexthdr = state->nexthdr; + fh->reserved = 0; + fh->identification = state->frag_id; + + /* + * Copy a block of the IP datagram. + */ + BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), + len)); + state->left -= len; + + fh->frag_off = htons(state->offset); + if (state->left > 0) + fh->frag_off |= htons(IP6_MF); + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + + state->ptr += len; + state->offset += len; + + return frag; +} +EXPORT_SYMBOL(ip6_frag_next); + int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { @@ -595,12 +766,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - struct ipv6hdr *tmp_hdr; - struct frag_hdr *fh; - unsigned int mtu, hlen, left, len, nexthdr_offset; - int hroom, troom; + struct ip6_frag_state state; + unsigned int mtu, hlen, nexthdr_offset; + int hroom, err = 0; __be32 frag_id; - int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; err = ip6_find_1stfragopt(skb, &prevhdr); @@ -647,6 +816,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; struct sk_buff *frag2; if (first_len - hlen > mtu || @@ -674,74 +844,29 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, skb->truesize -= frag->truesize; } - err = 0; - offset = 0; - /* BUILD HEADER */ - - *prevhdr = NEXTHDR_FRAGMENT; - tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); - if (!tmp_hdr) { - err = -ENOMEM; + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) goto fail; - } - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - - __skb_pull(skb, hlen); - fh = __skb_push(skb, sizeof(struct frag_hdr)); - __skb_push(skb, hlen); - skb_reset_network_header(skb); - memcpy(skb_network_header(skb), tmp_hdr, hlen); - - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(IP6_MF); - fh->identification = frag_id; - - first_len = skb_pagelen(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - ipv6_hdr(skb)->payload_len = htons(first_len - - sizeof(struct ipv6hdr)); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - fh = __skb_push(frag, sizeof(struct frag_hdr)); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), tmp_hdr, - hlen); - offset += skb->len - hlen - sizeof(struct frag_hdr); - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(offset); - if (frag->next) - fh->frag_off |= htons(IP6_MF); - fh->identification = frag_id; - ipv6_hdr(frag)->payload_len = - htons(frag->len - - sizeof(struct ipv6hdr)); - ip6_copy_metadata(frag, skb); - } + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); - if (err || !frag) + if (err || !iter.frag) break; - skb = frag; - frag = skb->next; - skb_mark_not_on_list(skb); + skb = ip6_fraglist_next(&iter); } - kfree(tmp_hdr); + kfree(iter.tmp_hdr); if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -749,7 +874,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - kfree_skb_list(frag); + kfree_skb_list(iter.frag); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); @@ -766,91 +891,26 @@ slow_path_clean: } slow_path: - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - /* * Fragment the datagram. */ - troom = rt->dst.dev->needed_tailroom; + ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, + LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, + &state); /* * Keep copying data until we run out. */ - while (left > 0) { - u8 *fragnexthdr_offset; - - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } - /* Allocate buffer */ - frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + - hroom + troom, GFP_ATOMIC); - if (!frag) { - err = -ENOMEM; + while (state.left > 0) { + frag = ip6_frag_next(skb, &state); + if (IS_ERR(frag)) { + err = PTR_ERR(frag); goto fail; } /* - * Set up data on packet - */ - - ip6_copy_metadata(frag, skb); - skb_reserve(frag, hroom); - skb_put(frag, len + hlen + sizeof(struct frag_hdr)); - skb_reset_network_header(frag); - fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); - frag->transport_header = (frag->network_header + hlen + - sizeof(struct frag_hdr)); - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - if (skb->sk) - skb_set_owner_w(frag, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); - - fragnexthdr_offset = skb_network_header(frag); - fragnexthdr_offset += prevhdr - skb_network_header(skb); - *fragnexthdr_offset = NEXTHDR_FRAGMENT; - - /* - * Build fragment header. - */ - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->identification = frag_id; - - /* - * Copy a block of the IP datagram. - */ - BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), - len)); - left -= len; - - fh->frag_off = htons(offset); - if (left > 0) - fh->frag_off |= htons(IP6_MF); - ipv6_hdr(frag)->payload_len = htons(frag->len - - sizeof(struct ipv6hdr)); - - ptr += len; - offset += len; - - /* * Put this fragment into the sending queue. */ err = output(net, sk, frag); diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 51fd33294c7c..3752bd3e92ce 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -206,8 +206,7 @@ static void __exit ipcomp6_fini(void) { if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&ipcomp6_type, AF_INET6); } module_init(ipcomp6_init); diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 91801432878c..878fcec14949 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -499,10 +499,8 @@ static void __exit mip6_fini(void) { if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) pr_info("%s: can't remove rawv6 mh filter\n", __func__); - if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type(rthdr)\n", __func__); - if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type(destopt)\n", __func__); + xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); + xfrm_unregister_type(&mip6_destopt_type, AF_INET6); } module_init(mip6_init); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 09dd2edfb868..083cc1c94cd3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1285,12 +1285,11 @@ static void ndisc_router_discovery(struct sk_buff *skb) !in6_dev->cnf.accept_ra_rtr_pref) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif - + /* routes added from RAs do not use nexthop objects */ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); - if (rt) { - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, @@ -1319,8 +1318,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 1240ccd57f39..61819ed858b1 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -16,6 +16,9 @@ #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/netfilter/nf_queue.h> +#include <net/netfilter/nf_conntrack_bridge.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include "../bridge/br_private.h" int ip6_route_me_harder(struct net *net, struct sk_buff *skb) { @@ -109,16 +112,142 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst, } EXPORT_SYMBOL_GPL(__nf_ip6_route); +int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + struct ip6_frag_state state; + u8 *prevhdr, nexthdr = 0; + unsigned int mtu, hlen; + int hroom, err = 0; + __be32 frag_id; + + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + goto blackhole; + hlen = err; + nexthdr = *prevhdr; + + mtu = skb->dev->mtu; + if (frag_max_size > mtu || + frag_max_size < IPV6_MIN_MTU) + goto blackhole; + + mtu = frag_max_size; + if (mtu < hlen + sizeof(struct frag_hdr) + 8) + goto blackhole; + mtu -= hlen + sizeof(struct frag_hdr); + + frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto blackhole; + + hroom = LL_RESERVED_SPACE(skb->dev); + if (skb_has_frag_list(skb)) { + unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; + struct sk_buff *frag2; + + if (first_len - hlen > mtu || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + goto blackhole; + + if (skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag2) { + if (frag2->len > mtu || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + goto blackhole; + + /* Partially cloned skb? */ + if (skb_shared(frag2)) + goto slow_path; + } + + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) + goto blackhole; + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. + */ + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); + + err = output(net, sk, data, skb); + if (err || !iter.frag) + break; + + skb = ip6_fraglist_next(&iter); + } + + kfree(iter.tmp_hdr); + if (!err) + return 0; + + kfree_skb_list(iter.frag); + return err; + } +slow_path: + /* This is a linearized skbuff, the original geometry is lost for us. + * This may also be a clone skbuff, we could preserve the geometry for + * the copies but probably not worth the effort. + */ + ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom, + LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id, + &state); + + while (state.left > 0) { + struct sk_buff *skb2; + + skb2 = ip6_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); + goto blackhole; + } + + err = output(net, sk, data, skb2); + if (err) + goto blackhole; + } + consume_skb(skb); + return err; + +blackhole: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL_GPL(br_ip6_fragment); + static const struct nf_ipv6_ops ipv6ops = { #if IS_MODULE(CONFIG_IPV6) .chk_addr = ipv6_chk_addr, .route_me_harder = ip6_route_me_harder, .dev_get_saddr = ipv6_dev_get_saddr, .route = __nf_ip6_route, +#if IS_ENABLED(CONFIG_SYN_COOKIES) + .cookie_init_sequence = __cookie_v6_init_sequence, + .cookie_v6_check = __cookie_v6_check, +#endif #endif .route_input = ip6_route_input, .fragment = ip6_fragment, .reroute = nf_ip6_reroute, +#if IS_MODULE(CONFIG_IPV6) && IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + .br_defrag = nf_ct_frag6_gather, +#endif +#if IS_MODULE(CONFIG_IPV6) + .br_fragment = br_ip6_fragment, +#endif }; int __init ipv6_netfilter_init(void) diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 41325d517478..e77ea1ed5edd 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -3,272 +3,11 @@ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <net/ip6_checksum.h> -#include <net/ip6_route.h> -#include <net/tcp.h> - #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_synproxy.h> -#include <net/netfilter/nf_conntrack_ecache.h> - -static struct ipv6hdr * -synproxy_build_ip(struct net *net, struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - struct ipv6hdr *iph; - - skb_reset_network_header(skb); - iph = skb_put(skb, sizeof(*iph)); - ip6_flow_hdr(iph, 0, 0); - iph->hop_limit = net->ipv6.devconf_all->hop_limit; - iph->nexthdr = IPPROTO_TCP; - iph->saddr = *saddr; - iph->daddr = *daddr; - - return iph; -} - -static void -synproxy_send_tcp(struct net *net, - const struct sk_buff *skb, struct sk_buff *nskb, - struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, - struct ipv6hdr *niph, struct tcphdr *nth, - unsigned int tcp_hdr_size) -{ - struct dst_entry *dst; - struct flowi6 fl6; - - nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)nth - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_TCP; - fl6.saddr = niph->saddr; - fl6.daddr = niph->daddr; - fl6.fl6_sport = nth->source; - fl6.fl6_dport = nth->dest; - security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - dst_release(dst); - goto free_nskb; - } - dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); - if (IS_ERR(dst)) - goto free_nskb; - - skb_dst_set(nskb, dst); - - if (nfct) { - nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); - nf_conntrack_get(nfct); - } - - ip6_local_out(net, nskb->sk, nskb); - return; - -free_nskb: - kfree_skb(nskb); -} - -static void -synproxy_send_client_synack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - u16 mss = opts->mss; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE; - nth->doff = tcp_hdr_size / 4; - nth->window = 0; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} -static void -synproxy_send_server_syn(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(recv_seq - 1); - /* ack_seq is used to relay our ISN to the synproxy hook to initialize - * sequence number translation once a connection tracking entry exists. - */ - nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); - tcp_flag_word(nth) = TCP_FLAG_SYN; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; - nth->doff = tcp_hdr_size / 4; - nth->window = th->window; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, - niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_ack(struct net *net, - const struct ip_ct_tcp *state, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(ntohl(th->ack_seq)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(ntohl(th->seq) + 1); - nth->ack_seq = th->ack_seq; - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(ntohs(th->window) >> opts->wscale); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static bool -synproxy_recv_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - int mss; - - mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); - if (mss == 0) { - this_cpu_inc(snet->stats->cookie_invalid); - return false; - } - - this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; - - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy_check_timestamp_cookie(opts); - - synproxy_send_server_syn(net, skb, th, opts, recv_seq); - return true; -} +#include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) @@ -304,13 +43,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(net, skb, th, &opts); + synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { + if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, + ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { @@ -321,141 +61,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv6_synproxy_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct net *net = nhs->net; - struct synproxy_net *snet = synproxy_pernet(net); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - struct nf_conn_synproxy *synproxy; - struct synproxy_options opts = {}; - const struct ip_ct_tcp *state; - struct tcphdr *th, _th; - __be16 frag_off; - u8 nexthdr; - int thoff; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_ACCEPT; - - synproxy = nfct_synproxy(ct); - if (synproxy == NULL) - return NF_ACCEPT; - - if (nf_is_loopback_packet(skb)) - return NF_ACCEPT; - - nexthdr = ipv6_hdr(skb)->nexthdr; - thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (thoff < 0 || nexthdr != IPPROTO_TCP) - return NF_ACCEPT; - - th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); - if (th == NULL) - return NF_DROP; - - state = &ct->proto.tcp; - switch (state->state) { - case TCP_CONNTRACK_CLOSE: - if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - - ntohl(th->seq) + 1); - break; - } - - if (!th->syn || th->ack || - CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - break; - - /* Reopened connection - reset the sequence number and timestamp - * adjustments, they will get initialized once the connection is - * reestablished. - */ - nf_ct_seqadj_init(ct, ctinfo, 0); - synproxy->tsoff = 0; - this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ - case TCP_CONNTRACK_SYN_SENT: - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (!th->syn && th->ack && - CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, - * therefore we need to add 1 to make the SYN sequence - * number match the one of first SYN. - */ - if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) { - this_cpu_inc(snet->stats->cookie_retrans); - consume_skb(skb); - return NF_STOLEN; - } else { - return NF_DROP; - } - } - - synproxy->isn = ntohl(th->ack_seq); - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy->its = opts.tsecr; - - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - break; - case TCP_CONNTRACK_SYN_RECV: - if (!th->syn || !th->ack) - break; - - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { - synproxy->tsoff = opts.tsval - synproxy->its; - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - } - - opts.options &= ~(XT_SYNPROXY_OPT_MSS | - XT_SYNPROXY_OPT_WSCALE | - XT_SYNPROXY_OPT_SACK_PERM); - - swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(net, state, skb, th, &opts); - - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); - nf_conntrack_event_cache(IPCT_SEQADJ, ct); - - swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(net, skb, th, &opts); - - consume_skb(skb); - return NF_STOLEN; - default: - break; - } - - synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); - return NF_ACCEPT; -} - -static const struct nf_hook_ops ipv6_synproxy_ops[] = { - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); @@ -471,16 +76,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par) if (err) return err; - if (snet->hook_ref6 == 0) { - err = nf_register_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); - if (err) { - nf_ct_netns_put(par->net, par->family); - return err; - } + err = nf_synproxy_ipv6_init(snet, par->net); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; } - snet->hook_ref6++; return err; } @@ -488,10 +89,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); - snet->hook_ref6--; - if (snet->hook_ref6 == 0) - nf_unregister_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); + nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 3f7d4691c423..a22100b1cf2c 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -2,7 +2,7 @@ /* * IPv6 raw table, a port of the IPv4 raw table to IPv6 * - * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 84322ce81d70..398e1df41406 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -54,26 +54,21 @@ static struct inet_frags nf_frags; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", - .data = &init_net.nf_frag.frags.timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_frag6_low_thresh", - .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", - .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.nf_frag.frags.low_thresh }, { } }; @@ -89,15 +84,15 @@ static int nf_ct_frag6_sysctl_register(struct net *net) GFP_KERNEL); if (table == NULL) goto err_alloc; - - table[0].data = &net->nf_frag.frags.timeout; - table[1].data = &net->nf_frag.frags.low_thresh; - table[1].extra2 = &net->nf_frag.frags.high_thresh; - table[2].data = &net->nf_frag.frags.high_thresh; - table[2].extra1 = &net->nf_frag.frags.low_thresh; - table[2].extra2 = &init_net.nf_frag.frags.high_thresh; } + table[0].data = &net->nf_frag.fqdir->timeout; + table[1].data = &net->nf_frag.fqdir->low_thresh; + table[1].extra2 = &net->nf_frag.fqdir->high_thresh; + table[2].data = &net->nf_frag.fqdir->high_thresh; + table[2].extra1 = &net->nf_frag.fqdir->low_thresh; + table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh; + hdr = register_net_sysctl(net, "net/netfilter", table); if (hdr == NULL) goto err_reg; @@ -144,12 +139,10 @@ static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } /* Creation primitives. */ @@ -165,7 +158,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->nf_frag.frags, &key); + q = inet_frag_find(net->nf_frag.fqdir, &key); if (!q) return NULL; @@ -278,7 +271,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -494,29 +487,35 @@ static int nf_ct_net_init(struct net *net) { int res; - net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - net->nf_frag.frags.f = &nf_frags; - - res = inet_frags_init_net(&net->nf_frag.frags); + res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net); if (res < 0) return res; + + net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = nf_ct_frag6_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); return res; } +static void nf_ct_net_pre_exit(struct net *net) +{ + fqdir_pre_exit(net->nf_frag.fqdir); +} + static void nf_ct_net_exit(struct net *net) { nf_ct_frags6_sysctl_unregister(net); - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); } static struct pernet_operations nf_ct_net_ops = { - .init = nf_ct_net_init, - .exit = nf_ct_net_exit, + .init = nf_ct_net_init, + .pre_exit = nf_ct_net_pre_exit, + .exit = nf_ct_net_exit, }; static const struct rhashtable_params nfct_rhash_params = { diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 4a8da679866e..bbff3e02e302 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -44,8 +44,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %u memory %lu\n", - atomic_read(&net->ipv6.frags.rhashtable.nelems), - frag_mem_limit(&net->ipv6.frags)); + atomic_read(&net->ipv6.fqdir->rhashtable.nelems), + frag_mem_limit(net->ipv6.fqdir)); return 0; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 70693bc7ad9d..8a6131991e38 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -834,7 +834,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -876,7 +876,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b2b2c0c38b87..ca05b16f1bb9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -72,12 +72,10 @@ static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, ipv6.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } static struct frag_queue * @@ -96,7 +94,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) IPV6_ADDR_LINKLOCAL))) key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &key); + q = inet_frag_find(net->ipv6.fqdir, &key); if (!q) return NULL; @@ -196,7 +194,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); fragsize = -skb_network_offset(skb) + skb->len; if (fragsize > fq->q.max_size) @@ -250,7 +248,7 @@ err: static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { - struct net *net = container_of(fq->q.net, struct net, ipv6.frags); + struct net *net = fq->q.fqdir->net; unsigned int nhoff; void *reasm_data; int payload_len; @@ -397,23 +395,18 @@ static const struct inet6_protocol frag_protocol = { static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", - .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", - .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv6.frags.high_thresh }, { .procname = "ip6frag_time", - .data = &init_net.ipv6.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -445,12 +438,12 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv6.frags.high_thresh; - table[0].extra1 = &net->ipv6.frags.low_thresh; - table[1].data = &net->ipv6.frags.low_thresh; - table[1].extra2 = &net->ipv6.frags.high_thresh; - table[2].data = &net->ipv6.frags.timeout; } + table[0].data = &net->ipv6.fqdir->high_thresh; + table[0].extra1 = &net->ipv6.fqdir->low_thresh; + table[1].data = &net->ipv6.fqdir->low_thresh; + table[1].extra2 = &net->ipv6.fqdir->high_thresh; + table[2].data = &net->ipv6.fqdir->timeout; hdr = register_net_sysctl(net, "net/ipv6", table); if (!hdr) @@ -513,30 +506,35 @@ static int __net_init ipv6_frags_init_net(struct net *net) { int res; - net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; - net->ipv6.frags.f = &ip6_frags; - - res = inet_frags_init_net(&net->ipv6.frags); + res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net); if (res < 0) return res; + net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = ip6_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); return res; } +static void __net_exit ipv6_frags_pre_exit_net(struct net *net) +{ + fqdir_pre_exit(net->ipv6.fqdir); +} + static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); } static struct pernet_operations ip6_frags_ops = { - .init = ipv6_frags_init_net, - .exit = ipv6_frags_exit_net, + .init = ipv6_frags_init_net, + .pre_exit = ipv6_frags_pre_exit_net, + .exit = ipv6_frags_exit_net, }; static const struct rhashtable_params ip6_rhash_params = { @@ -587,8 +585,8 @@ err_protocol: void ipv6_frag_exit(void) { - inet_frags_fini(&ip6_frags); ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + inet_frags_fini(&ip6_frags); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 97a843cf164c..4d2e6b31a8d6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -100,7 +100,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict); -static size_t rt6_nlmsg_size(struct fib6_info *rt); +static size_t rt6_nlmsg_size(struct fib6_info *f6i); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } if (rt_dev == dev) { - rt->dst.dev = loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(rt_dev); } @@ -429,21 +429,27 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, struct fib6_info *sibling, *next_sibling; struct fib6_info *match = res->f6i; - if (!match->fib6_nsiblings || have_oif_match) + if ((!match->fib6_nsiblings && !match->nh) || have_oif_match) goto out; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ - if (!fl6->mp_hash) + if (!fl6->mp_hash && + (!match->nh || nexthop_is_multipath(match->nh))) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) + if (unlikely(match->nh)) { + nexthop_path_fib6_result(res, fl6->mp_hash); + return; + } + + if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) goto out; list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { - const struct fib6_nh *nh = &sibling->fib6_nh; + const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); @@ -457,7 +463,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, out: res->f6i = match; - res->nh = &match->fib6_nh; + res->nh = match->fib6_nh; } /* @@ -485,6 +491,45 @@ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, return false; } +struct fib6_nh_dm_arg { + struct net *net; + const struct in6_addr *saddr; + int oif; + int flags; + struct fib6_nh *nh; +}; + +static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_dm_arg *arg = _arg; + + arg->nh = nh; + return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, + arg->flags); +} + +/* returns fib6_nh from nexthop or NULL */ +static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, + struct fib6_result *res, + const struct in6_addr *saddr, + int oif, int flags) +{ + struct fib6_nh_dm_arg arg = { + .net = net, + .saddr = saddr, + .oif = oif, + .flags = flags, + }; + + if (nexthop_is_blackhole(nh)) + return NULL; + + if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) + return arg.nh; + + return NULL; +} + static void rt6_device_match(struct net *net, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { @@ -493,14 +538,31 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { - nh = &f6i->fib6_nh; + if (unlikely(f6i->nh)) { + nh = nexthop_fib6_nh(f6i->nh); + if (nexthop_is_blackhole(f6i->nh)) + goto out_blackhole; + } else { + nh = f6i->fib6_nh; + } if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { - nh = &spf6i->fib6_nh; - if (__rt6_device_match(net, nh, saddr, oif, flags)) { + bool matched = false; + + if (unlikely(spf6i->nh)) { + nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, + oif, flags); + if (nh) + matched = true; + } else { + nh = spf6i->fib6_nh; + if (__rt6_device_match(net, nh, saddr, oif, flags)) + matched = true; + } + if (matched) { res->f6i = spf6i; goto out; } @@ -508,19 +570,32 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; goto out; } - nh = &f6i->fib6_nh; + if (unlikely(f6i->nh)) { + nh = nexthop_fib6_nh(f6i->nh); + if (nexthop_is_blackhole(f6i->nh)) + goto out_blackhole; + } else { + nh = f6i->fib6_nh; + } + if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; } out: res->nh = nh; res->fib6_type = res->f6i->fib6_type; res->fib6_flags = res->f6i->fib6_flags; + return; + +out_blackhole: + res->fib6_flags |= RTF_REJECT; + res->fib6_type = RTN_BLACKHOLE; + res->nh = nh; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -691,6 +766,24 @@ out: return rc; } +struct fib6_nh_frl_arg { + u32 flags; + int oif; + int strict; + int *mpri; + bool *do_rr; + struct fib6_nh *nh; +}; + +static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_frl_arg *arg = _arg; + + arg->nh = nh; + return find_match(nh, arg->flags, arg->oif, arg->strict, + arg->mpri, arg->do_rr); +} + static void __find_rr_leaf(struct fib6_info *f6i_start, struct fib6_info *nomatch, u32 metric, struct fib6_result *res, struct fib6_info **cont, @@ -701,6 +794,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, for (f6i = f6i_start; f6i && f6i != nomatch; f6i = rcu_dereference(f6i->fib6_next)) { + bool matched = false; struct fib6_nh *nh; if (cont && f6i->fib6_metric != metric) { @@ -711,8 +805,34 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, if (fib6_check_expired(f6i)) continue; - nh = &f6i->fib6_nh; - if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { + if (unlikely(f6i->nh)) { + struct fib6_nh_frl_arg arg = { + .flags = f6i->fib6_flags, + .oif = oif, + .strict = strict, + .mpri = mpri, + .do_rr = do_rr + }; + + if (nexthop_is_blackhole(f6i->nh)) { + res->fib6_flags = RTF_REJECT; + res->fib6_type = RTN_BLACKHOLE; + res->f6i = f6i; + res->nh = nexthop_fib6_nh(f6i->nh); + return; + } + if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, + &arg)) { + matched = true; + nh = arg.nh; + } + } else { + nh = f6i->fib6_nh; + if (find_match(nh, f6i->fib6_flags, oif, strict, + mpri, do_rr)) + matched = true; + } + if (matched) { res->f6i = f6i; res->nh = nh; res->fib6_flags = f6i->fib6_flags; @@ -793,7 +913,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif, out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; - res->nh = &res->f6i->fib6_nh; + res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } @@ -1114,6 +1234,8 @@ restart: rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); goto out; + } else if (res.fib6_flags & RTF_REJECT) { + goto do_create; } fib6_select_path(net, &res, fl6, fl6->flowi6_oif, @@ -1125,6 +1247,7 @@ restart: if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); } else { +do_create: rt = ip6_create_rt_rcu(&res); } @@ -1265,13 +1388,9 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) /* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { - struct rt6_info *pcpu_rt, **p; - - p = this_cpu_ptr(res->f6i->rt6i_pcpu); - pcpu_rt = *p; + struct rt6_info *pcpu_rt; - if (pcpu_rt) - ip6_hold_safe(NULL, &pcpu_rt); + pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); return pcpu_rt; } @@ -1282,13 +1401,10 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(res); - if (!pcpu_rt) { - dst_hold(&net->ipv6.ip6_null_entry->dst); - return net->ipv6.ip6_null_entry; - } + if (!pcpu_rt) + return NULL; - dst_hold(&pcpu_rt->dst); - p = this_cpu_ptr(res->f6i->rt6i_pcpu); + p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -1458,25 +1574,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res) return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } +#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL + +/* used when the flushed bit is not relevant, only access to the bucket + * (ie., all bucket users except rt6_insert_exception); + * + * called under rcu lock; sometimes called with rt6_exception_lock held + */ +static +struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + + if (lock) + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + else + bucket = rcu_dereference(nh->rt6i_exception_bucket); + + /* remove bucket flushed bit if set */ + if (bucket) { + unsigned long p = (unsigned long)bucket; + + p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + } + + return bucket; +} + +static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) +{ + unsigned long p = (unsigned long)bucket; + + return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); +} + +/* called with rt6_exception_lock held */ +static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + unsigned long p; + + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + + p = (unsigned long)bucket; + p |= FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); +} + static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; + struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *f6i = res->f6i; + struct fib6_nh *nh = res->nh; int err = 0; spin_lock_bh(&rt6_exception_lock); - if (f6i->exception_bucket_flushed) { - err = -EINVAL; - goto out; - } - - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), GFP_ATOMIC); @@ -1484,7 +1649,10 @@ static int rt6_insert_exception(struct rt6_info *nrt, err = -ENOMEM; goto out; } - rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); + } else if (fib6_nh_excptn_bucket_flushed(bucket)) { + err = -EINVAL; + goto out; } #ifdef CONFIG_IPV6_SUBTREES @@ -1539,7 +1707,7 @@ out: return err; } -void rt6_flush_exceptions(struct fib6_info *rt) +static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1547,25 +1715,46 @@ void rt6_flush_exceptions(struct fib6_info *rt) int i; spin_lock_bh(&rt6_exception_lock); - /* Prevent rt6_insert_exception() to recreate the bucket list */ - rt->exception_bucket_flushed = 1; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; + /* Prevent rt6_insert_exception() to recreate the bucket list */ + if (!from) + fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) - rt6_remove_exception(bucket, rt6_ex); - WARN_ON_ONCE(bucket->depth); + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { + if (!from || + rcu_access_pointer(rt6_ex->rt6i->from) == from) + rt6_remove_exception(bucket, rt6_ex); + } + WARN_ON_ONCE(!from && bucket->depth); bucket++; } - out: spin_unlock_bh(&rt6_exception_lock); } +static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) +{ + struct fib6_info *f6i = arg; + + fib6_nh_flush_exceptions(nh, f6i); + + return 0; +} + +void rt6_flush_exceptions(struct fib6_info *f6i) +{ + if (f6i->nh) + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, + f6i); + else + fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); +} + /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ @@ -1594,7 +1783,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, src_key = saddr; find_ex: #endif - bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); + bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) @@ -1612,25 +1801,20 @@ find_ex: } /* Remove the passed in cached rt from the hash table that contains it */ -static int rt6_remove_exception_rt(struct rt6_info *rt) +static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) { + const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *from; int err; - from = rcu_dereference(rt->from); - if (!from || - !(rt->rt6i_flags & RTF_CACHE)) - return -EINVAL; - - if (!rcu_access_pointer(from->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(from->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); + #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1638,7 +1822,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, @@ -1655,23 +1839,60 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) return err; } -/* Find rt6_ex which contains the passed in rt cache and - * refresh its stamp - */ -static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +struct fib6_nh_excptn_arg { + struct rt6_info *rt; + int plen; +}; + +static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_excptn_arg *arg = _arg; + int err; + + err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); + if (err == 0) + return 1; + + return 0; +} + +static int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; - struct rt6_exception *rt6_ex; struct fib6_info *from; - rcu_read_lock(); from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) - goto unlock; + return -EINVAL; - bucket = rcu_dereference(from->rt6i_exception_bucket); + if (from->nh) { + struct fib6_nh_excptn_arg arg = { + .rt = rt, + .plen = from->fib6_src.plen + }; + int rc; + + /* rc = 1 means an entry was found */ + rc = nexthop_for_each_fib6_nh(from->nh, + rt6_nh_remove_exception_rt, + &arg); + return rc ? 0 : -ENOENT; + } + return fib6_nh_remove_exception(from->fib6_nh, + from->fib6_src.plen, rt); +} + +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) +{ + const struct in6_addr *src_key = NULL; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1679,15 +1900,63 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif - rt6_ex = __rt6_find_exception_rcu(&bucket, - &rt->rt6i_dst.addr, - src_key); + rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; +} + +struct fib6_nh_match_arg { + const struct net_device *dev; + const struct in6_addr *gw; + struct fib6_nh *match; +}; + +/* determine if fib6_nh has given device and gateway */ +static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_match_arg *arg = _arg; + + if (arg->dev != nh->fib_nh_dev || + (arg->gw && !nh->fib_nh_gw_family) || + (!arg->gw && nh->fib_nh_gw_family) || + (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) + return 0; + + arg->match = nh; + + /* found a match, break the loop */ + return 1; +} + +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct fib6_info *from; + struct fib6_nh *fib6_nh; + + rcu_read_lock(); + + from = rcu_dereference(rt->from); + if (!from || !(rt->rt6i_flags & RTF_CACHE)) + goto unlock; + if (from->nh) { + struct fib6_nh_match_arg arg = { + .dev = rt->dst.dev, + .gw = &rt->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); + + if (!arg.match) + return; + fib6_nh = arg.match; + } else { + fib6_nh = from->fib6_nh; + } + fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } @@ -1715,15 +1984,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, - struct fib6_info *rt, int mtu) + const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; @@ -1745,21 +2012,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) -static void rt6_exceptions_clean_tohost(struct fib6_info *rt, - struct in6_addr *gateway) +static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, + const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1824,23 +2089,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -void rt6_age_exceptions(struct fib6_info *rt, - struct fib6_gc_args *gc_args, - unsigned long now) +static void fib6_nh_age_exceptions(const struct fib6_nh *nh, + struct fib6_gc_args *gc_args, + unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1855,6 +2118,36 @@ void rt6_age_exceptions(struct fib6_info *rt, rcu_read_unlock_bh(); } +struct fib6_nh_age_excptn_arg { + struct fib6_gc_args *gc_args; + unsigned long now; +}; + +static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_age_excptn_arg *arg = _arg; + + fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); + return 0; +} + +void rt6_age_exceptions(struct fib6_info *f6i, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + if (f6i->nh) { + struct fib6_nh_age_excptn_arg arg = { + .gc_args = gc_args, + .now = now + }; + + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, + &arg); + } else { + fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); + } +} + /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) @@ -1891,9 +2184,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; - struct rt6_info *rt; + struct rt6_info *rt = NULL; int strict = 0; + WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && + !rcu_read_lock_held()); + strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (net->ipv6.devconf_all->forwarding == 0) @@ -1902,23 +2198,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); - if (res.f6i == net->ipv6.fib6_null_entry) { - rt = net->ipv6.ip6_null_entry; - rcu_read_unlock(); - dst_hold(&rt->dst); - return rt; - } + if (res.f6i == net->ipv6.fib6_null_entry) + goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { - if (ip6_hold_safe(net, &rt)) - dst_use_noref(&rt->dst, jiffies); - - rcu_read_unlock(); - return rt; + goto out; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be @@ -1926,40 +2214,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ - struct rt6_info *uncached_rt; - - uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); + rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); - rcu_read_unlock(); - - if (uncached_rt) { - /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() - * No need for another dst_hold() + if (rt) { + /* 1 refcnt is taken during ip6_rt_cache_alloc(). + * As rt6_uncached_list_add() does not consume refcnt, + * this refcnt is always returned to the caller even + * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ - rt6_uncached_list_add(uncached_rt); + rt6_uncached_list_add(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); - } else { - uncached_rt = net->ipv6.ip6_null_entry; - dst_hold(&uncached_rt->dst); - } + rcu_read_unlock(); - return uncached_rt; + return rt; + } } else { /* Get a percpu copy */ - - struct rt6_info *pcpu_rt; - local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(&res); + rt = rt6_get_pcpu_route(&res); - if (!pcpu_rt) - pcpu_rt = rt6_make_pcpu_route(net, &res); + if (!rt) + rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); - rcu_read_unlock(); - - return pcpu_rt; } +out: + if (!rt) + rt = net->ipv6.ip6_null_entry; + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + ip6_hold_safe(net, &rt); + rcu_read_unlock(); + + return rt; } EXPORT_SYMBOL_GPL(ip6_pol_route); @@ -2084,17 +2370,54 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.basic.ip_proto = fl6->flowi6_proto; } break; + case 2: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (skb) { + struct flow_keys keys; + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, 0); + flkeys = &keys; + } + + /* Inner can be v4 or v6 */ + if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; + } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; + hash_keys.tags.flow_label = flkeys->tags.flow_label; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); + } + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; } mhash = flow_hash_from_keys(&hash_keys); return mhash >> 1; } +/* Called with rcu held */ void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); - int flags = RT6_LOOKUP_F_HAS_SADDR; + int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, @@ -2116,8 +2439,8 @@ void ip6_route_input(struct sk_buff *skb) if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); - skb_dst_set(skb, - ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); + skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, + &fl6, skb, flags)); } static struct rt6_info *ip6_pol_route_output(struct net *net, @@ -2129,8 +2452,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, int flags) { bool any_src; @@ -2138,6 +2462,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; + /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; @@ -2145,6 +2470,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, fl6->flowi6_iif = LOOPBACK_IFINDEX; + flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) @@ -2157,6 +2483,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } +EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); + +struct dst_entry *ip6_route_output_flags(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) +{ + struct dst_entry *dst; + struct rt6_info *rt6; + + rcu_read_lock(); + dst = ip6_route_output_flags_noref(net, sk, fl6, flags); + rt6 = (struct rt6_info *)dst; + /* For dst cached in uncached_list, refcnt is already taken. */ + if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { + dst = &net->ipv6.ip6_null_entry->dst; + dst_hold(dst); + } + rcu_read_unlock(); + + return dst; +} EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) @@ -2381,10 +2729,31 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, rcu_read_unlock(); return; } - res.nh = &res.f6i->fib6_nh; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; + if (res.f6i->nh) { + struct fib6_nh_match_arg arg = { + .dev = dst->dev, + .gw = &rt6->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(res.f6i->nh, + fib6_nh_find_match, &arg); + + /* fib6_info uses a nexthop that does not have fib6_nh + * using the dst->dev + gw. Should be impossible. + */ + if (!arg.match) { + rcu_read_unlock(); + return; + } + + res.nh = arg.match; + } else { + res.nh = res.f6i->fib6_nh; + } + nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); @@ -2491,6 +2860,21 @@ static bool ip6_redirect_nh_match(const struct fib6_result *res, return true; } +struct fib6_nh_rd_arg { + struct fib6_result *res; + struct flowi6 *fl6; + const struct in6_addr *gw; + struct rt6_info **ret; +}; + +static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_rd_arg *arg = _arg; + + arg->res->nh = nh; + return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); +} + /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; @@ -2506,6 +2890,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL; struct fib6_result res = {}; + struct fib6_nh_rd_arg arg = { + .res = &res, + .fl6 = fl6, + .gw = &rdfl->gateway, + .ret = &ret + }; struct fib6_info *rt; struct fib6_node *fn; @@ -2530,14 +2920,24 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; - res.nh = &rt->fib6_nh; - if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; - if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) - goto out; + if (unlikely(rt->nh)) { + if (nexthop_is_blackhole(rt->nh)) + continue; + /* on match, res->nh is filled in and potentially ret */ + if (nexthop_for_each_fib6_nh(rt->nh, + fib6_nh_redirect_match, + &arg)) + goto out; + } else { + res.nh = rt->fib6_nh; + if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, + &ret)) + goto out; + } } if (!rt) @@ -2554,7 +2954,7 @@ restart: } res.f6i = rt; - res.nh = &rt->fib6_nh; + res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); @@ -2781,10 +3181,9 @@ out: return entries > rt_max_size; } -static struct rt6_info *ip6_nh_lookup_table(struct net *net, - struct fib6_config *cfg, - const struct in6_addr *gw_addr, - u32 tbid, int flags) +static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, + const struct in6_addr *gw_addr, u32 tbid, + int flags, struct fib6_result *res) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, @@ -2792,25 +3191,23 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, .saddr = cfg->fc_prefsrc, }; struct fib6_table *table; - struct rt6_info *rt; + int err; table = fib6_get_table(net, tbid); if (!table) - return NULL; + return -EINVAL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; - rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); - /* if table lookup failed, fall back to full lookup */ - if (rt == net->ipv6.ip6_null_entry) { - ip6_rt_put(rt); - rt = NULL; - } + err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); + if (!err && res->f6i != net->ipv6.fib6_null_entry) + fib6_select_path(net, res, &fl6, cfg->fc_ifindex, + cfg->fc_ifindex != 0, NULL, flags); - return rt; + return err; } static int ip6_route_check_nh_onlink(struct net *net, @@ -2818,29 +3215,19 @@ static int ip6_route_check_nh_onlink(struct net *net, const struct net_device *dev, struct netlink_ext_ack *extack) { - u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; - u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; - struct fib6_info *from; - struct rt6_info *grt; + struct fib6_result res = {}; int err; - err = 0; - grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); - if (grt) { - rcu_read_lock(); - from = rcu_dereference(grt->from); - if (!grt->dst.error && - /* ignore match if it is the default route */ - from && !ipv6_addr_any(&from->fib6_dst.addr) && - (grt->rt6i_flags & flags || dev != grt->dst.dev)) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway or device mismatch"); - err = -EINVAL; - } - rcu_read_unlock(); - - ip6_rt_put(grt); + err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); + if (!err && !(res.fib6_flags & RTF_REJECT) && + /* ignore match if it is the default route */ + !ipv6_addr_any(&res.f6i->fib6_dst.addr) && + (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway or device mismatch"); + err = -EINVAL; } return err; @@ -2853,47 +3240,50 @@ static int ip6_route_check_nh(struct net *net, { const struct in6_addr *gw_addr = &cfg->fc_gateway; struct net_device *dev = _dev ? *_dev : NULL; - struct rt6_info *grt = NULL; + int flags = RT6_LOOKUP_F_IFACE; + struct fib6_result res = {}; int err = -EHOSTUNREACH; if (cfg->fc_table) { - int flags = RT6_LOOKUP_F_IFACE; - - grt = ip6_nh_lookup_table(net, cfg, gw_addr, - cfg->fc_table, flags); - if (grt) { - if (grt->rt6i_flags & RTF_GATEWAY || - (dev && dev != grt->dst.dev)) { - ip6_rt_put(grt); - grt = NULL; - } - } + err = ip6_nh_lookup_table(net, cfg, gw_addr, + cfg->fc_table, flags, &res); + /* gw_addr can not require a gateway or resolve to a reject + * route. If a device is given, it must match the result. + */ + if (err || res.fib6_flags & RTF_REJECT || + res.nh->fib_nh_gw_family || + (dev && dev != res.nh->fib_nh_dev)) + err = -EHOSTUNREACH; } - if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); + if (err < 0) { + struct flowi6 fl6 = { + .flowi6_oif = cfg->fc_ifindex, + .daddr = *gw_addr, + }; - if (!grt) - goto out; + err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); + if (err || res.fib6_flags & RTF_REJECT || + res.nh->fib_nh_gw_family) + err = -EHOSTUNREACH; + if (err) + return err; + + fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, + cfg->fc_ifindex != 0, NULL, flags); + } + + err = 0; if (dev) { - if (dev != grt->dst.dev) { - ip6_rt_put(grt); - goto out; - } + if (dev != res.nh->fib_nh_dev) + err = -EHOSTUNREACH; } else { - *_dev = dev = grt->dst.dev; - *idev = grt->rt6i_idev; + *_dev = dev = res.nh->fib_nh_dev; dev_hold(dev); - in6_dev_hold(grt->rt6i_idev); + *idev = in6_dev_get(dev); } - if (!(grt->rt6i_flags & RTF_GATEWAY)) - err = 0; - - ip6_rt_put(grt); - -out: return err; } @@ -2934,11 +3324,15 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, goto out; } + rcu_read_lock(); + if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else err = ip6_route_check_nh(net, cfg, _dev, idev); + rcu_read_unlock(); + if (err) goto out; } @@ -3039,7 +3433,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, goto out; } } - goto set_dev; + goto pcpu_alloc; } if (cfg->fc_flags & RTF_GATEWAY) { @@ -3075,7 +3469,14 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) goto out; -set_dev: + +pcpu_alloc: + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!fib6_nh->rt6i_pcpu) { + err = -ENOMEM; + goto out; + } + fib6_nh->fib_nh_dev = dev; fib6_nh->fib_nh_oif = dev->ifindex; err = 0; @@ -3095,6 +3496,38 @@ out: void fib6_nh_release(struct fib6_nh *fib6_nh) { + struct rt6_exception_bucket *bucket; + + rcu_read_lock(); + + fib6_nh_flush_exceptions(fib6_nh, NULL); + bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); + if (bucket) { + rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); + kfree(bucket); + } + + rcu_read_unlock(); + + if (fib6_nh->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + + free_percpu(fib6_nh->rt6i_pcpu); + } + fib_nh_common_release(&fib6_nh->nh_common); } @@ -3104,7 +3537,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_info *rt = NULL; + struct nexthop *nh = NULL; struct fib6_table *table; + struct fib6_nh *fib6_nh; int err = -EINVAL; int addr_type; @@ -3140,6 +3575,16 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } #endif + if (cfg->fc_nh_id) { + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto out; + } + err = fib6_check_nexthop(nh, cfg, extack); + if (err) + goto out; + } err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && @@ -3157,7 +3602,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags); + rt = fib6_info_alloc(gfp_flags, !nh); if (!rt) goto out; @@ -3197,19 +3642,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); - if (err) - goto out; + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + goto out; + } + if (rt->fib6_src.plen) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto out; + } + rt->nh = nh; + fib6_nh = nexthop_fib6_nh(rt->nh); + } else { + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); + if (err) + goto out; - /* We cannot add true routes via loopback here, - * they would result in kernel looping; promote them to reject routes - */ - addr_type = ipv6_addr_type(&cfg->fc_dst); - if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) - rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; + fib6_nh = rt->fib6_nh; + + /* We cannot add true routes via loopback here, they would + * result in kernel looping; promote them to reject routes + */ + addr_type = ipv6_addr_type(&cfg->fc_dst); + if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, + addr_type)) + rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; + } if (!ipv6_addr_any(&cfg->fc_prefsrc)) { - struct net_device *dev = fib6_info_nh_dev(rt); + struct net_device *dev = fib6_nh->fib_nh_dev; if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); @@ -3301,6 +3762,12 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) info->skip_notify = 1; } + info->skip_notify_kernel = 1; + call_fib6_multipath_entry_notifiers(net, + FIB_EVENT_ENTRY_DEL, + rt, + rt->fib6_nsiblings, + NULL); list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { @@ -3323,7 +3790,7 @@ out_put: return err; } -static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) +static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; @@ -3339,10 +3806,49 @@ out: return rc; } +static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, + struct fib6_nh *nh) +{ + struct fib6_result res = { + .f6i = rt, + .nh = nh, + }; + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); + if (rt_cache) + return __ip6_del_cached_rt(rt_cache, cfg); + + return 0; +} + +struct fib6_nh_del_cached_rt_arg { + struct fib6_config *cfg; + struct fib6_info *f6i; +}; + +static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_del_cached_rt_arg *arg = _arg; + int rc; + + rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); + return rc != -ESRCH ? rc : 0; +} + +static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) +{ + struct fib6_nh_del_cached_rt_arg arg = { + .cfg = cfg, + .f6i = f6i + }; + + return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); +} + static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt_cache; struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; @@ -3365,26 +3871,45 @@ static int ip6_route_del(struct fib6_config *cfg, for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; + if (rt->nh && cfg->fc_nh_id && + rt->nh->id != cfg->fc_nh_id) + continue; + if (cfg->fc_flags & RTF_CACHE) { - struct fib6_result res = { - .f6i = rt, - }; - int rc; - - rt_cache = rt6_find_cached_rt(&res, - &cfg->fc_dst, - &cfg->fc_src); - if (rt_cache) { - rc = ip6_del_cached_rt(rt_cache, cfg); - if (rc != -ESRCH) { - rcu_read_unlock(); - return rc; - } + int rc = 0; + + if (rt->nh) { + rc = ip6_del_cached_rt_nh(cfg, rt); + } else if (cfg->fc_nh_id) { + continue; + } else { + nh = rt->fib6_nh; + rc = ip6_del_cached_rt(cfg, rt, nh); + } + if (rc != -ESRCH) { + rcu_read_unlock(); + return rc; } continue; } - nh = &rt->fib6_nh; + if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) + continue; + if (cfg->fc_protocol && + cfg->fc_protocol != rt->fib6_protocol) + continue; + + if (rt->nh) { + if (!fib6_info_hold_safe(rt)) + continue; + rcu_read_unlock(); + + return __ip6_del_rt(rt, &cfg->fc_nlinfo); + } + if (cfg->fc_nh_id) + continue; + + nh = rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) @@ -3392,10 +3917,6 @@ static int ip6_route_del(struct fib6_config *cfg, if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; - if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) - continue; - if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) - continue; if (!fib6_info_hold_safe(rt)) continue; rcu_read_unlock(); @@ -3506,7 +4027,25 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (!res.f6i) goto out; - res.nh = &res.f6i->fib6_nh; + if (res.f6i->nh) { + struct fib6_nh_match_arg arg = { + .dev = dst->dev, + .gw = &rt->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(res.f6i->nh, + fib6_nh_find_match, &arg); + + /* fib6_info uses a nexthop that does not have fib6_nh + * using the dst->dev. Should be impossible + */ + if (!arg.match) + goto out; + res.nh = arg.match; + } else { + res.nh = res.f6i->fib6_nh; + } + res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); @@ -3558,12 +4097,15 @@ static struct fib6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) + /* these routes do not use nexthops */ + if (rt->nh) + continue; + if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || - !rt->fib6_nh.fib_nh_gw_family) + !rt->fib6_nh->fib_nh_gw_family) continue; - if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; @@ -3621,8 +4163,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - struct fib6_nh *nh = &rt->fib6_nh; + struct fib6_nh *nh; + + /* RA routes do not use nexthops */ + if (rt->nh) + continue; + nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&nh->fib_nh_gw6, addr)) @@ -3873,7 +4420,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && + if (!rt->nh && + ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -3901,18 +4449,22 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; + struct fib6_nh *nh; + /* RA routes do not use nexthops */ + if (rt->nh) + return 0; + + nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - rt->fib6_nh.fib_nh_gw_family && - ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { + nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; - } /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ - rt6_exceptions_clean_tohost(rt, gateway); + fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } @@ -3950,11 +4502,12 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) return NULL; } +/* only called for fib entries with builtin fib6_nh */ static bool rt6_is_dead(const struct fib6_info *rt) { - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || - (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && - ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) + if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || + (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && + ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; @@ -3966,11 +4519,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->fib6_nh.fib_nh_weight; + total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) - total += iter->fib6_nh.fib_nh_weight; + total += iter->fib6_nh->fib_nh_weight; } return total; @@ -3981,11 +4534,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->fib6_nh.fib_nh_weight; + *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) @@ -4028,9 +4581,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.fib6_null_entry && - rt->fib6_nh.fib_nh_dev == arg->dev) { - rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; + if (rt != net->ipv6.fib6_null_entry && !rt->nh && + rt->fib6_nh->fib_nh_dev == arg->dev) { + rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -4053,15 +4606,16 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } +/* only called for fib entries with inline fib6_nh */ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) + if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) + if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; @@ -4082,12 +4636,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, struct fib6_info *iter; unsigned int dead = 0; - if (rt->fib6_nh.fib_nh_dev == down_dev || - rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh->fib_nh_dev == down_dev || + rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == down_dev || - iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh->fib_nh_dev == down_dev || + iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -4099,11 +4653,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) - rt->fib6_nh.fib_nh_flags |= nh_flags; + if (rt->fib6_nh->fib_nh_dev == dev) + rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) - iter->fib6_nh.fib_nh_flags |= nh_flags; + if (iter->fib6_nh->fib_nh_dev == dev) + iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -4113,17 +4667,17 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); - if (rt == net->ipv6.fib6_null_entry) + if (rt == net->ipv6.fib6_null_entry || rt->nh) return 0; switch (arg->event) { case NETDEV_UNREGISTER: - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -4139,10 +4693,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->fib6_nh.fib_nh_dev != dev || + if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -4176,9 +4730,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event) struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; + struct fib6_info *f6i; }; -static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) +static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; + struct fib6_info *f6i = arg->f6i; + + /* For administrative MTU increase, there is no way to discover + * IPv6 PMTU increase, so PMTU increase should be updated here. + * Since RFC 1981 doesn't include administrative MTU increase + * update PMTU increase is a MUST. (i.e. jumbo frame) + */ + if (nh->fib_nh_dev == arg->dev) { + struct inet6_dev *idev = __in6_dev_get(arg->dev); + u32 mtu = f6i->fib6_pmtu; + + if (mtu >= arg->mtu || + (mtu < arg->mtu && mtu == idev->cnf.mtu6)) + fib6_metric_set(f6i, RTAX_MTU, arg->mtu); + + spin_lock_bh(&rt6_exception_lock); + rt6_exceptions_update_pmtu(idev, nh, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); + } + + return 0; +} + +static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; @@ -4193,24 +4774,17 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) if (!idev) return 0; - /* For administrative MTU increase, there is no way to discover - IPv6 PMTU increase, so PMTU increase should be updated here. - Since RFC 1981 doesn't include administrative MTU increase - update PMTU increase is a MUST. (i.e. jumbo frame) - */ - if (rt->fib6_nh.fib_nh_dev == arg->dev && - !fib6_metric_locked(rt, RTAX_MTU)) { - u32 mtu = rt->fib6_pmtu; - - if (mtu >= arg->mtu || - (mtu < arg->mtu && mtu == idev->cnf.mtu6)) - fib6_metric_set(rt, RTAX_MTU, arg->mtu); + if (fib6_metric_locked(f6i, RTAX_MTU)) + return 0; - spin_lock_bh(&rt6_exception_lock); - rt6_exceptions_update_pmtu(idev, rt, arg->mtu); - spin_unlock_bh(&rt6_exception_lock); + arg->f6i = f6i; + if (f6i->nh) { + /* fib6_nh_mtu_change only returns 0, so this is safe */ + return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, + arg); } - return 0; + + return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) @@ -4224,6 +4798,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu) } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { + [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, @@ -4241,6 +4816,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, + [RTA_NH_ID] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -4287,6 +4863,16 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); + if (tb[RTA_NH_ID]) { + if (tb[RTA_GATEWAY] || tb[RTA_OIF] || + tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { + NL_SET_ERR_MSG(extack, + "Nexthop specification and nexthop id are mutually exclusive"); + goto errout; + } + cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); + } + if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; @@ -4430,6 +5016,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; + enum fib_event_type event_type; struct fib6_config r_cfg; struct rtnexthop *rtnh; struct fib6_info *rt; @@ -4489,7 +5076,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); @@ -4501,12 +5088,23 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } + if (list_empty(&rt6_nh_list)) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - no valid nexthops"); + return -EINVAL; + } + /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done */ info->skip_notify = 1; + /* For add and replace, send one notification with all nexthops. For + * append, send one notification with all appended nexthops. + */ + info->skip_notify_kernel = 1; + err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { err = __ip6_ins_rt(nh->fib6_info, info, extack); @@ -4543,6 +5141,15 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nhn++; } + event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD; + err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type, + rt_notif, nhn - 1, extack); + if (err) { + /* Delete all the siblings that were just added */ + err_nh = NULL; + goto add_errout; + } + /* success ... tell user about new route */ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); goto cleanup; @@ -4621,6 +5228,12 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (cfg.fc_nh_id && + !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + return -EINVAL; + } + if (cfg.fc_mp) return ip6_route_multipath_del(&cfg, extack); else { @@ -4648,17 +5261,46 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, return ip6_route_add(&cfg, GFP_KERNEL, extack); } -static size_t rt6_nlmsg_size(struct fib6_info *rt) +/* add the overhead of this fib6_nh to nexthop_len */ +static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) { - int nexthop_len = 0; + int *nexthop_len = arg; - if (rt->fib6_nsiblings) { - nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ - + NLA_ALIGN(sizeof(struct rtnexthop)) - + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); + *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16); /* RTA_GATEWAY */ - nexthop_len *= rt->fib6_nsiblings; + if (nh->fib_nh_lws) { + /* RTA_ENCAP_TYPE */ + *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); + /* RTA_ENCAP */ + *nexthop_len += nla_total_size(2); + } + + return 0; +} + +static size_t rt6_nlmsg_size(struct fib6_info *f6i) +{ + int nexthop_len; + + if (f6i->nh) { + nexthop_len = nla_total_size(4); /* RTA_NH_ID */ + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, + &nexthop_len); + } else { + struct fib6_nh *nh = f6i->fib6_nh; + + nexthop_len = 0; + if (f6i->fib6_nsiblings) { + nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16) /* RTA_GATEWAY */ + + lwtunnel_get_encap_size(nh->fib_nh_lws); + + nexthop_len *= f6i->fib6_nsiblings; + } + nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } return NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -4674,10 +5316,38 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) + nexthop_len; } +static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, + unsigned char *flags) +{ + if (nexthop_is_multipath(nh)) { + struct nlattr *mp; + + mp = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp) + goto nla_put_failure; + + if (nexthop_mpath_fill_node(skb, nh)) + goto nla_put_failure; + + nla_nest_end(skb, mp); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = nexthop_fib6_nh(nh); + if (fib_nexthop_info(skb, &fib6_nh->nh_common, + flags, false) < 0) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -4687,6 +5357,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct rt6_info *rt6 = (struct rt6_info *)dst; struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; + unsigned char nh_flags = 0; struct nlmsghdr *nlh; struct rtmsg *rtm; long expires = 0; @@ -4794,22 +5465,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (!mp) goto nla_put_failure; - if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, - rt->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, + rt->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { - if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, - sibling->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, + sibling->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; } nla_nest_end(skb, mp); - } else { - unsigned char nh_flags = 0; + } else if (rt->nh) { + if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) + goto nla_put_failure; - if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, + if (nexthop_is_blackhole(rt->nh)) + rtm->rtm_type = RTN_BLACKHOLE; + + if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) + goto nla_put_failure; + + rtm->rtm_flags |= nh_flags; + } else { + if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, &nh_flags, false) < 0) goto nla_put_failure; @@ -4836,10 +5516,28 @@ nla_put_failure: return -EMSGSIZE; } +static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) +{ + const struct net_device *dev = arg; + + if (nh->fib_nh_dev == dev) + return 1; + + return 0; +} + static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { - if (f6i->fib6_nh.fib_nh_dev == dev) + if (f6i->nh) { + struct net_device *_dev = (struct net_device *)dev; + + return !!nexthop_for_each_fib6_nh(f6i->nh, + fib6_info_nh_uses_dev, + _dev); + } + + if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (f6i->fib6_nsiblings) { @@ -4847,7 +5545,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, list_for_each_entry_safe(sibling, next_sibling, &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh.fib_nh_dev == dev) + if (sibling->fib6_nh->fib_nh_dev == dev) return true; } } @@ -4855,33 +5553,131 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, return false; } -int rt6_dump_route(struct fib6_info *rt, void *p_arg) +struct fib6_nh_exception_dump_walker { + struct rt6_rtnl_dump_arg *dump; + struct fib6_info *rt; + unsigned int flags; + unsigned int skip; + unsigned int count; +}; + +static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) +{ + struct fib6_nh_exception_dump_walker *w = arg; + struct rt6_rtnl_dump_arg *dump = w->dump; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i, err; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); + if (!bucket) + return 0; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + if (w->skip) { + w->skip--; + continue; + } + + /* Expiration of entries doesn't bump sernum, insertion + * does. Removal is triggered by insertion, so we can + * rely on the fact that if entries change between two + * partial dumps, this node is scanned again completely, + * see rt6_insert_exception() and fib6_dump_table(). + * + * Count expired entries we go through as handled + * entries that we'll skip next time, in case of partial + * node dump. Otherwise, if entries expire meanwhile, + * we'll skip the wrong amount. + */ + if (rt6_check_expired(rt6_ex->rt6i)) { + w->count++; + continue; + } + + err = rt6_fill_node(dump->net, dump->skb, w->rt, + &rt6_ex->rt6i->dst, NULL, NULL, 0, + RTM_NEWROUTE, + NETLINK_CB(dump->cb->skb).portid, + dump->cb->nlh->nlmsg_seq, w->flags); + if (err) + return err; + + w->count++; + } + bucket++; + } + + return 0; +} + +/* Return -1 if done with node, number of handled routes on partial dump */ +int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct fib_dump_filter *filter = &arg->filter; unsigned int flags = NLM_F_MULTI; struct net *net = arg->net; + int count = 0; if (rt == net->ipv6.fib6_null_entry) - return 0; + return -1; if ((filter->flags & RTM_F_PREFIX) && !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ - return 1; + return -1; } - if (filter->filter_set) { - if ((filter->rt_type && rt->fib6_type != filter->rt_type) || - (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || - (filter->protocol && rt->fib6_protocol != filter->protocol)) { - return 1; - } + if (filter->filter_set && + ((filter->rt_type && rt->fib6_type != filter->rt_type) || + (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || + (filter->protocol && rt->fib6_protocol != filter->protocol))) { + return -1; + } + + if (filter->filter_set || + !filter->dump_routes || !filter->dump_exceptions) { flags |= NLM_F_DUMP_FILTERED; } - return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, - RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, - arg->cb->nlh->nlmsg_seq, flags); + if (filter->dump_routes) { + if (skip) { + skip--; + } else { + if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, + 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).portid, + arg->cb->nlh->nlmsg_seq, flags)) { + return 0; + } + count++; + } + } + + if (filter->dump_exceptions) { + struct fib6_nh_exception_dump_walker w = { .dump = arg, + .rt = rt, + .flags = flags, + .skip = skip, + .count = 0 }; + int err; + + rcu_read_lock(); + if (rt->nh) { + err = nexthop_for_each_fib6_nh(rt->nh, + rt6_nh_dump_exceptions, + &w); + } else { + err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); + } + rcu_read_unlock(); + + if (err) + return count += w.count; + } + + return -1; } static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, @@ -5126,6 +5922,38 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } +void fib6_rt_update(struct net *net, struct fib6_info *rt, + struct nl_info *info) +{ + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + /* call_fib6_entry_notifiers will be removed when in-kernel notifier + * is implemented and supported for nexthop objects + */ + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL); + + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + if (!skb) + goto errout; + + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, + RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); + if (err < 0) { + /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); +} + static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -5136,7 +5964,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { - net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; + net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -5330,11 +6158,11 @@ static int __net_init ip6_route_net_init(struct net *net) if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; - net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, - sizeof(*net->ipv6.fib6_null_entry), - GFP_KERNEL); + net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; + memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, + sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), @@ -5344,6 +6172,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; @@ -5355,6 +6184,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), @@ -5364,6 +6194,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); #endif net->ipv6.sysctl.flush_delay = 0; @@ -5471,7 +6302,7 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index e15cd37024fd..dc4c91e0bfb8 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -23,6 +23,7 @@ static int zero; static int one = 1; +static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; @@ -113,7 +114,9 @@ static struct ctl_table ipv6_table_template[] = { .data = &init_net.ipv6.sysctl.flowlabel_reflect, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &flowlabel_reflect_max, }, { .procname = "max_dst_opts_number", diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7a14ea37d2df..d56a9019a0fe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -171,7 +171,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } @@ -883,9 +883,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_oif = oif; } - if (sk) - mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) { + mark = inet_twsk(sk)->tw_mark; + /* autoflowlabel relies on buff->hash */ + skb_set_hash(buff, inet_twsk(sk)->tw_txhash, + PKT_HASH_TYPE_L4); + } else { + mark = sk->sk_mark; + } + buff->tstamp = tcp_transmit_time(sk); + } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; @@ -912,15 +920,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); u32 seq = 0, ack_seq = 0; struct tcp_md5sig_key *key = NULL; #ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; - struct ipv6hdr *ipv6h = ipv6_hdr(skb); unsigned char newhash[16]; int genhash; struct sock *sk1 = NULL; #endif + __be32 label = 0; + struct net *net; int oif = 0; if (th->rst) @@ -932,6 +942,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (!sk && !ipv6_unicast_destination(skb)) return; + net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); @@ -945,7 +956,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), + sk1 = inet6_lookup_listener(net, &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, @@ -975,9 +986,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) trace_tcp_send_reset(sk, skb); + if (sk->sk_state == TCP_TIME_WAIT) + label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); + } else { + if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) + label = ip6_flowlabel(ipv6h); } - tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, + label); #ifdef CONFIG_TCP_MD5SIG out: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 70b01bd95022..827fe7385078 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -54,16 +54,6 @@ #include <trace/events/skb.h> #include "udp_impl.h" -static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if defined(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_udp_l3mdev_accept && - skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) - return true; -#endif - return false; -} - static u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -111,7 +101,7 @@ void udp_v6_rehash(struct sock *sk) static int compute_score(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, - int dif, int sdif, bool exact_dif) + int dif, int sdif) { int score; struct inet_sock *inet; @@ -155,8 +145,8 @@ static int compute_score(struct sock *sk, struct net *net, static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, - int dif, int sdif, bool exact_dif, - struct udp_hslot *hslot2, struct sk_buff *skb) + int dif, int sdif, struct udp_hslot *hslot2, + struct sk_buff *skb) { struct sock *sk, *result; int score, badness; @@ -166,7 +156,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); + daddr, hnum, dif, sdif); if (score > badness) { if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, @@ -195,14 +185,13 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result; - bool exact_dif = udp6_lib_exact_dif_match(net, skb); hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif, + daddr, hnum, dif, sdif, hslot2, skb); if (!result) { hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); @@ -212,10 +201,9 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, - exact_dif, hslot2, - skb); + hslot2, skb); } - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } @@ -838,8 +826,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - ip6_compute_pseudo); + skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo); ret = udpv6_queue_rcv_skb(sk, skb); @@ -1332,7 +1319,7 @@ do_udp_sendmsg: fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -1384,7 +1371,7 @@ do_udp_sendmsg: } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 5bdca3d5d6b7..78daadecbdef 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -21,137 +21,6 @@ #include <net/ipv6.h> #include <net/addrconf.h> -static void -__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) -{ - const struct flowi6 *fl6 = &fl->u.ip6; - - /* Initialize temporary selector matching only - * to current session. */ - *(struct in6_addr *)&sel->daddr = fl6->daddr; - *(struct in6_addr *)&sel->saddr = fl6->saddr; - sel->dport = xfrm_flowi_dport(fl, &fl6->uli); - sel->dport_mask = htons(0xffff); - sel->sport = xfrm_flowi_sport(fl, &fl6->uli); - sel->sport_mask = htons(0xffff); - sel->family = AF_INET6; - sel->prefixlen_d = 128; - sel->prefixlen_s = 128; - sel->proto = fl6->flowi6_proto; - sel->ifindex = fl6->flowi6_oif; -} - -static void -xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, - const xfrm_address_t *daddr, const xfrm_address_t *saddr) -{ - x->id = tmpl->id; - if (ipv6_addr_any((struct in6_addr *)&x->id.daddr)) - memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); - memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); - if (ipv6_addr_any((struct in6_addr *)&x->props.saddr)) - memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); - x->props.mode = tmpl->mode; - x->props.reqid = tmpl->reqid; - x->props.family = AF_INET6; -} - -/* distribution counting sort function for xfrm_state and xfrm_tmpl */ -static int -__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) -{ - int count[XFRM_MAX_DEPTH] = { }; - int class[XFRM_MAX_DEPTH]; - int i; - - for (i = 0; i < n; i++) { - int c; - class[i] = c = cmp(src[i]); - count[c]++; - } - - for (i = 2; i < maxclass; i++) - count[i] += count[i - 1]; - - for (i = 0; i < n; i++) { - dst[count[class[i] - 1]++] = src[i]; - src[i] = NULL; - } - - return 0; -} - -/* - * Rule for xfrm_state: - * - * rule 1: select IPsec transport except AH - * rule 2: select MIPv6 RO or inbound trigger - * rule 3: select IPsec transport AH - * rule 4: select IPsec tunnel - * rule 5: others - */ -static int __xfrm6_state_sort_cmp(void *p) -{ - struct xfrm_state *v = p; - - switch (v->props.mode) { - case XFRM_MODE_TRANSPORT: - if (v->id.proto != IPPROTO_AH) - return 1; - else - return 3; -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case XFRM_MODE_ROUTEOPTIMIZATION: - case XFRM_MODE_IN_TRIGGER: - return 2; -#endif - case XFRM_MODE_TUNNEL: - case XFRM_MODE_BEET: - return 4; - } - return 5; -} - -static int -__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n) -{ - return __xfrm6_sort((void **)dst, (void **)src, n, - __xfrm6_state_sort_cmp, 6); -} - -/* - * Rule for xfrm_tmpl: - * - * rule 1: select IPsec transport - * rule 2: select MIPv6 RO or inbound trigger - * rule 3: select IPsec tunnel - * rule 4: others - */ -static int __xfrm6_tmpl_sort_cmp(void *p) -{ - struct xfrm_tmpl *v = p; - switch (v->mode) { - case XFRM_MODE_TRANSPORT: - return 1; -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case XFRM_MODE_ROUTEOPTIMIZATION: - case XFRM_MODE_IN_TRIGGER: - return 2; -#endif - case XFRM_MODE_TUNNEL: - case XFRM_MODE_BEET: - return 3; - } - return 4; -} - -static int -__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n) -{ - return __xfrm6_sort((void **)dst, (void **)src, n, - __xfrm6_tmpl_sort_cmp, 5); -} - int xfrm6_extract_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); @@ -171,12 +40,6 @@ int xfrm6_extract_header(struct sk_buff *skb) static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, .proto = IPPROTO_IPV6, - .eth_proto = htons(ETH_P_IPV6), - .owner = THIS_MODULE, - .init_tempsel = __xfrm6_init_tempsel, - .init_temprop = xfrm6_init_temprop, - .tmpl_sort = __xfrm6_tmpl_sort, - .state_sort = __xfrm6_state_sort, .output = xfrm6_output, .output_finish = xfrm6_output_finish, .extract_input = xfrm6_extract_input, diff --git a/net/key/af_key.c b/net/key/af_key.c index a50dd6f34b91..b67ed3a8486c 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -928,8 +928,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); - if (!addr->sadb_address_prefixlen) - BUG(); + BUG_ON(!addr->sadb_address_prefixlen); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); @@ -944,8 +943,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); - if (!addr->sadb_address_prefixlen) - BUG(); + BUG_ON(!addr->sadb_address_prefixlen); if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) { @@ -2438,8 +2436,10 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc goto out; } err = pfkey_xfrm_policy2msg(out_skb, xp, dir); - if (err < 0) + if (err < 0) { + kfree_skb(out_skb); goto out; + } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; @@ -2690,8 +2690,10 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); - if (err < 0) + if (err < 0) { + kfree_skb(out_skb); return err; + } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 6e2b4b9267e1..35bb4f3bdbe0 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -31,7 +31,6 @@ #include "l2tp_core.h" static struct dentry *rootdir; -static struct dentry *tunnels; struct l2tp_dfs_seq_data { struct net *net; @@ -326,32 +325,18 @@ static const struct file_operations l2tp_dfs_fops = { static int __init l2tp_debugfs_init(void) { - int rc = 0; - rootdir = debugfs_create_dir("l2tp", NULL); - if (IS_ERR(rootdir)) { - rc = PTR_ERR(rootdir); - rootdir = NULL; - goto out; - } - tunnels = debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops); - if (tunnels == NULL) - rc = -EIO; + debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops); pr_info("L2TP debugfs support\n"); -out: - if (rc) - pr_warn("unable to init\n"); - - return rc; + return 0; } static void __exit l2tp_debugfs_exit(void) { - debugfs_remove(tunnels); - debugfs_remove(rootdir); + debugfs_remove_recursive(rootdir); } module_init(l2tp_debugfs_init); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 1a76a0a4e3ab..687e23a8b326 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -536,7 +536,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -577,7 +577,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index cfc9fcb97465..f35899d45a9a 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -118,6 +118,8 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup + * This function does not hold refcnt on the returned dst. + * Caller must hold rcu_read_lock(). */ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, @@ -126,9 +128,8 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct dst_entry *dst = NULL; struct net_device *dev; + WARN_ON_ONCE(!rcu_read_lock_held()); if (fl6->flowi6_oif) { - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (dev && netif_is_l3_slave(dev)) dev = netdev_master_upper_dev_get_rcu(dev); @@ -136,8 +137,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_link_scope_lookup) dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); - - rcu_read_unlock(); } return dst; diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 5d2d1f746b91..3c03f6512c5f 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -68,7 +68,6 @@ static void __lapb_remove_cb(struct lapb_cb *lapb) lapb_put(lapb); } } -EXPORT_SYMBOL(lapb_register); /* * Add a socket to the bound sockets list. @@ -115,7 +114,6 @@ static struct lapb_cb *lapb_create_cb(void) { struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC); - if (!lapb) goto out; @@ -167,6 +165,7 @@ out: write_unlock_bh(&lapb_list_lock); return rc; } +EXPORT_SYMBOL(lapb_register); int lapb_unregister(struct net_device *dev) { diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 3fae902937fd..76cc9e967fa6 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -5,6 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018-2019 Intel Corporation * Copyright (C) 2018 Intel Corporation */ @@ -974,7 +975,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_BEACON | BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS | - BSS_CHANGED_TXPOWER; + BSS_CHANGED_TXPOWER | + BSS_CHANGED_TWT; int err; int prev_beacon_int; @@ -1044,6 +1046,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.dtim_period = params->dtim_period; sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p; + sdata->vif.bss_conf.twt_responder = params->twt_responder; sdata->vif.bss_conf.ssid_len = params->ssid_len; if (params->ssid_len) @@ -1465,7 +1468,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, return ret; } - if (params->supported_rates) { + if (params->supported_rates && params->supported_rates_len) { ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef, sband, params->supported_rates, params->supported_rates_len, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 271bc2b676a4..2e7f75938c51 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -272,6 +272,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), FLAG(EXT_KEY_ID_NATIVE), + FLAG(NO_AMPDU_KEYBORDER_SUPPORT), #undef FLAG }; diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 3509ce0daea3..7b8735ced2a1 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -339,9 +339,6 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key) key->debugfs.dir = debugfs_create_dir(buf, key->local->debugfs.keys); - if (!key->debugfs.dir) - return; - sta = key->sta; if (sta) { sprintf(buf, "../../netdev:%s/stations/%pM", diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index f1f2e1c7ac0c..b1438fd4d876 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -815,9 +815,8 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata) sprintf(buf, "netdev:%s", sdata->name); sdata->vif.debugfs_dir = debugfs_create_dir(buf, sdata->local->hw.wiphy->debugfsdir); - if (sdata->vif.debugfs_dir) - sdata->debugfs.subdir_stations = debugfs_create_dir("stations", - sdata->vif.debugfs_dir); + sdata->debugfs.subdir_stations = debugfs_create_dir("stations", + sdata->vif.debugfs_dir); add_files(sdata); } @@ -842,8 +841,5 @@ void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) return; sprintf(buf, "netdev:%s", sdata->name); - if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf)) - sdata_err(sdata, - "debugfs: failed to rename debugfs dir to %s\n", - buf); + debugfs_rename(dir->d_parent, dir, dir->d_parent, buf); } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 3fd79ccb293b..c8ad20c28c43 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -957,8 +957,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) * dir might still be around. */ sta->debugfs_dir = debugfs_create_dir(mac, stations_dir); - if (!sta->debugfs_dir) - return; DEBUGFS_ADD(flags); DEBUGFS_ADD(aid); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 157ff5f890d2..dd60f6428049 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -269,50 +269,61 @@ int ieee80211_set_tx_key(struct ieee80211_key *key) assert_key_lock(local); sta->ptk_idx = key->conf.keyidx; + + if (ieee80211_hw_check(&local->hw, NO_AMPDU_KEYBORDER_SUPPORT)) + clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); return 0; } -static int ieee80211_hw_key_replace(struct ieee80211_key *old_key, - struct ieee80211_key *new_key, - bool pairwise) +static void ieee80211_pairwise_rekey(struct ieee80211_key *old, + struct ieee80211_key *new) { - struct ieee80211_sub_if_data *sdata; - struct ieee80211_local *local; - struct sta_info *sta; - int ret; - - /* Aggregation sessions are OK when running on SW crypto. - * A broken remote STA may cause issues not observed with HW - * crypto, though. - */ - if (!(old_key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) - return 0; + struct ieee80211_local *local = new->local; + struct sta_info *sta = new->sta; + int i; - assert_key_lock(old_key->local); - sta = old_key->sta; + assert_key_lock(local); - /* Unicast rekey without Extended Key ID needs special handling */ - if (new_key && sta && pairwise && - rcu_access_pointer(sta->ptk[sta->ptk_idx]) == old_key) { - local = old_key->local; - sdata = old_key->sdata; + if (new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) { + /* Extended Key ID key install, initial one or rekey */ + + if (sta->ptk_idx != INVALID_PTK_KEYIDX && + ieee80211_hw_check(&local->hw, + NO_AMPDU_KEYBORDER_SUPPORT)) { + /* Aggregation Sessions with Extended Key ID must not + * mix MPDUs with different keyIDs within one A-MPDU. + * Tear down any running Tx aggregation and all new + * Rx/Tx aggregation request during rekey if the driver + * asks us to do so. (Blocking Tx only would be + * sufficient but WLAN_STA_BLOCK_BA gets the job done + * for the few ms we need it.) + */ + set_sta_flag(sta, WLAN_STA_BLOCK_BA); + mutex_lock(&sta->ampdu_mlme.mtx); + for (i = 0; i < IEEE80211_NUM_TIDS; i++) + ___ieee80211_stop_tx_ba_session(sta, i, + AGG_STOP_LOCAL_REQUEST); + mutex_unlock(&sta->ampdu_mlme.mtx); + } + } else if (old) { + /* Rekey without Extended Key ID. + * Aggregation sessions are OK when running on SW crypto. + * A broken remote STA may cause issues not observed with HW + * crypto, though. + */ + if (!(old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + return; - /* Stop TX till we are on the new key */ - old_key->flags |= KEY_FLAG_TAINTED; + /* Stop Tx till we are on the new key */ + old->flags |= KEY_FLAG_TAINTED; ieee80211_clear_fast_xmit(sta); - - /* Aggregation sessions during rekey are complicated due to the - * reorder buffer and retransmits. Side step that by blocking - * aggregation during rekey and tear down running sessions. - */ if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_LOCAL_REQUEST); } - if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) { pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.", @@ -320,18 +331,9 @@ static int ieee80211_hw_key_replace(struct ieee80211_key *old_key, /* Flushing the driver queues *may* help prevent * the clear text leaks and freezes. */ - ieee80211_flush_queues(local, sdata, false); + ieee80211_flush_queues(local, old->sdata, false); } } - - ieee80211_key_disable_hw_accel(old_key); - - if (new_key) - ret = ieee80211_key_enable_hw_accel(new_key); - else - ret = 0; - - return ret; } static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, @@ -389,7 +391,6 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, mutex_unlock(&sdata->local->key_mtx); } - static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, bool pairwise, @@ -397,7 +398,7 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct ieee80211_key *new) { int idx; - int ret; + int ret = 0; bool defunikey, defmultikey, defmgmtkey; /* caller must provide at least one old/new */ @@ -409,16 +410,27 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); + if (new && sta && pairwise) { + /* Unicast rekey needs special handling. With Extended Key ID + * old is still NULL for the first rekey. + */ + ieee80211_pairwise_rekey(old, new); + } + if (old) { idx = old->conf.keyidx; - ret = ieee80211_hw_key_replace(old, new, pairwise); + + if (old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { + ieee80211_key_disable_hw_accel(old); + + if (new) + ret = ieee80211_key_enable_hw_accel(new); + } } else { /* new must be provided in case old is not */ idx = new->conf.keyidx; if (!new->local->wowlan) ret = ieee80211_key_enable_hw_accel(new); - else - ret = 0; } if (ret) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 087c90e461b5..4c2702f128f3 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -352,11 +352,11 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, sdata_lock(sdata); /* Copy the addresses to the bss_conf list */ - ifa = idev->ifa_list; + ifa = rtnl_dereference(idev->ifa_list); while (ifa) { if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN) bss_conf->arp_addr_list[c] = ifa->ifa_address; - ifa = ifa->ifa_next; + ifa = rtnl_dereference(ifa->ifa_next); c++; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a00b703bfdfd..a99ad0325309 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3156,6 +3156,19 @@ static bool ieee80211_twt_req_supported(const struct sta_info *sta, IEEE80211_HE_MAC_CAP0_TWT_RES; } +static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee802_11_elems *elems) +{ + bool twt = ieee80211_twt_req_supported(sta, elems); + + if (sdata->vif.bss_conf.twt_requester != twt) { + sdata->vif.bss_conf.twt_requester = twt; + return BSS_CHANGED_TWT; + } + return 0; +} + static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, size_t len) @@ -3338,8 +3351,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta); bss_conf->he_support = sta->sta.he_cap.has_he; - bss_conf->twt_requester = - ieee80211_twt_req_supported(sta, &elems); + changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); } else { bss_conf->he_support = false; bss_conf->twt_requester = false; @@ -3999,6 +4011,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, bssid); + changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); + if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, elems.ht_operation, elems.vht_operation, elems.he_operation, @@ -4949,7 +4963,12 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, basic_rates = BIT(min_rate_index); } - new_sta->sta.supp_rates[cbss->channel->band] = rates; + if (rates) + new_sta->sta.supp_rates[cbss->channel->band] = rates; + else + sdata_info(sdata, + "No rates found, keeping mandatory only\n"); + sdata->vif.bss_conf.basic_rates = basic_rates; /* cf. IEEE 802.11 9.2.12 */ diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 6e5961d7f639..60ef8972b254 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -199,6 +199,10 @@ static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc) cfg80211_remain_on_channel_expired(&roc->sdata->wdev, roc->cookie, roc->chan, GFP_KERNEL); + else + cfg80211_tx_mgmt_expired(&roc->sdata->wdev, + roc->mgmt_tx_cookie, + roc->chan, GFP_KERNEL); list_del(&roc->list); kfree(roc); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 47ee36677c2b..a1e9fc7878aa 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -354,8 +354,10 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, break; } WARN_ONCE(i == sband->n_bitrates, - "no supported rates (0x%x) in rate_mask 0x%x with flags 0x%x\n", + "no supported rates for sta %pM (0x%x, band %d) in rate_mask 0x%x with flags 0x%x\n", + sta ? sta->addr : NULL, sta ? sta->supp_rates[sband->band] : -1, + sband->band, rate_mask, rate_flags); info->control.rates[0].count = @@ -366,9 +368,8 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, } -bool rate_control_send_low(struct ieee80211_sta *pubsta, - void *priv_sta, - struct ieee80211_tx_rate_control *txrc) +static bool rate_control_send_low(struct ieee80211_sta *pubsta, + struct ieee80211_tx_rate_control *txrc) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); struct ieee80211_supported_band *sband = txrc->sband; @@ -376,7 +377,7 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; - if (!pubsta || !priv_sta || rc_no_data_or_no_ack_use_min(txrc)) { + if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); @@ -402,7 +403,6 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta, } return false; } -EXPORT_SYMBOL(rate_control_send_low); static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask) { @@ -885,26 +885,29 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); int i; - if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { - ista = &sta->sta; - priv_sta = sta->rate_ctrl_priv; - } - for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { info->control.rates[i].idx = -1; info->control.rates[i].flags = 0; info->control.rates[i].count = 0; } + if (rate_control_send_low(sta ? &sta->sta : NULL, txrc)) + return; + if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL)) return; + if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { + ista = &sta->sta; + priv_sta = sta->rate_ctrl_priv; + } + if (ista) { spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); spin_unlock_bh(&sta->rate_ctrl_lock); } else { - ref->ops->get_rate(ref->priv, NULL, NULL, txrc); + rate_control_send_low(NULL, txrc); } if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE)) diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index a34e9c2ca626..ee86c3333999 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -340,10 +340,6 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, int delta; int sampling_ratio; - /* management/no-ack frames do not use rate control */ - if (rate_control_send_low(sta, priv_sta, txrc)) - return; - /* check multi-rate-retry capabilities & adjust lookaround_rate */ mrr_capable = mp->has_mrr && !txrc->rts && diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 298a1acb3ce5..5a882da82f0e 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1093,9 +1093,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct minstrel_priv *mp = priv; int sample_idx; - if (rate_control_send_low(sta, priv_sta, txrc)) - return; - if (!msp->is_ht) return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 187f62a48b2b..95eb8220e2e4 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation */ #include <linux/module.h> @@ -401,6 +401,47 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); + for (i = 0; i < NUM_NL80211_BANDS; i++) { + u32 mandatory = 0; + int r; + + if (!hw->wiphy->bands[i]) + continue; + + switch (i) { + case NL80211_BAND_2GHZ: + /* + * We use both here, even if we cannot really know for + * sure the station will support both, but the only use + * for this is when we don't know anything yet and send + * management frames, and then we'll pick the lowest + * possible rate anyway. + * If we don't include _G here, we cannot find a rate + * in P2P, and thus trigger the WARN_ONCE() in rate.c + */ + mandatory = IEEE80211_RATE_MANDATORY_B | + IEEE80211_RATE_MANDATORY_G; + break; + case NL80211_BAND_5GHZ: + mandatory = IEEE80211_RATE_MANDATORY_A; + break; + case NL80211_BAND_60GHZ: + WARN_ON(1); + mandatory = 0; + break; + } + + for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) { + struct ieee80211_rate *rate; + + rate = &hw->wiphy->bands[i]->bitrates[r]; + + if (!(rate->flags & mandatory)) + continue; + sta->sta.supp_rates[i] |= BIT(r); + } + } + sta->sta.smps_mode = IEEE80211_SMPS_OFF; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 1837734ce85b..32a45c03786e 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -651,6 +651,17 @@ config NFT_TPROXY help This makes transparent proxy support available in nftables. +config NFT_SYNPROXY + tristate "Netfilter nf_tables SYNPROXY expression support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY expression allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + if NF_TABLES_NETDEV config NF_DUP_NETDEV diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 72cca6b48960..9270a7fae484 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -78,7 +78,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ - nft_chain_route.o + nft_chain_route.o nf_tables_offload.o nf_tables_set-objs := nf_tables_set_core.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o @@ -110,6 +110,7 @@ obj-$(CONFIG_NFT_SOCKET) += nft_socket.o obj-$(CONFIG_NFT_OSF) += nft_osf.o obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o +obj-$(CONFIG_NFT_SYNPROXY) += nft_synproxy.o obj-$(CONFIG_NFT_NAT) += nft_chain_nat.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b96fd3f54705..5d5bdf450091 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -520,7 +520,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, ret = -EPERM; return ret; case NF_QUEUE: - ret = nf_queue(skb, state, e, s, verdict); + ret = nf_queue(skb, state, s, verdict); if (ret == 1) continue; return ret; @@ -536,28 +536,6 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, } EXPORT_SYMBOL(nf_hook_slow); - -int skb_make_writable(struct sk_buff *skb, unsigned int writable_len) -{ - if (writable_len > skb->len) - return 0; - - /* Not exclusive use of packet? Must copy. */ - if (!skb_cloned(skb)) { - if (writable_len <= skb_headlen(skb)) - return 1; - } else if (skb_clone_writable(skb, writable_len)) - return 1; - - if (writable_len <= skb_headlen(skb)) - writable_len = 0; - else - writable_len -= skb_headlen(skb); - - return !!__pskb_pull_tail(skb, writable_len); -} -EXPORT_SYMBOL(skb_make_writable); - /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 8acc4e173167..063df74b4647 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index e3884b0cca91..11ff9d4a7006 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the bitmap:ip type */ @@ -28,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index b73c37b3a791..ca7ac4a25ada 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -2,7 +2,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> */ /* Kernel module implementing an IP set type: the bitmap:ip,mac type */ @@ -28,7 +27,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index d8c140553379..704a0dda1609 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the bitmap:port type */ @@ -23,7 +22,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:port"); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 3cdf171cd468..2e151856ad99 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module for IP set management */ @@ -48,7 +48,7 @@ static unsigned int max_sets; module_param(max_sets, int, 0600); MODULE_PARM_DESC(max_sets, "maximal number of sets"); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); @@ -1290,11 +1290,13 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; ip_set_id_t index; + int ret; - /* Second pass, so parser can't fail */ - nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, - nlh->nlmsg_len - min_len, ip_set_setname_policy, - NULL); + ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, + nlh->nlmsg_len - min_len, + ip_set_setname_policy, NULL); + if (ret) + return ret; cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { @@ -1541,10 +1543,14 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; - nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, - nlh->nlmsg_len - min_len, - ip_set_adt_policy, NULL); + ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, + nlh->nlmsg_len - min_len, + ip_set_adt_policy, NULL); + if (ret) { + nlmsg_free(skb2); + return ret; + } errline = nla_data(cda[IPSET_ATTR_LINENO]); *errline = lineno; @@ -1558,10 +1564,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, return ret; } -static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_ad(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + enum ipset_adt adt, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; @@ -1590,18 +1598,17 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (attr[IPSET_ATTR_DATA]) { if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, + ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { - memset(tb, 0, sizeof(tb)); if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, + ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; @@ -1610,56 +1617,22 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, return ret; } -static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int ip_set_uadd(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[], struct netlink_ext_ack *extack) { - struct ip_set_net *inst = ip_set_pernet(net); - struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; - const struct nlattr *nla; - u32 flags = flag_exist(nlh); - bool use_lineno; - int ret = 0; - - if (unlikely(protocol_min_failed(attr) || - !attr[IPSET_ATTR_SETNAME] || - !((attr[IPSET_ATTR_DATA] != NULL) ^ - (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] && - !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] && - (!flag_nested(attr[IPSET_ATTR_ADT]) || - !attr[IPSET_ATTR_LINENO])))) - return -IPSET_ERR_PROTOCOL; - - set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (!set) - return -ENOENT; - - use_lineno = !!attr[IPSET_ATTR_LINENO]; - if (attr[IPSET_ATTR_DATA]) { - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) - return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, - use_lineno); - } else { - int nla_rem; + return ip_set_ad(net, ctnl, skb, + IPSET_ADD, nlh, attr, extack); +} - nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { - memset(tb, 0, sizeof(*tb)); - if (nla_type(nla) != IPSET_ATTR_DATA || - !flag_nested(nla) || - nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) - return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, - flags, use_lineno); - if (ret < 0) - return ret; - } - } - return ret; +static int ip_set_udel(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) +{ + return ip_set_ad(net, ctnl, skb, + IPSET_DEL, nlh, attr, extack); } static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 2384e36aef5c..2b8f959574b4 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ /* Get Layer-4 data from the packets */ diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 10f619625abd..0feb77fa9edc 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef _IP_SET_HASH_GEN_H #define _IP_SET_HASH_GEN_H @@ -622,7 +621,7 @@ retry: goto cleanup; } m->size = AHASH_INIT_SIZE; - extsize = ext_size(AHASH_INIT_SIZE, dsize); + extsize += ext_size(AHASH_INIT_SIZE, dsize); RCU_INIT_POINTER(hbucket(t, key), m); } else if (m->pos >= m->size) { struct hbucket *ht; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 69d7576be2e6..f4432d9fcad0 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip type */ @@ -27,7 +26,7 @@ #define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip"); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 6fe1ec0d2154..7a1734aad0c5 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,mark type */ diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 74ec7e097e34..32e240658334 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port type */ @@ -29,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index ced57d63b01f..15d419353179 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,ip type */ @@ -29,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 905f6cf0f55e..7a4d7afd4121 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ @@ -31,7 +30,7 @@ #define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 853e772ab4d9..d94c585d33c5 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:mac type */ @@ -20,7 +19,7 @@ #define IPSET_TYPE_REV_MAX 0 MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:mac"); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 06c91e49bf25..c259cbc3ef45 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net type */ @@ -28,7 +27,7 @@ #define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net"); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 0a8cbcdfb42b..87b29f971226 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net,iface type */ @@ -29,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 832e4f5491cb..a3ae69bfee66 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa> */ diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index a4f3f15b874a..799f2272cc65 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net,port type */ @@ -30,7 +29,7 @@ #define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,port"); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index e54d415405f3..a82b70e8b9a6 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 8ada318bf09d..6f9ead6319e0 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the list:set type */ @@ -19,7 +18,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_list:set"); diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index bfd4365a8d73..4515056ef1c2 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -358,7 +358,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, struct tcphdr *th; __u32 seq; - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) return 0; th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); @@ -435,7 +435,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, struct tcphdr *th; __u32 seq; - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) return 0; th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 7138556b206b..46f06f92ab8f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -34,6 +34,8 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ +#include <net/gue.h> +#include <net/gre.h> #include <net/route.h> #include <net/ip6_checksum.h> #include <net/netns/generic.h> /* net_generic() */ @@ -892,7 +894,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol) offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto out; #ifdef CONFIG_IP_VS_IPV6 @@ -1282,7 +1284,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); - if (!skb_make_writable(skb, iph->len)) + if (skb_ensure_writable(skb, iph->len)) goto drop; /* mangle the packet */ @@ -1574,6 +1576,73 @@ ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, return 1; } +/* Check the UDP tunnel and return its header length */ +static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, + unsigned int offset, __u16 af, + const union nf_inet_addr *daddr, __u8 *proto) +{ + struct udphdr _udph, *udph; + struct ip_vs_dest *dest; + + udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (!udph) + goto unk; + offset += sizeof(struct udphdr); + dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); + if (!dest) + goto unk; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + struct guehdr _gueh, *gueh; + + gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); + if (!gueh) + goto unk; + if (gueh->control != 0 || gueh->version != 0) + goto unk; + /* Later we can support also IPPROTO_IPV6 */ + if (gueh->proto_ctype != IPPROTO_IPIP) + goto unk; + *proto = gueh->proto_ctype; + return sizeof(struct udphdr) + sizeof(struct guehdr) + + (gueh->hlen << 2); + } + +unk: + return 0; +} + +/* Check the GRE tunnel and return its header length */ +static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, + unsigned int offset, __u16 af, + const union nf_inet_addr *daddr, __u8 *proto) +{ + struct gre_base_hdr _greh, *greh; + struct ip_vs_dest *dest; + + greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh); + if (!greh) + goto unk; + dest = ip_vs_find_tunnel(ipvs, af, daddr, 0); + if (!dest) + goto unk; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 type; + + /* Only support version 0 and C (csum) */ + if ((greh->flags & ~GRE_CSUM) != 0) + goto unk; + type = greh->protocol; + /* Later we can support also IPPROTO_IPV6 */ + if (type != htons(ETH_P_IP)) + goto unk; + *proto = IPPROTO_IPIP; + return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); + } + +unk: + return 0; +} + /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, @@ -1593,6 +1662,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; bool ipip, new_cp = false; + union nf_inet_addr *raddr; *related = 1; @@ -1631,20 +1701,56 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ + raddr = (union nf_inet_addr *)&cih->daddr; /* Special case for errors for IPIP packets */ ipip = false; if (cih->protocol == IPPROTO_IPIP) { + struct ip_vs_dest *dest; + if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; /* Error for our IPIP must arrive at LOCAL_IN */ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) return NF_ACCEPT; + dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); + /* Only for known tunnel */ + if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) + return NF_ACCEPT; offset += cih->ihl * 4; cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ ipip = true; + } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ + cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ + /* Error for our tunnel must arrive at LOCAL_IN */ + (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { + __u8 iproto; + int ulen; + + /* Non-first fragment has no UDP header */ + if (unlikely(cih->frag_off & htons(IP_OFFSET))) + return NF_ACCEPT; + offset2 = offset + cih->ihl * 4; + if (cih->protocol == IPPROTO_UDP) + ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, + raddr, &iproto); + else + ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, + raddr, &iproto); + if (ulen > 0) { + /* Skip IP and UDP/GRE tunnel headers */ + offset = offset2 + ulen; + /* Now we should be at the original IP header */ + cih = skb_header_pointer(skb, offset, sizeof(_ciph), + &_ciph); + if (cih && cih->version == 4 && cih->ihl >= 5 && + iproto == IPPROTO_IPIP) + ipip = true; + else + return NF_ACCEPT; + } } pd = ip_vs_proto_data_get(ipvs, cih->protocol); @@ -2245,7 +2351,6 @@ static const struct nf_hook_ops ip_vs_ops[] = { static int __net_init __ip_vs_init(struct net *net) { struct netns_ipvs *ipvs; - int ret; ipvs = net_generic(net, ip_vs_net_id); if (ipvs == NULL) @@ -2277,17 +2382,11 @@ static int __net_init __ip_vs_init(struct net *net) if (ip_vs_sync_net_init(ipvs) < 0) goto sync_fail; - ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) - goto hook_fail; - return 0; /* * Error handling */ -hook_fail: - ip_vs_sync_net_cleanup(ipvs); sync_fail: ip_vs_conn_net_cleanup(ipvs); conn_fail: @@ -2317,6 +2416,19 @@ static void __net_exit __ip_vs_cleanup(struct net *net) net->ipvs = NULL; } +static int __net_init __ip_vs_dev_init(struct net *net) +{ + int ret; + + ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + if (ret < 0) + goto hook_fail; + return 0; + +hook_fail: + return ret; +} + static void __net_exit __ip_vs_dev_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -2336,6 +2448,7 @@ static struct pernet_operations ipvs_core_ops = { }; static struct pernet_operations ipvs_core_dev_ops = { + .init = __ip_vs_dev_init, .exit = __ip_vs_dev_cleanup, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 776c87ed4813..07e0967bf129 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -510,15 +510,37 @@ static inline unsigned int ip_vs_rs_hashkey(int af, static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned int hash; + __be16 port; if (dest->in_rs_table) return; + switch (IP_VS_DFWD_METHOD(dest)) { + case IP_VS_CONN_F_MASQ: + port = dest->port; + break; + case IP_VS_CONN_F_TUNNEL: + switch (dest->tun_type) { + case IP_VS_CONN_F_TUNNEL_TYPE_GUE: + port = dest->tun_port; + break; + case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: + case IP_VS_CONN_F_TUNNEL_TYPE_GRE: + port = 0; + break; + default: + return; + } + break; + default: + return; + } + /* * Hash by proto,addr,port, * which are the parameters of the real service. */ - hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); dest->in_rs_table = 1; @@ -550,7 +572,8 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && - (dest->protocol == protocol || dest->vfwmark)) { + (dest->protocol == protocol || dest->vfwmark) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return true; } @@ -580,7 +603,37 @@ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && - (dest->protocol == protocol || dest->vfwmark)) { + (dest->protocol == protocol || dest->vfwmark) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* Find real service record by <af,addr,tun_port>. + * In case of multiple records with the same <af,addr,tun_port>, only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, + const union nf_inet_addr *daddr, + __be16 tun_port) +{ + struct ip_vs_dest *dest; + unsigned int hash; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, tun_port); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->tun_port == tun_port && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { /* HIT */ return dest; } @@ -826,24 +879,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; + /* Need to rehash? */ + if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != + IP_VS_DFWD_METHOD(dest) || + udest->tun_type != dest->tun_type || + udest->tun_port != dest->tun_port) + ip_vs_rs_unhash(dest); + /* set the tunnel info */ dest->tun_type = udest->tun_type; dest->tun_port = udest->tun_port; + dest->tun_flags = udest->tun_flags; /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { - /* - * Put the real service in rs_table if not present. - * For now only for NAT! - */ - ip_vs_rs_hash(ipvs, dest); /* FTP-NAT requires conntrack for mangling */ if (svc->port == FTPPORT) ip_vs_register_conntrack(svc); } atomic_set(&dest->conn_flags, conn_flags); + /* Put the real service in rs_table if not present. */ + ip_vs_rs_hash(ipvs, dest); /* bind the service */ old_svc = rcu_dereference_protected(dest->svc, 1); @@ -2396,9 +2454,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { - mutex_lock(&ipvs->sync_mutex); ret = stop_sync_thread(ipvs, dm->state); - mutex_unlock(&ipvs->sync_mutex); } goto out_dec; } @@ -2906,6 +2962,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3212,6 +3269,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) dest->tun_type) || nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, dest->tun_port) || + nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, + dest->tun_flags) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, @@ -3332,7 +3391,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh, *nla_tun_type, *nla_tun_port; + *nla_l_thresh, *nla_tun_type, *nla_tun_port, + *nla_tun_flags; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; @@ -3340,6 +3400,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; + nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; @@ -3355,6 +3416,9 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, if (nla_tun_port) udest->tun_port = nla_get_be16(nla_tun_port); + + if (nla_tun_flags) + udest->tun_flags = nla_get_u16(nla_tun_flags); } return 0; @@ -3515,10 +3579,8 @@ static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; - mutex_lock(&ipvs->sync_mutex); ret = stop_sync_thread(ipvs, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); - mutex_unlock(&ipvs->sync_mutex); return ret; } diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index c244b2545e24..cf925906f59b 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -267,7 +267,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, return 1; /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return 0; if (cp->app_data == (void *) IP_VS_FTP_PASV) { @@ -433,7 +433,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, return 1; /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return 0; data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index b58ddb7dffd1..a0921adc31a9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -101,7 +101,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, #endif /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { @@ -148,7 +148,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, #endif /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 915ac8206076..000d961b97e4 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -159,7 +159,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { @@ -237,7 +237,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 379140075e95..153d89647c87 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -148,7 +148,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { @@ -231,7 +231,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 2526be6b3d90..a4a78c4b06de 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -195,6 +195,7 @@ union ip_vs_sync_conn { #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) struct ip_vs_sync_thread_data { + struct task_struct *task; struct netns_ipvs *ipvs; struct socket *sock; char *buf; @@ -374,8 +375,11 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs, max(IPVS_SYNC_SEND_DELAY, 1)); ms->sync_queue_len++; list_add_tail(&sb->list, &ms->sync_queue); - if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) - wake_up_process(ms->master_thread); + if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) { + int id = (int)(ms - ipvs->ms); + + wake_up_process(ipvs->master_tinfo[id].task); + } } else ip_vs_sync_buff_release(sb); spin_unlock(&ipvs->sync_lock); @@ -1636,8 +1640,10 @@ static void master_wakeup_work_handler(struct work_struct *work) spin_lock_bh(&ipvs->sync_lock); if (ms->sync_queue_len && ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { + int id = (int)(ms - ipvs->ms); + ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; - wake_up_process(ms->master_thread); + wake_up_process(ipvs->master_tinfo[id].task); } spin_unlock_bh(&ipvs->sync_lock); } @@ -1703,10 +1709,6 @@ done: if (sb) ip_vs_sync_buff_release(sb); - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo); - return 0; } @@ -1740,11 +1742,6 @@ static int sync_thread_backup(void *data) } } - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo->buf); - kfree(tinfo); - return 0; } @@ -1752,8 +1749,8 @@ static int sync_thread_backup(void *data) int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, int state) { - struct ip_vs_sync_thread_data *tinfo = NULL; - struct task_struct **array = NULL, *task; + struct ip_vs_sync_thread_data *ti = NULL, *tinfo; + struct task_struct *task; struct net_device *dev; char *name; int (*threadfn)(void *data); @@ -1822,7 +1819,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, threadfn = sync_thread_master; } else if (state == IP_VS_STATE_BACKUP) { result = -EEXIST; - if (ipvs->backup_threads) + if (ipvs->backup_tinfo) goto out_early; ipvs->bcfg = *c; @@ -1849,28 +1846,22 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, master_wakeup_work_handler); ms->ipvs = ipvs; } - } else { - array = kcalloc(count, sizeof(struct task_struct *), - GFP_KERNEL); - result = -ENOMEM; - if (!array) - goto out; } + result = -ENOMEM; + ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data), + GFP_KERNEL); + if (!ti) + goto out; for (id = 0; id < count; id++) { - result = -ENOMEM; - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto out; + tinfo = &ti[id]; tinfo->ipvs = ipvs; - tinfo->sock = NULL; if (state == IP_VS_STATE_BACKUP) { + result = -ENOMEM; tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, GFP_KERNEL); if (!tinfo->buf) goto out; - } else { - tinfo->buf = NULL; } tinfo->id = id; if (state == IP_VS_STATE_MASTER) @@ -1885,17 +1876,15 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, result = PTR_ERR(task); goto out; } - tinfo = NULL; - if (state == IP_VS_STATE_MASTER) - ipvs->ms[id].master_thread = task; - else - array[id] = task; + tinfo->task = task; } /* mark as active */ - if (state == IP_VS_STATE_BACKUP) - ipvs->backup_threads = array; + if (state == IP_VS_STATE_MASTER) + ipvs->master_tinfo = ti; + else + ipvs->backup_tinfo = ti; spin_lock_bh(&ipvs->sync_buff_lock); ipvs->sync_state |= state; spin_unlock_bh(&ipvs->sync_buff_lock); @@ -1910,29 +1899,31 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, out: /* We do not need RTNL lock anymore, release it here so that - * sock_release below and in the kthreads can use rtnl_lock - * to leave the mcast group. + * sock_release below can use rtnl_lock to leave the mcast group. */ rtnl_unlock(); - count = id; - while (count-- > 0) { - if (state == IP_VS_STATE_MASTER) - kthread_stop(ipvs->ms[count].master_thread); - else - kthread_stop(array[count]); + id = min(id, count - 1); + if (ti) { + for (tinfo = ti + id; tinfo >= ti; tinfo--) { + if (tinfo->task) + kthread_stop(tinfo->task); + } } if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { kfree(ipvs->ms); ipvs->ms = NULL; } mutex_unlock(&ipvs->sync_mutex); - if (tinfo) { - if (tinfo->sock) - sock_release(tinfo->sock); - kfree(tinfo->buf); - kfree(tinfo); + + /* No more mutexes, release socks */ + if (ti) { + for (tinfo = ti + id; tinfo >= ti; tinfo--) { + if (tinfo->sock) + sock_release(tinfo->sock); + kfree(tinfo->buf); + } + kfree(ti); } - kfree(array); return result; out_early: @@ -1944,15 +1935,18 @@ out_early: int stop_sync_thread(struct netns_ipvs *ipvs, int state) { - struct task_struct **array; + struct ip_vs_sync_thread_data *ti, *tinfo; int id; int retc = -EINVAL; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); + mutex_lock(&ipvs->sync_mutex); if (state == IP_VS_STATE_MASTER) { + retc = -ESRCH; if (!ipvs->ms) - return -ESRCH; + goto err; + ti = ipvs->master_tinfo; /* * The lock synchronizes with sb_queue_tail(), so that we don't @@ -1971,38 +1965,56 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state) struct ipvs_master_sync_state *ms = &ipvs->ms[id]; int ret; + tinfo = &ti[id]; pr_info("stopping master sync thread %d ...\n", - task_pid_nr(ms->master_thread)); + task_pid_nr(tinfo->task)); cancel_delayed_work_sync(&ms->master_wakeup_work); - ret = kthread_stop(ms->master_thread); + ret = kthread_stop(tinfo->task); if (retc >= 0) retc = ret; } kfree(ipvs->ms); ipvs->ms = NULL; + ipvs->master_tinfo = NULL; } else if (state == IP_VS_STATE_BACKUP) { - if (!ipvs->backup_threads) - return -ESRCH; + retc = -ESRCH; + if (!ipvs->backup_tinfo) + goto err; + ti = ipvs->backup_tinfo; ipvs->sync_state &= ~IP_VS_STATE_BACKUP; - array = ipvs->backup_threads; retc = 0; for (id = ipvs->threads_mask; id >= 0; id--) { int ret; + tinfo = &ti[id]; pr_info("stopping backup sync thread %d ...\n", - task_pid_nr(array[id])); - ret = kthread_stop(array[id]); + task_pid_nr(tinfo->task)); + ret = kthread_stop(tinfo->task); if (retc >= 0) retc = ret; } - kfree(array); - ipvs->backup_threads = NULL; + ipvs->backup_tinfo = NULL; + } else { + goto err; } + id = ipvs->threads_mask; + mutex_unlock(&ipvs->sync_mutex); + + /* No more mutexes, release socks */ + for (tinfo = ti + id; tinfo >= ti; tinfo--) { + if (tinfo->sock) + sock_release(tinfo->sock); + kfree(tinfo->buf); + } + kfree(ti); /* decrease the module use count */ ip_vs_use_count_dec(); + return retc; +err: + mutex_unlock(&ipvs->sync_mutex); return retc; } @@ -2021,7 +2033,6 @@ void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) { int retc; - mutex_lock(&ipvs->sync_mutex); retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); if (retc && retc != -ESRCH) pr_err("Failed to stop Master Daemon\n"); @@ -2029,5 +2040,4 @@ void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); if (retc && retc != -ESRCH) pr_err("Failed to stop Backup Daemon\n"); - mutex_unlock(&ipvs->sync_mutex); } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index e101eda05d55..9c464d24beec 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -29,6 +29,7 @@ #include <linux/tcp.h> /* for tcphdr */ #include <net/ip.h> #include <net/gue.h> +#include <net/gre.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ @@ -36,6 +37,7 @@ #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> +#include <net/ip6_checksum.h> #include <net/addrconf.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> @@ -275,7 +277,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, } /* don't propagate ttl change to cloned packets */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return false; ipv6_hdr(skb)->hop_limit--; @@ -290,7 +292,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, } /* don't propagate ttl change to cloned packets */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return false; /* Decrease ttl */ @@ -381,8 +383,19 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (!dest) goto err_put; - if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 tflags = 0; + + if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + mtu -= gre_calc_hlen(tflags); + } if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; @@ -536,8 +549,19 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (!dest) goto err_put; - if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 tflags = 0; + + if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + mtu -= gre_calc_hlen(tflags); + } if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); @@ -792,7 +816,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -881,7 +905,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -1002,17 +1026,56 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb, __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); struct udphdr *udph; /* Our new UDP header */ struct guehdr *gueh; /* Our new GUE header */ + size_t hdrlen, optlen = 0; + void *data; + bool need_priv = false; + + if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + need_priv = true; + } - skb_push(skb, sizeof(struct guehdr)); + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); gueh = (struct guehdr *)skb->data; gueh->control = 0; gueh->version = 0; - gueh->hlen = 0; + gueh->hlen = optlen >> 2; gueh->flags = 0; gueh->proto_ctype = *next_protocol; + data = &gueh[1]; + + if (need_priv) { + __be32 *flags = data; + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd; + + gueh->flags |= GUE_FLAG_PRIV; + *flags = 0; + data += GUE_LEN_PRIV; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd = data; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + skb_push(skb, sizeof(struct udphdr)); skb_reset_transport_header(skb); @@ -1029,6 +1092,24 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb, return 0; } +static void +ipvs_gre_encap(struct net *net, struct sk_buff *skb, + struct ip_vs_conn *cp, __u8 *next_protocol) +{ + __be16 proto = *next_protocol == IPPROTO_IPIP ? + htons(ETH_P_IP) : htons(ETH_P_IPV6); + __be16 tflags = 0; + size_t hdrlen; + + if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + + hdrlen = gre_calc_hlen(tflags); + gre_build_header(skb, hdrlen, tflags, proto, 0, 0); + + *next_protocol = IPPROTO_GRE; +} + /* * IP Tunneling transmitter * @@ -1066,6 +1147,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int max_headroom; /* The extra header space needed */ int ret, local; int tun_type, gso_type; + int tun_flags; EnterFunction(10); @@ -1088,9 +1170,28 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + size_t gre_hdrlen; + __be16 tflags = 0; - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + gre_hdrlen = gre_calc_hlen(tflags); + + max_headroom += gre_hdrlen; + } /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; @@ -1101,8 +1202,22 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; gso_type = __tun_gso_type_mask(AF_INET, cp->af); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - gso_type |= SKB_GSO_UDP_TUNNEL; + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + gso_type |= SKB_GSO_GRE_CSUM; + else + gso_type |= SKB_GSO_GRE; + } if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; @@ -1111,8 +1226,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_set_inner_ipproto(skb, next_protocol); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - ipvs_gue_encap(net, skb, cp, &next_protocol); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) + ipvs_gre_encap(net, skb, cp, &next_protocol); skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); @@ -1170,6 +1296,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int max_headroom; /* The extra header space needed */ int ret, local; int tun_type, gso_type; + int tun_flags; EnterFunction(10); @@ -1193,9 +1320,28 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + size_t gre_hdrlen; + __be16 tflags = 0; + + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + gre_hdrlen = gre_calc_hlen(tflags); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + max_headroom += gre_hdrlen; + } skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, &next_protocol, &payload_len, @@ -1204,8 +1350,22 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; gso_type = __tun_gso_type_mask(AF_INET6, cp->af); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - gso_type |= SKB_GSO_UDP_TUNNEL; + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + gso_type |= SKB_GSO_GRE_CSUM; + else + gso_type |= SKB_GSO_GRE; + } if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; @@ -1214,8 +1374,19 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_set_inner_ipproto(skb, next_protocol); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - ipvs_gue_encap(net, skb, cp, &next_protocol); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) + ipvs_gre_encap(net, skb, cp, &next_protocol); skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); @@ -1400,7 +1571,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -1489,7 +1660,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index e52fcb9c9a96..921a7b95be68 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -37,12 +37,17 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, in_dev = __in_dev_get_rcu(rt->dst.dev); if (in_dev != NULL) { - for_primary_ifa(in_dev) { + const struct in_ifaddr *ifa; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + if (ifa->ifa_broadcast == iph->daddr) { mask = ifa->ifa_mask; break; } - } endfor_ifa(in_dev); + } } if (mask == 0) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f4f9b8344a32..bdfeacee0817 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -749,9 +749,6 @@ begin: continue; } - if (nf_ct_is_dying(ct)) - continue; - if (nf_ct_key_equal(h, tuple, zone, net)) return h; } @@ -777,20 +774,24 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conn *ct; rcu_read_lock(); -begin: + h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { + /* We have a candidate that matches the tuple we're interested + * in, try to obtain a reference and re-check tuple + */ ct = nf_ct_tuplehash_to_ctrack(h); - if (unlikely(nf_ct_is_dying(ct) || - !atomic_inc_not_zero(&ct->ct_general.use))) - h = NULL; - else { - if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { - nf_ct_put(ct); - goto begin; - } + if (likely(atomic_inc_not_zero(&ct->ct_general.use))) { + if (likely(nf_ct_key_equal(h, tuple, zone, net))) + goto found; + + /* TYPESAFE_BY_RCU recycled the candidate */ + nf_ct_put(ct); } + + h = NULL; } +found: rcu_read_unlock(); return h; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index fac6986d37a8..6497e5fc0871 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -6,7 +6,7 @@ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the 'brute force' H.323 connection tracking module by - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Jozsef Kadlecsik <kadlec@netfilter.org> * * For more information, please see http://nath323.sourceforge.net/ */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 7db79c1b8084..1b77444d5b52 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1256,7 +1256,6 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, struct nf_conntrack_tuple tuple; struct nf_conn *ct; struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; struct nf_conntrack_zone zone; int err; @@ -1266,11 +1265,13 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, - u3, &zone); + nfmsg->nfgen_family, &zone); else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, - u3, &zone); + nfmsg->nfgen_family, &zone); else { + u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; + return ctnetlink_flush_conntrack(net, cda, NETLINK_CB(skb).portid, nlmsg_report(nlh), u3); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 37bb530d848f..a0560d175a7f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -16,6 +16,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_bridge.h> #include <net/netfilter/nf_log.h> #include <linux/ip.h> @@ -120,10 +121,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) }; EXPORT_SYMBOL_GPL(nf_ct_l4proto_find); -static unsigned int nf_confirm(struct sk_buff *skb, - unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) +unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conn_help *help; @@ -154,6 +153,7 @@ static unsigned int nf_confirm(struct sk_buff *skb, /* We've seen it coming out the other side: confirm it */ return nf_conntrack_confirm(skb); } +EXPORT_SYMBOL_GPL(nf_confirm); static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, @@ -442,12 +442,14 @@ static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto) return 0; } +static struct nf_ct_bridge_info *nf_ct_bridge_info; + static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); - bool fixup_needed = false; + bool fixup_needed = false, retry = true; int err = 0; - +retry: mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { @@ -487,6 +489,32 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto) fixup_needed = true; break; #endif + case NFPROTO_BRIDGE: + if (!nf_ct_bridge_info) { + if (!retry) { + err = -EPROTO; + goto out_unlock; + } + mutex_unlock(&nf_ct_proto_mutex); + request_module("nf_conntrack_bridge"); + retry = false; + goto retry; + } + if (!try_module_get(nf_ct_bridge_info->me)) { + err = -EPROTO; + goto out_unlock; + } + cnet->users_bridge++; + if (cnet->users_bridge > 1) + goto out_unlock; + + err = nf_register_net_hooks(net, nf_ct_bridge_info->ops, + nf_ct_bridge_info->ops_size); + if (err) + cnet->users_bridge = 0; + else + fixup_needed = true; + break; default: err = -EPROTO; break; @@ -519,47 +547,99 @@ static void nf_ct_netns_do_put(struct net *net, u8 nfproto) ARRAY_SIZE(ipv6_conntrack_ops)); break; #endif + case NFPROTO_BRIDGE: + if (!nf_ct_bridge_info) + break; + if (cnet->users_bridge && (--cnet->users_bridge == 0)) + nf_unregister_net_hooks(net, nf_ct_bridge_info->ops, + nf_ct_bridge_info->ops_size); + + module_put(nf_ct_bridge_info->me); + break; } - mutex_unlock(&nf_ct_proto_mutex); } -int nf_ct_netns_get(struct net *net, u8 nfproto) +static int nf_ct_netns_inet_get(struct net *net) { int err; - if (nfproto == NFPROTO_INET) { - err = nf_ct_netns_do_get(net, NFPROTO_IPV4); - if (err < 0) - goto err1; - err = nf_ct_netns_do_get(net, NFPROTO_IPV6); - if (err < 0) - goto err2; - } else { - err = nf_ct_netns_do_get(net, nfproto); - if (err < 0) - goto err1; - } - return 0; + err = nf_ct_netns_do_get(net, NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_netns_do_get(net, NFPROTO_IPV6); + if (err < 0) + goto err2; + return err; err2: nf_ct_netns_put(net, NFPROTO_IPV4); err1: return err; } + +int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + int err; + + switch (nfproto) { + case NFPROTO_INET: + err = nf_ct_netns_inet_get(net); + break; + case NFPROTO_BRIDGE: + err = nf_ct_netns_do_get(net, NFPROTO_BRIDGE); + if (err < 0) + return err; + + err = nf_ct_netns_inet_get(net); + if (err < 0) { + nf_ct_netns_put(net, NFPROTO_BRIDGE); + return err; + } + break; + default: + err = nf_ct_netns_do_get(net, nfproto); + break; + } + return err; +} EXPORT_SYMBOL_GPL(nf_ct_netns_get); void nf_ct_netns_put(struct net *net, uint8_t nfproto) { - if (nfproto == NFPROTO_INET) { + switch (nfproto) { + case NFPROTO_BRIDGE: + nf_ct_netns_do_put(net, NFPROTO_BRIDGE); + /* fall through */ + case NFPROTO_INET: nf_ct_netns_do_put(net, NFPROTO_IPV4); nf_ct_netns_do_put(net, NFPROTO_IPV6); - } else { + break; + default: nf_ct_netns_do_put(net, nfproto); + break; } } EXPORT_SYMBOL_GPL(nf_ct_netns_put); +void nf_ct_bridge_register(struct nf_ct_bridge_info *info) +{ + WARN_ON(nf_ct_bridge_info); + mutex_lock(&nf_ct_proto_mutex); + nf_ct_bridge_info = info; + mutex_unlock(&nf_ct_proto_mutex); +} +EXPORT_SYMBOL_GPL(nf_ct_bridge_register); + +void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info) +{ + WARN_ON(!nf_ct_bridge_info); + mutex_lock(&nf_ct_proto_mutex); + nf_ct_bridge_info = NULL; + mutex_unlock(&nf_ct_proto_mutex); +} +EXPORT_SYMBOL_GPL(nf_ct_bridge_unregister); + int nf_conntrack_proto_init(void) { int ret; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index a824367ed518..dd53e2b20f6b 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -218,7 +218,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, /* See ip_conntrack_proto_tcp.c */ if (state->net->ct.sysctl_checksum && state->hook == NF_INET_PRE_ROUTING && - nf_ip_checksum(skb, state->hook, dataoff, 0)) { + nf_ip_checksum(skb, state->hook, dataoff, IPPROTO_ICMP)) { icmp_error_log(skb, state, "bad hw icmp checksum"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 522c08c23600..fce3d93f1541 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -336,7 +336,7 @@ static bool sctp_error(struct sk_buff *skb, if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { - if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) { + if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1e2cc83ff5da..d5fdfa00d683 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index dc21a43cd145..3066449f8bd8 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -126,7 +126,7 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; - if (!skb_make_writable(skb, optend)) + if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; @@ -176,7 +176,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb, this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; - if (!skb_make_writable(skb, protoff + sizeof(*tcph))) + if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 948b4ebbe3fb..e3d797252a98 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -53,7 +53,6 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, ft->dst_port = ctt->dst.u.tcp.port; ft->iifidx = other_dst->dev->ifindex; - ft->oifidx = dst->dev->ifindex; ft->dst_cache = dst; } diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 3574a212bdc2..bb25d4c794c7 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -374,7 +374,7 @@ static int seq_show(struct seq_file *s, void *v) continue; logger = nft_log_dereference(loggers[*pos][i]); - seq_printf(s, "%s", logger->name); + seq_puts(s, logger->name); if (i == 0 && loggers[*pos][i + 1] != NULL) seq_puts(s, ","); diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 98bf543e9891..a263505455fc 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -95,7 +95,7 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb, struct tcphdr *tcph; int oldlen, datalen; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return false; if (rep_len > match_len && @@ -145,7 +145,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, struct udphdr *udph; int datalen, oldlen; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return false; if (rep_len > match_len && diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 07da07788f6b..7ac733ebd060 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -70,7 +70,7 @@ static bool udp_manip_pkt(struct sk_buff *skb, struct udphdr *hdr; bool do_csum; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); @@ -88,7 +88,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb, #ifdef CONFIG_NF_CT_PROTO_UDPLITE struct udphdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); @@ -114,7 +114,7 @@ sctp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(*hdr)) hdrsize = sizeof(*hdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); @@ -155,7 +155,7 @@ tcp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct tcphdr *)(skb->data + hdroff); @@ -195,7 +195,7 @@ dccp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(struct dccp_hdr)) hdrsize = sizeof(struct dccp_hdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct dccp_hdr *)(skb->data + hdroff); @@ -229,7 +229,7 @@ icmp_manip_pkt(struct sk_buff *skb, { struct icmphdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); @@ -247,7 +247,7 @@ icmpv6_manip_pkt(struct sk_buff *skb, { struct icmp6hdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmp6hdr *)(skb->data + hdroff); @@ -275,7 +275,7 @@ gre_manip_pkt(struct sk_buff *skb, /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ - if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8)) + if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8)) return false; greh = (void *)skb->data + hdroff; @@ -347,7 +347,7 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, struct iphdr *iph; unsigned int hdroff; - if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) + if (skb_ensure_writable(skb, iphdroff + sizeof(*iph))) return false; iph = (void *)skb->data + iphdroff; @@ -378,7 +378,7 @@ static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, int hdroff; u8 nexthdr; - if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h))) + if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h))) return false; ipv6h = (void *)skb->data + iphdroff; @@ -562,9 +562,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); - if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; - if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) + if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP)) return 0; inside = (void *)skb->data + hdrlen; @@ -784,7 +784,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); - if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) return 0; diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 4ffe5e5e65ba..f91579c821e9 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -44,15 +44,17 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, if (hooknum == NF_INET_LOCAL_OUT) { newdst = htonl(0x7F000001); } else { - struct in_device *indev; - struct in_ifaddr *ifa; + const struct in_device *indev; newdst = 0; indev = __in_dev_get_rcu(skb->dev); - if (indev && indev->ifa_list) { - ifa = indev->ifa_list; - newdst = ifa->ifa_local; + if (indev) { + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(indev->ifa_list); + if (ifa) + newdst = ifa->ifa_local; } if (!newdst) diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index 7de28fa0f14a..e338d91980d8 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -282,7 +282,7 @@ next: if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) { struct udphdr *uh; - if (!skb_make_writable(skb, skb->len)) { + if (skb_ensure_writable(skb, skb->len)) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); return NF_DROP; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index b5b2be55ca82..a2b58de82600 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -156,7 +156,6 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, } static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, - const struct nf_hook_entries *entries, unsigned int index, unsigned int queuenum) { int status = -ENOENT; @@ -190,6 +189,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, goto err; } + if (!skb_dst_force(skb) && state->hook != NF_INET_PRE_ROUTING) { + status = -ENETDOWN; + goto err; + } + *entry = (struct nf_queue_entry) { .skb = skb, .state = *state, @@ -198,7 +202,6 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, }; nf_queue_entry_get_refs(entry); - skb_dst_force(skb); switch (entry->state.pf) { case AF_INET: @@ -225,12 +228,11 @@ err: /* Packets leaving via this function must come back through nf_reinject(). */ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, - const struct nf_hook_entries *entries, unsigned int index, - unsigned int verdict) + unsigned int index, unsigned int verdict) { int ret; - ret = __nf_queue(skb, state, entries, index, verdict >> NF_VERDICT_QBITS); + ret = __nf_queue(skb, state, index, verdict >> NF_VERDICT_QBITS); if (ret < 0) { if (ret == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) @@ -336,7 +338,7 @@ next_hook: local_bh_enable(); break; case NF_QUEUE: - err = nf_queue(skb, &entry->state, hooks, i, verdict); + err = nf_queue(skb, &entry->state, i, verdict); if (err == 1) goto next_hook; break; diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 8ce74ed985c0..b101f187eda8 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -10,16 +10,16 @@ #include <net/netns/generic.h> #include <linux/proc_fs.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter/xt_tcpudp.h> -#include <linux/netfilter/xt_SYNPROXY.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nf_synproxy.h> #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_synproxy.h> unsigned int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); @@ -57,7 +57,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, case TCPOPT_MSS: if (opsize == TCPOLEN_MSS) { opts->mss = get_unaligned_be16(ptr); - opts->options |= XT_SYNPROXY_OPT_MSS; + opts->options |= NF_SYNPROXY_OPT_MSS; } break; case TCPOPT_WINDOW: @@ -65,19 +65,19 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, opts->wscale = *ptr; if (opts->wscale > TCP_MAX_WSCALE) opts->wscale = TCP_MAX_WSCALE; - opts->options |= XT_SYNPROXY_OPT_WSCALE; + opts->options |= NF_SYNPROXY_OPT_WSCALE; } break; case TCPOPT_TIMESTAMP: if (opsize == TCPOLEN_TIMESTAMP) { opts->tsval = get_unaligned_be32(ptr); opts->tsecr = get_unaligned_be32(ptr + 4); - opts->options |= XT_SYNPROXY_OPT_TIMESTAMP; + opts->options |= NF_SYNPROXY_OPT_TIMESTAMP; } break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM) - opts->options |= XT_SYNPROXY_OPT_SACK_PERM; + opts->options |= NF_SYNPROXY_OPT_SACK_PERM; break; } @@ -89,36 +89,36 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, } EXPORT_SYMBOL_GPL(synproxy_parse_options); -unsigned int synproxy_options_size(const struct synproxy_options *opts) +static unsigned int +synproxy_options_size(const struct synproxy_options *opts) { unsigned int size = 0; - if (opts->options & XT_SYNPROXY_OPT_MSS) + if (opts->options & NF_SYNPROXY_OPT_MSS) size += TCPOLEN_MSS_ALIGNED; - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) size += TCPOLEN_TSTAMP_ALIGNED; - else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) size += TCPOLEN_SACKPERM_ALIGNED; - if (opts->options & XT_SYNPROXY_OPT_WSCALE) + if (opts->options & NF_SYNPROXY_OPT_WSCALE) size += TCPOLEN_WSCALE_ALIGNED; return size; } -EXPORT_SYMBOL_GPL(synproxy_options_size); -void +static void synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) { __be32 *ptr = (__be32 *)(th + 1); u8 options = opts->options; - if (options & XT_SYNPROXY_OPT_MSS) + if (options & NF_SYNPROXY_OPT_MSS) *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss); - if (options & XT_SYNPROXY_OPT_TIMESTAMP) { - if (options & XT_SYNPROXY_OPT_SACK_PERM) + if (options & NF_SYNPROXY_OPT_TIMESTAMP) { + if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | @@ -131,58 +131,56 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); - } else if (options & XT_SYNPROXY_OPT_SACK_PERM) + } else if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); - if (options & XT_SYNPROXY_OPT_WSCALE) + if (options & NF_SYNPROXY_OPT_WSCALE) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->wscale); } -EXPORT_SYMBOL_GPL(synproxy_build_options); -void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, +void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; opts->tsval = tcp_time_stamp_raw() & ~0x3f; - if (opts->options & XT_SYNPROXY_OPT_WSCALE) { + if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; opts->wscale = info->wscale; } else opts->tsval |= 0xf; - if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) opts->tsval |= 1 << 4; - if (opts->options & XT_SYNPROXY_OPT_ECN) + if (opts->options & NF_SYNPROXY_OPT_ECN) opts->tsval |= 1 << 5; } EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); -void synproxy_check_timestamp_cookie(struct synproxy_options *opts) +static void +synproxy_check_timestamp_cookie(struct synproxy_options *opts) { opts->wscale = opts->tsecr & 0xf; if (opts->wscale != 0xf) - opts->options |= XT_SYNPROXY_OPT_WSCALE; + opts->options |= NF_SYNPROXY_OPT_WSCALE; - opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0; + opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0; - opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0; + opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0; } -EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie); -unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, - unsigned int protoff, - struct tcphdr *th, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct nf_conn_synproxy *synproxy) +static unsigned int +synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, + struct tcphdr *th, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy) { unsigned int optoff, optend; __be32 *ptr, old; @@ -193,7 +191,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, optoff = protoff + sizeof(struct tcphdr); optend = protoff + th->doff * 4; - if (!skb_make_writable(skb, optend)) + if (skb_ensure_writable(skb, optend)) return 0; while (optoff < optend) { @@ -232,7 +230,6 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, } return 1; } -EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust); static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { .len = sizeof(struct nf_conn_synproxy), @@ -413,5 +410,830 @@ static void __exit synproxy_core_exit(void) module_init(synproxy_core_init); module_exit(synproxy_core_exit); +static struct iphdr * +synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, + __be32 daddr) +{ + struct iphdr *iph; + + skb_reset_network_header(skb); + iph = skb_put(skb, sizeof(*iph)); + iph->version = 4; + iph->ihl = sizeof(*iph) / 4; + iph->tos = 0; + iph->id = 0; + iph->frag_off = htons(IP_DF); + iph->ttl = net->ipv4.sysctl_ip_default_ttl; + iph->protocol = IPPROTO_TCP; + iph->check = 0; + iph->saddr = saddr; + iph->daddr = daddr; + + return iph; +} + +static void +synproxy_send_tcp(struct net *net, + const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct iphdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + skb_dst_set_noref(nskb, skb_dst(skb)); + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) + goto free_nskb; + + if (nfct) { + nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); + nf_conntrack_get(nfct); + } + + ip_local_out(net, nskb->sk, nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +void +synproxy_send_client_synack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); +} +EXPORT_SYMBOL_GPL(synproxy_send_client_synack); + +static void +synproxy_send_server_syn(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(struct net *net, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(ntohs(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); +} + +bool +synproxy_recv_client_ack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + int mss; + + mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= NF_SYNPROXY_OPT_MSS; + + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(net, skb, th, opts, recv_seq); + return true; +} +EXPORT_SYMBOL_GPL(synproxy_recv_client_ack); + +unsigned int +ipv4_synproxy_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *nhs) +{ + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + unsigned int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (!synproxy) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb) || + ip_hdr(skb)->protocol != IPPROTO_TCP) + return NF_ACCEPT; + + thoff = ip_hdrlen(skb); + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (!th) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(net, skb, th, &opts, + ntohl(th->seq) + 1)) { + this_cpu_inc(snet->stats->cookie_retrans); + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { + synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } + + opts.options &= ~(NF_SYNPROXY_OPT_MSS | + NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(net, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(net, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(ipv4_synproxy_hook); + +static const struct nf_hook_ops ipv4_synproxy_ops[] = { + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net) +{ + int err; + + if (snet->hook_ref4 == 0) { + err = nf_register_net_hooks(net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); + if (err) + return err; + } + + snet->hook_ref4++; + return 0; +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init); + +void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net) +{ + snet->hook_ref4--; + if (snet->hook_ref4 == 0) + nf_unregister_net_hooks(net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini); + +#if IS_ENABLED(CONFIG_IPV6) +static struct ipv6hdr * +synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct ipv6hdr *iph; + + skb_reset_network_header(skb); + iph = skb_put(skb, sizeof(*iph)); + ip6_flow_hdr(iph, 0, 0); + iph->hop_limit = net->ipv6.devconf_all->hop_limit; + iph->nexthdr = IPPROTO_TCP; + iph->saddr = *saddr; + iph->daddr = *daddr; + + return iph; +} + +static void +synproxy_send_tcp_ipv6(struct net *net, + const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct ipv6hdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + struct dst_entry *dst; + struct flowi6 fl6; + int err; + + nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = niph->saddr; + fl6.daddr = niph->daddr; + fl6.fl6_sport = nth->source; + fl6.fl6_dport = nth->dest; + security_skb_classify_flow((struct sk_buff *)skb, + flowi6_to_flowi(&fl6)); + err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); + if (err) { + goto free_nskb; + } + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + goto free_nskb; + + skb_dst_set(nskb, dst); + + if (nfct) { + nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); + nf_conntrack_get(nfct); + } + + ip6_local_out(net, nskb->sk, nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +void +synproxy_send_client_synack_ipv6(struct net *net, + const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, + tcp_hdr_size); +} +EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6); + +static void +synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general, + IP_CT_NEW, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state, + const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth, + tcp_hdr_size); +} + +static void +synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(ntohs(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, + tcp_hdr_size); +} + +bool +synproxy_recv_client_ack_ipv6(struct net *net, + const struct sk_buff *skb, + const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + int mss; + + mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= NF_SYNPROXY_OPT_MSS; + + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq); + return true; +} +EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6); + +unsigned int +ipv6_synproxy_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *nhs) +{ + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + __be16 frag_off; + u8 nexthdr; + int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (!synproxy) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (thoff < 0 || nexthdr != IPPROTO_TCP) + return NF_ACCEPT; + + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (!th) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, + ntohl(th->seq) + 1)) { + this_cpu_inc(snet->stats->cookie_retrans); + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { + synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } + + opts.options &= ~(NF_SYNPROXY_OPT_MSS | + NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack_ipv6(net, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack_ipv6(net, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(ipv6_synproxy_hook); + +static const struct nf_hook_ops ipv6_synproxy_ops[] = { + { + .hook = ipv6_synproxy_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv6_synproxy_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +int +nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) +{ + int err; + + if (snet->hook_ref6 == 0) { + err = nf_register_net_hooks(net, ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); + if (err) + return err; + } + + snet->hook_ref6++; + return 0; +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init); + +void +nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) +{ + snet->hook_ref6--; + if (snet->hook_ref6 == 0) + nf_unregister_net_hooks(net, ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini); +#endif /* CONFIG_IPV6 */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bcf17fb46d96..ed17a7c29b86 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -18,6 +18,7 @@ #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -97,6 +98,7 @@ static void nft_ctx_init(struct nft_ctx *ctx, ctx->nla = nla; ctx->portid = NETLINK_CB(skb).portid; ctx->report = nlmsg_report(nlh); + ctx->flags = nlh->nlmsg_flags; ctx->seq = nlh->nlmsg_seq; } @@ -1169,6 +1171,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, + [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -1446,25 +1449,18 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) return newstats; } -static void nft_chain_stats_replace(struct net *net, - struct nft_base_chain *chain, - struct nft_stats __percpu *newstats) +static void nft_chain_stats_replace(struct nft_trans *trans) { - struct nft_stats __percpu *oldstats; + struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain); - if (newstats == NULL) + if (!nft_trans_chain_stats(trans)) return; - if (rcu_access_pointer(chain->stats)) { - oldstats = rcu_dereference_protected(chain->stats, - lockdep_commit_lock_is_held(net)); - rcu_assign_pointer(chain->stats, newstats); - synchronize_rcu(); - free_percpu(oldstats); - } else { - rcu_assign_pointer(chain->stats, newstats); + rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans), + lockdep_commit_lock_is_held(trans->ctx.net)); + + if (!nft_trans_chain_stats(trans)) static_branch_inc(&nft_counters_enabled); - } } static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) @@ -1610,7 +1606,7 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha } static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, - u8 policy) + u8 policy, u32 flags) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -1664,8 +1660,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, ops->hook = hook.type->hooks[ops->hooknum]; ops->dev = hook.dev; - chain->flags |= NFT_BASE_CHAIN; + chain->flags |= NFT_BASE_CHAIN | flags; basechain->policy = NF_ACCEPT; + INIT_LIST_HEAD(&basechain->cb_list); } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) @@ -1725,7 +1722,8 @@ err1: return err; } -static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy) +static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, + u32 flags) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -1737,6 +1735,9 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy) struct nft_trans *trans; int err; + if (chain->flags ^ flags) + return -EOPNOTSUPP; + if (nla[NFTA_CHAIN_HOOK]) { if (!nft_is_base_chain(chain)) return -EBUSY; @@ -1842,6 +1843,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; + u32 flags = 0; lockdep_assert_held(&net->nft.commit_mutex); @@ -1896,6 +1898,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } } + if (nla[NFTA_CHAIN_FLAGS]) + flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS])); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { @@ -1906,10 +1911,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - return nf_tables_updchain(&ctx, genmask, policy); + return nf_tables_updchain(&ctx, genmask, policy, flags); } - return nf_tables_addchain(&ctx, family, genmask, policy); + return nf_tables_addchain(&ctx, family, genmask, policy, flags); } static int nf_tables_delchain(struct net *net, struct sock *nlsk, @@ -2016,15 +2021,31 @@ EXPORT_SYMBOL_GPL(nft_unregister_expr); static const struct nft_expr_type *__nft_expr_type_get(u8 family, struct nlattr *nla) { - const struct nft_expr_type *type; + const struct nft_expr_type *type, *candidate = NULL; list_for_each_entry(type, &nf_tables_expressions, list) { - if (!nla_strcmp(nla, type->name) && - (!type->family || type->family == family)) - return type; + if (!nla_strcmp(nla, type->name)) { + if (!type->family && !candidate) + candidate = type; + else if (type->family == family) + candidate = type; + } } - return NULL; + return candidate; +} + +#ifdef CONFIG_MODULES +static int nft_expr_type_request_module(struct net *net, u8 family, + struct nlattr *nla) +{ + nft_request_module(net, "nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); + if (__nft_expr_type_get(family, nla)) + return -EAGAIN; + + return 0; } +#endif static const struct nft_expr_type *nft_expr_type_get(struct net *net, u8 family, @@ -2042,9 +2063,7 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nft-expr-%u-%.*s", family, - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_expr_type_request_module(net, family, nla) == -EAGAIN) return ERR_PTR(-EAGAIN); nft_request_module(net, "nft-expr-%.*s", @@ -2137,6 +2156,12 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, (const struct nlattr * const *)info->tb); if (IS_ERR(ops)) { err = PTR_ERR(ops); +#ifdef CONFIG_MODULES + if (err == -EAGAIN) + nft_expr_type_request_module(ctx->net, + ctx->family, + tb[NFTA_EXPR_NAME]); +#endif goto err1; } } else @@ -2645,6 +2670,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, u8 genmask = nft_genmask_next(net); struct nft_expr_info *info = NULL; int family = nfmsg->nfgen_family; + struct nft_flow_rule *flow; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; @@ -2791,7 +2817,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, list_add_tail_rcu(&rule->list, &old_rule->list); } else { - if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); + if (!trans) { err = -ENOMEM; goto err2; } @@ -2814,6 +2841,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (net->nft.validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); + if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(rule); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + nft_trans_flow_rule(trans) = flow; + } + return 0; err2: nf_tables_rule_release(&ctx, rule); @@ -3877,6 +3912,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_SET_ELEM_EXPIRATION] = { .type = NLA_U64 }, [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, @@ -4330,7 +4366,7 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *data, - u64 timeout, gfp_t gfp) + u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -4345,9 +4381,11 @@ void *nft_set_elem_init(const struct nft_set *set, memcpy(nft_set_ext_key(ext), key, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) - *nft_set_ext_expiration(ext) = - get_jiffies_64() + timeout; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { + *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration; + if (expiration == 0) + *nft_set_ext_expiration(ext) += timeout; + } if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) *nft_set_ext_timeout(ext) = timeout; @@ -4412,6 +4450,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_trans *trans; u32 flags = 0; u64 timeout; + u64 expiration; u8 ulen; int err; @@ -4455,6 +4494,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, timeout = set->timeout; } + expiration = 0; + if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) { + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EINVAL; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION], + &expiration); + if (err) + return err; + } + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, nla[NFTA_SET_ELEM_KEY]); if (err < 0) @@ -4537,7 +4586,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, - timeout, GFP_KERNEL); + timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) goto err3; @@ -4739,7 +4788,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, - GFP_KERNEL); + 0, GFP_KERNEL); if (elem.priv == NULL) goto err2; @@ -6359,9 +6408,9 @@ static void nft_chain_commit_update(struct nft_trans *trans) if (!nft_is_base_chain(trans->ctx.chain)) return; + nft_chain_stats_replace(trans); + basechain = nft_base_chain(trans->ctx.chain); - nft_chain_stats_replace(trans->ctx.net, basechain, - nft_trans_chain_stats(trans)); switch (nft_trans_chain_policy(trans)) { case NF_DROP: @@ -6378,6 +6427,7 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: + free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: @@ -6596,6 +6646,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + int err; if (list_empty(&net->nft.commit_list)) { mutex_unlock(&net->nft.commit_mutex); @@ -6606,6 +6657,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nf_tables_validate(net) < 0) return -EAGAIN; + err = nft_flow_rule_offload_commit(net); + if (err < 0) + return err; + /* 1. Allocate space for next generation rules_gen_X[] */ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { int ret; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index b950cd31348b..96c74c4c7176 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -19,6 +19,7 @@ #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> +#include <net/netfilter/nft_meta.h> static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c new file mode 100644 index 000000000000..2c3302845f67 --- /dev/null +++ b/net/netfilter/nf_tables_offload.c @@ -0,0 +1,267 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <net/flow_offload.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> +#include <net/pkt_cls.h> + +static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) +{ + struct nft_flow_rule *flow; + + flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); + if (!flow) + return NULL; + + flow->rule = flow_rule_alloc(num_actions); + if (!flow->rule) { + kfree(flow); + return NULL; + } + + flow->rule->match.dissector = &flow->match.dissector; + flow->rule->match.mask = &flow->match.mask; + flow->rule->match.key = &flow->match.key; + + return flow; +} + +struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule) +{ + struct nft_offload_ctx ctx = { + .dep = { + .type = NFT_OFFLOAD_DEP_UNSPEC, + }, + }; + struct nft_flow_rule *flow; + int num_actions = 0, err; + struct nft_expr *expr; + + expr = nft_expr_first(rule); + while (expr->ops && expr != nft_expr_last(rule)) { + if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) + num_actions++; + + expr = nft_expr_next(expr); + } + + flow = nft_flow_rule_alloc(num_actions); + if (!flow) + return ERR_PTR(-ENOMEM); + + expr = nft_expr_first(rule); + while (expr->ops && expr != nft_expr_last(rule)) { + if (!expr->ops->offload) { + err = -EOPNOTSUPP; + goto err_out; + } + err = expr->ops->offload(&ctx, flow, expr); + if (err < 0) + goto err_out; + + expr = nft_expr_next(expr); + } + flow->proto = ctx.dep.l3num; + + return flow; +err_out: + nft_flow_rule_destroy(flow); + + return ERR_PTR(err); +} + +void nft_flow_rule_destroy(struct nft_flow_rule *flow) +{ + kfree(flow->rule); + kfree(flow); +} + +void nft_offload_set_dependency(struct nft_offload_ctx *ctx, + enum nft_offload_dep_type type) +{ + ctx->dep.type = type; +} + +void nft_offload_update_dependency(struct nft_offload_ctx *ctx, + const void *data, u32 len) +{ + switch (ctx->dep.type) { + case NFT_OFFLOAD_DEP_NETWORK: + WARN_ON(len != sizeof(__u16)); + memcpy(&ctx->dep.l3num, data, sizeof(__u16)); + break; + case NFT_OFFLOAD_DEP_TRANSPORT: + WARN_ON(len != sizeof(__u8)); + memcpy(&ctx->dep.protonum, data, sizeof(__u8)); + break; + default: + break; + } + ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; +} + +static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, + __be16 proto, + struct netlink_ext_ack *extack) +{ + common->protocol = proto; + common->extack = extack; +} + +static int nft_setup_cb_call(struct nft_base_chain *basechain, + enum tc_setup_type type, void *type_data) +{ + struct flow_block_cb *block_cb; + int err; + + list_for_each_entry(block_cb, &basechain->cb_list, list) { + err = block_cb->cb(type, type_data, block_cb->cb_priv); + if (err < 0) + return err; + } + return 0; +} + +static int nft_flow_offload_rule(struct nft_trans *trans, + enum flow_cls_command command) +{ + struct nft_flow_rule *flow = nft_trans_flow_rule(trans); + struct nft_rule *rule = nft_trans_rule(trans); + struct flow_cls_offload cls_flow = {}; + struct nft_base_chain *basechain; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + if (!nft_is_base_chain(trans->ctx.chain)) + return -EOPNOTSUPP; + + basechain = nft_base_chain(trans->ctx.chain); + + if (flow) + proto = flow->proto; + + nft_flow_offload_common_init(&cls_flow.common, proto, &extack); + cls_flow.command = command; + cls_flow.cookie = (unsigned long) rule; + if (flow) + cls_flow.rule = flow->rule; + + return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow); +} + +static int nft_flow_offload_bind(struct flow_block_offload *bo, + struct nft_base_chain *basechain) +{ + list_splice(&bo->cb_list, &basechain->cb_list); + return 0; +} + +static int nft_flow_offload_unbind(struct flow_block_offload *bo, + struct nft_base_chain *basechain) +{ + struct flow_block_cb *block_cb, *next; + + list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + } + + return 0; +} + +#define FLOW_SETUP_BLOCK TC_SETUP_BLOCK + +static int nft_flow_offload_chain(struct nft_trans *trans, + enum flow_block_command cmd) +{ + struct nft_chain *chain = trans->ctx.chain; + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo = {}; + struct nft_base_chain *basechain; + struct net_device *dev; + int err; + + if (!nft_is_base_chain(chain)) + return -EOPNOTSUPP; + + basechain = nft_base_chain(chain); + dev = basechain->ops.dev; + if (!dev || !dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + /* Only default policy to accept is supported for now. */ + if (cmd == FLOW_BLOCK_BIND && + nft_trans_chain_policy(trans) != -1 && + nft_trans_chain_policy(trans) != NF_ACCEPT) + return -EOPNOTSUPP; + + bo.command = cmd; + bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo.extack = &extack; + INIT_LIST_HEAD(&bo.cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, FLOW_SETUP_BLOCK, &bo); + if (err < 0) + return err; + + switch (cmd) { + case FLOW_BLOCK_BIND: + err = nft_flow_offload_bind(&bo, basechain); + break; + case FLOW_BLOCK_UNBIND: + err = nft_flow_offload_unbind(&bo, basechain); + break; + } + + return err; +} + +int nft_flow_rule_offload_commit(struct net *net) +{ + struct nft_trans *trans; + int err = 0; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; + + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_chain(trans, FLOW_BLOCK_BIND); + break; + case NFT_MSG_DELCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_chain(trans, FLOW_BLOCK_UNBIND); + break; + case NFT_MSG_NEWRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + if (trans->ctx.flags & NLM_F_REPLACE || + !(trans->ctx.flags & NLM_F_APPEND)) + return -EOPNOTSUPP; + + err = nft_flow_offload_rule(trans, FLOW_CLS_REPLACE); + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + break; + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans, FLOW_CLS_DESTROY); + break; + } + + if (err) + return err; + } + + return err; +} diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index f42326b40d6f..9f5dea0064ea 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -33,6 +33,7 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); + const struct in_ifaddr *ifa; int ret = 0; if (ttl_check == NF_OSF_TTL_TRUE) @@ -42,15 +43,13 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, else if (ip->ttl <= f_ttl) return 1; - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } - endfor_ifa(in_dev); - return ret; } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 89750f74e3a2..b6a7ce622c72 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -859,7 +859,7 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) } skb_put(e->skb, diff); } - if (!skb_make_writable(e->skb, data_len)) + if (skb_ensure_writable(e->skb, data_len)) return -ENOMEM; skb_copy_to_linear_data(e->skb, data, data_len); e->skb->ip_summed = CHECKSUM_NONE; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 411c0cf741e3..bd173b1824c6 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -12,6 +12,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_tables.h> struct nft_cmp_expr { @@ -107,12 +108,44 @@ nla_put_failure: return -1; } +static int __nft_cmp_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_cmp_expr *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->sreg]; + u8 *mask = (u8 *)&flow->match.mask; + u8 *key = (u8 *)&flow->match.key; + + if (priv->op != NFT_CMP_EQ) + return -EOPNOTSUPP; + + memcpy(key + reg->offset, &priv->data, priv->len); + memcpy(mask + reg->offset, ®->mask, priv->len); + + flow->match.dissector.used_keys |= BIT(reg->key); + flow->match.dissector.offset[reg->key] = reg->base_offset; + + nft_offload_update_dependency(ctx, &priv->data, priv->len); + + return 0; +} + +static int nft_cmp_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_cmp_expr *priv = nft_expr_priv(expr); + + return __nft_cmp_offload(ctx, flow, priv); +} + static const struct nft_expr_ops nft_cmp_ops = { .type = &nft_cmp_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)), .eval = nft_cmp_eval, .init = nft_cmp_init, .dump = nft_cmp_dump, + .offload = nft_cmp_offload, }; static int nft_cmp_fast_init(const struct nft_ctx *ctx, @@ -143,6 +176,25 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, return 0; } +static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); + struct nft_cmp_expr cmp = { + .data = { + .data = { + [0] = priv->data, + }, + }, + .sreg = priv->sreg, + .len = priv->len / BITS_PER_BYTE, + .op = NFT_CMP_EQ, + }; + + return __nft_cmp_offload(ctx, flow, &cmp); +} + static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); @@ -169,6 +221,7 @@ const struct nft_expr_ops nft_cmp_fast_ops = { .eval = NULL, /* inlined */ .init = nft_cmp_fast_init, .dump = nft_cmp_fast_dump, + .offload = nft_cmp_fast_offload, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index dfcdea6619f1..827ab6196df9 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -21,6 +21,7 @@ #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_expect.h> struct nft_ct { enum nft_ct_keys key:8; @@ -1153,6 +1154,135 @@ static struct nft_object_type nft_ct_helper_obj_type __read_mostly = { .owner = THIS_MODULE, }; +struct nft_ct_expect_obj { + u16 l3num; + __be16 dport; + u8 l4proto; + u8 size; + u32 timeout; +}; + +static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_ct_expect_obj *priv = nft_obj_data(obj); + + if (!tb[NFTA_CT_EXPECT_L4PROTO] || + !tb[NFTA_CT_EXPECT_DPORT] || + !tb[NFTA_CT_EXPECT_TIMEOUT] || + !tb[NFTA_CT_EXPECT_SIZE]) + return -EINVAL; + + priv->l3num = ctx->family; + if (tb[NFTA_CT_EXPECT_L3PROTO]) + priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); + + priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); + priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); + priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); + priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); + + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + nf_ct_netns_put(ctx->net, ctx->family); +} + +static int nft_ct_expect_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + const struct nft_ct_expect_obj *priv = nft_obj_data(obj); + + if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) || + nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) || + nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) || + nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) || + nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size)) + return -1; + + return 0; +} + +static void nft_ct_expect_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct_expect_obj *priv = nft_obj_data(obj); + struct nf_conntrack_expect *exp; + enum ip_conntrack_info ctinfo; + struct nf_conn_help *help; + enum ip_conntrack_dir dir; + u16 l3num = priv->l3num; + struct nf_conn *ct; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct || ctinfo == IP_CT_UNTRACKED) { + regs->verdict.code = NFT_BREAK; + return; + } + dir = CTINFO2DIR(ctinfo); + + help = nfct_help(ct); + if (!help) + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + if (!help) { + regs->verdict.code = NF_DROP; + return; + } + + if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) { + regs->verdict.code = NFT_BREAK; + return; + } + if (l3num == NFPROTO_INET) + l3num = nf_ct_l3num(ct); + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + regs->verdict.code = NF_DROP; + return; + } + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num, + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + priv->l4proto, NULL, &priv->dport); + exp->timeout.expires = jiffies + priv->timeout * HZ; + + if (nf_ct_expect_related(exp) != 0) + regs->verdict.code = NF_DROP; +} + +static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { + [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 }, + [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 }, + [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 }, + [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 }, +}; + +static struct nft_object_type nft_ct_expect_obj_type; + +static const struct nft_object_ops nft_ct_expect_obj_ops = { + .type = &nft_ct_expect_obj_type, + .size = sizeof(struct nft_ct_expect_obj), + .eval = nft_ct_expect_obj_eval, + .init = nft_ct_expect_obj_init, + .destroy = nft_ct_expect_obj_destroy, + .dump = nft_ct_expect_obj_dump, +}; + +static struct nft_object_type nft_ct_expect_obj_type __read_mostly = { + .type = NFT_OBJECT_CT_EXPECT, + .ops = &nft_ct_expect_obj_ops, + .maxattr = NFTA_CT_EXPECT_MAX, + .policy = nft_ct_expect_policy, + .owner = THIS_MODULE, +}; + static int __init nft_ct_module_init(void) { int err; @@ -1170,17 +1300,23 @@ static int __init nft_ct_module_init(void) err = nft_register_obj(&nft_ct_helper_obj_type); if (err < 0) goto err2; + + err = nft_register_obj(&nft_ct_expect_obj_type); + if (err < 0) + goto err3; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err = nft_register_obj(&nft_ct_timeout_obj_type); if (err < 0) - goto err3; + goto err4; #endif return 0; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT +err4: + nft_unregister_obj(&nft_ct_expect_obj_type); +#endif err3: nft_unregister_obj(&nft_ct_helper_obj_type); -#endif err2: nft_unregister_expr(&nft_notrack_type); err1: @@ -1193,6 +1329,7 @@ static void __exit nft_ct_module_exit(void) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT nft_unregister_obj(&nft_ct_timeout_obj_type); #endif + nft_unregister_obj(&nft_ct_expect_obj_type); nft_unregister_obj(&nft_ct_helper_obj_type); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); @@ -1207,3 +1344,4 @@ MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 505bdfc66801..33833a0cb989 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -56,7 +56,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, elem = nft_set_elem_init(set, &priv->tmpl, ®s->data[priv->sreg_key], ®s->data[priv->sreg_data], - timeout, GFP_ATOMIC); + timeout, 0, GFP_ATOMIC); if (elem == NULL) goto err1; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a7aa6c5250a4..a5e8469859e3 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -59,6 +59,103 @@ err: regs->verdict.code = NFT_BREAK; } +/* find the offset to specified option. + * + * If target header is found, its offset is set in *offset and return option + * number. Otherwise, return negative error. + * + * If the first fragment doesn't contain the End of Options it is considered + * invalid. + */ +static int ipv4_find_option(struct net *net, struct sk_buff *skb, + unsigned int *offset, int target) +{ + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options *opt = (struct ip_options *)optbuf; + struct iphdr *iph, _iph; + unsigned int start; + bool found = false; + __be32 info; + int optlen; + + iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (!iph) + return -EBADMSG; + start = sizeof(struct iphdr); + + optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); + if (optlen <= 0) + return -ENOENT; + + memset(opt, 0, sizeof(struct ip_options)); + /* Copy the options since __ip_options_compile() modifies + * the options. + */ + if (skb_copy_bits(skb, start, opt->__data, optlen)) + return -EBADMSG; + opt->optlen = optlen; + + if (__ip_options_compile(net, opt, NULL, &info)) + return -EBADMSG; + + switch (target) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (!opt->srr) + break; + found = target == IPOPT_SSRR ? opt->is_strictroute : + !opt->is_strictroute; + if (found) + *offset = opt->srr + start; + break; + case IPOPT_RR: + if (!opt->rr) + break; + *offset = opt->rr + start; + found = true; + break; + case IPOPT_RA: + if (!opt->router_alert) + break; + *offset = opt->router_alert + start; + found = true; + break; + default: + return -EOPNOTSUPP; + } + return found ? target : -ENOENT; +} + +static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + struct sk_buff *skb = pkt->skb; + unsigned int offset; + int err; + + if (skb->protocol != htons(ETH_P_IP)) + goto err; + + err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type); + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + *dest = (err >= 0); + return; + } else if (err < 0) { + goto err; + } + offset += priv->offset; + + dest[priv->len / NFT_REG32_SIZE] = 0; + if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + goto err; + return; +err: + regs->verdict.code = NFT_BREAK; +} + static void * nft_tcp_header_pointer(const struct nft_pktinfo *pkt, unsigned int len, void *buffer, unsigned int *tcphdr_len) @@ -153,7 +250,8 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (i + optl > tcphdr_len || priv->len + priv->offset > optl) return; - if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len)) + if (skb_ensure_writable(pkt->skb, + pkt->xt.thoff + i + priv->len)) return; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, @@ -311,6 +409,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, return nft_validate_register_load(priv->sreg, priv->len); } +static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err = nft_exthdr_init(ctx, expr, tb); + + if (err < 0) + return err; + + switch (priv->type) { + case IPOPT_SSRR: + case IPOPT_LSRR: + case IPOPT_RR: + case IPOPT_RA: + break; + default: + return -EOPNOTSUPP; + } + return 0; +} + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -357,6 +477,14 @@ static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .dump = nft_exthdr_dump, }; +static const struct nft_expr_ops nft_exthdr_ipv4_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_ipv4_eval, + .init = nft_exthdr_ipv4_init, + .dump = nft_exthdr_dump, +}; + static const struct nft_expr_ops nft_exthdr_tcp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -397,6 +525,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv6_ops; break; + case NFT_EXTHDR_OP_IPV4: + if (ctx->family != NFPROTO_IPV6) { + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_ipv4_ops; + } + break; } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index cb8547f97220..ca2ae4b95a8d 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -13,6 +13,7 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> void nft_immediate_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -124,6 +125,34 @@ static int nft_immediate_validate(const struct nft_ctx *ctx, return 0; } +static int nft_immediate_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + struct flow_action_entry *entry; + const struct nft_data *data; + + if (priv->dreg != NFT_REG_VERDICT) + return -EOPNOTSUPP; + + entry = &flow->rule->action.entries[ctx->num_actions++]; + + data = &priv->data; + switch (data->verdict.code) { + case NF_ACCEPT: + entry->id = FLOW_ACTION_ACCEPT; + break; + case NF_DROP: + entry->id = FLOW_ACTION_DROP; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), @@ -133,6 +162,8 @@ static const struct nft_expr_ops nft_imm_ops = { .deactivate = nft_immediate_deactivate, .dump = nft_immediate_dump, .validate = nft_immediate_validate, + .offload = nft_immediate_offload, + .offload_flags = NFT_OFFLOAD_F_ACTION, }; struct nft_expr_type nft_imm_type __read_mostly = { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index a54329b8634a..76866f77e343 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -21,23 +21,13 @@ #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nft_meta.h> +#include <net/netfilter/nf_tables_offload.h> #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ -struct nft_meta { - enum nft_meta_keys key:8; - union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; - }; -}; - static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -#ifdef CONFIG_NF_TABLES_BRIDGE -#include "../bridge/br_private.h" -#endif - void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -47,9 +37,6 @@ void nft_meta_get_eval(const struct nft_expr *expr, const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); struct sock *sk; u32 *dest = ®s->data[priv->dreg]; -#ifdef CONFIG_NF_TABLES_BRIDGE - const struct net_bridge_port *p; -#endif switch (priv->key) { case NFT_META_LEN: @@ -229,18 +216,6 @@ void nft_meta_get_eval(const struct nft_expr *expr, nft_reg_store8(dest, secpath_exists(skb)); break; #endif -#ifdef CONFIG_NF_TABLES_BRIDGE - case NFT_META_BRI_IIFNAME: - if (in == NULL || (p = br_port_get_rcu(in)) == NULL) - goto err; - strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); - return; - case NFT_META_BRI_OIFNAME: - if (out == NULL || (p = br_port_get_rcu(out)) == NULL) - goto err; - strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); - return; -#endif case NFT_META_IIFKIND: if (in == NULL || in->rtnl_link_ops == NULL) goto err; @@ -260,10 +235,11 @@ void nft_meta_get_eval(const struct nft_expr *expr, err: regs->verdict.code = NFT_BREAK; } +EXPORT_SYMBOL_GPL(nft_meta_get_eval); -static void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; @@ -300,16 +276,18 @@ static void nft_meta_set_eval(const struct nft_expr *expr, WARN_ON(1); } } +EXPORT_SYMBOL_GPL(nft_meta_set_eval); -static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, [NFTA_META_KEY] = { .type = NLA_U32 }, [NFTA_META_SREG] = { .type = NLA_U32 }, }; +EXPORT_SYMBOL_GPL(nft_meta_policy); -static int nft_meta_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -360,14 +338,6 @@ static int nft_meta_get_init(const struct nft_ctx *ctx, len = sizeof(u8); break; #endif -#ifdef CONFIG_NF_TABLES_BRIDGE - case NFT_META_BRI_IIFNAME: - case NFT_META_BRI_OIFNAME: - if (ctx->family != NFPROTO_BRIDGE) - return -EOPNOTSUPP; - len = IFNAMSIZ; - break; -#endif default: return -EOPNOTSUPP; } @@ -376,6 +346,7 @@ static int nft_meta_get_init(const struct nft_ctx *ctx, return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, len); } +EXPORT_SYMBOL_GPL(nft_meta_get_init); static int nft_meta_get_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -409,9 +380,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } -static int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -437,10 +408,11 @@ static int nft_meta_set_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, hooks); } +EXPORT_SYMBOL_GPL(nft_meta_set_validate); -static int nft_meta_set_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -475,9 +447,10 @@ static int nft_meta_set_init(const struct nft_ctx *ctx, return 0; } +EXPORT_SYMBOL_GPL(nft_meta_set_init); -static int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) +int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -490,8 +463,9 @@ static int nft_meta_get_dump(struct sk_buff *skb, nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_get_dump); -static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -505,15 +479,42 @@ static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_set_dump); -static void nft_meta_set_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +void nft_meta_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); if (priv->key == NFT_META_NFTRACE) static_branch_dec(&nft_trace_enabled); } +EXPORT_SYMBOL_GPL(nft_meta_set_destroy); + +static int nft_meta_get_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->key) { + case NFT_META_PROTOCOL: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, + sizeof(__u16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case NFT_META_L4PROTO: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, @@ -522,6 +523,7 @@ static const struct nft_expr_ops nft_meta_get_ops = { .init = nft_meta_get_init, .dump = nft_meta_get_dump, .validate = nft_meta_get_validate, + .offload = nft_meta_get_offload, }; static const struct nft_expr_ops nft_meta_set_ops = { @@ -544,6 +546,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) return ERR_PTR(-EINVAL); +#ifdef CONFIG_NF_TABLES_BRIDGE + if (ctx->family == NFPROTO_BRIDGE) + return ERR_PTR(-EAGAIN); +#endif if (tb[NFTA_META_DREG]) return &nft_meta_get_ops; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 680bd9f38a81..22a80eb60222 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -15,10 +15,13 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> /* For layer 4 checksum field offset. */ #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmpv6.h> +#include <linux/ip.h> +#include <linux/ipv6.h> /* add vlan header into the user buffer for if tag was removed by offloads */ static bool @@ -150,12 +153,195 @@ nla_put_failure: return -1; } +static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct ethhdr, h_source): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, + src, ETH_ALEN, reg); + break; + case offsetof(struct ethhdr, h_dest): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, + dst, ETH_ALEN, reg); + break; + } + + return 0; +} + +static int nft_payload_offload_ip(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct iphdr, saddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src, + sizeof(struct in_addr), reg); + break; + case offsetof(struct iphdr, daddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst, + sizeof(struct in_addr), reg); + break; + case offsetof(struct iphdr, protocol): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct ipv6hdr, saddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src, + sizeof(struct in6_addr), reg); + break; + case offsetof(struct ipv6hdr, daddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst, + sizeof(struct in6_addr), reg); + break; + case offsetof(struct ipv6hdr, nexthdr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_nh(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + int err; + + switch (ctx->dep.l3num) { + case htons(ETH_P_IP): + err = nft_payload_offload_ip(ctx, flow, priv); + break; + case htons(ETH_P_IPV6): + err = nft_payload_offload_ip6(ctx, flow, priv); + break; + default: + return -EOPNOTSUPP; + } + + return err; +} + +static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct tcphdr, source): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, + sizeof(__be16), reg); + break; + case offsetof(struct tcphdr, dest): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, + sizeof(__be16), reg); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_udp(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct udphdr, source): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, + sizeof(__be16), reg); + break; + case offsetof(struct udphdr, dest): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, + sizeof(__be16), reg); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_th(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + int err; + + switch (ctx->dep.protonum) { + case IPPROTO_TCP: + err = nft_payload_offload_tcp(ctx, flow, priv); + break; + case IPPROTO_UDP: + err = nft_payload_offload_udp(ctx, flow, priv); + break; + default: + return -EOPNOTSUPP; + } + + return err; +} + +static int nft_payload_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + int err; + + switch (priv->base) { + case NFT_PAYLOAD_LL_HEADER: + err = nft_payload_offload_ll(ctx, flow, priv); + break; + case NFT_PAYLOAD_NETWORK_HEADER: + err = nft_payload_offload_nh(ctx, flow, priv); + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + err = nft_payload_offload_th(ctx, flow, priv); + break; + default: + err = -EOPNOTSUPP; + break; + } + return err; +} + static const struct nft_expr_ops nft_payload_ops = { .type = &nft_payload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, + .offload = nft_payload_offload, }; const struct nft_expr_ops nft_payload_fast_ops = { @@ -164,6 +350,7 @@ const struct nft_expr_ops nft_payload_fast_ops = { .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, + .offload = nft_payload_offload, }; static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) @@ -240,7 +427,7 @@ static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, tsum)); } - if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) || + if (skb_ensure_writable(skb, l4csum_offset + sizeof(sum)) || skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0) return -1; @@ -256,7 +443,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, return -1; nft_csum_replace(&sum, fsum, tsum); - if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || + if (skb_ensure_writable(skb, csum_offset + sizeof(sum)) || skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) return -1; @@ -309,7 +496,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, goto err; } - if (!skb_make_writable(skb, max(offset + priv->len, 0)) || + if (skb_ensure_writable(skb, max(offset + priv->len, 0)) || skb_store_bits(skb, offset, src, priv->len) < 0) goto err; diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c new file mode 100644 index 000000000000..80060ade8a5b --- /dev/null +++ b/net/netfilter/nft_synproxy.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/types.h> +#include <net/ip.h> +#include <net/tcp.h> +#include <net/netlink.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_synproxy.h> +#include <net/netfilter/nf_synproxy.h> +#include <linux/netfilter/nf_tables.h> +#include <linux/netfilter/nf_synproxy.h> + +struct nft_synproxy { + struct nf_synproxy_info info; +}; + +static const struct nla_policy nft_synproxy_policy[NFTA_SYNPROXY_MAX + 1] = { + [NFTA_SYNPROXY_MSS] = { .type = NLA_U16 }, + [NFTA_SYNPROXY_WSCALE] = { .type = NLA_U8 }, + [NFTA_SYNPROXY_FLAGS] = { .type = NLA_U32 }, +}; + +static void nft_synproxy_tcp_options(struct synproxy_options *opts, + const struct tcphdr *tcp, + struct synproxy_net *snet, + struct nf_synproxy_info *info, + struct nft_synproxy *priv) +{ + this_cpu_inc(snet->stats->syn_received); + if (tcp->ece && tcp->cwr) + opts->options |= NF_SYNPROXY_OPT_ECN; + + opts->options &= priv->info.options; + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, opts); + else + opts->options &= ~(NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM | + NF_SYNPROXY_OPT_ECN); +} + +static void nft_synproxy_eval_v4(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + const struct tcphdr *tcp, + struct tcphdr *_tcph, + struct synproxy_options *opts) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + struct nf_synproxy_info info = priv->info; + struct net *net = nft_net(pkt); + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *skb = pkt->skb; + + if (tcp->syn) { + /* Initial SYN from client */ + nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); + synproxy_send_client_synack(net, skb, tcp, opts); + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else if (tcp->ack) { + /* ACK from client */ + if (synproxy_recv_client_ack(net, skb, tcp, opts, + ntohl(tcp->seq))) { + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else { + regs->verdict.code = NF_DROP; + } + } +} + +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) +static void nft_synproxy_eval_v6(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + const struct tcphdr *tcp, + struct tcphdr *_tcph, + struct synproxy_options *opts) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + struct nf_synproxy_info info = priv->info; + struct net *net = nft_net(pkt); + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *skb = pkt->skb; + + if (tcp->syn) { + /* Initial SYN from client */ + nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); + synproxy_send_client_synack_ipv6(net, skb, tcp, opts); + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else if (tcp->ack) { + /* ACK from client */ + if (synproxy_recv_client_ack_ipv6(net, skb, tcp, opts, + ntohl(tcp->seq))) { + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else { + regs->verdict.code = NF_DROP; + } + } +} +#endif /* CONFIG_NF_TABLES_IPV6*/ + +static void nft_synproxy_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct synproxy_options opts = {}; + struct sk_buff *skb = pkt->skb; + int thoff = pkt->xt.thoff; + const struct tcphdr *tcp; + struct tcphdr _tcph; + + if (pkt->tprot != IPPROTO_TCP) { + regs->verdict.code = NFT_BREAK; + return; + } + + if (nf_ip_checksum(skb, nft_hook(pkt), thoff, IPPROTO_TCP)) { + regs->verdict.code = NF_DROP; + return; + } + + tcp = skb_header_pointer(skb, pkt->xt.thoff, + sizeof(struct tcphdr), + &_tcph); + if (!tcp) { + regs->verdict.code = NF_DROP; + return; + } + + if (!synproxy_parse_options(skb, thoff, tcp, &opts)) { + regs->verdict.code = NF_DROP; + return; + } + + switch (skb->protocol) { + case htons(ETH_P_IP): + nft_synproxy_eval_v4(expr, regs, pkt, tcp, &_tcph, &opts); + return; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case htons(ETH_P_IPV6): + nft_synproxy_eval_v6(expr, regs, pkt, tcp, &_tcph, &opts); + return; +#endif + } + regs->verdict.code = NFT_BREAK; +} + +static int nft_synproxy_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct synproxy_net *snet = synproxy_pernet(ctx->net); + struct nft_synproxy *priv = nft_expr_priv(expr); + u32 flags; + int err; + + if (tb[NFTA_SYNPROXY_MSS]) + priv->info.mss = ntohs(nla_get_be16(tb[NFTA_SYNPROXY_MSS])); + if (tb[NFTA_SYNPROXY_WSCALE]) + priv->info.wscale = nla_get_u8(tb[NFTA_SYNPROXY_WSCALE]); + if (tb[NFTA_SYNPROXY_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_SYNPROXY_FLAGS])); + if (flags & ~NF_SYNPROXY_OPT_MASK) + return -EOPNOTSUPP; + priv->info.options = flags; + } + + err = nf_ct_netns_get(ctx->net, ctx->family); + if (err) + return err; + + switch (ctx->family) { + case NFPROTO_IPV4: + err = nf_synproxy_ipv4_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + err = nf_synproxy_ipv6_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; +#endif + case NFPROTO_INET: + case NFPROTO_BRIDGE: + err = nf_synproxy_ipv4_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + err = nf_synproxy_ipv6_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; + } + + return 0; + +nf_ct_failure: + nf_ct_netns_put(ctx->net, ctx->family); + return err; +} + +static void nft_synproxy_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct synproxy_net *snet = synproxy_pernet(ctx->net); + + switch (ctx->family) { + case NFPROTO_IPV4: + nf_synproxy_ipv4_fini(snet, ctx->net); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nf_synproxy_ipv6_fini(snet, ctx->net); + break; +#endif + case NFPROTO_INET: + case NFPROTO_BRIDGE: + nf_synproxy_ipv4_fini(snet, ctx->net); + nf_synproxy_ipv6_fini(snet, ctx->net); + break; + } + nf_ct_netns_put(ctx->net, ctx->family); +} + +static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_synproxy *priv = nft_expr_priv(expr); + + if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) || + nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) || + nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_synproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD)); +} + +static struct nft_expr_type nft_synproxy_type; +static const struct nft_expr_ops nft_synproxy_ops = { + .eval = nft_synproxy_eval, + .size = NFT_EXPR_SIZE(sizeof(struct nft_synproxy)), + .init = nft_synproxy_init, + .destroy = nft_synproxy_destroy, + .dump = nft_synproxy_dump, + .type = &nft_synproxy_type, + .validate = nft_synproxy_validate, +}; + +static struct nft_expr_type nft_synproxy_type __read_mostly = { + .ops = &nft_synproxy_ops, + .name = "synproxy", + .owner = THIS_MODULE, + .policy = nft_synproxy_policy, + .maxattr = NFTA_SYNPROXY_MAX, +}; + +static int __init nft_synproxy_module_init(void) +{ + return nft_register_expr(&nft_synproxy_type); +} + +static void __exit nft_synproxy_module_exit(void) +{ + return nft_unregister_expr(&nft_synproxy_type); +} + +module_init(nft_synproxy_module_init); +module_exit(nft_synproxy_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); +MODULE_ALIAS_NFT_EXPR("synproxy"); diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 06dc55590441..51b454d8fa9c 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -17,7 +17,8 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, case CHECKSUM_COMPLETE: if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; - if ((protocol == 0 && !csum_fold(skb->csum)) || + if ((protocol != IPPROTO_TCP && protocol != IPPROTO_UDP && + !csum_fold(skb->csum)) || !csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len - dataoff, protocol, skb->csum)) { @@ -26,7 +27,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, } /* fall through */ case CHECKSUM_NONE: - if (protocol == 0) + if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP) skb->csum = 0; else skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index b1054a3d18c5..eababc354ff1 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -31,7 +31,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; ipv4_change_dsfield(ip_hdr(skb), @@ -49,7 +49,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; ipv6_change_dsfield(ipv6_hdr(skb), @@ -79,7 +79,7 @@ tos_tg(struct sk_buff *skb, const struct xt_action_param *par) nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ip_hdr(skb); ipv4_change_dsfield(iph, 0, nv); @@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ipv6_hdr(skb); ipv6_change_dsfield(iph, 0, nv); diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c index 8221a5ce44bf..7873b834c300 100644 --- a/net/netfilter/xt_HL.c +++ b/net/netfilter/xt_HL.c @@ -29,7 +29,7 @@ ttl_tg(struct sk_buff *skb, const struct xt_action_param *par) const struct ipt_TTL_info *info = par->targinfo; int new_ttl; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, sizeof(*iph))) return NF_DROP; iph = ip_hdr(skb); @@ -69,7 +69,7 @@ hl_tg6(struct sk_buff *skb, const struct xt_action_param *par) const struct ip6t_HL_info *info = par->targinfo; int new_hl; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, sizeof(*ip6h))) return NF_DROP; ip6h = ipv6_hdr(skb); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 0b3a1b291c91..122db9fbb9f4 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -86,7 +86,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, if (par->fragoff != 0) return 0; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return -1; len = skb->len - tcphoff; diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 666f4ca9b15f..30e99464171b 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -28,33 +28,33 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) static unsigned int tcpoptstrip_mangle_packet(struct sk_buff *skb, const struct xt_action_param *par, - unsigned int tcphoff, unsigned int minlen) + unsigned int tcphoff) { const struct xt_tcpoptstrip_target_info *info = par->targinfo; + struct tcphdr *tcph, _th; unsigned int optl, i, j; - struct tcphdr *tcph; u_int16_t n, o; u_int8_t *opt; - int len, tcp_hdrlen; + int tcp_hdrlen; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) return XT_CONTINUE; - if (!skb_make_writable(skb, skb->len)) + tcph = skb_header_pointer(skb, tcphoff, sizeof(_th), &_th); + if (!tcph) return NF_DROP; - len = skb->len - tcphoff; - if (len < (int)sizeof(struct tcphdr)) - return NF_DROP; - - tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); tcp_hdrlen = tcph->doff * 4; + if (tcp_hdrlen < sizeof(struct tcphdr)) + return NF_DROP; - if (len < tcp_hdrlen) + if (skb_ensure_writable(skb, tcphoff + tcp_hdrlen)) return NF_DROP; - opt = (u_int8_t *)tcph; + /* must reload tcph, might have been moved */ + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + opt = (u8 *)tcph; /* * Walk through all TCP options - if we find some option to remove, @@ -88,8 +88,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, static unsigned int tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb), - sizeof(struct iphdr) + sizeof(struct tcphdr)); + return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb)); } #if IS_ENABLED(CONFIG_IP6_NF_MANGLE) @@ -106,8 +105,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (tcphoff < 0) return NF_DROP; - return tcpoptstrip_mangle_packet(skb, par, tcphoff, - sizeof(*ipv6h) + sizeof(struct tcphdr)); + return tcpoptstrip_mangle_packet(skb, par, tcphoff); } #endif diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index 140ce6be639a..0c9e014e30b4 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -2,7 +2,7 @@ /* * xt_iprange - Netfilter module to match IP address ranges * - * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> * (C) CC Computer Consultants GmbH, 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -130,7 +130,7 @@ static void __exit iprange_mt_exit(void) module_init(iprange_mt_init); module_exit(iprange_mt_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching"); MODULE_ALIAS("ipt_iprange"); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 95f64a99e425..e85ce69924ae 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -22,6 +22,9 @@ static int owner_check(const struct xt_mtchk_param *par) struct xt_owner_match_info *info = par->matchinfo; struct net *net = par->net; + if (info->match & ~XT_OWNER_MASK) + return -EINVAL; + /* Only allow the common case where the userns of the writer * matches the userns of the network namespace. */ @@ -88,11 +91,28 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } if (info->match & XT_OWNER_GID) { + unsigned int i, match = false; kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); - if ((gid_gte(filp->f_cred->fsgid, gid_min) && - gid_lte(filp->f_cred->fsgid, gid_max)) ^ - !(info->invert & XT_OWNER_GID)) + struct group_info *gi = filp->f_cred->group_info; + + if (gid_gte(filp->f_cred->fsgid, gid_min) && + gid_lte(filp->f_cred->fsgid, gid_max)) + match = true; + + if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) { + for (i = 0; i < gi->ngroups; ++i) { + kgid_t group = gi->gid[i]; + + if (gid_gte(group, gid_min) && + gid_lte(group, gid_max)) { + match = true; + break; + } + } + } + + if (match ^ !(info->invert & XT_OWNER_GID)) return false; } diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index f099228cb9c4..ecbfa291fb70 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -2,7 +2,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module which implements the set match and SET target @@ -18,7 +18,7 @@ #include <uapi/linux/netfilter/xt_set.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_DESCRIPTION("Xtables: IP set match and target module"); MODULE_ALIAS("xt_SET"); MODULE_ALIAS("ipt_set"); @@ -436,6 +436,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) { const struct xt_set_info_target_v3 *info = par->targinfo; ip_set_id_t index; + int ret = 0; if (info->add_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, @@ -453,17 +454,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) if (index == IPSET_INVALID_ID) { pr_info_ratelimited("Cannot find del_set index %u as target\n", info->del_set.index); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->add_set.index); - return -ENOENT; + ret = -ENOENT; + goto cleanup_add; } } if (info->map_set.index != IPSET_INVALID_ID) { if (strncmp(par->table, "mangle", 7)) { pr_info_ratelimited("--map-set only usable from mangle table\n"); - return -EINVAL; + ret = -EINVAL; + goto cleanup_del; } if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && @@ -471,20 +471,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); - return -EINVAL; + ret = -EINVAL; + goto cleanup_del; } index = ip_set_nfnl_get_byindex(par->net, info->map_set.index); if (index == IPSET_INVALID_ID) { pr_info_ratelimited("Cannot find map_set index %u as target\n", info->map_set.index); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->add_set.index); - if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->del_set.index); - return -ENOENT; + ret = -ENOENT; + goto cleanup_del; } } @@ -492,16 +488,21 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) info->del_set.dim > IPSET_DIM_MAX || info->map_set.dim > IPSET_DIM_MAX) { pr_info_ratelimited("SET target dimension over the limit!\n"); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->add_set.index); - if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->del_set.index); - if (info->map_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->map_set.index); - return -ERANGE; + ret = -ERANGE; + goto cleanup_mark; } return 0; +cleanup_mark: + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); +cleanup_del: + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); +cleanup_add: + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + return ret; } static void diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e9ddfd782d16..90b2ab9dd449 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -241,13 +241,8 @@ static __net_init int netlink_tap_init_net(struct net *net) return 0; } -static void __net_exit netlink_tap_exit_net(struct net *net) -{ -} - static struct pernet_operations netlink_tap_net_ops = { .init = netlink_tap_init_net, - .exit = netlink_tap_exit_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), }; @@ -2544,12 +2539,10 @@ struct nl_seq_iter { int link; }; -static int netlink_walk_start(struct nl_seq_iter *iter) +static void netlink_walk_start(struct nl_seq_iter *iter) { rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti); rhashtable_walk_start(&iter->hti); - - return 0; } static void netlink_walk_stop(struct nl_seq_iter *iter) @@ -2565,8 +2558,6 @@ static void *__netlink_seq_next(struct seq_file *seq) do { for (;;) { - int err; - nlk = rhashtable_walk_next(&iter->hti); if (IS_ERR(nlk)) { @@ -2583,9 +2574,7 @@ static void *__netlink_seq_next(struct seq_file *seq) if (++iter->link >= MAX_LINKS) return NULL; - err = netlink_walk_start(iter); - if (err) - return ERR_PTR(err); + netlink_walk_start(iter); } } while (sock_net(&nlk->sk) != seq_file_net(seq)); @@ -2597,13 +2586,10 @@ static void *netlink_seq_start(struct seq_file *seq, loff_t *posp) struct nl_seq_iter *iter = seq->private; void *obj = SEQ_START_TOKEN; loff_t pos; - int err; iter->link = 0; - err = netlink_walk_start(iter); - if (err) - return ERR_PTR(err); + netlink_walk_start(iter); for (pos = *posp; pos && obj && !IS_ERR(obj); pos--) obj = __netlink_seq_next(seq); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 86b87925ef34..96740d389377 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -869,7 +869,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) unsigned short frametype, flags, window, timeout; int ret; - skb->sk = NULL; /* Initially we don't know who it's for */ + skb_orphan(skb); /* * skb->data points to the netrom frame start @@ -968,6 +968,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) window = skb->data[20]; skb->sk = make; + skb->destructor = sock_efree; make->sk_state = TCP_ESTABLISHED; /* Fill in his circuit details */ diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 0a0c265baaa4..ce3382be937f 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -107,7 +107,7 @@ static int nci_queue_tx_data_frags(struct nci_dev *ndev, conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) { rc = -EPROTO; - goto free_exit; + goto exit; } __skb_queue_head_init(&frags_q); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 151518dbabad..3572e11b6f21 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -160,50 +160,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len); -static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, - __be16 ethertype) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) { - __be16 diff[] = { ~(hdr->h_proto), ethertype }; - - skb->csum = ~csum_partial((char *)diff, sizeof(diff), - ~skb->csum); - } - - hdr->h_proto = ethertype; -} - static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_mpls *mpls) { - struct mpls_shim_hdr *new_mpls_lse; - - /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ - if (skb->encapsulation) - return -ENOTSUPP; - - if (skb_cow_head(skb, MPLS_HLEN) < 0) - return -ENOMEM; - - if (!skb->inner_protocol) { - skb_set_inner_network_header(skb, skb->mac_len); - skb_set_inner_protocol(skb, skb->protocol); - } - - skb_push(skb, MPLS_HLEN); - memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), - skb->mac_len); - skb_reset_mac_header(skb); - skb_set_network_header(skb, skb->mac_len); - - new_mpls_lse = mpls_hdr(skb); - new_mpls_lse->label_stack_entry = mpls->mpls_lse; - - skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); + int err; - if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) - update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); - skb->protocol = mpls->mpls_ethertype; + err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype); + if (err) + return err; invalidate_flow_key(key); return 0; @@ -214,31 +178,10 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, { int err; - err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); - if (unlikely(err)) + err = skb_mpls_pop(skb, ethertype); + if (err) return err; - skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); - - memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), - skb->mac_len); - - __skb_pull(skb, MPLS_HLEN); - skb_reset_mac_header(skb); - skb_set_network_header(skb, skb->mac_len); - - if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) { - struct ethhdr *hdr; - - /* mpls_hdr() is used to locate the ethertype field correctly in the - * presence of VLAN tags. - */ - hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); - update_ethertype(skb, hdr, ethertype); - } - if (eth_p_mpls(skb->protocol)) - skb->protocol = ethertype; - invalidate_flow_key(key); return 0; } @@ -250,20 +193,12 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, __be32 lse; int err; - err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); - if (unlikely(err)) - return err; - stack = mpls_hdr(skb); lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask); - if (skb->ip_summed == CHECKSUM_COMPLETE) { - __be32 diff[] = { ~(stack->label_stack_entry), lse }; - - skb->csum = ~csum_partial((char *)diff, sizeof(diff), - ~skb->csum); - } + err = skb_mpls_update_lse(skb, lse); + if (err) + return err; - stack->label_stack_entry = lse; flow_key->mpls.top_lse = lse; return 0; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6747bc57b6fa..33b388103741 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1334,7 +1334,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, &flow->id, info, false, ufid_flags); if (likely(reply)) { - if (likely(!IS_ERR(reply))) { + if (!IS_ERR(reply)) { rcu_read_lock(); /*To keep RCU checker happy. */ err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 53cf07d141b4..7af0cde8b293 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -48,7 +48,7 @@ void ovs_dp_notify_wq(struct work_struct *work) if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; - if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) + if (!(netif_is_ovs_port(vport->dev))) dp_detach_port_notify(vport); } } diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 52a1ed9633ec..57d6436e6f6a 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -156,7 +156,7 @@ void ovs_netdev_detach_dev(struct vport *vport) static void netdev_destroy(struct vport *vport) { rtnl_lock(); - if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); rtnl_unlock(); @@ -166,7 +166,7 @@ static void netdev_destroy(struct vport *vport) void ovs_netdev_tunnel_destroy(struct vport *vport) { rtnl_lock(); - if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); /* We can be invoked by both explicit vport deletion and @@ -186,7 +186,7 @@ EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) { - if (likely(dev->priv_flags & IFF_OVS_DATAPATH)) + if (likely(netif_is_ovs_port(dev))) return (struct vport *) rcu_dereference_rtnl(dev->rx_handler_data); else diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index f927de9bda0a..3fc38d16c456 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -248,8 +248,6 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) */ void ovs_vport_del(struct vport *vport) { - ASSERT_OVSL(); - hlist_del_rcu(&vport->hash_node); module_put(vport->ops->owner); vport->ops->destroy(vport); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5f78df080573..8d54f3047768 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -384,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) smp_wmb(); } -static int __packet_get_status(struct packet_sock *po, void *frame) +static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; @@ -460,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, return ts_status; } -static void *packet_lookup_frame(struct packet_sock *po, - struct packet_ring_buffer *rb, - unsigned int position, - int status) +static void *packet_lookup_frame(const struct packet_sock *po, + const struct packet_ring_buffer *rb, + unsigned int position, + int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; @@ -758,7 +758,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; - if (po->stats.stats3.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; @@ -1003,7 +1003,6 @@ static void prb_fill_curr_block(char *curr, /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, - int status, unsigned int len ) { @@ -1075,7 +1074,7 @@ static void *packet_current_rx_frame(struct packet_sock *po, po->rx_ring.head, status); return curr; case TPACKET_V3: - return __packet_lookup_frame_in_block(po, skb, status, len); + return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); @@ -1083,10 +1082,10 @@ static void *packet_current_rx_frame(struct packet_sock *po, } } -static void *prb_lookup_block(struct packet_sock *po, - struct packet_ring_buffer *rb, - unsigned int idx, - int status) +static void *prb_lookup_block(const struct packet_sock *po, + const struct packet_ring_buffer *rb, + unsigned int idx, + int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); @@ -1199,12 +1198,12 @@ static void packet_free_pending(struct packet_sock *po) #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 -static bool __tpacket_has_room(struct packet_sock *po, int pow_off) +static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; - len = po->rx_ring.frame_max + 1; - idx = po->rx_ring.head; + len = READ_ONCE(po->rx_ring.frame_max) + 1; + idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) @@ -1212,12 +1211,12 @@ static bool __tpacket_has_room(struct packet_sock *po, int pow_off) return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) +static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; - len = po->rx_ring.prb_bdqc.knum_blocks; - idx = po->rx_ring.prb_bdqc.kactive_blk_num; + len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); + idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) @@ -1225,15 +1224,18 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +static int __packet_rcv_has_room(const struct packet_sock *po, + const struct sk_buff *skb) { - struct sock *sk = &po->sk; + const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { - int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) - - (skb ? skb->truesize : 0); - if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) + int rcvbuf = READ_ONCE(sk->sk_rcvbuf); + int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) + - (skb ? skb->truesize : 0); + + if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; @@ -1258,19 +1260,24 @@ static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { - int ret; - bool has_room; + int pressure, ret; - spin_lock_bh(&po->sk.sk_receive_queue.lock); ret = __packet_rcv_has_room(po, skb); - has_room = ret == ROOM_NORMAL; - if (po->pressure == has_room) - po->pressure = !has_room; - spin_unlock_bh(&po->sk.sk_receive_queue.lock); + pressure = ret != ROOM_NORMAL; + + if (READ_ONCE(po->pressure) != pressure) + WRITE_ONCE(po->pressure, pressure); return ret; } +static void packet_rcv_try_clear_pressure(struct packet_sock *po) +{ + if (READ_ONCE(po->pressure) && + __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + WRITE_ONCE(po->pressure, 0); +} + static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); @@ -1351,7 +1358,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(f->arr[i]); - if (po_next != po_skip && !po_next->pressure && + if (po_next != po_skip && !READ_ONCE(po_next->pressure) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; @@ -2126,10 +2133,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: is_drop_n_account = true; - spin_lock(&sk->sk_receive_queue.lock); - po->stats.stats1.tp_drops++; + atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); - spin_unlock(&sk->sk_receive_queue.lock); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { @@ -2193,6 +2198,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!res) goto drop_n_restore; + /* If we are flooded, just give up */ + if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { + atomic_inc(&po->tp_drops); + goto drop_n_restore; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && @@ -2263,7 +2274,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ - if (po->stats.stats1.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } @@ -2379,9 +2390,9 @@ drop: return 0; drop_n_account: - is_drop_n_account = true; - po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); + atomic_inc(&po->tp_drops); + is_drop_n_account = true; sk->sk_data_ready(sk); kfree_skb(copy_skb); @@ -3318,8 +3329,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb == NULL) goto out; - if (pkt_sk(sk)->pressure) - packet_rcv_has_room(pkt_sk(sk), NULL); + packet_rcv_try_clear_pressure(pkt_sk(sk)); if (pkt_sk(sk)->has_vnet_hdr) { err = packet_rcv_vnet(msg, skb, &len); @@ -3891,6 +3901,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; + int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3907,14 +3918,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); + drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); - st.stats3.tp_packets += st.stats3.tp_drops; + st.stats3.tp_drops = drops; + st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); - st.stats1.tp_packets += st.stats1.tp_drops; + st.stats1.tp_drops = drops; + st.stats1.tp_packets += drops; data = &st.stats1; } @@ -4133,8 +4147,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } - if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) - po->pressure = 0; + packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { diff --git a/net/packet/internal.h b/net/packet/internal.h index c70a2794456f..82fb2b10f790 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -132,6 +132,7 @@ struct packet_sock { struct net_device __rcu *cached_dev; int (*xmit)(struct sk_buff *skb); struct packet_type prot_hook ____cacheline_aligned_in_smp; + atomic_t tp_drops ____cacheline_aligned_in_smp; }; static struct packet_sock *pkt_sk(struct sock *sk) diff --git a/net/rds/ib.c b/net/rds/ib.c index b8d581b779b2..ec05d91aa9a2 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -318,6 +318,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; rds_ib_get_mr_info(rds_ibdev, iinfo); + iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } @@ -351,6 +352,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; iinfo6->max_send_sge = rds_ibdev->max_sge; rds6_ib_get_mr_info(rds_ibdev, iinfo6); + iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index f9f4721cdfa7..d09eaf153544 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -545,6 +545,7 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) switch (rx->sk.sk_state) { case RXRPC_UNBOUND: + case RXRPC_CLIENT_UNBOUND: rx->srx.srx_family = AF_RXRPC; rx->srx.srx_service = 0; rx->srx.transport_type = SOCK_DGRAM; @@ -569,10 +570,9 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) } rx->local = local; - rx->sk.sk_state = RXRPC_CLIENT_UNBOUND; + rx->sk.sk_state = RXRPC_CLIENT_BOUND; /* Fall through */ - case RXRPC_CLIENT_UNBOUND: case RXRPC_CLIENT_BOUND: if (!m->msg_name && test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) { diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index a0b6abfbd277..948e3fe249ec 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -519,6 +519,9 @@ send_fragmentable: } break; #endif + + default: + BUG(); } if (ret < 0) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2c72d95c3050..dd55b9ac3a66 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -842,6 +842,17 @@ config NET_ACT_CSUM To compile this code as a module, choose M here: the module will be called act_csum. +config NET_ACT_MPLS + tristate "MPLS manipulation" + depends on NET_CLS_ACT + help + Say Y here to push or pop MPLS headers. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_mpls. + config NET_ACT_VLAN tristate "Vlan manipulation" depends on NET_CLS_ACT @@ -877,6 +888,23 @@ config NET_ACT_CONNMARK To compile this code as a module, choose M here: the module will be called act_connmark. +config NET_ACT_CTINFO + tristate "Netfilter Connection Mark Actions" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + help + Say Y here to allow transfer of a connmark stored information. + Current actions transfer connmark stored DSCP into + ipv4/v6 diffserv and/or to transfer connmark to packet + mark. Both are useful for restoring egress based marks + back onto ingress connections for qdisc priority mapping + purposes. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_ctinfo. + config NET_ACT_SKBMOD tristate "skb data modification action" depends on NET_CLS_ACT @@ -912,6 +940,17 @@ config NET_ACT_TUNNEL_KEY To compile this code as a module, choose M here: the module will be called act_tunnel_key. +config NET_ACT_CT + tristate "connection tracking tc action" + depends on NET_CLS_ACT && NF_CONNTRACK + help + Say Y here to allow sending the packets to conntrack module. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_ct. + config NET_IFE_SKBMARK tristate "Support to encoding decoding skb mark on IFE action" depends on NET_ACT_IFE @@ -924,14 +963,6 @@ config NET_IFE_SKBTCINDEX tristate "Support to encoding decoding skb tcindex on IFE action" depends on NET_ACT_IFE -config NET_CLS_IND - bool "Incoming device classification" - depends on NET_CLS_U32 || NET_CLS_FW - ---help--- - Say Y here to extend the u32 and fw classifier to support - classification based on the incoming device. This option is - likely to disappear in favour of the metadata ematch. - endif # NET_SCHED config NET_SCH_FIFO diff --git a/net/sched/Makefile b/net/sched/Makefile index 8a40431d7b5c..415d1e1f237e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -18,15 +18,18 @@ obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o +obj-$(CONFIG_NET_ACT_MPLS) += act_mpls.o obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o +obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o obj-$(CONFIG_NET_ACT_IFE) += act_ife.o obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o +obj-$(CONFIG_NET_ACT_CT) += act_ct.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 4e5d2e9ace5d..339712296164 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -221,12 +221,13 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct idr *idr = &idrinfo->action_idr; struct tc_action *p; unsigned long id = 1; + unsigned long tmp; mutex_lock(&idrinfo->lock); s_i = cb->args[0]; - idr_for_each_entry_ul(idr, p, id) { + idr_for_each_entry_ul(idr, p, tmp, id) { index++; if (index < s_i) continue; @@ -292,6 +293,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct idr *idr = &idrinfo->action_idr; struct tc_action *p; unsigned long id = 1; + unsigned long tmp; nest = nla_nest_start_noflag(skb, 0); if (nest == NULL) @@ -300,7 +302,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, goto nla_put_failure; mutex_lock(&idrinfo->lock); - idr_for_each_entry_ul(idr, p, id) { + idr_for_each_entry_ul(idr, p, tmp, id) { ret = tcf_idr_release_unsafe(p); if (ret == ACT_P_DELETED) { module_put(ops->owner); @@ -533,8 +535,9 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tc_action *p; int ret; unsigned long id = 1; + unsigned long tmp; - idr_for_each_entry_ul(idr, p, id) { + idr_for_each_entry_ul(idr, p, tmp, id) { ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c new file mode 100644 index 000000000000..b501ce0cf116 --- /dev/null +++ b/net/sched/act_ct.c @@ -0,0 +1,984 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* - + * net/sched/act_ct.c Connection Tracking action + * + * Authors: Paul Blakey <paulb@mellanox.com> + * Yossi Kuperman <yossiku@mellanox.com> + * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/pkt_cls.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> +#include <net/act_api.h> +#include <net/ip.h> +#include <net/ipv6_frag.h> +#include <uapi/linux/tc_act/tc_ct.h> +#include <net/tc_act/tc_ct.h> + +#include <linux/netfilter/nf_nat.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> + +static struct tc_action_ops act_ct_ops; +static unsigned int ct_net_id; + +struct tc_ct_action_net { + struct tc_action_net tn; /* Must be first */ + bool labels; +}; + +/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ +static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb, + u16 zone_id, bool force) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + if (!net_eq(net, read_pnet(&ct->ct_net))) + return false; + if (nf_ct_zone(ct)->id != zone_id) + return false; + + /* Force conntrack entry direction. */ + if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { + if (nf_ct_is_confirmed(ct)) + nf_ct_kill(ct); + + nf_conntrack_put(&ct->ct_general); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + + return false; + } + + return true; +} + +/* Trim the skb to the length specified by the IP/IPv6 header, + * removing any trailing lower-layer padding. This prepares the skb + * for higher-layer processing that assumes skb->len excludes padding + * (such as nf_ip_checksum). The caller needs to pull the skb to the + * network header, and ensure ip_hdr/ipv6_hdr points to valid data. + */ +static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family) +{ + unsigned int len; + int err; + + switch (family) { + case NFPROTO_IPV4: + len = ntohs(ip_hdr(skb)->tot_len); + break; + case NFPROTO_IPV6: + len = sizeof(struct ipv6hdr) + + ntohs(ipv6_hdr(skb)->payload_len); + break; + default: + len = skb->len; + } + + err = pskb_trim_rcsum(skb, len); + + return err; +} + +static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) +{ + u8 family = NFPROTO_UNSPEC; + + switch (skb->protocol) { + case htons(ETH_P_IP): + family = NFPROTO_IPV4; + break; + case htons(ETH_P_IPV6): + family = NFPROTO_IPV6; + break; + default: + break; + } + + return family; +} + +static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag) +{ + unsigned int len; + + len = skb_network_offset(skb) + sizeof(struct iphdr); + if (unlikely(skb->len < len)) + return -EINVAL; + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + + *frag = ip_is_fragment(ip_hdr(skb)); + return 0; +} + +static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag) +{ + unsigned int flags = 0, len, payload_ofs = 0; + unsigned short frag_off; + int nexthdr; + + len = skb_network_offset(skb) + sizeof(struct ipv6hdr); + if (unlikely(skb->len < len)) + return -EINVAL; + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + + nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags); + if (unlikely(nexthdr < 0)) + return -EPROTO; + + *frag = flags & IP6_FH_F_FRAG; + return 0; +} + +static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, + u8 family, u16 zone) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + int err = 0; + bool frag; + + /* Previously seen (loopback)? Ignore. */ + ct = nf_ct_get(skb, &ctinfo); + if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) + return 0; + + if (family == NFPROTO_IPV4) + err = tcf_ct_ipv4_is_fragment(skb, &frag); + else + err = tcf_ct_ipv6_is_fragment(skb, &frag); + if (err || !frag) + return err; + + skb_get(skb); + + if (family == NFPROTO_IPV4) { + enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + local_bh_disable(); + err = ip_defrag(net, skb, user); + local_bh_enable(); + if (err && err != -EINPROGRESS) + goto out_free; + } else { /* NFPROTO_IPV6 */ +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + err = nf_ct_frag6_gather(net, skb, user); + if (err && err != -EINPROGRESS) + goto out_free; +#else + err = -EOPNOTSUPP; + goto out_free; +#endif + } + + skb_clear_hash(skb); + skb->ignore_df = 1; + return err; + +out_free: + kfree_skb(skb); + return err; +} + +static void tcf_ct_params_free(struct rcu_head *head) +{ + struct tcf_ct_params *params = container_of(head, + struct tcf_ct_params, rcu); + + if (params->tmpl) + nf_conntrack_put(¶ms->tmpl->ct_general); + kfree(params); +} + +#if IS_ENABLED(CONFIG_NF_NAT) +/* Modelled after nf_nat_ipv[46]_fn(). + * range is only used for new, uninitialized NAT state. + * Returns either NF_ACCEPT or NF_DROP. + */ +static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_nat_range2 *range, + enum nf_nat_manip_type maniptype) +{ + int hooknum, err = NF_ACCEPT; + + /* See HOOK2MANIP(). */ + if (maniptype == NF_NAT_MANIP_SRC) + hooknum = NF_INET_LOCAL_IN; /* Source NAT */ + else + hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (skb->protocol == htons(ETH_P_IP) && + ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + hooknum)) + err = NF_DROP; + goto out; + } else if (IS_ENABLED(CONFIG_IPV6) && + skb->protocol == htons(ETH_P_IPV6)) { + __be16 frag_off; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int hdrlen = ipv6_skip_exthdr(skb, + sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, + ctinfo, + hooknum, + hdrlen)) + err = NF_DROP; + goto out; + } + } + /* Non-ICMP, fall thru to initialize if needed. */ + /* fall through */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + /* Initialize according to the NAT action. */ + err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) + /* Action is set up to establish a new + * mapping. + */ + ? nf_nat_setup_info(ct, range, maniptype) + : nf_nat_alloc_null_binding(ct, hooknum); + if (err != NF_ACCEPT) + goto out; + } + break; + + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + break; + + default: + err = NF_DROP; + goto out; + } + + err = nf_nat_packet(ct, ctinfo, hooknum, skb); +out: + return err; +} +#endif /* CONFIG_NF_NAT */ + +static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + u32 new_mark; + + if (!mask) + return; + + new_mark = mark | (ct->mark & ~(mask)); + if (ct->mark != new_mark) { + ct->mark = new_mark; + if (nf_ct_is_confirmed(ct)) + nf_conntrack_event_cache(IPCT_MARK, ct); + } +#endif +} + +static void tcf_ct_act_set_labels(struct nf_conn *ct, + u32 *labels, + u32 *labels_m) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) + size_t labels_sz = FIELD_SIZEOF(struct tcf_ct_params, labels); + + if (!memchr_inv(labels_m, 0, labels_sz)) + return; + + nf_connlabels_replace(ct, labels, labels_m, 4); +#endif +} + +static int tcf_ct_act_nat(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + int ct_action, + struct nf_nat_range2 *range, + bool commit) +{ +#if IS_ENABLED(CONFIG_NF_NAT) + enum nf_nat_manip_type maniptype; + + if (!(ct_action & TCA_CT_ACT_NAT)) + return NF_ACCEPT; + + /* Add NAT extension if not confirmed yet. */ + if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) + return NF_DROP; /* Can't NAT. */ + + if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) && + (ctinfo != IP_CT_RELATED || commit)) { + /* NAT an established or related connection like before. */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) + /* This is the REPLY direction for a connection + * for which NAT was applied in the forward + * direction. Do the reverse NAT. + */ + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; + else + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; + } else if (ct_action & TCA_CT_ACT_NAT_SRC) { + maniptype = NF_NAT_MANIP_SRC; + } else if (ct_action & TCA_CT_ACT_NAT_DST) { + maniptype = NF_NAT_MANIP_DST; + } else { + return NF_ACCEPT; + } + + return ct_nat_execute(skb, ct, ctinfo, range, maniptype); +#else + return NF_ACCEPT; +#endif +} + +static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct net *net = dev_net(skb->dev); + bool cached, commit, clear, force; + enum ip_conntrack_info ctinfo; + struct tcf_ct *c = to_ct(a); + struct nf_conn *tmpl = NULL; + struct nf_hook_state state; + int nh_ofs, err, retval; + struct tcf_ct_params *p; + struct nf_conn *ct; + u8 family; + + p = rcu_dereference_bh(c->params); + + retval = READ_ONCE(c->tcf_action); + commit = p->ct_action & TCA_CT_ACT_COMMIT; + clear = p->ct_action & TCA_CT_ACT_CLEAR; + force = p->ct_action & TCA_CT_ACT_FORCE; + tmpl = p->tmpl; + + if (clear) { + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + nf_conntrack_put(&ct->ct_general); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + } + + goto out; + } + + family = tcf_ct_skb_nf_family(skb); + if (family == NFPROTO_UNSPEC) + goto drop; + + /* The conntrack module expects to be working at L3. + * We also try to pull the IPv4/6 header to linear area + */ + nh_ofs = skb_network_offset(skb); + skb_pull_rcsum(skb, nh_ofs); + err = tcf_ct_handle_fragments(net, skb, family, p->zone); + if (err == -EINPROGRESS) { + retval = TC_ACT_STOLEN; + goto out; + } + if (err) + goto drop; + + err = tcf_ct_skb_network_trim(skb, family); + if (err) + goto drop; + + /* If we are recirculating packets to match on ct fields and + * committing with a separate ct action, then we don't need to + * actually run the packet through conntrack twice unless it's for a + * different zone. + */ + cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force); + if (!cached) { + /* Associate skb with specified zone. */ + if (tmpl) { + ct = nf_ct_get(skb, &ctinfo); + if (skb_nfct(skb)) + nf_conntrack_put(skb_nfct(skb)); + nf_conntrack_get(&tmpl->ct_general); + nf_ct_set(skb, tmpl, IP_CT_NEW); + } + + state.hook = NF_INET_PRE_ROUTING; + state.net = net; + state.pf = family; + err = nf_conntrack_in(skb, &state); + if (err != NF_ACCEPT) + goto out_push; + } + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + goto out_push; + nf_ct_deliver_cached_events(ct); + + err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit); + if (err != NF_ACCEPT) + goto drop; + + if (commit) { + tcf_ct_act_set_mark(ct, p->mark, p->mark_mask); + tcf_ct_act_set_labels(ct, p->labels, p->labels_mask); + + /* This will take care of sending queued events + * even if the connection is already confirmed. + */ + nf_conntrack_confirm(skb); + } + +out_push: + skb_push_rcsum(skb, nh_ofs); + +out: + bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + return retval; + +drop: + qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); + return TC_ACT_SHOT; +} + +static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { + [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 }, + [TCA_CT_ACTION] = { .type = NLA_U16 }, + [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) }, + [TCA_CT_ZONE] = { .type = NLA_U16 }, + [TCA_CT_MARK] = { .type = NLA_U32 }, + [TCA_CT_MARK_MASK] = { .type = NLA_U32 }, + [TCA_CT_LABELS] = { .type = NLA_BINARY, + .len = 128 / BITS_PER_BYTE }, + [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY, + .len = 128 / BITS_PER_BYTE }, + [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 }, + [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 }, + [TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct in6_addr) }, + [TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct in6_addr) }, + [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 }, + [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 }, +}; + +static int tcf_ct_fill_params_nat(struct tcf_ct_params *p, + struct tc_ct *parm, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct nf_nat_range2 *range; + + if (!(p->ct_action & TCA_CT_ACT_NAT)) + return 0; + + if (!IS_ENABLED(CONFIG_NF_NAT)) { + NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel"); + return -EOPNOTSUPP; + } + + if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST))) + return 0; + + if ((p->ct_action & TCA_CT_ACT_NAT_SRC) && + (p->ct_action & TCA_CT_ACT_NAT_DST)) { + NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time"); + return -EOPNOTSUPP; + } + + range = &p->range; + if (tb[TCA_CT_NAT_IPV4_MIN]) { + struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX]; + + p->ipv4_range = true; + range->flags |= NF_NAT_RANGE_MAP_IPS; + range->min_addr.ip = + nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]); + + range->max_addr.ip = max_attr ? + nla_get_in_addr(max_attr) : + range->min_addr.ip; + } else if (tb[TCA_CT_NAT_IPV6_MIN]) { + struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX]; + + p->ipv4_range = false; + range->flags |= NF_NAT_RANGE_MAP_IPS; + range->min_addr.in6 = + nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]); + + range->max_addr.in6 = max_attr ? + nla_get_in6_addr(max_attr) : + range->min_addr.in6; + } + + if (tb[TCA_CT_NAT_PORT_MIN]) { + range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]); + + range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ? + nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) : + range->min_proto.all; + } + + return 0; +} + +static void tcf_ct_set_key_val(struct nlattr **tb, + void *val, int val_type, + void *mask, int mask_type, + int len) +{ + if (!tb[val_type]) + return; + nla_memcpy(val, tb[val_type], len); + + if (!mask) + return; + + if (mask_type == TCA_CT_UNSPEC || !tb[mask_type]) + memset(mask, 0xff, len); + else + nla_memcpy(mask, tb[mask_type], len); +} + +static int tcf_ct_fill_params(struct net *net, + struct tcf_ct_params *p, + struct tc_ct *parm, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct tc_ct_action_net *tn = net_generic(net, ct_net_id); + struct nf_conntrack_zone zone; + struct nf_conn *tmpl; + int err; + + p->zone = NF_CT_DEFAULT_ZONE_ID; + + tcf_ct_set_key_val(tb, + &p->ct_action, TCA_CT_ACTION, + NULL, TCA_CT_UNSPEC, + sizeof(p->ct_action)); + + if (p->ct_action & TCA_CT_ACT_CLEAR) + return 0; + + err = tcf_ct_fill_params_nat(p, parm, tb, extack); + if (err) + return err; + + if (tb[TCA_CT_MARK]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) { + NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled."); + return -EOPNOTSUPP; + } + tcf_ct_set_key_val(tb, + &p->mark, TCA_CT_MARK, + &p->mark_mask, TCA_CT_MARK_MASK, + sizeof(p->mark)); + } + + if (tb[TCA_CT_LABELS]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) { + NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled."); + return -EOPNOTSUPP; + } + + if (!tn->labels) { + NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length"); + return -EOPNOTSUPP; + } + tcf_ct_set_key_val(tb, + p->labels, TCA_CT_LABELS, + p->labels_mask, TCA_CT_LABELS_MASK, + sizeof(p->labels)); + } + + if (tb[TCA_CT_ZONE]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) { + NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled."); + return -EOPNOTSUPP; + } + + tcf_ct_set_key_val(tb, + &p->zone, TCA_CT_ZONE, + NULL, TCA_CT_UNSPEC, + sizeof(p->zone)); + } + + if (p->zone == NF_CT_DEFAULT_ZONE_ID) + return 0; + + nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0); + tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL); + if (!tmpl) { + NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template"); + return -ENOMEM; + } + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + nf_conntrack_get(&tmpl->ct_general); + p->tmpl = tmpl; + + return 0; +} + +static int tcf_ct_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int replace, int bind, bool rtnl_held, + struct tcf_proto *tp, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ct_net_id); + struct tcf_ct_params *params = NULL; + struct nlattr *tb[TCA_CT_MAX + 1]; + struct tcf_chain *goto_ch = NULL; + struct tc_ct *parm; + struct tcf_ct *c; + int err, res = 0; + + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_CT_PARMS]) { + NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters"); + return -EINVAL; + } + parm = nla_data(tb[TCA_CT_PARMS]); + + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + + if (!err) { + err = tcf_idr_create(tn, parm->index, est, a, + &act_ct_ops, bind, true); + if (err) { + tcf_idr_cleanup(tn, parm->index); + return err; + } + res = ACT_P_CREATED; + } else { + if (bind) + return 0; + + if (!replace) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto cleanup; + + c = to_ct(*a); + + params = kzalloc(sizeof(*params), GFP_KERNEL); + if (unlikely(!params)) { + err = -ENOMEM; + goto cleanup; + } + + err = tcf_ct_fill_params(net, params, parm, tb, extack); + if (err) + goto cleanup; + + spin_lock_bh(&c->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock)); + spin_unlock_bh(&c->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (params) + kfree_rcu(params, rcu); + if (res == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + + return res; + +cleanup: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + kfree(params); + tcf_idr_release(*a, bind); + return err; +} + +static void tcf_ct_cleanup(struct tc_action *a) +{ + struct tcf_ct_params *params; + struct tcf_ct *c = to_ct(a); + + params = rcu_dereference_protected(c->params, 1); + if (params) + call_rcu(¶ms->rcu, tcf_ct_params_free); +} + +static int tcf_ct_dump_key_val(struct sk_buff *skb, + void *val, int val_type, + void *mask, int mask_type, + int len) +{ + int err; + + if (mask && !memchr_inv(mask, 0, len)) + return 0; + + err = nla_put(skb, val_type, len, val); + if (err) + return err; + + if (mask_type != TCA_CT_UNSPEC) { + err = nla_put(skb, mask_type, len, mask); + if (err) + return err; + } + + return 0; +} + +static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) +{ + struct nf_nat_range2 *range = &p->range; + + if (!(p->ct_action & TCA_CT_ACT_NAT)) + return 0; + + if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST))) + return 0; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) { + if (p->ipv4_range) { + if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN, + range->min_addr.ip)) + return -1; + if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX, + range->max_addr.ip)) + return -1; + } else { + if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN, + &range->min_addr.in6)) + return -1; + if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX, + &range->max_addr.in6)) + return -1; + } + } + + if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { + if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN, + range->min_proto.all)) + return -1; + if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX, + range->max_proto.all)) + return -1; + } + + return 0; +} + +static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_ct *c = to_ct(a); + struct tcf_ct_params *p; + + struct tc_ct opt = { + .index = c->tcf_index, + .refcnt = refcount_read(&c->tcf_refcnt) - ref, + .bindcnt = atomic_read(&c->tcf_bindcnt) - bind, + }; + struct tcf_t t; + + spin_lock_bh(&c->tcf_lock); + p = rcu_dereference_protected(c->params, + lockdep_is_held(&c->tcf_lock)); + opt.action = c->tcf_action; + + if (tcf_ct_dump_key_val(skb, + &p->ct_action, TCA_CT_ACTION, + NULL, TCA_CT_UNSPEC, + sizeof(p->ct_action))) + goto nla_put_failure; + + if (p->ct_action & TCA_CT_ACT_CLEAR) + goto skip_dump; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + tcf_ct_dump_key_val(skb, + &p->mark, TCA_CT_MARK, + &p->mark_mask, TCA_CT_MARK_MASK, + sizeof(p->mark))) + goto nla_put_failure; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + tcf_ct_dump_key_val(skb, + p->labels, TCA_CT_LABELS, + p->labels_mask, TCA_CT_LABELS_MASK, + sizeof(p->labels))) + goto nla_put_failure; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + tcf_ct_dump_key_val(skb, + &p->zone, TCA_CT_ZONE, + NULL, TCA_CT_UNSPEC, + sizeof(p->zone))) + goto nla_put_failure; + + if (tcf_ct_dump_nat(skb, p)) + goto nla_put_failure; + +skip_dump: + if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + tcf_tm_dump(&t, &c->tcf_tm); + if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD)) + goto nla_put_failure; + spin_unlock_bh(&c->tcf_lock); + + return skb->len; +nla_put_failure: + spin_unlock_bh(&c->tcf_lock); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_ct_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ct_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ct_net_id); + + return tcf_idr_search(tn, a, index); +} + +static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse, bool hw) +{ + struct tcf_ct *c = to_ct(a); + + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + + if (hw) + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); + c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse); +} + +static struct tc_action_ops act_ct_ops = { + .kind = "ct", + .id = TCA_ID_CT, + .owner = THIS_MODULE, + .act = tcf_ct_act, + .dump = tcf_ct_dump, + .init = tcf_ct_init, + .cleanup = tcf_ct_cleanup, + .walk = tcf_ct_walker, + .lookup = tcf_ct_search, + .stats_update = tcf_stats_update, + .size = sizeof(struct tcf_ct), +}; + +static __net_init int ct_init_net(struct net *net) +{ + unsigned int n_bits = FIELD_SIZEOF(struct tcf_ct_params, labels) * 8; + struct tc_ct_action_net *tn = net_generic(net, ct_net_id); + + if (nf_connlabels_get(net, n_bits - 1)) { + tn->labels = false; + pr_err("act_ct: Failed to set connlabels length"); + } else { + tn->labels = true; + } + + return tc_action_net_init(&tn->tn, &act_ct_ops); +} + +static void __net_exit ct_exit_net(struct list_head *net_list) +{ + struct net *net; + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + struct tc_ct_action_net *tn = net_generic(net, ct_net_id); + + if (tn->labels) + nf_connlabels_put(net); + } + rtnl_unlock(); + + tc_action_net_exit(net_list, ct_net_id); +} + +static struct pernet_operations ct_net_ops = { + .init = ct_init_net, + .exit_batch = ct_exit_net, + .id = &ct_net_id, + .size = sizeof(struct tc_ct_action_net), +}; + +static int __init ct_init_module(void) +{ + return tcf_register_action(&act_ct_ops, &ct_net_ops); +} + +static void __exit ct_cleanup_module(void) +{ + tcf_unregister_action(&act_ct_ops, &ct_net_ops); +} + +module_init(ct_init_module); +module_exit(ct_cleanup_module); +MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>"); +MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>"); +MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>"); +MODULE_DESCRIPTION("Connection tracking action"); +MODULE_LICENSE("GPL v2"); + diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c new file mode 100644 index 000000000000..10eb2bb99861 --- /dev/null +++ b/net/sched/act_ctinfo.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions + * + * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk> + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/pkt_cls.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/act_api.h> +#include <net/pkt_cls.h> +#include <uapi/linux/tc_act/tc_ctinfo.h> +#include <net/tc_act/tc_ctinfo.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_zones.h> + +static struct tc_action_ops act_ctinfo_ops; +static unsigned int ctinfo_net_id; + +static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, + struct tcf_ctinfo_params *cp, + struct sk_buff *skb, int wlen, int proto) +{ + u8 dscp, newdscp; + + newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) & + ~INET_ECN_MASK; + + switch (proto) { + case NFPROTO_IPV4: + dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK; + if (dscp != newdscp) { + if (likely(!skb_try_make_writable(skb, wlen))) { + ipv4_change_dsfield(ip_hdr(skb), + INET_ECN_MASK, + newdscp); + ca->stats_dscp_set++; + } else { + ca->stats_dscp_error++; + } + } + break; + case NFPROTO_IPV6: + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK; + if (dscp != newdscp) { + if (likely(!skb_try_make_writable(skb, wlen))) { + ipv6_change_dsfield(ipv6_hdr(skb), + INET_ECN_MASK, + newdscp); + ca->stats_dscp_set++; + } else { + ca->stats_dscp_error++; + } + } + break; + default: + break; + } +} + +static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, + struct tcf_ctinfo_params *cp, + struct sk_buff *skb) +{ + ca->stats_cpmark_set++; + skb->mark = ct->mark & cp->cpmarkmask; +} + +static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + const struct nf_conntrack_tuple_hash *thash = NULL; + struct tcf_ctinfo *ca = to_ctinfo(a); + struct nf_conntrack_tuple tuple; + struct nf_conntrack_zone zone; + enum ip_conntrack_info ctinfo; + struct tcf_ctinfo_params *cp; + struct nf_conn *ct; + int proto, wlen; + int action; + + cp = rcu_dereference_bh(ca->params); + + tcf_lastuse_update(&ca->tcf_tm); + bstats_update(&ca->tcf_bstats, skb); + action = READ_ONCE(ca->tcf_action); + + wlen = skb_network_offset(skb); + if (tc_skb_protocol(skb) == htons(ETH_P_IP)) { + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen)) + goto out; + + proto = NFPROTO_IPV4; + } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) { + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen)) + goto out; + + proto = NFPROTO_IPV6; + } else { + goto out; + } + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { /* look harder, usually ingress */ + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + proto, cp->net, &tuple)) + goto out; + zone.id = cp->zone; + zone.dir = NF_CT_DEFAULT_ZONE_DIR; + + thash = nf_conntrack_find_get(cp->net, &zone, &tuple); + if (!thash) + goto out; + + ct = nf_ct_tuplehash_to_ctrack(thash); + } + + if (cp->mode & CTINFO_MODE_DSCP) + if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask)) + tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto); + + if (cp->mode & CTINFO_MODE_CPMARK) + tcf_ctinfo_cpmark_set(ct, ca, cp, skb); + + if (thash) + nf_ct_put(ct); +out: + return action; +} + +static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { + [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct + tc_ctinfo) }, + [TCA_CTINFO_ZONE] = { .type = NLA_U16 }, + [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 }, + [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 }, + [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 }, +}; + +static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + struct nlattr *tb[TCA_CTINFO_MAX + 1]; + struct tcf_ctinfo_params *cp_new; + struct tcf_chain *goto_ch = NULL; + u32 dscpmask = 0, dscpstatemask; + struct tc_ctinfo *actparm; + struct tcf_ctinfo *ci; + u8 dscpmaskshift; + int ret = 0, err; + + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_CTINFO_ACT]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing required TCA_CTINFO_ACT attribute"); + return -EINVAL; + } + actparm = nla_data(tb[TCA_CTINFO_ACT]); + + /* do some basic validation here before dynamically allocating things */ + /* that we would otherwise have to clean up. */ + if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) { + dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]); + /* need contiguous 6 bit mask */ + dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0; + if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_CTINFO_PARMS_DSCP_MASK], + "dscp mask must be 6 contiguous bits"); + return -EINVAL; + } + dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ? + nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0; + /* mask & statemask must not overlap */ + if (dscpmask & dscpstatemask) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_CTINFO_PARMS_DSCP_STATEMASK], + "dscp statemask must not overlap dscp mask"); + return -EINVAL; + } + } + + /* done the validation:now to the actual action allocation */ + err = tcf_idr_check_alloc(tn, &actparm->index, a, bind); + if (!err) { + ret = tcf_idr_create(tn, actparm->index, est, a, + &act_ctinfo_ops, bind, false); + if (ret) { + tcf_idr_cleanup(tn, actparm->index); + return ret; + } + ret = ACT_P_CREATED; + } else if (err > 0) { + if (bind) /* don't override defaults */ + return 0; + if (!ovr) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + } else { + return err; + } + + err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + ci = to_ctinfo(*a); + + cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL); + if (unlikely(!cp_new)) { + err = -ENOMEM; + goto put_chain; + } + + cp_new->net = net; + cp_new->zone = tb[TCA_CTINFO_ZONE] ? + nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0; + if (dscpmask) { + cp_new->dscpmask = dscpmask; + cp_new->dscpmaskshift = dscpmaskshift; + cp_new->dscpstatemask = dscpstatemask; + cp_new->mode |= CTINFO_MODE_DSCP; + } + + if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) { + cp_new->cpmarkmask = + nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]); + cp_new->mode |= CTINFO_MODE_CPMARK; + } + + spin_lock_bh(&ci->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); + rcu_swap_protected(ci->params, cp_new, + lockdep_is_held(&ci->tcf_lock)); + spin_unlock_bh(&ci->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (cp_new) + kfree_rcu(cp_new, rcu); + + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + + return ret; + +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; +} + +static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + struct tcf_ctinfo *ci = to_ctinfo(a); + struct tc_ctinfo opt = { + .index = ci->tcf_index, + .refcnt = refcount_read(&ci->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, + }; + unsigned char *b = skb_tail_pointer(skb); + struct tcf_ctinfo_params *cp; + struct tcf_t t; + + spin_lock_bh(&ci->tcf_lock); + cp = rcu_dereference_protected(ci->params, + lockdep_is_held(&ci->tcf_lock)); + + tcf_tm_dump(&t, &ci->tcf_tm); + if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) + goto nla_put_failure; + + opt.action = ci->tcf_action; + if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) + goto nla_put_failure; + + if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone)) + goto nla_put_failure; + + if (cp->mode & CTINFO_MODE_DSCP) { + if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK, + cp->dscpmask)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK, + cp->dscpstatemask)) + goto nla_put_failure; + } + + if (cp->mode & CTINFO_MODE_CPMARK) { + if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK, + cp->cpmarkmask)) + goto nla_put_failure; + } + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, + ci->stats_dscp_set, TCA_CTINFO_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, + ci->stats_dscp_error, TCA_CTINFO_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, + ci->stats_cpmark_set, TCA_CTINFO_PAD)) + goto nla_put_failure; + + spin_unlock_bh(&ci->tcf_lock); + return skb->len; + +nla_put_failure: + spin_unlock_bh(&ci->tcf_lock); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tcf_idr_search(tn, a, index); +} + +static struct tc_action_ops act_ctinfo_ops = { + .kind = "ctinfo", + .id = TCA_ID_CTINFO, + .owner = THIS_MODULE, + .act = tcf_ctinfo_act, + .dump = tcf_ctinfo_dump, + .init = tcf_ctinfo_init, + .walk = tcf_ctinfo_walker, + .lookup = tcf_ctinfo_search, + .size = sizeof(struct tcf_ctinfo), +}; + +static __net_init int ctinfo_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tc_action_net_init(tn, &act_ctinfo_ops); +} + +static void __net_exit ctinfo_exit_net(struct list_head *net_list) +{ + tc_action_net_exit(net_list, ctinfo_net_id); +} + +static struct pernet_operations ctinfo_net_ops = { + .init = ctinfo_init_net, + .exit_batch = ctinfo_exit_net, + .id = &ctinfo_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init ctinfo_init_module(void) +{ + return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops); +} + +static void __exit ctinfo_cleanup_module(void) +{ + tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops); +} + +module_init(ctinfo_init_module); +module_exit(ctinfo_cleanup_module); +MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>"); +MODULE_DESCRIPTION("Connection tracking mark actions"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 58e7573dded4..055faa298c8e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -27,6 +27,9 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); +#define MIRRED_RECURSION_LIMIT 4 +static DEFINE_PER_CPU(unsigned int, mirred_rec_level); + static bool tcf_mirred_is_act_redirect(int action) { return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; @@ -210,6 +213,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct sk_buff *skb2 = skb; bool m_mac_header_xmit; struct net_device *dev; + unsigned int rec_level; int retval, err = 0; bool use_reinsert; bool want_ingress; @@ -217,6 +221,14 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, int m_eaction; int mac_len; + rec_level = __this_cpu_inc_return(mirred_rec_level); + if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) { + net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", + netdev_name(skb->dev)); + __this_cpu_dec(mirred_rec_level); + return TC_ACT_SHOT; + } + tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -277,7 +289,9 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (use_reinsert) { res->ingress = want_ingress; res->qstats = this_cpu_ptr(m->common.cpu_qstats); - return TC_ACT_REINSERT; + skb_tc_reinsert(skb, res); + __this_cpu_dec(mirred_rec_level); + return TC_ACT_CONSUMED; } } @@ -292,6 +306,7 @@ out: if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } + __this_cpu_dec(mirred_rec_level); return retval; } @@ -411,6 +426,11 @@ static void tcf_mirred_put_dev(struct net_device *dev) dev_put(dev); } +static size_t tcf_mirred_get_fill_size(const struct tc_action *act) +{ + return nla_total_size(sizeof(struct tc_mirred)); +} + static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .id = TCA_ID_MIRRED, @@ -422,6 +442,7 @@ static struct tc_action_ops act_mirred_ops = { .init = tcf_mirred_init, .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, + .get_fill_size = tcf_mirred_get_fill_size, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, .put_dev = tcf_mirred_put_dev, diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c new file mode 100644 index 000000000000..ca2597ce4ac9 --- /dev/null +++ b/net/sched/act_mpls.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mpls.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <linux/tc_act/tc_mpls.h> +#include <net/mpls.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> +#include <net/tc_act/tc_mpls.h> + +static unsigned int mpls_net_id; +static struct tc_action_ops act_mpls_ops; + +#define ACT_MPLS_TTL_DEFAULT 255 + +static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse, + struct tcf_mpls_params *p, bool set_bos) +{ + u32 new_lse = 0; + + if (lse) + new_lse = be32_to_cpu(lse->label_stack_entry); + + if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET) { + new_lse &= ~MPLS_LS_LABEL_MASK; + new_lse |= p->tcfm_label << MPLS_LS_LABEL_SHIFT; + } + if (p->tcfm_ttl) { + new_lse &= ~MPLS_LS_TTL_MASK; + new_lse |= p->tcfm_ttl << MPLS_LS_TTL_SHIFT; + } + if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET) { + new_lse &= ~MPLS_LS_TC_MASK; + new_lse |= p->tcfm_tc << MPLS_LS_TC_SHIFT; + } + if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET) { + new_lse &= ~MPLS_LS_S_MASK; + new_lse |= p->tcfm_bos << MPLS_LS_S_SHIFT; + } else if (set_bos) { + new_lse |= 1 << MPLS_LS_S_SHIFT; + } + + return cpu_to_be32(new_lse); +} + +static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_mpls *m = to_mpls(a); + struct tcf_mpls_params *p; + __be32 new_lse; + int ret; + + tcf_lastuse_update(&m->tcf_tm); + bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); + + /* Ensure 'data' points at mac_header prior calling mpls manipulating + * functions. + */ + if (skb_at_tc_ingress(skb)) + skb_push_rcsum(skb, skb->mac_len); + + ret = READ_ONCE(m->tcf_action); + + p = rcu_dereference_bh(m->mpls_p); + + switch (p->tcfm_action) { + case TCA_MPLS_ACT_POP: + if (skb_mpls_pop(skb, p->tcfm_proto)) + goto drop; + break; + case TCA_MPLS_ACT_PUSH: + new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); + if (skb_mpls_push(skb, new_lse, p->tcfm_proto)) + goto drop; + break; + case TCA_MPLS_ACT_MODIFY: + new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false); + if (skb_mpls_update_lse(skb, new_lse)) + goto drop; + break; + case TCA_MPLS_ACT_DEC_TTL: + if (skb_mpls_dec_ttl(skb)) + goto drop; + break; + } + + if (skb_at_tc_ingress(skb)) + skb_pull_rcsum(skb, skb->mac_len); + + return ret; + +drop: + qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats)); + return TC_ACT_SHOT; +} + +static int valid_label(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u32 *label = nla_data(attr); + + if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) { + NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { + [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 }, + [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)), + [TCA_MPLS_PROTO] = { .type = NLA_U16 }, + [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label), + [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7), + [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1), + [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), +}; + +static int tcf_mpls_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, mpls_net_id); + struct nlattr *tb[TCA_MPLS_MAX + 1]; + struct tcf_chain *goto_ch = NULL; + struct tcf_mpls_params *p; + struct tc_mpls *parm; + bool exists = false; + struct tcf_mpls *m; + int ret = 0, err; + u8 mpls_ttl = 0; + + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_MPLS_MAX, nla, mpls_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_MPLS_PARMS]) { + NL_SET_ERR_MSG_MOD(extack, "No MPLS params"); + return -EINVAL; + } + parm = nla_data(tb[TCA_MPLS_PARMS]); + + /* Verify parameters against action type. */ + switch (parm->m_action) { + case TCA_MPLS_ACT_POP: + if (!tb[TCA_MPLS_PROTO]) { + NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop"); + return -EINVAL; + } + if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop"); + return -EINVAL; + } + if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || + tb[TCA_MPLS_BOS]) { + NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop"); + return -EINVAL; + } + break; + case TCA_MPLS_ACT_DEC_TTL: + if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] || + tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) { + NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl"); + return -EINVAL; + } + break; + case TCA_MPLS_ACT_PUSH: + if (!tb[TCA_MPLS_LABEL]) { + NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push"); + return -EINVAL; + } + if (tb[TCA_MPLS_PROTO] && + !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) { + NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push"); + return -EPROTONOSUPPORT; + } + /* Push needs a TTL - if not specified, set a default value. */ + if (!tb[TCA_MPLS_TTL]) { +#if IS_ENABLED(CONFIG_MPLS) + mpls_ttl = net->mpls.default_ttl ? + net->mpls.default_ttl : ACT_MPLS_TTL_DEFAULT; +#else + mpls_ttl = ACT_MPLS_TTL_DEFAULT; +#endif + } + break; + case TCA_MPLS_ACT_MODIFY: + if (tb[TCA_MPLS_PROTO]) { + NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify"); + return -EINVAL; + } + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action"); + return -EINVAL; + } + + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; + if (exists && bind) + return 0; + + if (!exists) { + ret = tcf_idr_create(tn, parm->index, est, a, + &act_mpls_ops, bind, true); + if (ret) { + tcf_idr_cleanup(tn, parm->index); + return ret; + } + + ret = ACT_P_CREATED; + } else if (!ovr) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + m = to_mpls(*a); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + err = -ENOMEM; + goto put_chain; + } + + p->tcfm_action = parm->m_action; + p->tcfm_label = tb[TCA_MPLS_LABEL] ? nla_get_u32(tb[TCA_MPLS_LABEL]) : + ACT_MPLS_LABEL_NOT_SET; + p->tcfm_tc = tb[TCA_MPLS_TC] ? nla_get_u8(tb[TCA_MPLS_TC]) : + ACT_MPLS_TC_NOT_SET; + p->tcfm_ttl = tb[TCA_MPLS_TTL] ? nla_get_u8(tb[TCA_MPLS_TTL]) : + mpls_ttl; + p->tcfm_bos = tb[TCA_MPLS_BOS] ? nla_get_u8(tb[TCA_MPLS_BOS]) : + ACT_MPLS_BOS_NOT_SET; + p->tcfm_proto = tb[TCA_MPLS_PROTO] ? nla_get_be16(tb[TCA_MPLS_PROTO]) : + htons(ETH_P_MPLS_UC); + + spin_lock_bh(&m->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock)); + spin_unlock_bh(&m->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (p) + kfree_rcu(p, rcu); + + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; +} + +static void tcf_mpls_cleanup(struct tc_action *a) +{ + struct tcf_mpls *m = to_mpls(a); + struct tcf_mpls_params *p; + + p = rcu_dereference_protected(m->mpls_p, 1); + if (p) + kfree_rcu(p, rcu); +} + +static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_mpls *m = to_mpls(a); + struct tcf_mpls_params *p; + struct tc_mpls opt = { + .index = m->tcf_index, + .refcnt = refcount_read(&m->tcf_refcnt) - ref, + .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, + }; + struct tcf_t t; + + spin_lock_bh(&m->tcf_lock); + opt.action = m->tcf_action; + p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock)); + opt.m_action = p->tcfm_action; + + if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET && + nla_put_u32(skb, TCA_MPLS_LABEL, p->tcfm_label)) + goto nla_put_failure; + + if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET && + nla_put_u8(skb, TCA_MPLS_TC, p->tcfm_tc)) + goto nla_put_failure; + + if (p->tcfm_ttl && nla_put_u8(skb, TCA_MPLS_TTL, p->tcfm_ttl)) + goto nla_put_failure; + + if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET && + nla_put_u8(skb, TCA_MPLS_BOS, p->tcfm_bos)) + goto nla_put_failure; + + if (nla_put_be16(skb, TCA_MPLS_PROTO, p->tcfm_proto)) + goto nla_put_failure; + + tcf_tm_dump(&t, &m->tcf_tm); + + if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD)) + goto nla_put_failure; + + spin_unlock_bh(&m->tcf_lock); + + return skb->len; + +nla_put_failure: + spin_unlock_bh(&m->tcf_lock); + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +static int tcf_mpls_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, mpls_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static int tcf_mpls_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, mpls_net_id); + + return tcf_idr_search(tn, a, index); +} + +static struct tc_action_ops act_mpls_ops = { + .kind = "mpls", + .id = TCA_ID_MPLS, + .owner = THIS_MODULE, + .act = tcf_mpls_act, + .dump = tcf_mpls_dump, + .init = tcf_mpls_init, + .cleanup = tcf_mpls_cleanup, + .walk = tcf_mpls_walker, + .lookup = tcf_mpls_search, + .size = sizeof(struct tcf_mpls), +}; + +static __net_init int mpls_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, mpls_net_id); + + return tc_action_net_init(tn, &act_mpls_ops); +} + +static void __net_exit mpls_exit_net(struct list_head *net_list) +{ + tc_action_net_exit(net_list, mpls_net_id); +} + +static struct pernet_operations mpls_net_ops = { + .init = mpls_init_net, + .exit_batch = mpls_exit_net, + .id = &mpls_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init mpls_init_module(void) +{ + return tcf_register_action(&act_mpls_ops, &mpls_net_ops); +} + +static void __exit mpls_cleanup_module(void) +{ + tcf_unregister_action(&act_mpls_ops, &mpls_net_ops); +} + +module_init(mpls_init_module); +module_exit(mpls_cleanup_module); + +MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPLS manipulation actions"); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ad36bbcc583e..638c1bc1ea1b 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -35,6 +35,7 @@ #include <net/tc_act/tc_police.h> #include <net/tc_act/tc_sample.h> #include <net/tc_act/tc_skbedit.h> +#include <net/tc_act/tc_ct.h> extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -672,21 +673,27 @@ static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb) kfree(indr_block_cb); } +static int tcf_block_setup(struct tcf_block *block, + struct flow_block_offload *bo); + static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev, struct tc_indr_block_cb *indr_block_cb, - enum tc_block_command command) + enum flow_block_command command) { - struct tc_block_offload bo = { + struct flow_block_offload bo = { .command = command, - .binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS, - .block = indr_dev->block, + .binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS, + .net = dev_net(indr_dev->dev), + .block_shared = tcf_block_shared(indr_dev->block), }; + INIT_LIST_HEAD(&bo.cb_list); if (!indr_dev->block) return; indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, &bo); + tcf_block_setup(indr_dev->block, &bo); } int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv, @@ -705,7 +712,7 @@ int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv, if (err) goto err_dev_put; - tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND); + tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_BIND); return 0; err_dev_put: @@ -742,7 +749,7 @@ void __tc_indr_block_cb_unregister(struct net_device *dev, return; /* Send unbind message if required to free any block cbs. */ - tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND); + tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_UNBIND); tc_indr_block_cb_del(indr_block_cb); tc_indr_block_dev_put(indr_dev); } @@ -759,27 +766,31 @@ EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister); static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev, struct tcf_block_ext_info *ei, - enum tc_block_command command, + enum flow_block_command command, struct netlink_ext_ack *extack) { struct tc_indr_block_cb *indr_block_cb; struct tc_indr_block_dev *indr_dev; - struct tc_block_offload bo = { + struct flow_block_offload bo = { .command = command, .binder_type = ei->binder_type, - .block = block, + .net = dev_net(dev), + .block_shared = tcf_block_shared(block), .extack = extack, }; + INIT_LIST_HEAD(&bo.cb_list); indr_dev = tc_indr_block_dev_lookup(dev); if (!indr_dev) return; - indr_dev->block = command == TC_BLOCK_BIND ? block : NULL; + indr_dev->block = command == FLOW_BLOCK_BIND ? block : NULL; list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, &bo); + + tcf_block_setup(block, &bo); } static bool tcf_block_offload_in_use(struct tcf_block *block) @@ -790,16 +801,24 @@ static bool tcf_block_offload_in_use(struct tcf_block *block) static int tcf_block_offload_cmd(struct tcf_block *block, struct net_device *dev, struct tcf_block_ext_info *ei, - enum tc_block_command command, + enum flow_block_command command, struct netlink_ext_ack *extack) { - struct tc_block_offload bo = {}; + struct flow_block_offload bo = {}; + int err; + bo.net = dev_net(dev); bo.command = command; bo.binder_type = ei->binder_type; - bo.block = block; + bo.block_shared = tcf_block_shared(block); bo.extack = extack; - return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + INIT_LIST_HEAD(&bo.cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + if (err < 0) + return err; + + return tcf_block_setup(block, &bo); } static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, @@ -820,20 +839,20 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, return -EOPNOTSUPP; } - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack); + err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; if (err) return err; - tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack); + tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); return 0; no_offload_dev_inc: if (tcf_block_offload_in_use(block)) return -EOPNOTSUPP; block->nooffloaddevcnt++; - tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack); + tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); return 0; } @@ -843,11 +862,11 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, struct net_device *dev = q->dev_queue->dev; int err; - tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL); + tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); if (!dev->netdev_ops->ndo_setup_tc) goto no_offload_dev_dec; - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL); + err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; return; @@ -1340,17 +1359,17 @@ static void tcf_block_release(struct Qdisc *q, struct tcf_block *block, struct tcf_block_owner_item { struct list_head list; struct Qdisc *q; - enum tcf_block_binder_type binder_type; + enum flow_block_binder_type binder_type; }; static void tcf_block_owner_netif_keep_dst(struct tcf_block *block, struct Qdisc *q, - enum tcf_block_binder_type binder_type) + enum flow_block_binder_type binder_type) { if (block->keep_dst && - binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS && - binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS) + binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS && + binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) netif_keep_dst(qdisc_dev(q)); } @@ -1367,7 +1386,7 @@ EXPORT_SYMBOL(tcf_block_netif_keep_dst); static int tcf_block_owner_add(struct tcf_block *block, struct Qdisc *q, - enum tcf_block_binder_type binder_type) + enum flow_block_binder_type binder_type) { struct tcf_block_owner_item *item; @@ -1382,7 +1401,7 @@ static int tcf_block_owner_add(struct tcf_block *block, static void tcf_block_owner_del(struct tcf_block *block, struct Qdisc *q, - enum tcf_block_binder_type binder_type) + enum flow_block_binder_type binder_type) { struct tcf_block_owner_item *item; @@ -1494,43 +1513,6 @@ void tcf_block_put(struct tcf_block *block) EXPORT_SYMBOL(tcf_block_put); -struct tcf_block_cb { - struct list_head list; - tc_setup_cb_t *cb; - void *cb_ident; - void *cb_priv; - unsigned int refcnt; -}; - -void *tcf_block_cb_priv(struct tcf_block_cb *block_cb) -{ - return block_cb->cb_priv; -} -EXPORT_SYMBOL(tcf_block_cb_priv); - -struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block, - tc_setup_cb_t *cb, void *cb_ident) -{ struct tcf_block_cb *block_cb; - - list_for_each_entry(block_cb, &block->cb_list, list) - if (block_cb->cb == cb && block_cb->cb_ident == cb_ident) - return block_cb; - return NULL; -} -EXPORT_SYMBOL(tcf_block_cb_lookup); - -void tcf_block_cb_incref(struct tcf_block_cb *block_cb) -{ - block_cb->refcnt++; -} -EXPORT_SYMBOL(tcf_block_cb_incref); - -unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) -{ - return --block_cb->refcnt; -} -EXPORT_SYMBOL(tcf_block_cb_decref); - static int tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_priv, bool add, bool offload_in_use, @@ -1572,66 +1554,76 @@ err_playback_remove: return err; } -struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, - tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv, - struct netlink_ext_ack *extack) +static int tcf_block_bind(struct tcf_block *block, + struct flow_block_offload *bo) { - struct tcf_block_cb *block_cb; - int err; + struct flow_block_cb *block_cb, *next; + int err, i = 0; - /* Replay any already present rules */ - err = tcf_block_playback_offloads(block, cb, cb_priv, true, - tcf_block_offload_in_use(block), - extack); - if (err) - return ERR_PTR(err); + list_for_each_entry(block_cb, &bo->cb_list, list) { + err = tcf_block_playback_offloads(block, block_cb->cb, + block_cb->cb_priv, true, + tcf_block_offload_in_use(block), + bo->extack); + if (err) + goto err_unroll; - block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); - if (!block_cb) - return ERR_PTR(-ENOMEM); - block_cb->cb = cb; - block_cb->cb_ident = cb_ident; - block_cb->cb_priv = cb_priv; - list_add(&block_cb->list, &block->cb_list); - return block_cb; -} -EXPORT_SYMBOL(__tcf_block_cb_register); + i++; + } + list_splice(&bo->cb_list, &block->cb_list); -int tcf_block_cb_register(struct tcf_block *block, - tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv, struct netlink_ext_ack *extack) -{ - struct tcf_block_cb *block_cb; + return 0; + +err_unroll: + list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { + if (i-- > 0) { + list_del(&block_cb->list); + tcf_block_playback_offloads(block, block_cb->cb, + block_cb->cb_priv, false, + tcf_block_offload_in_use(block), + NULL); + } + flow_block_cb_free(block_cb); + } - block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv, - extack); - return PTR_ERR_OR_ZERO(block_cb); + return err; } -EXPORT_SYMBOL(tcf_block_cb_register); -void __tcf_block_cb_unregister(struct tcf_block *block, - struct tcf_block_cb *block_cb) +static void tcf_block_unbind(struct tcf_block *block, + struct flow_block_offload *bo) { - tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, - false, tcf_block_offload_in_use(block), - NULL); - list_del(&block_cb->list); - kfree(block_cb); + struct flow_block_cb *block_cb, *next; + + list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { + tcf_block_playback_offloads(block, block_cb->cb, + block_cb->cb_priv, false, + tcf_block_offload_in_use(block), + NULL); + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + } } -EXPORT_SYMBOL(__tcf_block_cb_unregister); -void tcf_block_cb_unregister(struct tcf_block *block, - tc_setup_cb_t *cb, void *cb_ident) +static int tcf_block_setup(struct tcf_block *block, + struct flow_block_offload *bo) { - struct tcf_block_cb *block_cb; + int err; - block_cb = tcf_block_cb_lookup(block, cb, cb_ident); - if (!block_cb) - return; - __tcf_block_cb_unregister(block, block_cb); + switch (bo->command) { + case FLOW_BLOCK_BIND: + err = tcf_block_bind(block, bo); + break; + case FLOW_BLOCK_UNBIND: + err = 0; + tcf_block_unbind(block, bo); + break; + default: + WARN_ON_ONCE(1); + err = -EOPNOTSUPP; + } + + return err; } -EXPORT_SYMBOL(tcf_block_cb_unregister); /* Main classifier routine: scans classifier chain attached * to this qdisc, (optionally) tests for protocol and asks @@ -3155,7 +3147,7 @@ EXPORT_SYMBOL(tcf_exts_dump_stats); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop) { - struct tcf_block_cb *block_cb; + struct flow_block_cb *block_cb; int ok_count = 0; int err; @@ -3266,6 +3258,10 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->police.burst = tcf_police_tcfp_burst(act); entry->police.rate_bytes_ps = tcf_police_rate_bytes_ps(act); + } else if (is_tcf_ct(act)) { + entry->id = FLOW_ACTION_CT; + entry->ct.action = tcf_ct_action(act); + entry->ct.zone = tcf_ct_zone(act); } else { goto err_out; } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index eedd5786c084..38d6e85693fc 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -26,8 +26,10 @@ #include <net/dst.h> #include <net/dst_metadata.h> +#include <uapi/linux/netfilter/nf_conntrack_common.h> + struct fl_flow_key { - int indev_ifindex; + struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; struct flow_dissector_key_control enc_control; struct flow_dissector_key_basic basic; @@ -54,6 +56,7 @@ struct fl_flow_key { struct flow_dissector_key_enc_opts enc_opts; struct flow_dissector_key_ports tp_min; struct flow_dissector_key_ports tp_max; + struct flow_dissector_key_ct ct; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -272,24 +275,40 @@ static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, return __fl_lookup(mask, mkey); } +static u16 fl_ct_info_to_flower_map[] = { + [IP_CT_ESTABLISHED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED, + [IP_CT_RELATED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_RELATED, + [IP_CT_ESTABLISHED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED, + [IP_CT_RELATED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_RELATED, + [IP_CT_NEW] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_NEW, +}; + static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - struct cls_fl_filter *f; - struct fl_flow_mask *mask; - struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; + struct fl_flow_key skb_key; + struct fl_flow_mask *mask; + struct cls_fl_filter *f; list_for_each_entry_rcu(mask, &head->masks, list) { fl_clear_masked_range(&skb_key, mask); - skb_key.indev_ifindex = skb->skb_iif; + skb_flow_dissect_meta(skb, &mask->dissector, &skb_key); /* skb_flow_dissect() does not set n_proto in case an unknown * protocol, so do it rather here. */ skb_key.basic.n_proto = skb->protocol; skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); + skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, + fl_ct_info_to_flower_map, + ARRAY_SIZE(fl_ct_info_to_flower_map)); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); fl_set_masked_key(&skb_mkey, &skb_key, mask); @@ -390,14 +409,14 @@ static void fl_destroy_filter_work(struct work_struct *work) static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool rtnl_held, struct netlink_ext_ack *extack) { - struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; + struct flow_cls_offload cls_flower = {}; if (!rtnl_held) rtnl_lock(); tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); - cls_flower.command = TC_CLSFLOWER_DESTROY; + cls_flower.command = FLOW_CLS_DESTROY; cls_flower.cookie = (unsigned long) f; tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); @@ -415,8 +434,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct cls_fl_head *head = fl_head_dereference(tp); - struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; + struct flow_cls_offload cls_flower = {}; bool skip_sw = tc_skip_sw(f->flags); int err = 0; @@ -430,7 +449,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, } tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); - cls_flower.command = TC_CLSFLOWER_REPLACE; + cls_flower.command = FLOW_CLS_REPLACE; cls_flower.cookie = (unsigned long) f; cls_flower.rule->match.dissector = &f->mask->dissector; cls_flower.rule->match.mask = &f->mask->key; @@ -479,14 +498,14 @@ errout: static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, bool rtnl_held) { - struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; + struct flow_cls_offload cls_flower = {}; if (!rtnl_held) rtnl_lock(); tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL); - cls_flower.command = TC_CLSFLOWER_STATS; + cls_flower.command = FLOW_CLS_STATS; cls_flower.cookie = (unsigned long) f; cls_flower.classid = f->res.classid; @@ -524,24 +543,6 @@ static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle) return f; } -static struct cls_fl_filter *fl_get_next_filter(struct tcf_proto *tp, - unsigned long *handle) -{ - struct cls_fl_head *head = fl_head_dereference(tp); - struct cls_fl_filter *f; - - rcu_read_lock(); - while ((f = idr_get_next_ul(&head->handle_idr, handle))) { - /* don't return filters that are being deleted */ - if (refcount_inc_not_zero(&f->refcnt)) - break; - ++(*handle); - } - rcu_read_unlock(); - - return f; -} - static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) @@ -704,6 +705,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_CT_MARK_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_CT_LABELS] = { .type = NLA_BINARY, + .len = 128 / BITS_PER_BYTE }, + [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY, + .len = 128 / BITS_PER_BYTE }, }; static const struct nla_policy @@ -725,11 +736,11 @@ static void fl_set_key_val(struct nlattr **tb, { if (!tb[val_type]) return; - memcpy(val, nla_data(tb[val_type]), len); + nla_memcpy(val, tb[val_type], len); if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type]) memset(mask, 0xff, len); else - memcpy(mask, nla_data(tb[mask_type]), len); + nla_memcpy(mask, tb[mask_type], len); } static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, @@ -1015,21 +1026,65 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return 0; } +static int fl_set_key_ct(struct nlattr **tb, + struct flow_dissector_key_ct *key, + struct flow_dissector_key_ct *mask, + struct netlink_ext_ack *extack) +{ + if (tb[TCA_FLOWER_KEY_CT_STATE]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) { + NL_SET_ERR_MSG(extack, "Conntrack isn't enabled"); + return -EOPNOTSUPP; + } + fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE, + &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK, + sizeof(key->ct_state)); + } + if (tb[TCA_FLOWER_KEY_CT_ZONE]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) { + NL_SET_ERR_MSG(extack, "Conntrack zones isn't enabled"); + return -EOPNOTSUPP; + } + fl_set_key_val(tb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE, + &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK, + sizeof(key->ct_zone)); + } + if (tb[TCA_FLOWER_KEY_CT_MARK]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) { + NL_SET_ERR_MSG(extack, "Conntrack mark isn't enabled"); + return -EOPNOTSUPP; + } + fl_set_key_val(tb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK, + &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK, + sizeof(key->ct_mark)); + } + if (tb[TCA_FLOWER_KEY_CT_LABELS]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) { + NL_SET_ERR_MSG(extack, "Conntrack labels aren't enabled"); + return -EOPNOTSUPP; + } + fl_set_key_val(tb, key->ct_labels, TCA_FLOWER_KEY_CT_LABELS, + mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK, + sizeof(key->ct_labels)); + } + + return 0; +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) { __be16 ethertype; int ret = 0; -#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_FLOWER_INDEV]) { int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack); if (err < 0) return err; - key->indev_ifindex = err; - mask->indev_ifindex = 0xffffffff; + key->meta.ingress_ifindex = err; + mask->meta.ingress_ifindex = 0xffffffff; } -#endif fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, @@ -1225,6 +1280,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, return ret; } + ret = fl_set_key_ct(tb, &key->ct, &mask->ct, extack); + if (ret) + return ret; + if (tb[TCA_FLOWER_KEY_FLAGS]) ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); @@ -1282,6 +1341,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; size_t cnt = 0; + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_META, meta); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); FL_KEY_SET_IF_MASKED(mask, keys, cnt, @@ -1323,6 +1384,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_ENC_IP, enc_ip); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_CT, ct); skb_flow_dissector_init(dissector, keys, cnt); } @@ -1691,20 +1754,25 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last, static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { + struct cls_fl_head *head = fl_head_dereference(tp); + unsigned long id = arg->cookie, tmp; struct cls_fl_filter *f; arg->count = arg->skip; - while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) { + idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) { + /* don't return filters that are being deleted */ + if (!refcount_inc_not_zero(&f->refcnt)) + continue; if (arg->fn(tp, f, arg) < 0) { __fl_put(f); arg->stop = 1; break; } __fl_put(f); - arg->cookie++; arg->count++; } + arg->cookie = id; } static struct cls_fl_filter * @@ -1735,8 +1803,8 @@ fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add) static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { - struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; + struct flow_cls_offload cls_flower = {}; struct cls_fl_filter *f = NULL; int err; @@ -1757,7 +1825,7 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); cls_flower.command = add ? - TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; + FLOW_CLS_REPLACE : FLOW_CLS_DESTROY; cls_flower.cookie = (unsigned long)f; cls_flower.rule->match.dissector = &f->mask->dissector; cls_flower.rule->match.mask = &f->mask->key; @@ -1801,7 +1869,7 @@ next_flow: static int fl_hw_create_tmplt(struct tcf_chain *chain, struct fl_flow_tmplt *tmplt) { - struct tc_cls_flower_offload cls_flower = {}; + struct flow_cls_offload cls_flower = {}; struct tcf_block *block = chain->block; cls_flower.rule = flow_rule_alloc(0); @@ -1809,7 +1877,7 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain, return -ENOMEM; cls_flower.common.chain_index = chain->index; - cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE; + cls_flower.command = FLOW_CLS_TMPLT_CREATE; cls_flower.cookie = (unsigned long) tmplt; cls_flower.rule->match.dissector = &tmplt->dissector; cls_flower.rule->match.mask = &tmplt->mask; @@ -1827,11 +1895,11 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain, static void fl_hw_destroy_tmplt(struct tcf_chain *chain, struct fl_flow_tmplt *tmplt) { - struct tc_cls_flower_offload cls_flower = {}; + struct flow_cls_offload cls_flower = {}; struct tcf_block *block = chain->block; cls_flower.common.chain_index = chain->index; - cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY; + cls_flower.command = FLOW_CLS_TMPLT_DESTROY; cls_flower.cookie = (unsigned long) tmplt; tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); @@ -2077,6 +2145,40 @@ nla_put_failure: return -EMSGSIZE; } +static int fl_dump_key_ct(struct sk_buff *skb, + struct flow_dissector_key_ct *key, + struct flow_dissector_key_ct *mask) +{ + if (IS_ENABLED(CONFIG_NF_CONNTRACK) && + fl_dump_key_val(skb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE, + &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK, + sizeof(key->ct_state))) + goto nla_put_failure; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + fl_dump_key_val(skb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE, + &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK, + sizeof(key->ct_zone))) + goto nla_put_failure; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + fl_dump_key_val(skb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK, + &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK, + sizeof(key->ct_mark))) + goto nla_put_failure; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + fl_dump_key_val(skb, &key->ct_labels, TCA_FLOWER_KEY_CT_LABELS, + &mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK, + sizeof(key->ct_labels))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, struct flow_dissector_key_enc_opts *enc_opts) { @@ -2123,10 +2225,10 @@ static int fl_dump_key_enc_opt(struct sk_buff *skb, static int fl_dump_key(struct sk_buff *skb, struct net *net, struct fl_flow_key *key, struct fl_flow_key *mask) { - if (mask->indev_ifindex) { + if (mask->meta.ingress_ifindex) { struct net_device *dev; - dev = __dev_get_by_index(net, key->indev_ifindex); + dev = __dev_get_by_index(net, key->meta.ingress_ifindex); if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name)) goto nla_put_failure; } @@ -2310,6 +2412,9 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts)) goto nla_put_failure; + if (fl_dump_key_ct(skb, &key->ct, &mask->ct)) + goto nla_put_failure; + if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 4dab833f66cb..c9496c920d6f 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -8,9 +8,6 @@ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel). * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension - * - * JHS: We should remove the CONFIG_NET_CLS_IND from here - * eventually when the meta match extension is made available */ #include <linux/module.h> @@ -37,9 +34,7 @@ struct fw_filter { struct fw_filter __rcu *next; u32 id; struct tcf_result res; -#ifdef CONFIG_NET_CLS_IND int ifindex; -#endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; struct tcf_proto *tp; struct rcu_work rwork; @@ -67,10 +62,8 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, f = rcu_dereference_bh(f->next)) { if (f->id == id) { *res = f->res; -#ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, f->ifindex)) continue; -#endif /* CONFIG_NET_CLS_IND */ r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; @@ -222,7 +215,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &f->res, base); } -#ifdef CONFIG_NET_CLS_IND if (tb[TCA_FW_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack); @@ -230,7 +222,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, return ret; f->ifindex = ret; } -#endif /* CONFIG_NET_CLS_IND */ err = -EINVAL; if (tb[TCA_FW_MASK]) { @@ -276,9 +267,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, fnew->id = f->id; fnew->res = f->res; -#ifdef CONFIG_NET_CLS_IND fnew->ifindex = f->ifindex; -#endif /* CONFIG_NET_CLS_IND */ fnew->tp = f->tp; err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT, @@ -405,14 +394,12 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh, if (f->res.classid && nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) goto nla_put_failure; -#ifdef CONFIG_NET_CLS_IND if (f->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, f->ifindex); if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) goto nla_put_failure; } -#endif /* CONFIG_NET_CLS_IND */ if (head->mask != 0xFFFFFFFF && nla_put_u32(skb, TCA_FW_MASK, head->mask)) goto nla_put_failure; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 38c0a9f0f296..a30d2f8feb32 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,6 +21,7 @@ struct cls_mall_head { unsigned int in_hw_count; struct tc_matchall_pcnt __percpu *pf; struct rcu_work rwork; + bool deleting; }; static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -258,7 +259,11 @@ err_exts_init: static int mall_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { - return -EOPNOTSUPP; + struct cls_mall_head *head = rtnl_dereference(tp->root); + + head->deleting = true; + *last = true; + return 0; } static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg, @@ -269,7 +274,7 @@ static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg, if (arg->count < arg->skip) goto skip; - if (!head) + if (!head || head->deleting) return; if (arg->fn(tp, head, arg) < 0) arg->stop = 1; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index c7727de5e073..be9e46c77e8b 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -20,9 +20,6 @@ * pure RSVP doesn't need such a general approach and can use * much simpler (and faster) schemes, sort of cls_rsvp.c. * - * JHS: We should remove the CONFIG_NET_CLS_IND from here - * eventually when the meta match extension is made available - * * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro> */ @@ -48,9 +45,7 @@ struct tc_u_knode { u32 handle; struct tc_u_hnode __rcu *ht_up; struct tcf_exts exts; -#ifdef CONFIG_NET_CLS_IND int ifindex; -#endif u8 fshift; struct tcf_result res; struct tc_u_hnode __rcu *ht_down; @@ -176,12 +171,10 @@ check_terminal: if (n->sel.flags & TC_U32_TERMINAL) { *res = n->res; -#ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, n->ifindex)) { n = rcu_dereference_bh(n->next); goto next_knode; } -#endif #ifdef CONFIG_CLS_U32_PERF __this_cpu_inc(n->pf->rhit); #endif @@ -761,7 +754,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &n->res, base); } -#ifdef CONFIG_NET_CLS_IND if (tb[TCA_U32_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack); @@ -769,7 +761,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, return -EINVAL; n->ifindex = ret; } -#endif return 0; } @@ -817,9 +808,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, new->handle = n->handle; RCU_INIT_POINTER(new->ht_up, n->ht_up); -#ifdef CONFIG_NET_CLS_IND new->ifindex = n->ifindex; -#endif new->fshift = n->fshift; new->res = n->res; new->flags = n->flags; @@ -1351,14 +1340,12 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, if (tcf_exts_dump(skb, &n->exts) < 0) goto nla_put_failure; -#ifdef CONFIG_NET_CLS_IND if (n->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, n->ifindex); if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) goto nla_put_failure; } -#endif #ifdef CONFIG_CLS_U32_PERF gpf = kzalloc(sizeof(struct tc_u32_pcnt) + n->sel.nkeys * sizeof(u64), @@ -1422,9 +1409,7 @@ static int __init init_u32(void) #ifdef CONFIG_CLS_U32_PERF pr_info(" Performance counters on\n"); #endif -#ifdef CONFIG_NET_CLS_IND pr_info(" input device check on\n"); -#endif #ifdef CONFIG_NET_CLS_ACT pr_info(" Actions configured\n"); #endif diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index 243fd22f2248..9fff6480acc6 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -21,6 +21,7 @@ struct em_ipt_match { const struct xt_match *match; u32 hook; + u8 nfproto; u8 match_data[0] __aligned(8); }; @@ -71,11 +72,25 @@ static int policy_validate_match_data(struct nlattr **tb, u8 mrev) return 0; } +static int addrtype_validate_match_data(struct nlattr **tb, u8 mrev) +{ + if (mrev != 1) { + pr_err("only addrtype match revision 1 supported"); + return -EINVAL; + } + + return 0; +} + static const struct em_ipt_xt_match em_ipt_xt_matches[] = { { .match_name = "policy", .validate_match_data = policy_validate_match_data }, + { + .match_name = "addrtype", + .validate_match_data = addrtype_validate_match_data + }, {} }; @@ -115,6 +130,7 @@ static int em_ipt_change(struct net *net, void *data, int data_len, struct em_ipt_match *im = NULL; struct xt_match *match; int mdata_len, ret; + u8 nfproto; ret = nla_parse_deprecated(tb, TCA_EM_IPT_MAX, data, data_len, em_ipt_policy, NULL); @@ -125,6 +141,15 @@ static int em_ipt_change(struct net *net, void *data, int data_len, !tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO]) return -EINVAL; + nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]); + switch (nfproto) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + break; + default: + return -EINVAL; + } + match = get_xt_match(tb); if (IS_ERR(match)) { pr_err("unable to load match\n"); @@ -140,6 +165,7 @@ static int em_ipt_change(struct net *net, void *data, int data_len, im->match = match; im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]); + im->nfproto = nfproto; nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len); ret = check_match(net, im, mdata_len); @@ -182,15 +208,33 @@ static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, const struct em_ipt_match *im = (const void *)em->data; struct xt_action_param acpar = {}; struct net_device *indev = NULL; + u8 nfproto = im->match->family; struct nf_hook_state state; int ret; + switch (tc_skb_protocol(skb)) { + case htons(ETH_P_IP): + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + return 0; + if (nfproto == NFPROTO_UNSPEC) + nfproto = NFPROTO_IPV4; + break; + case htons(ETH_P_IPV6): + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + if (nfproto == NFPROTO_UNSPEC) + nfproto = NFPROTO_IPV6; + break; + default: + return 0; + } + rcu_read_lock(); if (skb->skb_iif) indev = dev_get_by_index_rcu(em->net, skb->skb_iif); - nf_hook_state_init(&state, im->hook, im->match->family, + nf_hook_state_init(&state, im->hook, nfproto, indev ?: skb->dev, skb->dev, NULL, em->net, NULL); acpar.match = im->match; @@ -213,7 +257,7 @@ static int em_ipt_dump(struct sk_buff *skb, struct tcf_ematch *em) return -EMSGSIZE; if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0) return -EMSGSIZE; - if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0) + if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->nfproto) < 0) return -EMSGSIZE; if (nla_put(skb, TCA_EM_IPT_MATCH_DATA, im->match->usersize ?: im->match->matchsize, diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index db0c2ba1d156..cebfb65d8556 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -22,10 +22,12 @@ #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) #define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) +#define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK) struct etf_sched_data { bool offload; bool deadline_mode; + bool skip_sock_check; int clockid; int queue; s32 delta; /* in ns */ @@ -77,6 +79,9 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) struct sock *sk = nskb->sk; ktime_t now; + if (q->skip_sock_check) + goto skip; + if (!sk) return false; @@ -92,6 +97,7 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) if (sk->sk_txtime_deadline_mode != q->deadline_mode) return false; +skip: now = q->get_time(); if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) return false; @@ -385,6 +391,7 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, q->clockid = qopt->clockid; q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt); switch (q->clockid) { case CLOCK_REALTIME: @@ -473,6 +480,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) if (q->deadline_mode) opt.flags |= TC_ETF_DEADLINE_MODE_ON; + if (q->skip_sock_check) + opt.flags |= TC_ETF_SKIP_SOCK_CHECK; + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 0f65f617756b..bf56aa519797 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -83,7 +83,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); - q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->block_info.chain_head_change = clsact_chain_head_change; q->block_info.chain_head_change_priv = &q->miniqp; @@ -114,6 +114,7 @@ nla_put_failure: } static const struct Qdisc_class_ops ingress_class_ops = { + .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED, .leaf = ingress_leaf, .find = ingress_find, .walk = ingress_walk, @@ -216,7 +217,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); - q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->ingress_block_info.chain_head_change = clsact_chain_head_change; q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress; @@ -227,7 +228,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress); - q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS; + q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS; q->egress_block_info.chain_head_change = clsact_chain_head_change; q->egress_block_info.chain_head_change_priv = &q->miniqp_egress; @@ -246,6 +247,7 @@ static void clsact_destroy(struct Qdisc *sch) } static const struct Qdisc_class_ops clsact_class_ops = { + .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED, .leaf = ingress_leaf, .find = clsact_find, .walk = ingress_walk, diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 9ecfb8f5902a..388750ddc57a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -21,12 +21,17 @@ #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> +#include <net/sock.h> +#include <net/tcp.h> static LIST_HEAD(taprio_list); static DEFINE_SPINLOCK(taprio_list_lock); #define TAPRIO_ALL_GATES_OPEN -1 +#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)) +#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) + struct sched_entry { struct list_head list; @@ -35,6 +40,7 @@ struct sched_entry { * packet leaves after this time. */ ktime_t close_time; + ktime_t next_txtime; atomic_t budget; int index; u32 gate_mask; @@ -55,6 +61,8 @@ struct sched_gate_list { struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; + u32 flags; + enum tk_offsets tk_offset; int clockid; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte @@ -65,9 +73,9 @@ struct taprio_sched { struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; - ktime_t (*get_time)(void); struct hrtimer advance_timer; struct list_head taprio_list; + int txtime_delay; }; static ktime_t sched_base_time(const struct sched_gate_list *sched) @@ -78,6 +86,20 @@ static ktime_t sched_base_time(const struct sched_gate_list *sched) return ns_to_ktime(sched->base_time); } +static ktime_t taprio_get_time(struct taprio_sched *q) +{ + ktime_t mono = ktime_get(); + + switch (q->tk_offset) { + case TK_OFFS_MAX: + return mono; + default: + return ktime_mono_to_any(mono, q->tk_offset); + } + + return KTIME_MAX; +} + static void taprio_free_sched_cb(struct rcu_head *head) { struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); @@ -108,20 +130,263 @@ static void switch_schedules(struct taprio_sched *q, *admin = NULL; } -static ktime_t get_cycle_time(struct sched_gate_list *sched) +/* Get how much time has been already elapsed in the current cycle. */ +static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) +{ + ktime_t time_since_sched_start; + s32 time_elapsed; + + time_since_sched_start = ktime_sub(time, sched->base_time); + div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); + + return time_elapsed; +} + +static ktime_t get_interval_end_time(struct sched_gate_list *sched, + struct sched_gate_list *admin, + struct sched_entry *entry, + ktime_t intv_start) +{ + s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); + ktime_t intv_end, cycle_ext_end, cycle_end; + + cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); + intv_end = ktime_add_ns(intv_start, entry->interval); + cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); + + if (ktime_before(intv_end, cycle_end)) + return intv_end; + else if (admin && admin != sched && + ktime_after(admin->base_time, cycle_end) && + ktime_before(admin->base_time, cycle_ext_end)) + return admin->base_time; + else + return cycle_end; +} + +static int length_to_duration(struct taprio_sched *q, int len) +{ + return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); +} + +/* Returns the entry corresponding to next available interval. If + * validate_interval is set, it only validates whether the timestamp occurs + * when the gate corresponding to the skb's traffic class is open. + */ +static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, + struct Qdisc *sch, + struct sched_gate_list *sched, + struct sched_gate_list *admin, + ktime_t time, + ktime_t *interval_start, + ktime_t *interval_end, + bool validate_interval) +{ + ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; + ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; + struct sched_entry *entry = NULL, *entry_found = NULL; + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + bool entry_available = false; + s32 cycle_elapsed; + int tc, n; + + tc = netdev_get_prio_tc_map(dev, skb->priority); + packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); + + *interval_start = 0; + *interval_end = 0; + + if (!sched) + return NULL; + + cycle = sched->cycle_time; + cycle_elapsed = get_cycle_time_elapsed(sched, time); + curr_intv_end = ktime_sub_ns(time, cycle_elapsed); + cycle_end = ktime_add_ns(curr_intv_end, cycle); + + list_for_each_entry(entry, &sched->entries, list) { + curr_intv_start = curr_intv_end; + curr_intv_end = get_interval_end_time(sched, admin, entry, + curr_intv_start); + + if (ktime_after(curr_intv_start, cycle_end)) + break; + + if (!(entry->gate_mask & BIT(tc)) || + packet_transmit_time > entry->interval) + continue; + + txtime = entry->next_txtime; + + if (ktime_before(txtime, time) || validate_interval) { + transmit_end_time = ktime_add_ns(time, packet_transmit_time); + if ((ktime_before(curr_intv_start, time) && + ktime_before(transmit_end_time, curr_intv_end)) || + (ktime_after(curr_intv_start, time) && !validate_interval)) { + entry_found = entry; + *interval_start = curr_intv_start; + *interval_end = curr_intv_end; + break; + } else if (!entry_available && !validate_interval) { + /* Here, we are just trying to find out the + * first available interval in the next cycle. + */ + entry_available = 1; + entry_found = entry; + *interval_start = ktime_add_ns(curr_intv_start, cycle); + *interval_end = ktime_add_ns(curr_intv_end, cycle); + } + } else if (ktime_before(txtime, earliest_txtime) && + !entry_available) { + earliest_txtime = txtime; + entry_found = entry; + n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); + *interval_start = ktime_add(curr_intv_start, n * cycle); + *interval_end = ktime_add(curr_intv_end, n * cycle); + } + } + + return entry_found; +} + +static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) { + struct taprio_sched *q = qdisc_priv(sch); + struct sched_gate_list *sched, *admin; + ktime_t interval_start, interval_end; struct sched_entry *entry; - ktime_t cycle = 0; - if (sched->cycle_time != 0) - return sched->cycle_time; + rcu_read_lock(); + sched = rcu_dereference(q->oper_sched); + admin = rcu_dereference(q->admin_sched); + + entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, + &interval_start, &interval_end, true); + rcu_read_unlock(); - list_for_each_entry(entry, &sched->entries, list) - cycle = ktime_add_ns(cycle, entry->interval); + return entry; +} - sched->cycle_time = cycle; +/* This returns the tstamp value set by TCP in terms of the set clock. */ +static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) +{ + unsigned int offset = skb_network_offset(skb); + const struct ipv6hdr *ipv6h; + const struct iphdr *iph; + struct ipv6hdr _ipv6h; - return cycle; + ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); + if (!ipv6h) + return 0; + + if (ipv6h->version == 4) { + iph = (struct iphdr *)ipv6h; + offset += iph->ihl * 4; + + /* special-case 6in4 tunnelling, as that is a common way to get + * v6 connectivity in the home + */ + if (iph->protocol == IPPROTO_IPV6) { + ipv6h = skb_header_pointer(skb, offset, + sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) + return 0; + } else if (iph->protocol != IPPROTO_TCP) { + return 0; + } + } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { + return 0; + } + + return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset); +} + +/* There are a few scenarios where we will have to modify the txtime from + * what is read from next_txtime in sched_entry. They are: + * 1. If txtime is in the past, + * a. The gate for the traffic class is currently open and packet can be + * transmitted before it closes, schedule the packet right away. + * b. If the gate corresponding to the traffic class is going to open later + * in the cycle, set the txtime of packet to the interval start. + * 2. If txtime is in the future, there are packets corresponding to the + * current traffic class waiting to be transmitted. So, the following + * possibilities exist: + * a. We can transmit the packet before the window containing the txtime + * closes. + * b. The window might close before the transmission can be completed + * successfully. So, schedule the packet in the next open window. + */ +static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) +{ + ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; + struct taprio_sched *q = qdisc_priv(sch); + struct sched_gate_list *sched, *admin; + ktime_t minimum_time, now, txtime; + int len, packet_transmit_time; + struct sched_entry *entry; + bool sched_changed; + + now = taprio_get_time(q); + minimum_time = ktime_add_ns(now, q->txtime_delay); + + tcp_tstamp = get_tcp_tstamp(q, skb); + minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); + + rcu_read_lock(); + admin = rcu_dereference(q->admin_sched); + sched = rcu_dereference(q->oper_sched); + if (admin && ktime_after(minimum_time, admin->base_time)) + switch_schedules(q, &admin, &sched); + + /* Until the schedule starts, all the queues are open */ + if (!sched || ktime_before(minimum_time, sched->base_time)) { + txtime = minimum_time; + goto done; + } + + len = qdisc_pkt_len(skb); + packet_transmit_time = length_to_duration(q, len); + + do { + sched_changed = 0; + + entry = find_entry_to_transmit(skb, sch, sched, admin, + minimum_time, + &interval_start, &interval_end, + false); + if (!entry) { + txtime = 0; + goto done; + } + + txtime = entry->next_txtime; + txtime = max_t(ktime_t, txtime, minimum_time); + txtime = max_t(ktime_t, txtime, interval_start); + + if (admin && admin != sched && + ktime_after(txtime, admin->base_time)) { + sched = admin; + sched_changed = 1; + continue; + } + + transmit_end_time = ktime_add(txtime, packet_transmit_time); + minimum_time = transmit_end_time; + + /* Update the txtime of current entry to the next time it's + * interval starts. + */ + if (ktime_after(transmit_end_time, interval_end)) + entry->next_txtime = ktime_add(interval_start, sched->cycle_time); + } while (sched_changed || ktime_after(transmit_end_time, interval_end)); + + entry->next_txtime = transmit_end_time; + +done: + rcu_read_unlock(); + return txtime; } static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -137,6 +402,15 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); + if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) { + if (!is_valid_interval(skb, sch)) + return qdisc_drop(skb, sch, to_free); + } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { + skb->tstamp = get_packet_txtime(skb, sch); + if (!skb->tstamp) + return qdisc_drop(skb, sch, to_free); + } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; @@ -172,6 +446,9 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) if (!skb) continue; + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) + return skb; + prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); @@ -184,11 +461,6 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) return NULL; } -static inline int length_to_duration(struct taprio_sched *q, int len) -{ - return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); -} - static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) { atomic_set(&entry->budget, @@ -232,6 +504,13 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) if (unlikely(!child)) continue; + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { + skb = child->ops->dequeue(child); + if (!skb) + continue; + goto skb_found; + } + skb = child->ops->peek(child); if (!skb) continue; @@ -243,7 +522,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) continue; len = qdisc_pkt_len(skb); - guard = ktime_add_ns(q->get_time(), + guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); /* In the case that there's no gate entry, there's no @@ -262,6 +541,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) if (unlikely(!skb)) goto done; +skb_found: qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; @@ -524,12 +804,22 @@ static int parse_taprio_schedule(struct nlattr **tb, if (err < 0) return err; + if (!new->cycle_time) { + struct sched_entry *entry; + ktime_t cycle = 0; + + list_for_each_entry(entry, &new->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + new->cycle_time = cycle; + } + return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + u32 taprio_flags) { int i, j; @@ -577,6 +867,9 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, return -EINVAL; } + if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) + continue; + /* Verify that the offset and counts do not overlap */ for (j = i + 1; j < qopt->num_tc; j++) { if (last > qopt->offset[j]) { @@ -598,14 +891,14 @@ static int taprio_get_start_time(struct Qdisc *sch, s64 n; base = sched_base_time(sched); - now = q->get_time(); + now = taprio_get_time(q); if (ktime_after(base, now)) { *start = base; return 0; } - cycle = get_cycle_time(sched); + cycle = sched->cycle_time; /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, @@ -632,7 +925,7 @@ static void setup_first_close_time(struct taprio_sched *q, first = list_first_entry(&sched->entries, struct sched_entry, list); - cycle = get_cycle_time(sched); + cycle = sched->cycle_time; /* FIXME: find a better place to do this */ sched->cycle_close_time = ktime_add_ns(base, cycle); @@ -707,6 +1000,18 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; } +static void setup_txtime(struct taprio_sched *q, + struct sched_gate_list *sched, ktime_t base) +{ + struct sched_entry *entry; + u32 interval = 0; + + list_for_each_entry(entry, &sched->entries, list) { + entry->next_txtime = ktime_add_ns(base, interval); + interval += entry->interval; + } +} + static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -715,6 +1020,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; + u32 taprio_flags = 0; int i, err, clockid; unsigned long flags; ktime_t start; @@ -727,7 +1033,21 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); - err = taprio_parse_mqprio_opt(dev, mqprio, extack); + if (tb[TCA_TAPRIO_ATTR_FLAGS]) { + taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]); + + if (q->flags != 0 && q->flags != taprio_flags) { + NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); + return -EOPNOTSUPP; + } else if (!FLAGS_VALID(taprio_flags)) { + NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); + return -EINVAL; + } + + q->flags = taprio_flags; + } + + err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags); if (err < 0) return err; @@ -786,7 +1106,18 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); - if (!hrtimer_active(&q->advance_timer)) { + if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { + if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { + NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); + err = -EINVAL; + goto unlock; + } + + q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); + } + + if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) && + !hrtimer_active(&q->advance_timer)) { hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; } @@ -806,16 +1137,16 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, switch (q->clockid) { case CLOCK_REALTIME: - q->get_time = ktime_get_real; + q->tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: - q->get_time = ktime_get; + q->tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: - q->get_time = ktime_get_boottime; + q->tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: - q->get_time = ktime_get_clocktai; + q->tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); @@ -829,20 +1160,35 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - setup_first_close_time(q, new_admin, start); + if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) { + setup_txtime(q, new_admin, start); - /* Protects against advance_sched() */ - spin_lock_irqsave(&q->current_entry_lock, flags); + if (!oper) { + rcu_assign_pointer(q->oper_sched, new_admin); + err = 0; + new_admin = NULL; + goto unlock; + } - taprio_start_sched(sch, start, new_admin); + rcu_assign_pointer(q->admin_sched, new_admin); + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); + } else { + setup_first_close_time(q, new_admin, start); - rcu_assign_pointer(q->admin_sched, new_admin); - if (admin) - call_rcu(&admin->rcu, taprio_free_sched_cb); - new_admin = NULL; + /* Protects against advance_sched() */ + spin_lock_irqsave(&q->current_entry_lock, flags); - spin_unlock_irqrestore(&q->current_entry_lock, flags); + taprio_start_sched(sch, start, new_admin); + rcu_assign_pointer(q->admin_sched, new_admin); + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); + + spin_unlock_irqrestore(&q->current_entry_lock, flags); + } + + new_admin = NULL; err = 0; unlock: @@ -1080,6 +1426,13 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; + if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) + goto options_error; + + if (q->txtime_delay && + nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) + goto options_error; + if (oper && dump_schedule(skb, oper)) goto options_error; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 1999237ce481..5010cce52c93 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -261,8 +261,6 @@ static struct sctp_association *sctp_association_init( goto stream_free; asoc->active_key_id = ep->active_key_id; - asoc->prsctp_enable = ep->prsctp_enable; - asoc->reconf_enable = ep->reconf_enable; asoc->strreset_enable = ep->strreset_enable; /* Save the hmacs and chunks list into this association */ diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index f54333cbbe0f..53bc61537f44 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -393,24 +393,19 @@ int sctp_bind_addr_state(const struct sctp_bind_addr *bp, { struct sctp_sockaddr_entry *laddr; struct sctp_af *af; - int state = -1; af = sctp_get_af_specific(addr->sa.sa_family); if (unlikely(!af)) - return state; + return -1; - rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; - if (af->cmp_addr(&laddr->a, addr)) { - state = laddr->state; - break; - } + if (af->cmp_addr(&laddr->a, addr)) + return laddr->state; } - rcu_read_unlock(); - return state; + return -1; } /* Find the first address in the bind address list that is not present in diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 64e0a594a651..e5f2fc726a98 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -253,7 +253,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) goto out; fl6_sock_release(flowlabel); } diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 2cae7440349c..74847d613835 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -94,11 +94,6 @@ static const struct net_offload sctp6_offload = { }, }; -static const struct skb_checksum_ops crc32c_csum_ops = { - .update = sctp_csum_update, - .combine = sctp_csum_combine, -}; - int __init sctp_offload_init(void) { int ret; @@ -111,7 +106,7 @@ int __init sctp_offload_init(void) if (ret) goto ipv4; - crc32c_csum_stub = &crc32c_csum_ops; + crc32c_csum_stub = &sctp_csum_ops; return ret; ipv4: diff --git a/net/sctp/output.c b/net/sctp/output.c index e0c27477788d..dbda7e7927fd 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -282,6 +282,9 @@ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt, sctp_chunk_free(sack); goto out; } + SCTP_INC_STATS(sock_net(asoc->base.sk), + SCTP_MIB_OUTCTRLCHUNKS); + asoc->stats.octrlchunks++; asoc->peer.sack_needed = 0; if (del_timer(timer)) sctp_association_put(asoc); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 23af232c0a25..2d47adcb4cbe 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -81,7 +81,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist, return; } - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { /* Add the address to the local list. */ addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 9b0e5b0d701a..ed39396b9bba 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -247,7 +247,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); chunksize += sizeof(ecap_param); - if (asoc->prsctp_enable) + if (asoc->ep->prsctp_enable) chunksize += sizeof(prsctp_param); /* ADDIP: Section 4.2.7: @@ -261,7 +261,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, num_ext += 2; } - if (asoc->reconf_enable) { + if (asoc->ep->reconf_enable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } @@ -269,7 +269,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, if (sp->adaptation_ind) chunksize += sizeof(aiparam); - if (sp->strm_interleave) { + if (asoc->ep->intl_enable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } @@ -348,7 +348,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, sctp_addto_param(retval, num_ext, extensions); } - if (asoc->prsctp_enable) + if (asoc->ep->prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { @@ -438,7 +438,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, if (sp->adaptation_ind) chunksize += sizeof(aiparam); - if (asoc->intl_enable) { + if (asoc->peer.intl_capable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } @@ -2007,12 +2007,11 @@ static void sctp_process_ext_param(struct sctp_association *asoc, for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_RECONF: - if (asoc->reconf_enable && - !asoc->peer.reconf_capable) + if (asoc->ep->reconf_enable) asoc->peer.reconf_capable = 1; break; case SCTP_CID_FWD_TSN: - if (asoc->prsctp_enable && !asoc->peer.prsctp_capable) + if (asoc->ep->prsctp_enable) asoc->peer.prsctp_capable = 1; break; case SCTP_CID_AUTH: @@ -2028,8 +2027,8 @@ static void sctp_process_ext_param(struct sctp_association *asoc, asoc->peer.asconf_capable = 1; break; case SCTP_CID_I_DATA: - if (sctp_sk(asoc->base.sk)->strm_interleave) - asoc->intl_enable = 1; + if (asoc->ep->intl_enable) + asoc->peer.intl_capable = 1; break; default: break; @@ -2637,7 +2636,7 @@ do_addr_param: break; case SCTP_PARAM_FWD_TSN_SUPPORT: - if (asoc->prsctp_enable) { + if (asoc->ep->prsctp_enable) { asoc->peer.prsctp_capable = 1; break; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 39ea0a37af09..aa80cda36581 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1913,7 +1913,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (err) goto err; - if (sp->strm_interleave) { + if (asoc->ep->intl_enable) { timeo = sock_sndtimeo(sk, 0); err = sctp_wait_for_connect(asoc, &timeo); if (err) { @@ -3581,7 +3581,7 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk, sctp_sk(sk)->frag_interleave = !!val; if (!sctp_sk(sk)->frag_interleave) - sctp_sk(sk)->strm_interleave = 0; + sctp_sk(sk)->ep->intl_enable = 0; return 0; } @@ -4226,10 +4226,7 @@ static int sctp_setsockopt_reconfig_supported(struct sock *sk, sctp_style(sk, UDP)) goto out; - if (asoc) - asoc->reconf_enable = !!params.assoc_value; - else - sctp_sk(sk)->ep->reconf_enable = !!params.assoc_value; + sctp_sk(sk)->ep->reconf_enable = !!params.assoc_value; retval = 0; @@ -4487,7 +4484,7 @@ static int sctp_setsockopt_interleaving_supported(struct sock *sk, goto out; } - sp->strm_interleave = !!params.assoc_value; + sp->ep->intl_enable = !!params.assoc_value; retval = 0; @@ -4816,35 +4813,17 @@ out_nounlock: static int sctp_connect(struct sock *sk, struct sockaddr *addr, int addr_len, int flags) { - struct inet_sock *inet = inet_sk(sk); struct sctp_af *af; - int err = 0; + int err = -EINVAL; lock_sock(sk); - pr_debug("%s: sk:%p, sockaddr:%p, addr_len:%d\n", __func__, sk, addr, addr_len); - /* We may need to bind the socket. */ - if (!inet->inet_num) { - if (sk->sk_prot->get_port(sk, 0)) { - release_sock(sk); - return -EAGAIN; - } - inet->inet_sport = htons(inet->inet_num); - } - /* Validate addr_len before calling common connect/connectx routine. */ - af = addr_len < offsetofend(struct sockaddr, sa_family) ? NULL : - sctp_get_af_specific(addr->sa_family); - if (!af || addr_len < af->sockaddr_len) { - err = -EINVAL; - } else { - /* Pass correct addr len to common routine (so it knows there - * is only one address being passed. - */ + af = sctp_get_af_specific(addr->sa_family); + if (af && addr_len >= af->sockaddr_len) err = __sctp_connect(sk, addr, af->sockaddr_len, flags, NULL); - } release_sock(sk); return err; @@ -7346,7 +7325,7 @@ static int sctp_getsockopt_pr_supported(struct sock *sk, int len, goto out; } - params.assoc_value = asoc ? asoc->prsctp_enable + params.assoc_value = asoc ? asoc->peer.prsctp_capable : sctp_sk(sk)->ep->prsctp_enable; if (put_user(len, optlen)) @@ -7554,7 +7533,7 @@ static int sctp_getsockopt_reconfig_supported(struct sock *sk, int len, goto out; } - params.assoc_value = asoc ? asoc->reconf_enable + params.assoc_value = asoc ? asoc->peer.reconf_capable : sctp_sk(sk)->ep->reconf_enable; if (put_user(len, optlen)) @@ -7713,8 +7692,8 @@ static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len, goto out; } - params.assoc_value = asoc ? asoc->intl_enable - : sctp_sk(sk)->strm_interleave; + params.assoc_value = asoc ? asoc->peer.intl_capable + : sctp_sk(sk)->ep->intl_enable; if (put_user(len, optlen)) goto out; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 93ed07877337..25946604af85 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -153,13 +153,20 @@ out: int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) { struct sctp_stream_out_ext *soute; + int ret; soute = kzalloc(sizeof(*soute), GFP_KERNEL); if (!soute) return -ENOMEM; SCTP_SO(stream, sid)->ext = soute; - return sctp_sched_init_sid(stream, sid, GFP_KERNEL); + ret = sctp_sched_init_sid(stream, sid, GFP_KERNEL); + if (ret) { + kfree(SCTP_SO(stream, sid)->ext); + SCTP_SO(stream, sid)->ext = NULL; + } + + return ret; } void sctp_stream_free(struct sctp_stream *stream) diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index afbf1223d91c..40c40be23fcb 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -1358,6 +1358,6 @@ void sctp_stream_interleave_init(struct sctp_stream *stream) struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); - stream->si = asoc->intl_enable ? &sctp_stream_interleave_1 - : &sctp_stream_interleave_0; + stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1 + : &sctp_stream_interleave_0; } diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index b8fa7ab3e394..99e5f69fbb74 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -228,7 +228,7 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { if (!list_is_last(&ch->frag_list, &ch->msg->chunks) && - !q->asoc->intl_enable) { + !q->asoc->peer.intl_capable) { struct sctp_stream_out *sout; __u16 sid; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 7621ec2f539c..302e355f2ebc 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -123,30 +123,11 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); -static int smc_release(struct socket *sock) +static int __smc_release(struct smc_sock *smc) { - struct sock *sk = sock->sk; - struct smc_sock *smc; + struct sock *sk = &smc->sk; int rc = 0; - if (!sk) - goto out; - - smc = smc_sk(sk); - - /* cleanup for a dangling non-blocking connect */ - if (smc->connect_nonblock && sk->sk_state == SMC_INIT) - tcp_abort(smc->clcsock->sk, ECONNABORTED); - flush_work(&smc->connect_work); - - if (sk->sk_state == SMC_LISTEN) - /* smc_close_non_accepted() is called and acquires - * sock lock for child sockets again - */ - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - else - lock_sock(sk); - if (!smc->use_fallback) { rc = smc_close_active(smc); sock_set_flag(sk, SOCK_DEAD); @@ -174,6 +155,35 @@ static int smc_release(struct socket *sock) smc_conn_free(&smc->conn); } + return rc; +} + +static int smc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = 0; + + if (!sk) + goto out; + + smc = smc_sk(sk); + + /* cleanup for a dangling non-blocking connect */ + if (smc->connect_nonblock && sk->sk_state == SMC_INIT) + tcp_abort(smc->clcsock->sk, ECONNABORTED); + flush_work(&smc->connect_work); + + if (sk->sk_state == SMC_LISTEN) + /* smc_close_non_accepted() is called and acquires + * sock lock for child sockets again + */ + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + else + lock_sock(sk); + + rc = __smc_release(smc); + /* detach socket */ sock_orphan(sk); sock->sk = NULL; @@ -964,26 +974,7 @@ void smc_close_non_accepted(struct sock *sk) if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; - if (!smc->use_fallback) { - smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); - sk->sk_shutdown |= SHUTDOWN_MASK; - } - sk->sk_prot->unhash(sk); - if (smc->clcsock) { - struct socket *tcp; - - tcp = smc->clcsock; - smc->clcsock = NULL; - sock_release(tcp); - } - if (smc->use_fallback) { - sock_put(sk); /* passive closing */ - sk->sk_state = SMC_CLOSED; - } else { - if (sk->sk_state == SMC_CLOSED) - smc_conn_free(&smc->conn); - } + __smc_release(smc); release_sock(sk); sock_put(sk); /* final sock_put */ } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 745afd82f281..49bcebff6378 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -97,17 +97,19 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; - for_ifa(in_dev) { + + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (!inet_ifa_match(ipv4, ifa)) continue; prop->prefix_len = inet_mask_len(ifa->ifa_mask); prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ return 0; - } endfor_ifa(in_dev); + } return -ENOENT; } @@ -190,14 +192,15 @@ static int smc_clc_prfx_match4_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dev); + const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && inet_ifa_match(prop->outgoing_subnet, ifa)) return 0; - } endfor_ifa(in_dev); + } return -ENOENT; } diff --git a/net/socket.c b/net/socket.c index 38eec1583f6d..16449d6daeca 100644 --- a/net/socket.c +++ b/net/socket.c @@ -103,13 +103,6 @@ #include <net/busy_poll.h> #include <linux/errqueue.h> -/* proto_ops for ipv4 and ipv6 use the same {recv,send}msg function */ -#if IS_ENABLED(CONFIG_INET) -#define INDIRECT_CALL_INET4(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) -#else -#define INDIRECT_CALL_INET4(f, f1, ...) f(__VA_ARGS__) -#endif - #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; @@ -241,20 +234,13 @@ static struct kmem_cache *sock_inode_cachep __ro_after_init; static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; - struct socket_wq *wq; ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; - wq = kmalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) { - kmem_cache_free(sock_inode_cachep, ei); - return NULL; - } - init_waitqueue_head(&wq->wait); - wq->fasync_list = NULL; - wq->flags = 0; - ei->socket.wq = wq; + init_waitqueue_head(&ei->socket.wq.wait); + ei->socket.wq.fasync_list = NULL; + ei->socket.wq.flags = 0; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; @@ -265,12 +251,11 @@ static struct inode *sock_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void sock_destroy_inode(struct inode *inode) +static void sock_free_inode(struct inode *inode) { struct socket_alloc *ei; ei = container_of(inode, struct socket_alloc, vfs_inode); - kfree_rcu(ei->socket.wq, rcu); kmem_cache_free(sock_inode_cachep, ei); } @@ -295,7 +280,7 @@ static void init_inodecache(void) static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, - .destroy_inode = sock_destroy_inode, + .free_inode = sock_free_inode, .statfs = simple_statfs, }; @@ -429,7 +414,7 @@ static int sock_map_fd(struct socket *sock, int flags) } newfile = sock_alloc_file(sock, flags, NULL); - if (likely(!IS_ERR(newfile))) { + if (!IS_ERR(newfile)) { fd_install(fd, newfile); return fd; } @@ -606,7 +591,7 @@ static void __sock_release(struct socket *sock, struct inode *inode) module_put(owner); } - if (sock->wq->fasync_list) + if (sock->wq.fasync_list) pr_err("%s: fasync list not empty!\n", __func__); if (!sock->file) { @@ -641,10 +626,13 @@ EXPORT_SYMBOL(__sock_tx_timestamp); INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); +INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *, + size_t)); static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - int ret = INDIRECT_CALL_INET4(sock->ops->sendmsg, inet_sendmsg, sock, - msg, msg_data_left(msg)); + int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, + inet_sendmsg, sock, msg, + msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); return ret; } @@ -870,12 +858,15 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops); INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, - size_t , int )); + size_t, int)); +INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, + size_t, int)); static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - return INDIRECT_CALL_INET4(sock->ops->recvmsg, inet_recvmsg, sock, msg, - msg_data_left(msg), flags); + return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, + inet_recvmsg, sock, msg, msg_data_left(msg), + flags); } /** @@ -1289,13 +1280,12 @@ static int sock_fasync(int fd, struct file *filp, int on) { struct socket *sock = filp->private_data; struct sock *sk = sock->sk; - struct socket_wq *wq; + struct socket_wq *wq = &sock->wq; if (sk == NULL) return -EINVAL; lock_sock(sk); - wq = sock->wq; fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) @@ -2051,6 +2041,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, static int __sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen) { + mm_segment_t oldfs = get_fs(); + char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2063,6 +2055,22 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, + &optname, optval, &optlen, + &kernel_optval); + + if (err < 0) { + goto out_put; + } else if (err > 0) { + err = 0; + goto out_put; + } + + if (kernel_optval) { + set_fs(KERNEL_DS); + optval = (char __user __force *)kernel_optval; + } + if (level == SOL_SOCKET) err = sock_setsockopt(sock, level, optname, optval, @@ -2071,6 +2079,11 @@ static int __sys_setsockopt(int fd, int level, int optname, err = sock->ops->setsockopt(sock, level, optname, optval, optlen); + + if (kernel_optval) { + set_fs(oldfs); + kfree(kernel_optval); + } out_put: fput_light(sock->file, fput_needed); } @@ -2093,6 +2106,7 @@ static int __sys_getsockopt(int fd, int level, int optname, { int err, fput_needed; struct socket *sock; + int max_optlen; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -2100,6 +2114,8 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; + max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, @@ -2108,6 +2124,10 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); + + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, + optval, optlen, + max_optlen, err); out_put: fput_light(sock->file, fput_needed); } diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index f64f36a83063..b3815c1e8f2e 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -157,18 +157,14 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, return 0; } - skb = alloc_skb(0, GFP_ATOMIC); + skb = alloc_skb_for_msg(head); if (!skb) { STRP_STATS_INCR(strp->stats.mem_fail); desc->error = -ENOMEM; return 0; } - skb->len = head->len; - skb->data_len = head->len; - skb->truesize = head->truesize; - *_strp_msg(skb) = *_strp_msg(head); + strp->skb_nextp = &head->next; - skb_shinfo(skb)->frag_list = head; strp->skb_head = skb; head = skb; } else { diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 6c997d4a6218..1336f3cdad38 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -323,7 +323,7 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, hdr = buf_msg(skb); if (msg_user(hdr) == MSG_FRAGMENTER) - hdr = msg_get_wrapped(hdr); + hdr = msg_inner_hdr(hdr); if (msg_type(hdr) != TIPC_MCAST_MSG) return 0; @@ -392,7 +392,7 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, skb = skb_peek(pkts); hdr = buf_msg(skb); if (msg_user(hdr) == MSG_FRAGMENTER) - hdr = msg_get_wrapped(hdr); + hdr = msg_inner_hdr(hdr); msg_set_is_rcast(hdr, method->rcast); /* Switch method ? */ diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 2bed6589f41e..a809c0ec8d15 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -62,7 +62,7 @@ static struct tipc_bearer *bearer_get(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); - return rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + return rcu_dereference(tn->bearer_list[bearer_id]); } static void bearer_disable(struct net *net, struct tipc_bearer *b); @@ -210,7 +210,7 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + b = rcu_dereference(tn->bearer_list[bearer_id]); if (b) tipc_disc_add_dest(b->disc); rcu_read_unlock(); @@ -222,7 +222,7 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + b = rcu_dereference(tn->bearer_list[bearer_id]); if (b) tipc_disc_remove_dest(b->disc); rcu_read_unlock(); @@ -444,7 +444,7 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, struct net_device *dev; int delta; - dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr); + dev = (struct net_device *)rcu_dereference(b->media_ptr); if (!dev) return 0; @@ -481,7 +481,7 @@ int tipc_bearer_mtu(struct net *net, u32 bearer_id) struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(tipc_net(net)->bearer_list[bearer_id]); + b = rcu_dereference(tipc_net(net)->bearer_list[bearer_id]); if (b) mtu = b->mtu; rcu_read_unlock(); @@ -574,8 +574,8 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(dev->tipc_ptr) ?: - rcu_dereference_rtnl(orig_dev->tipc_ptr); + b = rcu_dereference(dev->tipc_ptr) ?: + rcu_dereference(orig_dev->tipc_ptr); if (likely(b && test_bit(0, &b->up) && (skb->pkt_type <= PACKET_MULTICAST))) { skb_mark_not_on_list(skb); diff --git a/net/tipc/link.c b/net/tipc/link.c index 2050fd386642..66d3a07bc571 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -107,7 +107,6 @@ struct tipc_stats { * @backlogq: queue for messages waiting to be sent * @snt_nxt: next sequence number to use for outbound messages * @prev_from: sequence number of most previous retransmission request - * @stale_cnt: counter for number of identical retransmit attempts * @stale_limit: time when repeated identical retransmits must force link reset * @ackers: # of peers that needs to ack each packet before it can be released * @acked: # last packet acked by a certain peer. Used for broadcast. @@ -167,7 +166,6 @@ struct tipc_link { u16 snd_nxt; u16 prev_from; u16 window; - u16 stale_cnt; unsigned long stale_limit; /* Reception */ @@ -209,7 +207,7 @@ enum { BC_NACK_SND_SUPPRESS, }; -#define TIPC_BC_RETR_LIM msecs_to_jiffies(10) /* [ms] */ +#define TIPC_BC_RETR_LIM (jiffies + msecs_to_jiffies(10)) #define TIPC_UC_RETR_TIME (jiffies + msecs_to_jiffies(1)) /* @@ -249,9 +247,9 @@ static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static bool tipc_link_release_pkts(struct tipc_link *l, u16 to); static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data); -static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, - struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq); +static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq); /* * Simple non-static link routines (i.e. referenced outside this file) @@ -734,7 +732,7 @@ static void link_profile_stats(struct tipc_link *l) if (msg_user(msg) == MSG_FRAGMENTER) { if (msg_type(msg) != FIRST_FRAGMENT) return; - length = msg_size(msg_get_wrapped(msg)); + length = msg_size(msg_inner_hdr(msg)); } l->stats.msg_lengths_total += length; l->stats.msg_length_counts++; @@ -910,7 +908,6 @@ void tipc_link_reset(struct tipc_link *l) l->acked = 0; l->silent_intv_cnt = 0; l->rst_cnt = 0; - l->stale_cnt = 0; l->bc_peer_is_up = false; memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); @@ -979,8 +976,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, __skb_queue_tail(transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) - TIPC_SKB_CB(skb)->nxt_retr = - jiffies + TIPC_BC_RETR_LIM; + TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; l->rcv_unacked = 0; @@ -1030,7 +1026,7 @@ static void tipc_link_advance_backlog(struct tipc_link *l, __skb_queue_tail(&l->transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) - TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM; + TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; @@ -1044,32 +1040,68 @@ static void tipc_link_advance_backlog(struct tipc_link *l, l->snd_nxt = seqno; } -static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb) +/** + * link_retransmit_failure() - Detect repeated retransmit failures + * @l: tipc link sender + * @r: tipc link receiver (= l in case of unicast) + * @from: seqno of the 1st packet in retransmit request + * @rc: returned code + * + * Return: true if the repeated retransmit failures happens, otherwise + * false + */ +static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, + u16 from, int *rc) { - struct tipc_msg *hdr = buf_msg(skb); + struct sk_buff *skb = skb_peek(&l->transmq); + struct tipc_msg *hdr; + + if (!skb) + return false; + hdr = buf_msg(skb); + + /* Detect repeated retransmit failures on same packet */ + if (r->prev_from != from) { + r->prev_from = from; + r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); + } else if (time_after(jiffies, r->stale_limit)) { + pr_warn("Retransmission failure on link <%s>\n", l->name); + link_print(l, "State of link "); + pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", + msg_user(hdr), msg_type(hdr), msg_size(hdr), + msg_errcode(hdr)); + pr_info("sqno %u, prev: %x, src: %x\n", + msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); + + trace_tipc_list_dump(&l->transmq, true, "retrans failure!"); + trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!"); + trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!"); + + if (link_is_bc_sndlink(l)) + *rc = TIPC_LINK_DOWN_EVT; + + *rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + return true; + } - pr_warn("Retransmission failure on link <%s>\n", l->name); - link_print(l, "State of link "); - pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", - msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr)); - pr_info("sqno %u, prev: %x, src: %x\n", - msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); + return false; } -/* tipc_link_retrans() - retransmit one or more packets +/* tipc_link_bc_retrans() - retransmit zero or more packets * @l: the link to transmit on * @r: the receiving link ordering the retransmit. Same as l if unicast * @from: retransmit from (inclusive) this sequence number * @to: retransmit to (inclusive) this sequence number * xmitq: queue for accumulating the retransmitted packets */ -static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, - u16 from, u16 to, struct sk_buff_head *xmitq) +static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, + u16 from, u16 to, struct sk_buff_head *xmitq) { struct sk_buff *_skb, *skb = skb_peek(&l->transmq); u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; struct tipc_msg *hdr; + int rc = 0; if (!skb) return 0; @@ -1077,20 +1109,9 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, return 0; trace_tipc_link_retrans(r, from, to, &l->transmq); - /* Detect repeated retransmit failures on same packet */ - if (r->prev_from != from) { - r->prev_from = from; - r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); - r->stale_cnt = 0; - } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { - link_retransmit_failure(l, skb); - trace_tipc_list_dump(&l->transmq, true, "retrans failure!"); - trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!"); - trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!"); - if (link_is_bc_sndlink(l)) - return TIPC_LINK_DOWN_EVT; - return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); - } + + if (link_retransmit_failure(l, r, from, &rc)) + return rc; skb_queue_walk(&l->transmq, skb) { hdr = buf_msg(skb); @@ -1101,9 +1122,9 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, if (link_is_bc_sndlink(l)) { if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; - TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM; + TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; } - _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); + _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC); if (!_skb) return 0; hdr = buf_msg(_skb); @@ -1324,17 +1345,23 @@ exit: * @gap: # of gap packets * @ga: buffer pointer to Gap ACK blocks from peer * @xmitq: queue for accumulating the retransmitted packets if any + * + * In case of a repeated retransmit failures, the call will return shortly + * with a returned code (e.g. TIPC_LINK_DOWN_EVT) */ -static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, - struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq) +static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq) { struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; - u16 seqno; - u16 n = 0; + u16 seqno, n = 0; + int rc = 0; + + if (gap && link_retransmit_failure(l, l, acked + 1, &rc)) + return rc; skb_queue_walk_safe(&l->transmq, skb, tmp) { seqno = buf_seqno(skb); @@ -1369,6 +1396,8 @@ next_gap_ack: goto next_gap_ack; } } + + return 0; } /* tipc_link_build_state_msg: prepare link state message for transmission @@ -1481,7 +1510,6 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Forward queues and wake up waiting users */ if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { - l->stale_cnt = 0; tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); @@ -1918,7 +1946,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); - tipc_link_advance_transmq(l, ack, gap, ga, xmitq); + rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); /* If NACK, retransmit will now start at right position */ if (gap) @@ -2035,7 +2063,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (more(peers_snd_nxt, l->rcv_nxt + l->window)) return rc; - rc = tipc_link_retrans(snd_l, l, from, to, xmitq); + rc = tipc_link_bc_retrans(snd_l, l, from, to, xmitq); l->snd_nxt = peers_snd_nxt; if (link_bc_rcv_gap(l)) @@ -2131,7 +2159,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, if (dnode == tipc_own_addr(l->net)) { tipc_link_bc_ack_rcv(l, acked, xmitq); - rc = tipc_link_retrans(l->bc_sndlink, l, from, to, xmitq); + rc = tipc_link_bc_retrans(l->bc_sndlink, l, from, to, xmitq); l->stats.recv_nacks++; return rc; } @@ -2550,7 +2578,7 @@ int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf) i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt); i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt); i += scnprintf(buf + i, sz - i, " %u", l->prev_from); - i += scnprintf(buf + i, sz - i, " %u", l->stale_cnt); + i += scnprintf(buf + i, sz - i, " %u", 0); i += scnprintf(buf + i, sz - i, " %u", l->acked); list = &l->transmq; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 8de02ad6e352..da509f0eb9ca 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -308,7 +308,7 @@ static inline unchar *msg_data(struct tipc_msg *m) return ((unchar *)m) + msg_hdr_sz(m); } -static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) +static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m) { return (struct tipc_msg *)msg_data(m); } @@ -486,7 +486,7 @@ static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) static inline u32 msg_origport(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) - m = msg_get_wrapped(m); + m = msg_inner_hdr(m); return msg_word(m, 4); } diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 99bd166bccec..d6165ad384c0 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -261,7 +261,7 @@ struct genl_family tipc_genl_family __ro_after_init = { .version = TIPC_GENL_V2_VERSION, .hdrsize = 0, .maxattr = TIPC_NLA_MAX, - .policy = tipc_nl_policy, + .policy = tipc_nl_policy, .netnsok = true, .module = THIS_MODULE, .ops = tipc_genl_v2_ops, diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index cf155061c472..d86030ef1232 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -691,7 +691,6 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb, struct nlattr *prop; struct nlattr *media; struct tipc_link_config *lc; - int len; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -699,10 +698,6 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb, if (!media) return -EMSGSIZE; - len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME); - if (!string_is_valid(lc->name, len)) - return -EINVAL; - if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name)) return -EMSGSIZE; @@ -723,7 +718,6 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb, struct nlattr *prop; struct nlattr *bearer; struct tipc_link_config *lc; - int len; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -731,10 +725,6 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb, if (!bearer) return -EMSGSIZE; - len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME); - if (!string_is_valid(lc->name, len)) - return -EINVAL; - if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name)) return -EMSGSIZE; diff --git a/net/tipc/node.c b/net/tipc/node.c index 550581d47d51..324a1f91b394 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1649,7 +1649,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); - u16 iseqno = msg_seqno(msg_get_wrapped(hdr)); + u16 iseqno = msg_seqno(msg_inner_hdr(hdr)); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 1405ccc9101c..287df68721df 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -76,6 +76,7 @@ struct udp_media_addr { /* struct udp_replicast - container for UDP remote addresses */ struct udp_replicast { struct udp_media_addr addr; + struct dst_cache dst_cache; struct rcu_head rcu; struct list_head list; }; @@ -158,22 +159,27 @@ static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a) /* tipc_send_msg - enqueue a send request */ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, struct udp_bearer *ub, struct udp_media_addr *src, - struct udp_media_addr *dst) + struct udp_media_addr *dst, struct dst_cache *cache) { + struct dst_entry *ndst = dst_cache_get(cache); int ttl, err = 0; - struct rtable *rt; if (dst->proto == htons(ETH_P_IP)) { - struct flowi4 fl = { - .daddr = dst->ipv4.s_addr, - .saddr = src->ipv4.s_addr, - .flowi4_mark = skb->mark, - .flowi4_proto = IPPROTO_UDP - }; - rt = ip_route_output_key(net, &fl); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto tx_error; + struct rtable *rt = (struct rtable *)ndst; + + if (!rt) { + struct flowi4 fl = { + .daddr = dst->ipv4.s_addr, + .saddr = src->ipv4.s_addr, + .flowi4_mark = skb->mark, + .flowi4_proto = IPPROTO_UDP + }; + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto tx_error; + } + dst_cache_set_ip4(cache, &rt->dst, fl.saddr); } ttl = ip4_dst_hoplimit(&rt->dst); @@ -182,17 +188,19 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, dst->port, false, true); #if IS_ENABLED(CONFIG_IPV6) } else { - struct dst_entry *ndst; - struct flowi6 fl6 = { - .flowi6_oif = ub->ifindex, - .daddr = dst->ipv6, - .saddr = src->ipv6, - .flowi6_proto = IPPROTO_UDP - }; - err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, &ndst, - &fl6); - if (err) - goto tx_error; + if (!ndst) { + struct flowi6 fl6 = { + .flowi6_oif = ub->ifindex, + .daddr = dst->ipv6, + .saddr = src->ipv6, + .flowi6_proto = IPPROTO_UDP + }; + err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, + &ndst, &fl6); + if (err) + goto tx_error; + dst_cache_set_ip6(cache, ndst, &fl6.saddr); + } ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, &src->ipv6, &dst->ipv6, 0, ttl, 0, @@ -223,14 +231,15 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, } skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); - ub = rcu_dereference_rtnl(b->media_ptr); + ub = rcu_dereference(b->media_ptr); if (!ub) { err = -ENODEV; goto out; } if (addr->broadcast != TIPC_REPLICAST_SUPPORT) - return tipc_udp_xmit(net, skb, ub, src, dst); + return tipc_udp_xmit(net, skb, ub, src, dst, + &ub->rcast.dst_cache); /* Replicast, send an skb to each configured IP address */ list_for_each_entry_rcu(rcast, &ub->rcast.list, list) { @@ -242,7 +251,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, goto out; } - err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr); + err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr, + &rcast->dst_cache); if (err) goto out; } @@ -286,6 +296,11 @@ static int tipc_udp_rcast_add(struct tipc_bearer *b, if (!rcast) return -ENOMEM; + if (dst_cache_init(&rcast->dst_cache, GFP_ATOMIC)) { + kfree(rcast); + return -ENOMEM; + } + memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr)); if (ntohs(addr->proto) == ETH_P_IP) @@ -475,7 +490,7 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) } } - ub = rcu_dereference_rtnl(b->media_ptr); + ub = rtnl_dereference(b->media_ptr); if (!ub) { rtnl_unlock(); return -EINVAL; @@ -517,7 +532,7 @@ int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b) struct udp_bearer *ub; struct nlattr *nest; - ub = rcu_dereference_rtnl(b->media_ptr); + ub = rtnl_dereference(b->media_ptr); if (!ub) return -ENODEV; @@ -742,6 +757,10 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); + err = dst_cache_init(&ub->rcast.dst_cache, GFP_ATOMIC); + if (err) + goto free; + /** * The bcast media address port is used for all peers and the ip * is used if it's a multicast address. @@ -752,12 +771,14 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, else err = tipc_udp_rcast_add(b, &remote); if (err) - goto err; + goto free; return 0; + +free: + dst_cache_destroy(&ub->rcast.dst_cache); + udp_tunnel_sock_release(ub->ubsock); err: - if (ub->ubsock) - udp_tunnel_sock_release(ub->ubsock); kfree(ub); return err; } @@ -769,12 +790,13 @@ static void cleanup_bearer(struct work_struct *work) struct udp_replicast *rcast, *tmp; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { + dst_cache_destroy(&rcast->dst_cache); list_del_rcu(&rcast->list); kfree_rcu(rcast, rcu); } - if (ub->ubsock) - udp_tunnel_sock_release(ub->ubsock); + dst_cache_destroy(&ub->rcast.dst_cache); + udp_tunnel_sock_release(ub->ubsock); synchronize_net(); kfree(ub); } @@ -784,13 +806,12 @@ static void tipc_udp_disable(struct tipc_bearer *b) { struct udp_bearer *ub; - ub = rcu_dereference_rtnl(b->media_ptr); + ub = rtnl_dereference(b->media_ptr); if (!ub) { pr_err("UDP bearer instance not found\n"); return; } - if (ub->ubsock) - sock_set_flag(ub->ubsock->sk, SOCK_DEAD); + sock_set_flag(ub->ubsock->sk, SOCK_DEAD); RCU_INIT_POINTER(ub->bearer, NULL); /* sock_release need to be done outside of rtnl lock */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 1f9cf57d9754..7c0b2b778703 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -61,7 +61,7 @@ static void tls_device_free_ctx(struct tls_context *ctx) if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); - kfree(ctx); + tls_ctx_free(ctx); } static void tls_device_gc_task(struct work_struct *work) @@ -209,6 +209,33 @@ void tls_device_free_resources_tx(struct sock *sk) tls_free_partial_record(sk, tls_ctx); } +static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, + u32 seq) +{ + struct net_device *netdev; + struct sk_buff *skb; + int err = 0; + u8 *rcd_sn; + + skb = tcp_write_queue_tail(sk); + if (skb) + TCP_SKB_CB(skb)->eor = 1; + + rcd_sn = tls_ctx->tx.rec_seq; + + down_read(&device_offload_lock); + netdev = tls_ctx->netdev; + if (netdev) + err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, + rcd_sn, + TLS_OFFLOAD_CTX_DIR_TX); + up_read(&device_offload_lock); + if (err) + return; + + clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags); +} + static void tls_append_frag(struct tls_record_info *record, struct page_frag *pfrag, int size) @@ -252,7 +279,7 @@ static int tls_push_record(struct sock *sk, skb_frag_address(frag), record->len - prot->prepend_size, record_type, - ctx->crypto_send.info.version); + prot->version); /* HW doesn't care about the data in the tag, because it fills it. */ dummy_tag_frag.page = skb_frag_page(frag); @@ -264,7 +291,11 @@ static int tls_push_record(struct sock *sk, list_add_tail(&record->list, &offload_ctx->records_list); spin_unlock_irq(&offload_ctx->lock); offload_ctx->open_record = NULL; - tls_advance_record_sn(sk, &ctx->tx, ctx->crypto_send.info.version); + + if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags)) + tls_device_resync_tx(sk, ctx, tp->write_seq); + + tls_advance_record_sn(sk, prot, &ctx->tx); for (i = 0; i < record->num_frags; i++) { frag = &record->frags[i]; @@ -551,7 +582,7 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) } static void tls_device_resync_rx(struct tls_context *tls_ctx, - struct sock *sk, u32 seq, u64 rcd_sn) + struct sock *sk, u32 seq, u8 *rcd_sn) { struct net_device *netdev; @@ -559,14 +590,17 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx, return; netdev = READ_ONCE(tls_ctx->netdev); if (netdev) - netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, rcd_sn); + netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, + TLS_OFFLOAD_CTX_DIR_RX); clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags); } -void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) +void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; + u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + struct tls_prot_info *prot; u32 is_req_pending; s64 resync_req; u32 req_seq; @@ -574,15 +608,83 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) if (tls_ctx->rx_conf != TLS_HW) return; + prot = &tls_ctx->prot_info; rx_ctx = tls_offload_ctx_rx(tls_ctx); - resync_req = atomic64_read(&rx_ctx->resync_req); - req_seq = (resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1); - is_req_pending = resync_req; + memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); - if (unlikely(is_req_pending) && req_seq == seq && - atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) { + switch (rx_ctx->resync_type) { + case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ: + resync_req = atomic64_read(&rx_ctx->resync_req); + req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; - tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); + is_req_pending = resync_req; + + if (likely(!is_req_pending) || req_seq != seq || + !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) + return; + break; + case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT: + if (likely(!rx_ctx->resync_nh_do_now)) + return; + + /* head of next rec is already in, note that the sock_inq will + * include the currently parsed message when called from parser + */ + if (tcp_inq(sk) > rcd_len) + return; + + rx_ctx->resync_nh_do_now = 0; + seq += rcd_len; + tls_bigint_increment(rcd_sn, prot->rec_seq_size); + break; + } + + tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); +} + +static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, + struct tls_offload_context_rx *ctx, + struct sock *sk, struct sk_buff *skb) +{ + struct strp_msg *rxm; + + /* device will request resyncs by itself based on stream scan */ + if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT) + return; + /* already scheduled */ + if (ctx->resync_nh_do_now) + return; + /* seen decrypted fragments since last fully-failed record */ + if (ctx->resync_nh_reset) { + ctx->resync_nh_reset = 0; + ctx->resync_nh.decrypted_failed = 1; + ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL; + return; + } + + if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt) + return; + + /* doing resync, bump the next target in case it fails */ + if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL) + ctx->resync_nh.decrypted_tgt *= 2; + else + ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL; + + rxm = strp_msg(skb); + + /* head of next rec is already in, parser will sync for us */ + if (tcp_inq(sk) > rxm->full_len) { + ctx->resync_nh_do_now = 1; + } else { + struct tls_prot_info *prot = &tls_ctx->prot_info; + u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + + memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); + tls_bigint_increment(rcd_sn, prot->rec_seq_size); + + tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq, + rcd_sn); } } @@ -610,8 +712,10 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) sg_set_buf(&sg[0], buf, rxm->full_len + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); - skb_copy_bits(skb, offset, buf, - TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + err = skb_copy_bits(skb, offset, buf, + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + if (err) + goto free_buf; /* We are interested only in the decrypted data not the auth */ err = decrypt_skb(sk, skb, sg); @@ -625,8 +729,11 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) if (skb_pagelen(skb) > offset) { copy = min_t(int, skb_pagelen(skb) - offset, data_len); - if (skb->decrypted) - skb_store_bits(skb, offset, buf, copy); + if (skb->decrypted) { + err = skb_store_bits(skb, offset, buf, copy); + if (err) + goto free_buf; + } offset += copy; buf += copy; @@ -649,8 +756,11 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) copy = min_t(int, skb_iter->len - frag_pos, data_len + rxm->offset - offset); - if (skb_iter->decrypted) - skb_store_bits(skb_iter, frag_pos, buf, copy); + if (skb_iter->decrypted) { + err = skb_store_bits(skb_iter, frag_pos, buf, copy); + if (err) + goto free_buf; + } offset += copy; buf += copy; @@ -671,10 +781,6 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) int is_encrypted = !is_decrypted; struct sk_buff *skb_iter; - /* Skip if it is already decrypted */ - if (ctx->sw.decrypted) - return 0; - /* Check if all the data is decrypted already */ skb_walk_frags(skb, skb_iter) { is_decrypted &= skb_iter->decrypted; @@ -683,12 +789,21 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) ctx->sw.decrypted |= is_decrypted; - /* Return immedeatly if the record is either entirely plaintext or + /* Return immediately if the record is either entirely plaintext or * entirely ciphertext. Otherwise handle reencrypt partially decrypted * record. */ - return (is_encrypted || is_decrypted) ? 0 : - tls_device_reencrypt(sk, skb); + if (is_decrypted) { + ctx->resync_nh_reset = 1; + return 0; + } + if (is_encrypted) { + tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb); + return 0; + } + + ctx->resync_nh_reset = 1; + return tls_device_reencrypt(sk, skb); } static void tls_device_attach(struct tls_context *ctx, struct sock *sk, @@ -742,6 +857,11 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) } crypto_info = &ctx->crypto_send.info; + if (crypto_info->version != TLS_1_2_VERSION) { + rc = -EOPNOTSUPP; + goto free_offload_ctx; + } + switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; @@ -757,6 +877,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto free_offload_ctx; } + /* Sanity-check the rec_seq_size for stack allocations */ + if (rec_seq_size > TLS_MAX_REC_SEQ_SIZE) { + rc = -EINVAL; + goto free_offload_ctx; + } + + prot->version = crypto_info->version; + prot->cipher_type = crypto_info->cipher_type; prot->prepend_size = TLS_HEADER_SIZE + nonce_size; prot->tag_size = tag_size; prot->overhead_size = prot->prepend_size + prot->tag_size; @@ -876,6 +1004,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) struct net_device *netdev; int rc = 0; + if (ctx->crypto_recv.info.version != TLS_1_2_VERSION) + return -EOPNOTSUPP; + /* We support starting offload on multiple sockets * concurrently, so we only need a read lock here. * This lock must precede get_netdev_for_sock to prevent races between @@ -908,6 +1039,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) rc = -ENOMEM; goto release_netdev; } + context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; rc = tls_set_sw_offload(sk, ctx, 0); @@ -1015,7 +1147,7 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: if ((dev->features & NETIF_F_HW_TLS_RX) && - !dev->tlsdev_ops->tls_dev_resync_rx) + !dev->tlsdev_ops->tls_dev_resync) return NOTIFY_BAD; if (dev->tlsdev_ops && diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index c3a5fe624b4e..9070d68a92a4 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -209,6 +209,10 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) update_chksum(nskb, headln); + /* sock_efree means skb must gone through skb_orphan_partial() */ + if (nskb->destructor == sock_efree) + return; + delta = nskb->truesize - skb->truesize; if (likely(delta < 0)) WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); @@ -240,7 +244,6 @@ static int fill_sg_in(struct scatterlist *sg_in, record = tls_get_record(ctx, tcp_seq, rcd_sn); if (!record) { spin_unlock_irqrestore(&ctx->lock, flags); - WARN(1, "Record not found for seq %u\n", tcp_seq); return -EINVAL; } @@ -409,7 +412,10 @@ put_sg: put_page(sg_page(&sg_in[--resync_sgs])); kfree(sg_in); free_orig: - kfree_skb(skb); + if (nskb) + consume_skb(skb); + else + kfree_skb(skb); return nskb; } @@ -424,6 +430,12 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, } EXPORT_SYMBOL_GPL(tls_validate_xmit_skb); +struct sk_buff *tls_encrypt_skb(struct sk_buff *skb) +{ + return tls_sw_fallback(skb->sk, skb); +} +EXPORT_SYMBOL_GPL(tls_encrypt_skb); + int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index e2b69e805d46..4674e57e66b0 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -251,7 +251,7 @@ static void tls_write_space(struct sock *sk) ctx->sk_write_space(sk); } -static void tls_ctx_free(struct tls_context *ctx) +void tls_ctx_free(struct tls_context *ctx) { if (!ctx) return; @@ -643,7 +643,7 @@ static void tls_hw_sk_destruct(struct sock *sk) ctx->sk_destruct(sk); /* Free ctx */ - kfree(ctx); + tls_ctx_free(ctx); icsk->icsk_ulp_data = NULL; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 455a782c7658..53b4ad94e74a 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -534,7 +534,7 @@ static int tls_do_encryption(struct sock *sk, /* Unhook the record from context if encryption is not failure */ ctx->open_rec = NULL; - tls_advance_record_sn(sk, &tls_ctx->tx, prot->version); + tls_advance_record_sn(sk, prot, &tls_ctx->tx); return rc; } @@ -1485,15 +1485,16 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; - int version = prot->version; struct strp_msg *rxm = strp_msg(skb); int pad, err = 0; if (!ctx->decrypted) { #ifdef CONFIG_TLS_DEVICE - err = tls_device_decrypted(sk, skb); - if (err < 0) - return err; + if (tls_ctx->rx_conf == TLS_HW) { + err = tls_device_decrypted(sk, skb); + if (err < 0) + return err; + } #endif /* Still not decrypted after tls_device */ if (!ctx->decrypted) { @@ -1501,8 +1502,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, async); if (err < 0) { if (err == -EINPROGRESS) - tls_advance_record_sn(sk, &tls_ctx->rx, - version); + tls_advance_record_sn(sk, prot, + &tls_ctx->rx); return err; } @@ -1517,7 +1518,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, rxm->full_len -= pad; rxm->offset += prot->prepend_size; rxm->full_len -= prot->overhead_size; - tls_advance_record_sn(sk, &tls_ctx->rx, version); + tls_advance_record_sn(sk, prot, &tls_ctx->rx); ctx->decrypted = true; ctx->saved_data_ready(sk); } else { @@ -1958,7 +1959,8 @@ bool tls_sw_stream_read(const struct sock *sk) ingress_empty = list_empty(&psock->ingress_msg); rcu_read_unlock(); - return !ingress_empty || ctx->recv_pkt; + return !ingress_empty || ctx->recv_pkt || + !skb_queue_empty(&ctx->rx_list); } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -2013,8 +2015,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } #ifdef CONFIG_TLS_DEVICE - handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset, - *(u64*)tls_ctx->rx.rec_seq); + tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE, + TCP_SKB_CB(skb)->seq + rxm->offset); #endif return data_len + TLS_HEADER_SIZE; @@ -2281,8 +2283,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_priv; } - /* Sanity-check the IV size for stack allocations. */ - if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) { + /* Sanity-check the sizes for stack allocations. */ + if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE || + rec_seq_size > TLS_MAX_REC_SEQ_SIZE) { rc = -EINVAL; goto free_priv; } diff --git a/net/unix/diag.c b/net/unix/diag.c index c51a707260fa..9ff64f9df1f3 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -5,9 +5,11 @@ #include <linux/unix_diag.h> #include <linux/skbuff.h> #include <linux/module.h> +#include <linux/uidgid.h> #include <net/netlink.h> #include <net/af_unix.h> #include <net/tcp_states.h> +#include <net/sock.h> static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { @@ -111,6 +113,12 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql); } +static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb) +{ + uid_t uid = from_kuid_munged(sk_user_ns(nlskb->sk), sock_i_uid(sk)); + return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid); +} + static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, u32 portid, u32 seq, u32 flags, int sk_ino) { @@ -157,6 +165,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown)) goto out_nlmsg_trim; + if ((req->udiag_show & UDIAG_SHOW_UID) && + sk_diag_dump_uid(sk, skb)) + goto out_nlmsg_trim; + nlmsg_end(skb, nlh); return 0; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 169112f8aa1e..ab47bf3ab66e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -274,7 +274,8 @@ EXPORT_SYMBOL_GPL(vsock_insert_connected); void vsock_remove_bound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); - __vsock_remove_bound(vsk); + if (__vsock_in_bound_table(vsk)) + __vsock_remove_bound(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_bound); @@ -282,7 +283,8 @@ EXPORT_SYMBOL_GPL(vsock_remove_bound); void vsock_remove_connected(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); - __vsock_remove_connected(vsk); + if (__vsock_in_connected_table(vsk)) + __vsock_remove_connected(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_connected); @@ -318,35 +320,10 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, } EXPORT_SYMBOL_GPL(vsock_find_connected_socket); -static bool vsock_in_bound_table(struct vsock_sock *vsk) -{ - bool ret; - - spin_lock_bh(&vsock_table_lock); - ret = __vsock_in_bound_table(vsk); - spin_unlock_bh(&vsock_table_lock); - - return ret; -} - -static bool vsock_in_connected_table(struct vsock_sock *vsk) -{ - bool ret; - - spin_lock_bh(&vsock_table_lock); - ret = __vsock_in_connected_table(vsk); - spin_unlock_bh(&vsock_table_lock); - - return ret; -} - void vsock_remove_sock(struct vsock_sock *vsk) { - if (vsock_in_bound_table(vsk)) - vsock_remove_bound(vsk); - - if (vsock_in_connected_table(vsk)) - vsock_remove_connected(vsk); + vsock_remove_bound(vsk); + vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); @@ -477,8 +454,7 @@ static void vsock_pending_work(struct work_struct *work) * incoming packets can't find this socket, and to reduce the reference * count. */ - if (vsock_in_connected_table(vsk)) - vsock_remove_connected(vsk); + vsock_remove_connected(vsk); sk->sk_state = TCP_CLOSE; diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 62dcdf082349..f2084e3f7aa4 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -14,14 +14,14 @@ #include <net/sock.h> #include <net/af_vsock.h> -/* The host side's design of the feature requires 6 exact 4KB pages for - * recv/send rings respectively -- this is suboptimal considering memory - * consumption, however unluckily we have to live with it, before the - * host comes up with a better design in the future. +/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some + * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer + * hosts don't have this limitation; but, keep the defaults the same for compat. */ #define PAGE_SIZE_4K 4096 #define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6) #define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6) +#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64) /* The MTU is 16KB per the host side's design */ #define HVS_MTU_SIZE (1024 * 16) @@ -46,8 +46,9 @@ struct hvs_recv_buf { }; /* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use - * a small size, i.e. HVS_SEND_BUF_SIZE, to minimize the dynamically-allocated - * buffer, because tests show there is no significant performance difference. + * a smaller size, i.e. HVS_SEND_BUF_SIZE, to maximize concurrency between the + * guest and the host processing as one VMBUS packet is the smallest processing + * unit. * * Note: the buffer can be eliminated in the future when we add new VMBus * ringbuffer APIs that allow us to directly copy data from userspace buffer @@ -321,8 +322,11 @@ static void hvs_open_connection(struct vmbus_channel *chan) struct sockaddr_vm addr; struct sock *sk, *new = NULL; struct vsock_sock *vnew = NULL; - struct hvsock *hvs, *hvs_new = NULL; + struct hvsock *hvs = NULL; + struct hvsock *hvs_new = NULL; + int rcvbuf; int ret; + int sndbuf; if_type = &chan->offermsg.offer.if_type; if_instance = &chan->offermsg.offer.if_instance; @@ -364,9 +368,34 @@ static void hvs_open_connection(struct vmbus_channel *chan) } set_channel_read_mode(chan, HV_CALL_DIRECT); - ret = vmbus_open(chan, RINGBUFFER_HVS_SND_SIZE, - RINGBUFFER_HVS_RCV_SIZE, NULL, 0, - hvs_channel_cb, conn_from_host ? new : sk); + + /* Use the socket buffer sizes as hints for the VMBUS ring size. For + * server side sockets, 'sk' is the parent socket and thus, this will + * allow the child sockets to inherit the size from the parent. Keep + * the mins to the default value and align to page size as per VMBUS + * requirements. + * For the max, the socket core library will limit the socket buffer + * size that can be set by the user, but, since currently, the hv_sock + * VMBUS ring buffer is physically contiguous allocation, restrict it + * further. + * Older versions of hv_sock host side code cannot handle bigger VMBUS + * ring buffer size. Use the version number to limit the change to newer + * versions. + */ + if (vmbus_proto_version < VERSION_WIN10_V5) { + sndbuf = RINGBUFFER_HVS_SND_SIZE; + rcvbuf = RINGBUFFER_HVS_RCV_SIZE; + } else { + sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE); + sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE); + sndbuf = ALIGN(sndbuf, PAGE_SIZE); + rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE); + rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE); + rcvbuf = ALIGN(rcvbuf, PAGE_SIZE); + } + + ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb, + conn_from_host ? new : sk); if (ret != 0) { if (conn_from_host) { hvs_new->chan = NULL; @@ -424,6 +453,7 @@ static u32 hvs_get_local_cid(void) static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk) { struct hvsock *hvs; + struct sock *sk = sk_vsock(vsk); hvs = kzalloc(sizeof(*hvs), GFP_KERNEL); if (!hvs) @@ -431,7 +461,8 @@ static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk) vsk->trans = hvs; hvs->vsk = vsk; - + sk->sk_sndbuf = RINGBUFFER_HVS_SND_SIZE; + sk->sk_rcvbuf = RINGBUFFER_HVS_RCV_SIZE; return 0; } @@ -627,7 +658,9 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, struct hvsock *hvs = vsk->trans; struct vmbus_channel *chan = hvs->chan; struct hvs_send_buf *send_buf; - ssize_t to_write, max_writable, ret; + ssize_t to_write, max_writable; + ssize_t ret = 0; + ssize_t bytes_written = 0; BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K); @@ -635,20 +668,34 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, if (!send_buf) return -ENOMEM; - max_writable = hvs_channel_writable_bytes(chan); - to_write = min_t(ssize_t, len, max_writable); - to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE); - - ret = memcpy_from_msg(send_buf->data, msg, to_write); - if (ret < 0) - goto out; + /* Reader(s) could be draining data from the channel as we write. + * Maximize bandwidth, by iterating until the channel is found to be + * full. + */ + while (len) { + max_writable = hvs_channel_writable_bytes(chan); + if (!max_writable) + break; + to_write = min_t(ssize_t, len, max_writable); + to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE); + /* memcpy_from_msg is safe for loop as it advances the offsets + * within the message iterator. + */ + ret = memcpy_from_msg(send_buf->data, msg, to_write); + if (ret < 0) + goto out; - ret = hvs_send_data(hvs->chan, send_buf, to_write); - if (ret < 0) - goto out; + ret = hvs_send_data(hvs->chan, send_buf, to_write); + if (ret < 0) + goto out; - ret = to_write; + bytes_written += to_write; + len -= to_write; + } out: + /* If any data has been sent, return that */ + if (bytes_written) + ret = bytes_written; kfree(send_buf); return ret; } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 9c287e3e393c..0815d1357861 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -38,6 +38,7 @@ struct virtio_vsock { * must be accessed with tx_lock held. */ struct mutex tx_lock; + bool tx_run; struct work_struct send_pkt_work; spinlock_t send_pkt_list_lock; @@ -53,6 +54,7 @@ struct virtio_vsock { * must be accessed with rx_lock held. */ struct mutex rx_lock; + bool rx_run; int rx_buf_nr; int rx_buf_max_nr; @@ -60,24 +62,28 @@ struct virtio_vsock { * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held. */ struct mutex event_lock; + bool event_run; struct virtio_vsock_event event_list[8]; u32 guest_cid; }; -static struct virtio_vsock *virtio_vsock_get(void) -{ - return the_virtio_vsock; -} - static u32 virtio_transport_get_local_cid(void) { - struct virtio_vsock *vsock = virtio_vsock_get(); + struct virtio_vsock *vsock; + u32 ret; - if (!vsock) - return VMADDR_CID_ANY; + rcu_read_lock(); + vsock = rcu_dereference(the_virtio_vsock); + if (!vsock) { + ret = VMADDR_CID_ANY; + goto out_rcu; + } - return vsock->guest_cid; + ret = vsock->guest_cid; +out_rcu: + rcu_read_unlock(); + return ret; } static void virtio_transport_loopback_work(struct work_struct *work) @@ -91,6 +97,10 @@ static void virtio_transport_loopback_work(struct work_struct *work) spin_unlock_bh(&vsock->loopback_list_lock); mutex_lock(&vsock->rx_lock); + + if (!vsock->rx_run) + goto out; + while (!list_empty(&pkts)) { struct virtio_vsock_pkt *pkt; @@ -99,6 +109,7 @@ static void virtio_transport_loopback_work(struct work_struct *work) virtio_transport_recv_pkt(pkt); } +out: mutex_unlock(&vsock->rx_lock); } @@ -127,6 +138,9 @@ virtio_transport_send_pkt_work(struct work_struct *work) mutex_lock(&vsock->tx_lock); + if (!vsock->tx_run) + goto out; + vq = vsock->vqs[VSOCK_VQ_TX]; for (;;) { @@ -185,6 +199,7 @@ virtio_transport_send_pkt_work(struct work_struct *work) if (added) virtqueue_kick(vq); +out: mutex_unlock(&vsock->tx_lock); if (restart_rx) @@ -197,14 +212,18 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) struct virtio_vsock *vsock; int len = pkt->len; - vsock = virtio_vsock_get(); + rcu_read_lock(); + vsock = rcu_dereference(the_virtio_vsock); if (!vsock) { virtio_transport_free_pkt(pkt); - return -ENODEV; + len = -ENODEV; + goto out_rcu; } - if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) - return virtio_transport_send_pkt_loopback(vsock, pkt); + if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) { + len = virtio_transport_send_pkt_loopback(vsock, pkt); + goto out_rcu; + } if (pkt->reply) atomic_inc(&vsock->queued_replies); @@ -214,6 +233,9 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) spin_unlock_bh(&vsock->send_pkt_list_lock); queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); + +out_rcu: + rcu_read_unlock(); return len; } @@ -222,12 +244,14 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk) { struct virtio_vsock *vsock; struct virtio_vsock_pkt *pkt, *n; - int cnt = 0; + int cnt = 0, ret; LIST_HEAD(freeme); - vsock = virtio_vsock_get(); + rcu_read_lock(); + vsock = rcu_dereference(the_virtio_vsock); if (!vsock) { - return -ENODEV; + ret = -ENODEV; + goto out_rcu; } spin_lock_bh(&vsock->send_pkt_list_lock); @@ -255,7 +279,11 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk) queue_work(virtio_vsock_workqueue, &vsock->rx_work); } - return 0; + ret = 0; + +out_rcu: + rcu_read_unlock(); + return ret; } static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) @@ -307,6 +335,10 @@ static void virtio_transport_tx_work(struct work_struct *work) vq = vsock->vqs[VSOCK_VQ_TX]; mutex_lock(&vsock->tx_lock); + + if (!vsock->tx_run) + goto out; + do { struct virtio_vsock_pkt *pkt; unsigned int len; @@ -317,6 +349,8 @@ static void virtio_transport_tx_work(struct work_struct *work) added = true; } } while (!virtqueue_enable_cb(vq)); + +out: mutex_unlock(&vsock->tx_lock); if (added) @@ -345,6 +379,9 @@ static void virtio_transport_rx_work(struct work_struct *work) mutex_lock(&vsock->rx_lock); + if (!vsock->rx_run) + goto out; + do { virtqueue_disable_cb(vq); for (;;) { @@ -454,6 +491,9 @@ static void virtio_transport_event_work(struct work_struct *work) mutex_lock(&vsock->event_lock); + if (!vsock->event_run) + goto out; + do { struct virtio_vsock_event *event; unsigned int len; @@ -468,7 +508,7 @@ static void virtio_transport_event_work(struct work_struct *work) } while (!virtqueue_enable_cb(vq)); virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]); - +out: mutex_unlock(&vsock->event_lock); } @@ -565,7 +605,8 @@ static int virtio_vsock_probe(struct virtio_device *vdev) return ret; /* Only one virtio-vsock device per guest is supported */ - if (the_virtio_vsock) { + if (rcu_dereference_protected(the_virtio_vsock, + lockdep_is_held(&the_virtio_vsock_mutex))) { ret = -EBUSY; goto out; } @@ -590,8 +631,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->rx_buf_max_nr = 0; atomic_set(&vsock->queued_replies, 0); - vdev->priv = vsock; - the_virtio_vsock = vsock; mutex_init(&vsock->tx_lock); mutex_init(&vsock->rx_lock); mutex_init(&vsock->event_lock); @@ -605,14 +644,23 @@ static int virtio_vsock_probe(struct virtio_device *vdev) INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work); INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work); + mutex_lock(&vsock->tx_lock); + vsock->tx_run = true; + mutex_unlock(&vsock->tx_lock); + mutex_lock(&vsock->rx_lock); virtio_vsock_rx_fill(vsock); + vsock->rx_run = true; mutex_unlock(&vsock->rx_lock); mutex_lock(&vsock->event_lock); virtio_vsock_event_fill(vsock); + vsock->event_run = true; mutex_unlock(&vsock->event_lock); + vdev->priv = vsock; + rcu_assign_pointer(the_virtio_vsock, vsock); + mutex_unlock(&the_virtio_vsock_mutex); return 0; @@ -627,15 +675,33 @@ static void virtio_vsock_remove(struct virtio_device *vdev) struct virtio_vsock *vsock = vdev->priv; struct virtio_vsock_pkt *pkt; - flush_work(&vsock->loopback_work); - flush_work(&vsock->rx_work); - flush_work(&vsock->tx_work); - flush_work(&vsock->event_work); - flush_work(&vsock->send_pkt_work); + mutex_lock(&the_virtio_vsock_mutex); + + vdev->priv = NULL; + rcu_assign_pointer(the_virtio_vsock, NULL); + synchronize_rcu(); /* Reset all connected sockets when the device disappear */ vsock_for_each_connected_socket(virtio_vsock_reset_sock); + /* Stop all work handlers to make sure no one is accessing the device, + * so we can safely call vdev->config->reset(). + */ + mutex_lock(&vsock->rx_lock); + vsock->rx_run = false; + mutex_unlock(&vsock->rx_lock); + + mutex_lock(&vsock->tx_lock); + vsock->tx_run = false; + mutex_unlock(&vsock->tx_lock); + + mutex_lock(&vsock->event_lock); + vsock->event_run = false; + mutex_unlock(&vsock->event_lock); + + /* Flush all device writes and interrupts, device will not use any + * more buffers. + */ vdev->config->reset(vdev); mutex_lock(&vsock->rx_lock); @@ -666,12 +732,20 @@ static void virtio_vsock_remove(struct virtio_device *vdev) } spin_unlock_bh(&vsock->loopback_list_lock); - mutex_lock(&the_virtio_vsock_mutex); - the_virtio_vsock = NULL; - mutex_unlock(&the_virtio_vsock_mutex); - + /* Delete virtqueues and flush outstanding callbacks if any */ vdev->config->del_vqs(vdev); + /* Other works can be queued before 'config->del_vqs()', so we flush + * all works before to free the vsock object to avoid use after free. + */ + flush_work(&vsock->loopback_work); + flush_work(&vsock->rx_work); + flush_work(&vsock->tx_work); + flush_work(&vsock->event_work); + flush_work(&vsock->send_pkt_work); + + mutex_unlock(&the_virtio_vsock_mutex); + kfree(vsock); } diff --git a/net/wireless/core.c b/net/wireless/core.c index 53ad3dbb76fe..45d9afcff6d5 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -859,6 +859,19 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; } + for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) { + /* + * Validate we have a policy (can be explicitly set to + * VENDOR_CMD_RAW_DATA which is non-NULL) and also that + * we have at least one of doit/dumpit. + */ + if (WARN_ON(!rdev->wiphy.vendor_commands[i].policy)) + return -EINVAL; + if (WARN_ON(!rdev->wiphy.vendor_commands[i].doit && + !rdev->wiphy.vendor_commands[i].dumpit)) + return -EINVAL; + } + #ifdef CONFIG_PM if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns && (!rdev->wiphy.wowlan->pattern_min_len || diff --git a/net/wireless/core.h b/net/wireless/core.h index 84d36ca7a7ab..ee8388fe4a92 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -531,6 +531,10 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); +struct cfg80211_internal_bss * +cfg80211_bss_update(struct cfg80211_registered_device *rdev, + struct cfg80211_internal_bss *tmp, + bool signal_valid, unsigned long ts); #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS #define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) #else diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 520d437aa8d1..fc83dd179c1a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -571,6 +571,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PEER_MEASUREMENTS] = NLA_POLICY_NESTED(nl80211_pmsr_attr_policy), [NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1), + [NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY, + .len = SAE_PASSWORD_MAX_LEN }, + [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -4447,6 +4450,8 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return true; case NL80211_CMD_CONNECT: if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD) && auth_type == NL80211_AUTHTYPE_SAE) return false; @@ -4637,6 +4642,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + params.twt_responder = + nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]); + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) @@ -8751,7 +8759,8 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) static bool nl80211_valid_wpa_versions(u32 wpa_versions) { return !(wpa_versions & ~(NL80211_WPA_VERSION_1 | - NL80211_WPA_VERSION_2)); + NL80211_WPA_VERSION_2 | + NL80211_WPA_VERSION_3)); } static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) @@ -8987,6 +8996,16 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]); } + if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) { + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD)) + return -EINVAL; + settings->sae_pwd = + nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]); + settings->sae_pwd_len = + nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]); + } + return 0; } @@ -12669,6 +12688,29 @@ static int nl80211_crit_protocol_stop(struct sk_buff *skb, return 0; } +static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + if (vcmd->policy == VENDOR_CMD_RAW_DATA) { + if (attr->nla_type & NLA_F_NESTED) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "unexpected nested data"); + return -EINVAL; + } + + return 0; + } + + if (!(attr->nla_type & NLA_F_NESTED)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "expected nested data"); + return -EINVAL; + } + + return nl80211_validate_nested(attr, vcmd->maxattr, vcmd->policy, + extack); +} + static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -12727,11 +12769,16 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NL80211_ATTR_VENDOR_DATA]); len = nla_len(info->attrs[NL80211_ATTR_VENDOR_DATA]); + + err = nl80211_vendor_check_policy(vcmd, + info->attrs[NL80211_ATTR_VENDOR_DATA], + info->extack); + if (err) + return err; } rdev->cur_cmd_info = info; - err = rdev->wiphy.vendor_commands[i].doit(&rdev->wiphy, wdev, - data, len); + err = vcmd->doit(&rdev->wiphy, wdev, data, len); rdev->cur_cmd_info = NULL; return err; } @@ -12818,6 +12865,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, if (attrbuf[NL80211_ATTR_VENDOR_DATA]) { data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]); data_len = nla_len(attrbuf[NL80211_ATTR_VENDOR_DATA]); + + err = nl80211_vendor_check_policy( + &(*rdev)->wiphy.vendor_commands[vcmd_idx], + attrbuf[NL80211_ATTR_VENDOR_DATA], + cb->extack); + if (err) + return err; } /* 0 is the first index - add 1 to parse only once */ @@ -15086,7 +15140,9 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, return; } - if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -15376,6 +15432,19 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie, } EXPORT_SYMBOL(cfg80211_remain_on_channel_expired); +void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie, + struct ieee80211_channel *chan, + gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + trace_cfg80211_tx_mgmt_expired(wdev, cookie, chan); + nl80211_send_remain_on_chan_event(NL80211_CMD_FRAME_WAIT_CANCEL, + rdev, wdev, cookie, chan, 0, gfp); +} +EXPORT_SYMBOL(cfg80211_tx_mgmt_expired); + void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp) { diff --git a/net/wireless/scan.c b/net/wireless/scan.c index aa571d727903..d66e6d4b7555 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1092,17 +1092,17 @@ struct cfg80211_non_tx_bss { }; /* Returned bss is reference counted and must be cleaned up appropriately. */ -static struct cfg80211_internal_bss * +struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *tmp, - bool signal_valid) + bool signal_valid, unsigned long ts) { struct cfg80211_internal_bss *found = NULL; if (WARN_ON(!tmp->pub.channel)) return NULL; - tmp->ts = jiffies; + tmp->ts = ts; spin_lock_bh(&rdev->bss_lock); @@ -1425,7 +1425,8 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; - res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid); + res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, + jiffies); if (!res) return NULL; @@ -1842,7 +1843,8 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; - res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid); + res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, + jiffies); if (!res) return NULL; @@ -1972,6 +1974,27 @@ out: } EXPORT_SYMBOL(cfg80211_unlink_bss); +void cfg80211_bss_iter(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + void (*iter)(struct wiphy *wiphy, + struct cfg80211_bss *bss, + void *data), + void *iter_data) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct cfg80211_internal_bss *bss; + + spin_lock_bh(&rdev->bss_lock); + + list_for_each_entry(bss, &rdev->bss_list, list) { + if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel)) + iter(wiphy, &bss->pub, iter_data); + } + + spin_unlock_bh(&rdev->bss_lock); +} +EXPORT_SYMBOL(cfg80211_bss_iter); + #ifdef CONFIG_CFG80211_WEXT static struct cfg80211_registered_device * cfg80211_get_dev_from_ifindex(struct net *net, int ifindex) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 7d34cb884840..7a6c38ddc65a 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -796,12 +796,36 @@ void cfg80211_connect_done(struct net_device *dev, u8 *next; if (params->bss) { - /* Make sure the bss entry provided by the driver is valid. */ struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss); - if (WARN_ON(list_empty(&ibss->list))) { - cfg80211_put_bss(wdev->wiphy, params->bss); - return; + if (list_empty(&ibss->list)) { + struct cfg80211_bss *found = NULL, *tmp = params->bss; + + found = cfg80211_get_bss(wdev->wiphy, NULL, + params->bss->bssid, + wdev->ssid, wdev->ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY_ANY); + if (found) { + /* The same BSS is already updated so use it + * instead, as it has latest info. + */ + params->bss = found; + } else { + /* Update with BSS provided by driver, it will + * be freshly added and ref cnted, we can free + * the old one. + * + * signal_valid can be false, as we are not + * expecting the BSS to be found. + * + * keep the old timestamp to avoid confusion + */ + cfg80211_bss_update(rdev, ibss, false, + ibss->ts); + } + + cfg80211_put_bss(wdev->wiphy, tmp); } } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 2abfff925aac..4fbb91a511ae 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2752,6 +2752,24 @@ TRACE_EVENT(cfg80211_ready_on_channel_expired, WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG) ); +TRACE_EVENT(cfg80211_tx_mgmt_expired, + TP_PROTO(struct wireless_dev *wdev, u64 cookie, + struct ieee80211_channel *chan), + TP_ARGS(wdev, cookie, chan), + TP_STRUCT__entry( + WDEV_ENTRY + __field(u64, cookie) + CHAN_ENTRY + ), + TP_fast_assign( + WDEV_ASSIGN; + __entry->cookie = cookie; + CHAN_ASSIGN(chan); + ), + TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT, + WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG) +); + TRACE_EVENT(cfg80211_new_sta, TP_PROTO(struct net_device *netdev, const u8 *mac_addr, struct station_info *sinfo), diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9c6de4f114f8..20c91f02d3d8 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -105,6 +105,9 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, umem->dev = dev; umem->queue_id = queue_id; + + dev_hold(dev); + if (force_copy) /* For copy-mode, we are done. */ goto out_rtnl_unlock; @@ -124,7 +127,6 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, goto err_unreg_umem; rtnl_unlock(); - dev_hold(dev); umem->zc = true; return 0; @@ -138,11 +140,13 @@ out_rtnl_unlock: return err; } -static void xdp_umem_clear_dev(struct xdp_umem *umem) +void xdp_umem_clear_dev(struct xdp_umem *umem) { struct netdev_bpf bpf; int err; + ASSERT_RTNL(); + if (!umem->dev) return; @@ -151,22 +155,17 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) bpf.xsk.umem = NULL; bpf.xsk.queue_id = umem->queue_id; - rtnl_lock(); err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); - rtnl_unlock(); if (err) WARN(1, "failed to disable umem!\n"); } - rtnl_lock(); xdp_clear_umem_at_qid(umem->dev, umem->queue_id); - rtnl_unlock(); - if (umem->zc) { - dev_put(umem->dev); - umem->zc = false; - } + dev_put(umem->dev); + umem->dev = NULL; + umem->zc = false; } static void xdp_umem_unpin_pages(struct xdp_umem *umem) @@ -194,7 +193,9 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem) static void xdp_umem_release(struct xdp_umem *umem) { + rtnl_lock(); xdp_umem_clear_dev(umem); + rtnl_unlock(); ida_simple_remove(&umem_ida, umem->id); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 27603227601b..a63a9fb251f5 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -10,6 +10,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u16 queue_id, u16 flags); +void xdp_umem_clear_dev(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a14e8864e4fa..d4d6f10aa936 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -37,6 +37,12 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) READ_ONCE(xs->umem->fq); } +bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) +{ + return xskq_has_addrs(umem->fq, cnt); +} +EXPORT_SYMBOL(xsk_umem_has_addrs); + u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) { return xskq_peek_addr(umem->fq, addr); @@ -123,13 +129,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) u64 addr; int err; - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) - return -EINVAL; + spin_lock_bh(&xs->rx_lock); + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { + err = -EINVAL; + goto out_unlock; + } if (!xskq_peek_addr(xs->umem->fq, &addr) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - xs->rx_dropped++; - return -ENOSPC; + err = -ENOSPC; + goto out_drop; } addr += xs->umem->headroom; @@ -138,13 +148,21 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) memcpy(buffer, xdp->data_meta, len + metalen); addr += metalen; err = xskq_produce_batch_desc(xs->rx, addr, len); - if (!err) { - xskq_discard_addr(xs->umem->fq); - xsk_flush(xs); - return 0; - } + if (err) + goto out_drop; + + xskq_discard_addr(xs->umem->fq); + xskq_produce_flush_desc(xs->rx); + + spin_unlock_bh(&xs->rx_lock); + xs->sk.sk_data_ready(&xs->sk); + return 0; + +out_drop: xs->rx_dropped++; +out_unlock: + spin_unlock_bh(&xs->rx_lock); return err; } @@ -166,22 +184,18 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_consume_tx_done); -bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len) +bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) { - struct xdp_desc desc; struct xdp_sock *xs; rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_list, list) { - if (!xskq_peek_desc(xs->tx, &desc)) + if (!xskq_peek_desc(xs->tx, desc)) continue; - if (xskq_produce_addr_lazy(umem->cq, desc.addr)) + if (xskq_produce_addr_lazy(umem->cq, desc->addr)) goto out; - *dma = xdp_umem_get_dma(umem, desc.addr); - *len = desc.len; - xskq_discard_desc(xs->tx); rcu_read_unlock(); return true; @@ -335,6 +349,22 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, return 0; } +static void xsk_unbind_dev(struct xdp_sock *xs) +{ + struct net_device *dev = xs->dev; + + if (!dev || xs->state != XSK_BOUND) + return; + + xs->state = XSK_UNBOUND; + + /* Wait for driver to stop using the xdp socket. */ + xdp_del_sk_umem(xs->umem, xs); + xs->dev = NULL; + synchronize_net(); + dev_put(dev); +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -354,15 +384,7 @@ static int xsk_release(struct socket *sock) sock_prot_inuse_add(net, sk->sk_prot, -1); local_bh_enable(); - if (xs->dev) { - struct net_device *dev = xs->dev; - - /* Wait for driver to stop using the xdp socket. */ - xdp_del_sk_umem(xs->umem, xs); - xs->dev = NULL; - synchronize_net(); - dev_put(dev); - } + xsk_unbind_dev(xs); xskq_destroy(xs->rx); xskq_destroy(xs->tx); @@ -412,7 +434,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; mutex_lock(&xs->mutex); - if (xs->dev) { + if (xs->state != XSK_READY) { err = -EBUSY; goto out_release; } @@ -492,6 +514,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) out_unlock: if (err) dev_put(dev); + else + xs->state = XSK_BOUND; out_release: mutex_unlock(&xs->mutex); return err; @@ -520,6 +544,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); + if (xs->state != XSK_READY) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); mutex_unlock(&xs->mutex); @@ -534,7 +562,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); - if (xs->umem) { + if (xs->state != XSK_READY || xs->umem) { mutex_unlock(&xs->mutex); return -EBUSY; } @@ -561,6 +589,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); + if (xs->state != XSK_READY) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } if (!xs->umem) { mutex_unlock(&xs->mutex); return -EINVAL; @@ -644,6 +676,26 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, return 0; } + case XDP_OPTIONS: + { + struct xdp_options opts = {}; + + if (len < sizeof(opts)) + return -EINVAL; + + mutex_lock(&xs->mutex); + if (xs->zc) + opts.flags |= XDP_OPTIONS_ZEROCOPY; + mutex_unlock(&xs->mutex); + + len = sizeof(opts); + if (copy_to_user(optval, &opts, len)) + return -EFAULT; + if (put_user(len, optlen)) + return -EFAULT; + + return 0; + } default: break; } @@ -662,6 +714,9 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long pfn; struct page *qpg; + if (xs->state != XSK_READY) + return -EBUSY; + if (offset == XDP_PGOFF_RX_RING) { q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { @@ -693,6 +748,38 @@ static int xsk_mmap(struct file *file, struct socket *sock, size, vma->vm_page_prot); } +static int xsk_notifier(struct notifier_block *this, + unsigned long msg, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct sock *sk; + + switch (msg) { + case NETDEV_UNREGISTER: + mutex_lock(&net->xdp.lock); + sk_for_each(sk, &net->xdp.list) { + struct xdp_sock *xs = xdp_sk(sk); + + mutex_lock(&xs->mutex); + if (xs->dev == dev) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + xsk_unbind_dev(xs); + + /* Clear device references in umem. */ + xdp_umem_clear_dev(xs->umem); + } + mutex_unlock(&xs->mutex); + } + mutex_unlock(&net->xdp.lock); + break; + } + return NOTIFY_DONE; +} + static struct proto xsk_proto = { .name = "XDP", .owner = THIS_MODULE, @@ -764,7 +851,9 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); + xs->state = XSK_READY; mutex_init(&xs->mutex); + spin_lock_init(&xs->rx_lock); spin_lock_init(&xs->tx_completion_lock); mutex_lock(&net->xdp.lock); @@ -784,6 +873,10 @@ static const struct net_proto_family xsk_family_ops = { .owner = THIS_MODULE, }; +static struct notifier_block xsk_netdev_notifier = { + .notifier_call = xsk_notifier, +}; + static int __net_init xsk_net_init(struct net *net) { mutex_init(&net->xdp.lock); @@ -816,8 +909,15 @@ static int __init xsk_init(void) err = register_pernet_subsys(&xsk_net_ops); if (err) goto out_sk; + + err = register_netdevice_notifier(&xsk_netdev_notifier); + if (err) + goto out_pernet; + return 0; +out_pernet: + unregister_pernet_subsys(&xsk_net_ops); out_sk: sock_unregister(PF_XDP); out_proto: diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 88b9ae24658d..909c5168ed0f 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -117,6 +117,20 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) return q->nentries - (producer - q->cons_tail); } +static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) +{ + u32 entries = q->prod_tail - q->cons_tail; + + if (entries >= cnt) + return true; + + /* Refresh the local pointer. */ + q->prod_tail = READ_ONCE(q->ring->producer); + entries = q->prod_tail - q->cons_tail; + + return entries >= cnt; +} + /* UMEM queue */ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) @@ -288,7 +302,7 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) /* Order producer and data */ smp_wmb(); /* B, matches C */ - q->prod_tail = q->prod_head, + q->prod_tail = q->prod_head; WRITE_ONCE(q->ring->producer, q->prod_tail); } diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index c967fc3c38c8..51bb6018f3bf 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -15,6 +15,8 @@ config XFRM_ALGO tristate select XFRM select CRYPTO + select CRYPTO_HASH + select CRYPTO_BLKCIPHER if INET config XFRM_USER diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index ff654306d836..189ef15acbbc 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -271,9 +271,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) return false; if ((!dev || (dev == xfrm_dst_path(dst)->dev)) && - (!xdst->child->xfrm && x->type->get_mtu)) { - mtu = x->type->get_mtu(x, xdst->child_mtu_cached); - + (!xdst->child->xfrm)) { + mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 314973aaa414..6088bc2dc11e 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -359,28 +359,29 @@ static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); if (likely(afinfo)) err = afinfo->extract_input(x, skb); + rcu_read_unlock(); - if (err) { - rcu_read_unlock(); + if (err) return err; - } if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); - if (!inner_mode) { - rcu_read_unlock(); + if (!inner_mode) return -EAFNOSUPPORT; - } } - afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); - if (unlikely(!afinfo)) { - rcu_read_unlock(); - return -EAFNOSUPPORT; + switch (inner_mode->family) { + case AF_INET: + skb->protocol = htons(ETH_P_IP); + break; + case AF_INET6: + skb->protocol = htons(ETH_P_IPV6); + break; + default: + WARN_ON_ONCE(1); + break; } - skb->protocol = afinfo->eth_proto; - rcu_read_unlock(); return xfrm_inner_mode_encap_remove(x, inner_mode, skb); } diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index ad3a2555c517..74868f9d81fb 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -133,7 +133,7 @@ static void xfrmi_dev_free(struct net_device *dev) free_percpu(dev->tstats); } -static int xfrmi_create2(struct net_device *dev) +static int xfrmi_create(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = dev_net(dev); @@ -156,54 +156,7 @@ out: return err; } -static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p) -{ - struct net_device *dev; - struct xfrm_if *xi; - char name[IFNAMSIZ]; - int err; - - if (p->name[0]) { - strlcpy(name, p->name, IFNAMSIZ); - } else { - err = -EINVAL; - goto failed; - } - - dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup); - if (!dev) { - err = -EAGAIN; - goto failed; - } - - dev_net_set(dev, net); - - xi = netdev_priv(dev); - xi->p = *p; - xi->net = net; - xi->dev = dev; - xi->phydev = dev_get_by_index(net, p->link); - if (!xi->phydev) { - err = -ENODEV; - goto failed_free; - } - - err = xfrmi_create2(dev); - if (err < 0) - goto failed_dev_put; - - return xi; - -failed_dev_put: - dev_put(xi->phydev); -failed_free: - free_netdev(dev); -failed: - return ERR_PTR(err); -} - -static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p, - int create) +static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) { struct xfrm_if __rcu **xip; struct xfrm_if *xi; @@ -211,17 +164,11 @@ static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p, for (xip = &xfrmn->xfrmi[0]; (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) { - if (xi->p.if_id == p->if_id) { - if (create) - return ERR_PTR(-EEXIST); - + xip = &xi->next) + if (xi->p.if_id == p->if_id) return xi; - } - } - if (!create) - return ERR_PTR(-ENODEV); - return xfrmi_create(net, p); + + return NULL; } static void xfrmi_dev_uninit(struct net_device *dev) @@ -686,21 +633,33 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); - struct xfrm_if_parms *p; + struct xfrm_if_parms p; struct xfrm_if *xi; + int err; - xi = netdev_priv(dev); - p = &xi->p; - - xfrmi_netlink_parms(data, p); + xfrmi_netlink_parms(data, &p); if (!tb[IFLA_IFNAME]) return -EINVAL; - nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strlcpy(p.name, tb[IFLA_IFNAME], IFNAMSIZ); + + xi = xfrmi_locate(net, &p); + if (xi) + return -EEXIST; - xi = xfrmi_locate(net, p, 1); - return PTR_ERR_OR_ZERO(xi); + xi = netdev_priv(dev); + xi->p = p; + xi->net = net; + xi->dev = dev; + xi->phydev = dev_get_by_index(net, p.link); + if (!xi->phydev) + return -ENODEV; + + err = xfrmi_create(dev); + if (err < 0) + dev_put(xi->phydev); + return err; } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) @@ -717,9 +676,8 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], xfrmi_netlink_parms(data, &xi->p); - xi = xfrmi_locate(net, &xi->p, 0); - - if (IS_ERR_OR_NULL(xi)) { + xi = xfrmi_locate(net, &xi->p); + if (!xi) { xi = netdev_priv(dev); } else { if (xi->dev != dev) @@ -793,11 +751,6 @@ static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn) unregister_netdevice_many(&list); } -static int __net_init xfrmi_init_net(struct net *net) -{ - return 0; -} - static void __net_exit xfrmi_exit_net(struct net *net) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); @@ -808,7 +761,6 @@ static void __net_exit xfrmi_exit_net(struct net *net) } static struct pernet_operations xfrmi_net_ops = { - .init = xfrmi_init_net, .exit = xfrmi_exit_net, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b1694d5d15d3..8ca637a72697 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -585,9 +585,6 @@ static void xfrm_bydst_resize(struct net *net, int dir) odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); - odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, - lockdep_is_held(&net->xfrm.xfrm_policy_lock)); - for (i = hmask; i >= 0; i--) xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); @@ -1280,13 +1277,17 @@ static void xfrm_hash_rebuild(struct work_struct *work) hlist_for_each_entry_safe(policy, n, &net->xfrm.policy_inexact[dir], - bydst_inexact_list) + bydst_inexact_list) { + hlist_del_rcu(&policy->bydst); hlist_del_init(&policy->bydst_inexact_list); + } hmask = net->xfrm.policy_bydst[dir].hmask; odst = net->xfrm.policy_bydst[dir].table; - for (i = hmask; i >= 0; i--) - INIT_HLIST_HEAD(odst + i); + for (i = hmask; i >= 0; i--) { + hlist_for_each_entry_safe(policy, n, odst + i, bydst) + hlist_del_rcu(&policy->bydst); + } if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { /* dir out => dst = remote, src = local */ net->xfrm.policy_bydst[dir].dbits4 = rbits4; @@ -1315,8 +1316,6 @@ static void xfrm_hash_rebuild(struct work_struct *work) chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); - hlist_del_rcu(&policy->bydst); - if (!chain) { void *p = xfrm_policy_inexact_insert(policy, dir, 0); @@ -3628,7 +3627,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } xfrm_nr = ti; if (npols > 1) { - xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net); + xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 50621d982970..c6f3c4a1bd99 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -27,6 +27,8 @@ #include <linux/interrupt.h> #include <linux/kernel.h> +#include <crypto/aead.h> + #include "xfrm_hash.h" #define xfrm_state_deref_prot(table, net) \ @@ -177,63 +179,132 @@ int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol) static bool km_is_alive(const struct km_event *c); void km_state_expired(struct xfrm_state *x, int hard, u32 portid); -static DEFINE_SPINLOCK(xfrm_type_lock); int xfrm_register_type(const struct xfrm_type *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - const struct xfrm_type **typemap; int err = 0; - if (unlikely(afinfo == NULL)) + if (!afinfo) return -EAFNOSUPPORT; - typemap = afinfo->type_map; - spin_lock_bh(&xfrm_type_lock); - if (likely(typemap[type->proto] == NULL)) - typemap[type->proto] = type; - else - err = -EEXIST; - spin_unlock_bh(&xfrm_type_lock); +#define X(afi, T, name) do { \ + WARN_ON((afi)->type_ ## name); \ + (afi)->type_ ## name = (T); \ + } while (0) + + switch (type->proto) { + case IPPROTO_COMP: + X(afinfo, type, comp); + break; + case IPPROTO_AH: + X(afinfo, type, ah); + break; + case IPPROTO_ESP: + X(afinfo, type, esp); + break; + case IPPROTO_IPIP: + X(afinfo, type, ipip); + break; + case IPPROTO_DSTOPTS: + X(afinfo, type, dstopts); + break; + case IPPROTO_ROUTING: + X(afinfo, type, routing); + break; + case IPPROTO_IPV6: + X(afinfo, type, ipip6); + break; + default: + WARN_ON(1); + err = -EPROTONOSUPPORT; + break; + } +#undef X rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_type); -int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family) +void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - const struct xfrm_type **typemap; - int err = 0; if (unlikely(afinfo == NULL)) - return -EAFNOSUPPORT; - typemap = afinfo->type_map; - spin_lock_bh(&xfrm_type_lock); + return; - if (unlikely(typemap[type->proto] != type)) - err = -ENOENT; - else - typemap[type->proto] = NULL; - spin_unlock_bh(&xfrm_type_lock); +#define X(afi, T, name) do { \ + WARN_ON((afi)->type_ ## name != (T)); \ + (afi)->type_ ## name = NULL; \ + } while (0) + + switch (type->proto) { + case IPPROTO_COMP: + X(afinfo, type, comp); + break; + case IPPROTO_AH: + X(afinfo, type, ah); + break; + case IPPROTO_ESP: + X(afinfo, type, esp); + break; + case IPPROTO_IPIP: + X(afinfo, type, ipip); + break; + case IPPROTO_DSTOPTS: + X(afinfo, type, dstopts); + break; + case IPPROTO_ROUTING: + X(afinfo, type, routing); + break; + case IPPROTO_IPV6: + X(afinfo, type, ipip6); + break; + default: + WARN_ON(1); + break; + } +#undef X rcu_read_unlock(); - return err; } EXPORT_SYMBOL(xfrm_unregister_type); static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) { + const struct xfrm_type *type = NULL; struct xfrm_state_afinfo *afinfo; - const struct xfrm_type **typemap; - const struct xfrm_type *type; int modload_attempted = 0; retry: afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return NULL; - typemap = afinfo->type_map; - type = READ_ONCE(typemap[proto]); + switch (proto) { + case IPPROTO_COMP: + type = afinfo->type_comp; + break; + case IPPROTO_AH: + type = afinfo->type_ah; + break; + case IPPROTO_ESP: + type = afinfo->type_esp; + break; + case IPPROTO_IPIP: + type = afinfo->type_ipip; + break; + case IPPROTO_DSTOPTS: + type = afinfo->type_dstopts; + break; + case IPPROTO_ROUTING: + type = afinfo->type_routing; + break; + case IPPROTO_IPV6: + type = afinfo->type_ipip6; + break; + default: + break; + } + if (unlikely(type && !try_module_get(type->owner))) type = NULL; @@ -253,65 +324,71 @@ static void xfrm_put_type(const struct xfrm_type *type) module_put(type->owner); } -static DEFINE_SPINLOCK(xfrm_type_offload_lock); int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - const struct xfrm_type_offload **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; - typemap = afinfo->type_offload_map; - spin_lock_bh(&xfrm_type_offload_lock); - if (likely(typemap[type->proto] == NULL)) - typemap[type->proto] = type; - else - err = -EEXIST; - spin_unlock_bh(&xfrm_type_offload_lock); + switch (type->proto) { + case IPPROTO_ESP: + WARN_ON(afinfo->type_offload_esp); + afinfo->type_offload_esp = type; + break; + default: + WARN_ON(1); + err = -EPROTONOSUPPORT; + break; + } + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_type_offload); -int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, - unsigned short family) +void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, + unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - const struct xfrm_type_offload **typemap; - int err = 0; if (unlikely(afinfo == NULL)) - return -EAFNOSUPPORT; - typemap = afinfo->type_offload_map; - spin_lock_bh(&xfrm_type_offload_lock); + return; - if (unlikely(typemap[type->proto] != type)) - err = -ENOENT; - else - typemap[type->proto] = NULL; - spin_unlock_bh(&xfrm_type_offload_lock); + switch (type->proto) { + case IPPROTO_ESP: + WARN_ON(afinfo->type_offload_esp != type); + afinfo->type_offload_esp = NULL; + break; + default: + WARN_ON(1); + break; + } rcu_read_unlock(); - return err; } EXPORT_SYMBOL(xfrm_unregister_type_offload); static const struct xfrm_type_offload * xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load) { + const struct xfrm_type_offload *type = NULL; struct xfrm_state_afinfo *afinfo; - const struct xfrm_type_offload **typemap; - const struct xfrm_type_offload *type; retry: afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return NULL; - typemap = afinfo->type_offload_map; - type = typemap[proto]; + switch (proto) { + case IPPROTO_ESP: + type = afinfo->type_offload_esp; + break; + default: + break; + } + if ((type && !try_module_get(type->owner))) type = NULL; @@ -770,24 +847,79 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) EXPORT_SYMBOL(xfrm_sad_getinfo); static void +__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) +{ + const struct flowi4 *fl4 = &fl->u.ip4; + + sel->daddr.a4 = fl4->daddr; + sel->saddr.a4 = fl4->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl4->uli); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl, &fl4->uli); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET; + sel->prefixlen_d = 32; + sel->prefixlen_s = 32; + sel->proto = fl4->flowi4_proto; + sel->ifindex = fl4->flowi4_oif; +} + +static void +__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) +{ + const struct flowi6 *fl6 = &fl->u.ip6; + + /* Initialize temporary selector matching only to current session. */ + *(struct in6_addr *)&sel->daddr = fl6->daddr; + *(struct in6_addr *)&sel->saddr = fl6->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl6->uli); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl, &fl6->uli); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET6; + sel->prefixlen_d = 128; + sel->prefixlen_s = 128; + sel->proto = fl6->flowi6_proto; + sel->ifindex = fl6->flowi6_oif; +} + +static void xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, const struct xfrm_tmpl *tmpl, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family) { - struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family); - - if (!afinfo) - return; + switch (family) { + case AF_INET: + __xfrm4_init_tempsel(&x->sel, fl); + break; + case AF_INET6: + __xfrm6_init_tempsel(&x->sel, fl); + break; + } - afinfo->init_tempsel(&x->sel, fl); + x->id = tmpl->id; - if (family != tmpl->encap_family) { - afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family); - if (!afinfo) - return; + switch (tmpl->encap_family) { + case AF_INET: + if (x->id.daddr.a4 == 0) + x->id.daddr.a4 = daddr->a4; + x->props.saddr = tmpl->saddr; + if (x->props.saddr.a4 == 0) + x->props.saddr.a4 = saddr->a4; + break; + case AF_INET6: + if (ipv6_addr_any((struct in6_addr *)&x->id.daddr)) + memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); + memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); + if (ipv6_addr_any((struct in6_addr *)&x->props.saddr)) + memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); + break; } - afinfo->init_temprop(x, tmpl, daddr, saddr); + + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = tmpl->encap_family; } static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, @@ -1633,51 +1765,129 @@ xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, EXPORT_SYMBOL(xfrm_find_acq); #ifdef CONFIG_XFRM_SUB_POLICY -int +#if IS_ENABLED(CONFIG_IPV6) +/* distribution counting sort function for xfrm_state and xfrm_tmpl */ +static void +__xfrm6_sort(void **dst, void **src, int n, + int (*cmp)(const void *p), int maxclass) +{ + int count[XFRM_MAX_DEPTH] = { }; + int class[XFRM_MAX_DEPTH]; + int i; + + for (i = 0; i < n; i++) { + int c = cmp(src[i]); + + class[i] = c; + count[c]++; + } + + for (i = 2; i < maxclass; i++) + count[i] += count[i - 1]; + + for (i = 0; i < n; i++) { + dst[count[class[i] - 1]++] = src[i]; + src[i] = NULL; + } +} + +/* Rule for xfrm_state: + * + * rule 1: select IPsec transport except AH + * rule 2: select MIPv6 RO or inbound trigger + * rule 3: select IPsec transport AH + * rule 4: select IPsec tunnel + * rule 5: others + */ +static int __xfrm6_state_sort_cmp(const void *p) +{ + const struct xfrm_state *v = p; + + switch (v->props.mode) { + case XFRM_MODE_TRANSPORT: + if (v->id.proto != IPPROTO_AH) + return 1; + else + return 3; +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + return 2; +#endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + return 4; + } + return 5; +} + +/* Rule for xfrm_tmpl: + * + * rule 1: select IPsec transport + * rule 2: select MIPv6 RO or inbound trigger + * rule 3: select IPsec tunnel + * rule 4: others + */ +static int __xfrm6_tmpl_sort_cmp(const void *p) +{ + const struct xfrm_tmpl *v = p; + + switch (v->mode) { + case XFRM_MODE_TRANSPORT: + return 1; +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + return 2; +#endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + return 3; + } + return 4; +} +#else +static inline int __xfrm6_state_sort_cmp(const void *p) { return 5; } +static inline int __xfrm6_tmpl_sort_cmp(const void *p) { return 4; } + +static inline void +__xfrm6_sort(void **dst, void **src, int n, + int (*cmp)(const void *p), int maxclass) +{ + int i; + + for (i = 0; i < n; i++) + dst[i] = src[i]; +} +#endif /* CONFIG_IPV6 */ + +void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, - unsigned short family, struct net *net) + unsigned short family) { int i; - int err = 0; - struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - if (!afinfo) - return -EAFNOSUPPORT; - spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/ - if (afinfo->tmpl_sort) - err = afinfo->tmpl_sort(dst, src, n); + if (family == AF_INET6) + __xfrm6_sort((void **)dst, (void **)src, n, + __xfrm6_tmpl_sort_cmp, 5); else for (i = 0; i < n; i++) dst[i] = src[i]; - spin_unlock_bh(&net->xfrm.xfrm_state_lock); - rcu_read_unlock(); - return err; } -EXPORT_SYMBOL(xfrm_tmpl_sort); -int +void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, unsigned short family) { int i; - int err = 0; - struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - struct net *net = xs_net(*src); - - if (!afinfo) - return -EAFNOSUPPORT; - spin_lock_bh(&net->xfrm.xfrm_state_lock); - if (afinfo->state_sort) - err = afinfo->state_sort(dst, src, n); + if (family == AF_INET6) + __xfrm6_sort((void **)dst, (void **)src, n, + __xfrm6_state_sort_cmp, 6); else for (i = 0; i < n; i++) dst[i] = src[i]; - spin_unlock_bh(&net->xfrm.xfrm_state_lock); - rcu_read_unlock(); - return err; } -EXPORT_SYMBOL(xfrm_state_sort); #endif /* Silly enough, but I'm lazy to build resolution list */ @@ -2195,38 +2405,49 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -int xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); + struct crypto_aead *aead; + u32 blksize, net_adj = 0; + + if (x->km.state != XFRM_STATE_VALID || + !type || type->proto != IPPROTO_ESP) + return mtu - x->props.header_len; + + aead = x->data; + blksize = ALIGN(crypto_aead_blocksize(aead), 4); - if (x->km.state == XFRM_STATE_VALID && - type && type->get_mtu) - return type->get_mtu(x, mtu); + switch (x->props.mode) { + case XFRM_MODE_TRANSPORT: + case XFRM_MODE_BEET: + if (x->props.family == AF_INET) + net_adj = sizeof(struct iphdr); + else if (x->props.family == AF_INET6) + net_adj = sizeof(struct ipv6hdr); + break; + case XFRM_MODE_TUNNEL: + break; + default: + WARN_ON_ONCE(1); + break; + } - return mtu - x->props.header_len; + return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - + net_adj) & ~(blksize - 1)) + net_adj - 2; } +EXPORT_SYMBOL_GPL(xfrm_state_mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { - const struct xfrm_state_afinfo *afinfo; const struct xfrm_mode *inner_mode; const struct xfrm_mode *outer_mode; int family = x->props.family; int err; - err = -EAFNOSUPPORT; - afinfo = xfrm_state_get_afinfo(family); - if (!afinfo) - goto error; - - err = 0; - if (afinfo->init_flags) - err = afinfo->init_flags(x); - - rcu_read_unlock(); - - if (err) - goto error; + if (family == AF_INET && + xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) + x->props.flags |= XFRM_STATE_NOPMTUDISC; err = -EPROTONOSUPPORT; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 173477211e40..b88ba45ff1ac 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -152,6 +152,25 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, err = -EINVAL; switch (p->family) { case AF_INET: + break; + + case AF_INET6: +#if IS_ENABLED(CONFIG_IPV6) + break; +#else + err = -EAFNOSUPPORT; + goto out; +#endif + + default: + goto out; + } + + switch (p->sel.family) { + case AF_UNSPEC: + break; + + case AF_INET: if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) goto out; |