diff options
Diffstat (limited to 'net')
305 files changed, 13195 insertions, 4621 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h index 53cf446ce2e3..01853cec0209 100644 --- a/net/6lowpan/6lowpan_i.h +++ b/net/6lowpan/6lowpan_i.h @@ -18,24 +18,16 @@ extern const struct ndisc_ops lowpan_ndisc_ops; int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev); #ifdef CONFIG_6LOWPAN_DEBUGFS -int lowpan_dev_debugfs_init(struct net_device *dev); +void lowpan_dev_debugfs_init(struct net_device *dev); void lowpan_dev_debugfs_exit(struct net_device *dev); -int __init lowpan_debugfs_init(void); +void __init lowpan_debugfs_init(void); void lowpan_debugfs_exit(void); #else -static inline int lowpan_dev_debugfs_init(struct net_device *dev) -{ - return 0; -} - +static inline void lowpan_dev_debugfs_init(struct net_device *dev) { } static inline void lowpan_dev_debugfs_exit(struct net_device *dev) { } -static inline int __init lowpan_debugfs_init(void) -{ - return 0; -} - +static inline void __init lowpan_debugfs_init(void) { } static inline void lowpan_debugfs_exit(void) { } #endif /* CONFIG_6LOWPAN_DEBUGFS */ diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 2d68351f1ac4..a068757eabaf 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -42,9 +42,7 @@ int lowpan_register_netdevice(struct net_device *dev, if (ret < 0) return ret; - ret = lowpan_dev_debugfs_init(dev); - if (ret < 0) - unregister_netdevice(dev); + lowpan_dev_debugfs_init(dev); return ret; } @@ -152,9 +150,7 @@ static int __init lowpan_module_init(void) { int ret; - ret = lowpan_debugfs_init(); - if (ret < 0) - return ret; + lowpan_debugfs_init(); ret = register_netdevice_notifier(&lowpan_notifier); if (ret < 0) { diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index f5a8eec9d7a3..1c140af06d52 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -163,11 +163,11 @@ static const struct file_operations lowpan_ctx_pfx_fops = { .release = single_release, }; -static int lowpan_dev_debugfs_ctx_init(struct net_device *dev, - struct dentry *ctx, u8 id) +static void lowpan_dev_debugfs_ctx_init(struct net_device *dev, + struct dentry *ctx, u8 id) { struct lowpan_dev *ldev = lowpan_dev(dev); - struct dentry *dentry, *root; + struct dentry *root; char buf[32]; WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE); @@ -175,34 +175,18 @@ static int lowpan_dev_debugfs_ctx_init(struct net_device *dev, sprintf(buf, "%d", id); root = debugfs_create_dir(buf, ctx); - if (!root) - return -EINVAL; - dentry = debugfs_create_file_unsafe("active", 0644, root, - &ldev->ctx.table[id], - &lowpan_ctx_flag_active_fops); - if (!dentry) - return -EINVAL; + debugfs_create_file("active", 0644, root, &ldev->ctx.table[id], + &lowpan_ctx_flag_active_fops); - dentry = debugfs_create_file_unsafe("compression", 0644, root, - &ldev->ctx.table[id], - &lowpan_ctx_flag_c_fops); - if (!dentry) - return -EINVAL; - - dentry = debugfs_create_file("prefix", 0644, root, - &ldev->ctx.table[id], - &lowpan_ctx_pfx_fops); - if (!dentry) - return -EINVAL; + debugfs_create_file("compression", 0644, root, &ldev->ctx.table[id], + &lowpan_ctx_flag_c_fops); - dentry = debugfs_create_file_unsafe("prefix_len", 0644, root, - &ldev->ctx.table[id], - &lowpan_ctx_plen_fops); - if (!dentry) - return -EINVAL; + debugfs_create_file("prefix", 0644, root, &ldev->ctx.table[id], + &lowpan_ctx_pfx_fops); - return 0; + debugfs_create_file("prefix_len", 0644, root, &ldev->ctx.table[id], + &lowpan_ctx_plen_fops); } static int lowpan_context_show(struct seq_file *file, void *offset) @@ -242,64 +226,39 @@ static int lowpan_short_addr_get(void *data, u64 *val) DEFINE_DEBUGFS_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get, NULL, "0x%04llx\n"); -static int lowpan_dev_debugfs_802154_init(const struct net_device *dev, +static void lowpan_dev_debugfs_802154_init(const struct net_device *dev, struct lowpan_dev *ldev) { - struct dentry *dentry, *root; + struct dentry *root; if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) - return 0; + return; root = debugfs_create_dir("ieee802154", ldev->iface_debugfs); - if (!root) - return -EINVAL; - - dentry = debugfs_create_file_unsafe("short_addr", 0444, root, - lowpan_802154_dev(dev)->wdev->ieee802154_ptr, - &lowpan_short_addr_fops); - if (!dentry) - return -EINVAL; - return 0; + debugfs_create_file("short_addr", 0444, root, + lowpan_802154_dev(dev)->wdev->ieee802154_ptr, + &lowpan_short_addr_fops); } -int lowpan_dev_debugfs_init(struct net_device *dev) +void lowpan_dev_debugfs_init(struct net_device *dev) { struct lowpan_dev *ldev = lowpan_dev(dev); - struct dentry *contexts, *dentry; - int ret, i; + struct dentry *contexts; + int i; /* creating the root */ ldev->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs); - if (!ldev->iface_debugfs) - goto fail; contexts = debugfs_create_dir("contexts", ldev->iface_debugfs); - if (!contexts) - goto remove_root; - - dentry = debugfs_create_file("show", 0644, contexts, - &lowpan_dev(dev)->ctx, - &lowpan_context_fops); - if (!dentry) - goto remove_root; - - for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) { - ret = lowpan_dev_debugfs_ctx_init(dev, contexts, i); - if (ret < 0) - goto remove_root; - } - ret = lowpan_dev_debugfs_802154_init(dev, ldev); - if (ret < 0) - goto remove_root; + debugfs_create_file("show", 0644, contexts, &lowpan_dev(dev)->ctx, + &lowpan_context_fops); - return 0; + for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) + lowpan_dev_debugfs_ctx_init(dev, contexts, i); -remove_root: - lowpan_dev_debugfs_exit(dev); -fail: - return -EINVAL; + lowpan_dev_debugfs_802154_init(dev, ldev); } void lowpan_dev_debugfs_exit(struct net_device *dev) @@ -307,13 +266,9 @@ void lowpan_dev_debugfs_exit(struct net_device *dev) debugfs_remove_recursive(lowpan_dev(dev)->iface_debugfs); } -int __init lowpan_debugfs_init(void) +void __init lowpan_debugfs_init(void) { lowpan_debugfs = debugfs_create_dir("6lowpan", NULL); - if (!lowpan_debugfs) - return -EINVAL; - - return 0; } void lowpan_debugfs_exit(void) diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index a0b2d8b9def7..93eadf179123 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -580,6 +580,7 @@ static int vlan_dev_init(struct net_device *dev) dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE; dev->hw_enc_features = vlan_tnl_features(real_dev); + dev->mpls_features = real_dev->mpls_features; /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; diff --git a/net/Kconfig b/net/Kconfig index d122f53c6fa2..57f51a279ad6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -67,8 +67,6 @@ source "net/xdp/Kconfig" config INET bool "TCP/IP networking" - select CRYPTO - select CRYPTO_AES ---help--- These are the protocols used on the Internet and on most local Ethernets. It is highly recommended to say Y here (this will enlarge diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index cb7d57d16c9d..37898da8ad48 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -9,12 +9,11 @@ #include "main.h" +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> -struct netlink_callback; -struct seq_file; -struct sk_buff; - extern char batadv_routing_algo[]; extern struct list_head batadv_hardif_list; diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 231b4aab4d8d..22672cb3e25d 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -21,6 +21,7 @@ #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> @@ -41,8 +42,6 @@ #include "netlink.h" #include "originator.h" -struct sk_buff; - static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index bb3d40f73bfe..1a29505f4f66 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -9,8 +9,8 @@ #include "main.h" -struct sk_buff; -struct work_struct; +#include <linux/skbuff.h> +#include <linux/workqueue.h> int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 616bf2ea8755..2a50df7fc2bf 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -9,10 +9,9 @@ #include "main.h" +#include <linux/skbuff.h> #include <linux/types.h> -struct sk_buff; - int batadv_v_ogm_init(struct batadv_priv *bat_priv); void batadv_v_ogm_free(struct batadv_priv *bat_priv); int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface); diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 012d72c8d064..02b24a861a85 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -10,14 +10,13 @@ #include "main.h" #include <linux/compiler.h> +#include <linux/netdevice.h> +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> -struct net_device; -struct netlink_callback; -struct seq_file; -struct sk_buff; - /** * batadv_bla_is_loopdetect_mac() - check if the mac address is from a loop * detect frame sent by bridge loop avoidance diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index d38d70ccdd5a..38c4d8e51155 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -10,7 +10,6 @@ #include <asm/current.h> #include <linux/dcache.h> #include <linux/debugfs.h> -#include <linux/err.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/fs.h> @@ -293,31 +292,13 @@ static struct batadv_debuginfo *batadv_hardif_debuginfos[] = { void batadv_debugfs_init(void) { struct batadv_debuginfo **bat_debug; - struct dentry *file; batadv_debugfs = debugfs_create_dir(BATADV_DEBUGFS_SUBDIR, NULL); - if (batadv_debugfs == ERR_PTR(-ENODEV)) - batadv_debugfs = NULL; - - if (!batadv_debugfs) - goto err; - - for (bat_debug = batadv_general_debuginfos; *bat_debug; ++bat_debug) { - file = debugfs_create_file(((*bat_debug)->attr).name, - S_IFREG | ((*bat_debug)->attr).mode, - batadv_debugfs, NULL, - &(*bat_debug)->fops); - if (!file) { - pr_err("Can't add general debugfs file: %s\n", - ((*bat_debug)->attr).name); - goto err; - } - } - return; -err: - debugfs_remove_recursive(batadv_debugfs); - batadv_debugfs = NULL; + for (bat_debug = batadv_general_debuginfos; *bat_debug; ++bat_debug) + debugfs_create_file(((*bat_debug)->attr).name, + S_IFREG | ((*bat_debug)->attr).mode, + batadv_debugfs, NULL, &(*bat_debug)->fops); } /** @@ -333,42 +314,23 @@ void batadv_debugfs_destroy(void) * batadv_debugfs_add_hardif() - creates the base directory for a hard interface * in debugfs. * @hard_iface: hard interface which should be added. - * - * Return: 0 on success or negative error number in case of failure */ -int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) +void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) { struct net *net = dev_net(hard_iface->net_dev); struct batadv_debuginfo **bat_debug; - struct dentry *file; - - if (!batadv_debugfs) - goto out; if (net != &init_net) - return 0; + return; hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name, batadv_debugfs); - if (!hard_iface->debug_dir) - goto out; - - for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug) { - file = debugfs_create_file(((*bat_debug)->attr).name, - S_IFREG | ((*bat_debug)->attr).mode, - hard_iface->debug_dir, - hard_iface->net_dev, - &(*bat_debug)->fops); - if (!file) - goto rem_attr; - } - return 0; -rem_attr: - debugfs_remove_recursive(hard_iface->debug_dir); - hard_iface->debug_dir = NULL; -out: - return -ENOMEM; + for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug) + debugfs_create_file(((*bat_debug)->attr).name, + S_IFREG | ((*bat_debug)->attr).mode, + hard_iface->debug_dir, hard_iface->net_dev, + &(*bat_debug)->fops); } /** @@ -379,15 +341,12 @@ void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface) { const char *name = hard_iface->net_dev->name; struct dentry *dir; - struct dentry *d; dir = hard_iface->debug_dir; if (!dir) return; - d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name); - if (!d) - pr_err("Can't rename debugfs dir to %s\n", name); + debugfs_rename(dir->d_parent, dir, dir->d_parent, name); } /** @@ -419,44 +378,29 @@ int batadv_debugfs_add_meshif(struct net_device *dev) struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_debuginfo **bat_debug; struct net *net = dev_net(dev); - struct dentry *file; - - if (!batadv_debugfs) - goto out; if (net != &init_net) return 0; bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs); - if (!bat_priv->debug_dir) - goto out; - if (batadv_socket_setup(bat_priv) < 0) - goto rem_attr; + batadv_socket_setup(bat_priv); if (batadv_debug_log_setup(bat_priv) < 0) goto rem_attr; - for (bat_debug = batadv_mesh_debuginfos; *bat_debug; ++bat_debug) { - file = debugfs_create_file(((*bat_debug)->attr).name, - S_IFREG | ((*bat_debug)->attr).mode, - bat_priv->debug_dir, - dev, &(*bat_debug)->fops); - if (!file) { - batadv_err(dev, "Can't add debugfs file: %s/%s\n", - dev->name, ((*bat_debug)->attr).name); - goto rem_attr; - } - } + for (bat_debug = batadv_mesh_debuginfos; *bat_debug; ++bat_debug) + debugfs_create_file(((*bat_debug)->attr).name, + S_IFREG | ((*bat_debug)->attr).mode, + bat_priv->debug_dir, dev, + &(*bat_debug)->fops); - if (batadv_nc_init_debugfs(bat_priv) < 0) - goto rem_attr; + batadv_nc_init_debugfs(bat_priv); return 0; rem_attr: debugfs_remove_recursive(bat_priv->debug_dir); bat_priv->debug_dir = NULL; -out: return -ENOMEM; } @@ -469,15 +413,12 @@ void batadv_debugfs_rename_meshif(struct net_device *dev) struct batadv_priv *bat_priv = netdev_priv(dev); const char *name = dev->name; struct dentry *dir; - struct dentry *d; dir = bat_priv->debug_dir; if (!dir) return; - d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name); - if (!d) - pr_err("Can't rename debugfs dir to %s\n", name); + debugfs_rename(dir->d_parent, dir, dir->d_parent, name); } /** diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 7fac680cf740..1c5afd301ce9 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -9,8 +9,8 @@ #include "main.h" -struct file; -struct net_device; +#include <linux/fs.h> +#include <linux/netdevice.h> #define BATADV_DEBUGFS_SUBDIR "batman_adv" @@ -22,7 +22,7 @@ void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); void batadv_debugfs_rename_meshif(struct net_device *dev); void batadv_debugfs_del_meshif(struct net_device *dev); -int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); +void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); @@ -54,9 +54,8 @@ static inline void batadv_debugfs_del_meshif(struct net_device *dev) } static inline -int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) +void batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) { - return 0; } static inline diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 110c27447d70..67c7729add55 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -11,15 +11,14 @@ #include <linux/compiler.h> #include <linux/netdevice.h> +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include "originator.h" -struct netlink_callback; -struct seq_file; -struct sk_buff; - #ifdef CONFIG_BATMAN_ADV_DAT /* BATADV_DAT_ADDR_MAX - maximum address value in the DHT space */ diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index d6074ba2ada7..abfe8c6556de 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -11,11 +11,10 @@ #include <linux/compiler.h> #include <linux/list.h> +#include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> -struct sk_buff; - void batadv_frag_purge_orig(struct batadv_orig_node *orig, bool (*check_cb)(struct batadv_frag_table_entry *)); bool batadv_frag_skb_fwd(struct sk_buff *skb, diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 0e14026feebd..0be8e7178ec7 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -9,12 +9,11 @@ #include "main.h" +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> - -struct batadv_tvlv_gateway_data; -struct netlink_callback; -struct seq_file; -struct sk_buff; +#include <uapi/linux/batadv_packet.h> void batadv_gw_check_client_stop(struct batadv_priv *bat_priv); void batadv_gw_reselect(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index dac097f9be03..fc55750542e4 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -11,6 +11,7 @@ #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/kernel.h> +#include <linux/limits.h> #include <linux/math64.h> #include <linux/netdevice.h> #include <linux/stddef.h> diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 5cf50736c635..211b14b37db8 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -9,10 +9,9 @@ #include "main.h" +#include <linux/netdevice.h> #include <linux/types.h> -struct net_device; - /** * enum batadv_bandwidth_units - bandwidth unit types */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 3719cfd026f0..c90e47342bb0 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -16,6 +16,7 @@ #include <linux/if_ether.h> #include <linux/kernel.h> #include <linux/kref.h> +#include <linux/limits.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -923,9 +924,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; - ret = batadv_debugfs_add_hardif(hard_iface); - if (ret) - goto free_sysfs; + batadv_debugfs_add_hardif(hard_iface); INIT_LIST_HEAD(&hard_iface->list); INIT_HLIST_HEAD(&hard_iface->neigh_list); @@ -947,8 +946,6 @@ batadv_hardif_add_interface(struct net_device *net_dev) return hard_iface; -free_sysfs: - batadv_sysfs_del_hardif(&hard_iface->hardif_obj); free_if: kfree(hard_iface); release_dev: diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index c8ef6aa0e865..bbb8a6f18d6b 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -11,13 +11,12 @@ #include <linux/compiler.h> #include <linux/kref.h> +#include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/stddef.h> #include <linux/types.h> - -struct net_device; -struct net; +#include <net/net_namespace.h> /** * enum batadv_hard_if_state - State of a hard interface diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index ceef171f7f98..57877f0b78e0 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -12,13 +12,12 @@ #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/list.h> +#include <linux/lockdep.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> -struct lock_class_key; - /* callback to a compare function. should compare 2 element datas for their * keys * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 0a91c8661357..0a70b66e8770 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -314,25 +314,11 @@ static const struct file_operations batadv_fops = { /** * batadv_socket_setup() - Create debugfs "socket" file * @bat_priv: the bat priv with all the soft interface information - * - * Return: 0 on success or negative error number in case of failure */ -int batadv_socket_setup(struct batadv_priv *bat_priv) +void batadv_socket_setup(struct batadv_priv *bat_priv) { - struct dentry *d; - - if (!bat_priv->debug_dir) - goto err; - - d = debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir, - bat_priv, &batadv_fops); - if (!d) - goto err; - - return 0; - -err: - return -ENOMEM; + debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir, + bat_priv, &batadv_fops); } /** diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 35eecbfd2e65..27fafff586df 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -10,12 +10,11 @@ #include "main.h" #include <linux/types.h> - -struct batadv_icmp_header; +#include <uapi/linux/batadv_packet.h> #define BATADV_ICMP_SOCKET "socket" -int batadv_socket_setup(struct batadv_priv *bat_priv); +void batadv_socket_setup(struct batadv_priv *bat_priv); #ifdef CONFIG_BATMAN_ADV_DEBUGFS diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index f79ebd5b46e9..11941cf1adcc 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -190,27 +190,16 @@ static const struct file_operations batadv_log_fops = { */ int batadv_debug_log_setup(struct batadv_priv *bat_priv) { - struct dentry *d; - - if (!bat_priv->debug_dir) - goto err; - bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC); if (!bat_priv->debug_log) - goto err; + return -ENOMEM; spin_lock_init(&bat_priv->debug_log->lock); init_waitqueue_head(&bat_priv->debug_log->queue_wait); - d = debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv, - &batadv_log_fops); - if (!d) - goto err; - + debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv, + &batadv_log_fops); return 0; - -err: - return -ENOMEM; } /** diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 5504637e63d8..741cfa3719ff 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -9,6 +9,7 @@ #include "main.h" +#include <linux/atomic.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/printk.h> diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index c59afcba31e0..3d4c04d87ff3 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2019.2" +#define BATADV_SOURCE_VERSION "2019.3" #endif /* B.A.T.M.A.N. parameters */ @@ -205,20 +205,20 @@ enum batadv_uev_type { /* Kernel headers */ +#include <linux/atomic.h> #include <linux/compiler.h> #include <linux/etherdevice.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> +#include <linux/netdevice.h> #include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include "types.h" - -struct net_device; -struct packet_type; -struct seq_file; -struct sk_buff; +#include "main.h" /** * batadv_print_vid() - return printable version of vid information diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index ec54e236e345..67d7f83009ae 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -20,6 +20,7 @@ #include <linux/igmp.h> #include <linux/in.h> #include <linux/in6.h> +#include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jiffies.h> @@ -98,69 +99,307 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) } /** - * batadv_mcast_addr_is_ipv4() - check if multicast MAC is IPv4 - * @addr: the MAC address to check + * batadv_mcast_mla_rtr_flags_softif_get_ipv4() - get mcast router flags from + * node for IPv4 + * @dev: the interface to check * - * Return: True, if MAC address is one reserved for IPv4 multicast, false - * otherwise. + * Checks the presence of an IPv4 multicast router on this node. + * + * Caller needs to hold rcu read lock. + * + * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise. */ -static bool batadv_mcast_addr_is_ipv4(const u8 *addr) +static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv4(struct net_device *dev) { - static const u8 prefix[] = {0x01, 0x00, 0x5E}; + struct in_device *in_dev = __in_dev_get_rcu(dev); - return memcmp(prefix, addr, sizeof(prefix)) == 0; + if (in_dev && IN_DEV_MFORWARD(in_dev)) + return BATADV_NO_FLAGS; + else + return BATADV_MCAST_WANT_NO_RTR4; } /** - * batadv_mcast_addr_is_ipv6() - check if multicast MAC is IPv6 - * @addr: the MAC address to check + * batadv_mcast_mla_rtr_flags_softif_get_ipv6() - get mcast router flags from + * node for IPv6 + * @dev: the interface to check * - * Return: True, if MAC address is one reserved for IPv6 multicast, false - * otherwise. + * Checks the presence of an IPv6 multicast router on this node. + * + * Caller needs to hold rcu read lock. + * + * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise. */ -static bool batadv_mcast_addr_is_ipv6(const u8 *addr) +#if IS_ENABLED(CONFIG_IPV6_MROUTE) +static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) { - static const u8 prefix[] = {0x33, 0x33}; + struct inet6_dev *in6_dev = __in6_dev_get(dev); - return memcmp(prefix, addr, sizeof(prefix)) == 0; + if (in6_dev && in6_dev->cnf.mc_forwarding) + return BATADV_NO_FLAGS; + else + return BATADV_MCAST_WANT_NO_RTR6; +} +#else +static inline u8 +batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) +{ + return BATADV_MCAST_WANT_NO_RTR6; } +#endif /** - * batadv_mcast_mla_softif_get() - get softif multicast listeners + * batadv_mcast_mla_rtr_flags_softif_get() - get mcast router flags from node + * @bat_priv: the bat priv with all the soft interface information + * @bridge: bridge interface on top of the soft_iface if present, + * otherwise pass NULL + * + * Checks the presence of IPv4 and IPv6 multicast routers on this + * node. + * + * Return: + * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present + * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present + * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present + * The former two OR'd: no multicast router is present + */ +static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + struct net_device *dev = bridge ? bridge : bat_priv->soft_iface; + u8 flags = BATADV_NO_FLAGS; + + rcu_read_lock(); + + flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv4(dev); + flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv6(dev); + + rcu_read_unlock(); + + return flags; +} + +/** + * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge + * @bat_priv: the bat priv with all the soft interface information + * @bridge: bridge interface on top of the soft_iface if present, + * otherwise pass NULL + * + * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge. + * + * Return: + * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present + * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present + * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present + * The former two OR'd: no multicast router is present + */ +#if IS_ENABLED(CONFIG_IPV6) +static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); + struct net_device *dev = bat_priv->soft_iface; + struct br_ip_list *br_ip_entry, *tmp; + u8 flags = BATADV_MCAST_WANT_NO_RTR6; + int ret; + + if (!bridge) + return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; + + /* TODO: ask the bridge if a multicast router is present (the bridge + * is capable of performing proper RFC4286 multicast multicast router + * discovery) instead of searching for a ff02::2 listener here + */ + ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); + if (ret < 0) + return BATADV_NO_FLAGS; + + list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { + /* the bridge snooping does not maintain IPv4 link-local + * addresses - therefore we won't find any IPv4 multicast router + * address here, only IPv6 ones + */ + if (br_ip_entry->addr.proto == htons(ETH_P_IPV6) && + ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.u.ip6)) + flags &= ~BATADV_MCAST_WANT_NO_RTR6; + + list_del(&br_ip_entry->list); + kfree(br_ip_entry); + } + + return flags; +} +#else +static inline u8 +batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + if (bridge) + return BATADV_NO_FLAGS; + else + return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; +} +#endif + +/** + * batadv_mcast_mla_rtr_flags_get() - get multicast router flags + * @bat_priv: the bat priv with all the soft interface information + * @bridge: bridge interface on top of the soft_iface if present, + * otherwise pass NULL + * + * Checks the presence of IPv4 and IPv6 multicast routers on this + * node or behind its bridge. + * + * Return: + * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present + * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present + * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present + * The former two OR'd: no multicast router is present + */ +static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; + + flags &= batadv_mcast_mla_rtr_flags_softif_get(bat_priv, bridge); + flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge); + + return flags; +} + +/** + * batadv_mcast_mla_flags_get() - get the new multicast flags * @bat_priv: the bat priv with all the soft interface information + * + * Return: A set of flags for the current/next TVLV, querier and + * bridge state. + */ +static struct batadv_mcast_mla_flags +batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) +{ + struct net_device *dev = bat_priv->soft_iface; + struct batadv_mcast_querier_state *qr4, *qr6; + struct batadv_mcast_mla_flags mla_flags; + struct net_device *bridge; + + bridge = batadv_mcast_get_bridge(dev); + + memset(&mla_flags, 0, sizeof(mla_flags)); + mla_flags.enabled = 1; + mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv, + bridge); + + if (!bridge) + return mla_flags; + + dev_put(bridge); + + mla_flags.bridged = 1; + qr4 = &mla_flags.querier_ipv4; + qr6 = &mla_flags.querier_ipv6; + + if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) + pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); + + qr4->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); + qr4->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); + + qr6->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); + qr6->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); + + mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; + + /* 1) If no querier exists at all, then multicast listeners on + * our local TT clients behind the bridge will keep silent. + * 2) If the selected querier is on one of our local TT clients, + * behind the bridge, then this querier might shadow multicast + * listeners on our local TT clients, behind this bridge. + * + * In both cases, we will signalize other batman nodes that + * we need all multicast traffic of the according protocol. + */ + if (!qr4->exists || qr4->shadowing) { + mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4; + mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4; + } + + if (!qr6->exists || qr6->shadowing) { + mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6; + mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6; + } + + return mla_flags; +} + +/** + * batadv_mcast_mla_is_duplicate() - check whether an address is in a list + * @mcast_addr: the multicast address to check + * @mcast_list: the list with multicast addresses to search in + * + * Return: true if the given address is already in the given list. + * Otherwise returns false. + */ +static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, + struct hlist_head *mcast_list) +{ + struct batadv_hw_addr *mcast_entry; + + hlist_for_each_entry(mcast_entry, mcast_list, list) + if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) + return true; + + return false; +} + +/** + * batadv_mcast_mla_softif_get_ipv4() - get softif IPv4 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into + * @flags: flags indicating the new multicast state * - * Collects multicast addresses of multicast listeners residing + * Collects multicast addresses of IPv4 multicast listeners residing * on this kernel on the given soft interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * - * If there is a bridge interface on top of dev, collects from that one - * instead. Just like with IP addresses and routes, multicast listeners - * will(/should) register to the bridge interface instead of an - * enslaved bat0. - * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ -static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv, - struct net_device *dev, - struct hlist_head *mcast_list) +static int +batadv_mcast_mla_softif_get_ipv4(struct net_device *dev, + struct hlist_head *mcast_list, + struct batadv_mcast_mla_flags *flags) { - bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4; - bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6; - struct net_device *bridge = batadv_mcast_get_bridge(dev); - struct netdev_hw_addr *mc_list_entry; struct batadv_hw_addr *new; + struct in_device *in_dev; + u8 mcast_addr[ETH_ALEN]; + struct ip_mc_list *pmc; int ret = 0; - netif_addr_lock_bh(bridge ? bridge : dev); - netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) { - if (all_ipv4 && batadv_mcast_addr_is_ipv4(mc_list_entry->addr)) + if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) + return 0; + + rcu_read_lock(); + + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) { + rcu_read_unlock(); + return 0; + } + + for (pmc = rcu_dereference(in_dev->mc_list); pmc; + pmc = rcu_dereference(pmc->next_rcu)) { + if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && + ipv4_is_local_multicast(pmc->multiaddr)) + continue; + + if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && + !ipv4_is_local_multicast(pmc->multiaddr)) continue; - if (all_ipv6 && batadv_mcast_addr_is_ipv6(mc_list_entry->addr)) + ip_eth_mc_map(pmc->multiaddr, mcast_addr); + + if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); @@ -169,36 +408,142 @@ static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv, break; } - ether_addr_copy(new->addr, mc_list_entry->addr); + ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } - netif_addr_unlock_bh(bridge ? bridge : dev); + rcu_read_unlock(); - if (bridge) - dev_put(bridge); + return ret; +} + +/** + * batadv_mcast_mla_softif_get_ipv6() - get softif IPv6 multicast listeners + * @dev: the device to collect multicast addresses from + * @mcast_list: a list to put found addresses into + * @flags: flags indicating the new multicast state + * + * Collects multicast addresses of IPv6 multicast listeners residing + * on this kernel on the given soft interface, dev, in + * the given mcast_list. In general, multicast listeners provided by + * your multicast receiving applications run directly on this node. + * + * Return: -ENOMEM on memory allocation error or the number of + * items added to the mcast_list otherwise. + */ +#if IS_ENABLED(CONFIG_IPV6) +static int +batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, + struct hlist_head *mcast_list, + struct batadv_mcast_mla_flags *flags) +{ + struct batadv_hw_addr *new; + struct inet6_dev *in6_dev; + u8 mcast_addr[ETH_ALEN]; + struct ifmcaddr6 *pmc6; + int ret = 0; + + if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) + return 0; + + rcu_read_lock(); + + in6_dev = __in6_dev_get(dev); + if (!in6_dev) { + rcu_read_unlock(); + return 0; + } + + read_lock_bh(&in6_dev->lock); + for (pmc6 = in6_dev->mc_list; pmc6; pmc6 = pmc6->next) { + if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) < + IPV6_ADDR_SCOPE_LINKLOCAL) + continue; + + if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && + ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr)) + continue; + + if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && + IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) > + IPV6_ADDR_SCOPE_LINKLOCAL) + continue; + + ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr); + + if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) + continue; + + new = kmalloc(sizeof(*new), GFP_ATOMIC); + if (!new) { + ret = -ENOMEM; + break; + } + + ether_addr_copy(new->addr, mcast_addr); + hlist_add_head(&new->list, mcast_list); + ret++; + } + read_unlock_bh(&in6_dev->lock); + rcu_read_unlock(); return ret; } +#else +static inline int +batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, + struct hlist_head *mcast_list, + struct batadv_mcast_mla_flags *flags) +{ + return 0; +} +#endif /** - * batadv_mcast_mla_is_duplicate() - check whether an address is in a list - * @mcast_addr: the multicast address to check - * @mcast_list: the list with multicast addresses to search in + * batadv_mcast_mla_softif_get() - get softif multicast listeners + * @dev: the device to collect multicast addresses from + * @mcast_list: a list to put found addresses into + * @flags: flags indicating the new multicast state * - * Return: true if the given address is already in the given list. - * Otherwise returns false. + * Collects multicast addresses of multicast listeners residing + * on this kernel on the given soft interface, dev, in + * the given mcast_list. In general, multicast listeners provided by + * your multicast receiving applications run directly on this node. + * + * If there is a bridge interface on top of dev, collects from that one + * instead. Just like with IP addresses and routes, multicast listeners + * will(/should) register to the bridge interface instead of an + * enslaved bat0. + * + * Return: -ENOMEM on memory allocation error or the number of + * items added to the mcast_list otherwise. */ -static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, - struct hlist_head *mcast_list) +static int +batadv_mcast_mla_softif_get(struct net_device *dev, + struct hlist_head *mcast_list, + struct batadv_mcast_mla_flags *flags) { - struct batadv_hw_addr *mcast_entry; + struct net_device *bridge = batadv_mcast_get_bridge(dev); + int ret4, ret6 = 0; - hlist_for_each_entry(mcast_entry, mcast_list, list) - if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) - return true; + if (bridge) + dev = bridge; - return false; + ret4 = batadv_mcast_mla_softif_get_ipv4(dev, mcast_list, flags); + if (ret4 < 0) + goto out; + + ret6 = batadv_mcast_mla_softif_get_ipv6(dev, mcast_list, flags); + if (ret6 < 0) { + ret4 = 0; + goto out; + } + +out: + if (bridge) + dev_put(bridge); + + return ret4 + ret6; } /** @@ -227,9 +572,9 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) /** * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners - * @bat_priv: the bat priv with all the soft interface information * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into + * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on foreign, non-mesh devices which we gave access to our mesh via @@ -239,14 +584,13 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ -static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv, - struct net_device *dev, - struct hlist_head *mcast_list) +static int batadv_mcast_mla_bridge_get(struct net_device *dev, + struct hlist_head *mcast_list, + struct batadv_mcast_mla_flags *flags) { struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); - bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4; - bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6; struct br_ip_list *br_ip_entry, *tmp; + u8 tvlv_flags = flags->tvlv_flags; struct batadv_hw_addr *new; u8 mcast_addr[ETH_ALEN]; int ret; @@ -259,11 +603,34 @@ static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv, goto out; list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { - if (all_ipv4 && br_ip_entry->addr.proto == htons(ETH_P_IP)) - continue; + if (br_ip_entry->addr.proto == htons(ETH_P_IP)) { + if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) + continue; - if (all_ipv6 && br_ip_entry->addr.proto == htons(ETH_P_IPV6)) - continue; + if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && + ipv4_is_local_multicast(br_ip_entry->addr.u.ip4)) + continue; + + if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && + !ipv4_is_local_multicast(br_ip_entry->addr.u.ip4)) + continue; + } + +#if IS_ENABLED(CONFIG_IPV6) + if (br_ip_entry->addr.proto == htons(ETH_P_IPV6)) { + if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) + continue; + + if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && + ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.u.ip6)) + continue; + + if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && + IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.u.ip6) > + IPV6_ADDR_SCOPE_LINKLOCAL) + continue; + } +#endif batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) @@ -370,27 +737,6 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, } /** - * batadv_mcast_has_bridge() - check whether the soft-iface is bridged - * @bat_priv: the bat priv with all the soft interface information - * - * Checks whether there is a bridge on top of our soft interface. - * - * Return: true if there is a bridge, false otherwise. - */ -static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) -{ - struct net_device *upper = bat_priv->soft_iface; - - rcu_read_lock(); - do { - upper = netdev_master_upper_dev_get_rcu(upper); - } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); - rcu_read_unlock(); - - return upper; -} - -/** * batadv_mcast_querier_log() - debug output regarding the querier status on * link * @bat_priv: the bat priv with all the soft interface information @@ -424,7 +770,7 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, batadv_info(bat_priv->soft_iface, "%s Querier disappeared - multicast optimizations disabled\n", str_proto); - else if (!bat_priv->mcast.bridged && !new_state->exists) + else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists) batadv_info(bat_priv->soft_iface, "No %s Querier present - multicast optimizations disabled\n", str_proto); @@ -446,9 +792,7 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, * batadv_mcast_bridge_log() - debug output for topology changes in bridged * setups * @bat_priv: the bat priv with all the soft interface information - * @bridged: a flag about whether the soft interface is currently bridged or not - * @querier_ipv4: (maybe) new status of a potential, selected IGMP querier - * @querier_ipv6: (maybe) new status of a potential, selected MLD querier + * @new_flags: flags indicating the new multicast state * * If no bridges are ever used on this node, then this function does nothing. * @@ -461,126 +805,86 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, * multicast flags this node is going to set. */ static void -batadv_mcast_bridge_log(struct batadv_priv *bat_priv, bool bridged, - struct batadv_mcast_querier_state *querier_ipv4, - struct batadv_mcast_querier_state *querier_ipv6) +batadv_mcast_bridge_log(struct batadv_priv *bat_priv, + struct batadv_mcast_mla_flags *new_flags) { - if (!bat_priv->mcast.bridged && bridged) + struct batadv_mcast_mla_flags *old_flags = &bat_priv->mcast.mla_flags; + + if (!old_flags->bridged && new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge added: Setting Unsnoopables(U)-flag\n"); - else if (bat_priv->mcast.bridged && !bridged) + else if (old_flags->bridged && !new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge removed: Unsetting Unsnoopables(U)-flag\n"); - if (bridged) { + if (new_flags->bridged) { batadv_mcast_querier_log(bat_priv, "IGMP", - &bat_priv->mcast.querier_ipv4, - querier_ipv4); + &old_flags->querier_ipv4, + &new_flags->querier_ipv4); batadv_mcast_querier_log(bat_priv, "MLD", - &bat_priv->mcast.querier_ipv6, - querier_ipv6); + &old_flags->querier_ipv6, + &new_flags->querier_ipv6); } } /** * batadv_mcast_flags_logs() - output debug information about mcast flag changes * @bat_priv: the bat priv with all the soft interface information - * @flags: flags indicating the new multicast state + * @flags: TVLV flags indicating the new multicast state * - * Whenever the multicast flags this nodes announces changes (@mcast_flags vs. - * bat_priv->mcast.flags), this notifies userspace via the 'mcast' log level. + * Whenever the multicast TVLV flags this nodes announces change this notifies + * userspace via the 'mcast' log level. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { - u8 old_flags = bat_priv->mcast.flags; - char str_old_flags[] = "[...]"; + bool old_enabled = bat_priv->mcast.mla_flags.enabled; + u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags; + char str_old_flags[] = "[.... . ]"; - sprintf(str_old_flags, "[%c%c%c]", + sprintf(str_old_flags, "[%c%c%c%s%s]", (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', - (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', + !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", + !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); batadv_dbg(BATADV_DBG_MCAST, bat_priv, - "Changing multicast flags from '%s' to '[%c%c%c]'\n", - bat_priv->mcast.enabled ? str_old_flags : "<undefined>", + "Changing multicast flags from '%s' to '[%c%c%c%s%s]'\n", + old_enabled ? str_old_flags : "<undefined>", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', - (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', + !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", + !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); } /** - * batadv_mcast_mla_tvlv_update() - update multicast tvlv + * batadv_mcast_mla_flags_update() - update multicast flags * @bat_priv: the bat priv with all the soft interface information + * @flags: flags indicating the new multicast state * * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. - * - * Return: false if we want all IPv4 && IPv6 multicast traffic and true - * otherwise. */ -static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) +static void +batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv, + struct batadv_mcast_mla_flags *flags) { struct batadv_tvlv_mcast_data mcast_data; - struct batadv_mcast_querier_state querier4 = {false, false}; - struct batadv_mcast_querier_state querier6 = {false, false}; - struct net_device *dev = bat_priv->soft_iface; - bool bridged; - - mcast_data.flags = BATADV_NO_FLAGS; - memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); - - bridged = batadv_mcast_has_bridge(bat_priv); - if (!bridged) - goto update; - - if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) - pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); - - querier4.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); - querier4.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); - querier6.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); - querier6.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); - - mcast_data.flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; - - /* 1) If no querier exists at all, then multicast listeners on - * our local TT clients behind the bridge will keep silent. - * 2) If the selected querier is on one of our local TT clients, - * behind the bridge, then this querier might shadow multicast - * listeners on our local TT clients, behind this bridge. - * - * In both cases, we will signalize other batman nodes that - * we need all multicast traffic of the according protocol. - */ - if (!querier4.exists || querier4.shadowing) - mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV4; - - if (!querier6.exists || querier6.shadowing) - mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV6; - -update: - batadv_mcast_bridge_log(bat_priv, bridged, &querier4, &querier6); - - bat_priv->mcast.querier_ipv4.exists = querier4.exists; - bat_priv->mcast.querier_ipv4.shadowing = querier4.shadowing; + if (!memcmp(flags, &bat_priv->mcast.mla_flags, sizeof(*flags))) + return; - bat_priv->mcast.querier_ipv6.exists = querier6.exists; - bat_priv->mcast.querier_ipv6.shadowing = querier6.shadowing; + batadv_mcast_bridge_log(bat_priv, flags); + batadv_mcast_flags_log(bat_priv, flags->tvlv_flags); - bat_priv->mcast.bridged = bridged; + mcast_data.flags = flags->tvlv_flags; + memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); - if (!bat_priv->mcast.enabled || - mcast_data.flags != bat_priv->mcast.flags) { - batadv_mcast_flags_log(bat_priv, mcast_data.flags); - batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, - &mcast_data, sizeof(mcast_data)); - bat_priv->mcast.flags = mcast_data.flags; - bat_priv->mcast.enabled = true; - } + batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, + &mcast_data, sizeof(mcast_data)); - return !(mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV4 && - mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV6); + bat_priv->mcast.mla_flags = *flags; } /** @@ -599,22 +903,24 @@ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) { struct net_device *soft_iface = bat_priv->soft_iface; struct hlist_head mcast_list = HLIST_HEAD_INIT; + struct batadv_mcast_mla_flags flags; int ret; - if (!batadv_mcast_mla_tvlv_update(bat_priv)) - goto update; + flags = batadv_mcast_mla_flags_get(bat_priv); - ret = batadv_mcast_mla_softif_get(bat_priv, soft_iface, &mcast_list); + ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list, &flags); if (ret < 0) goto out; - ret = batadv_mcast_mla_bridge_get(bat_priv, soft_iface, &mcast_list); + ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list, &flags); if (ret < 0) goto out; -update: + spin_lock(&bat_priv->mcast.mla_lock); batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); batadv_mcast_mla_tt_add(bat_priv, &mcast_list); + batadv_mcast_mla_flags_update(bat_priv, &flags); + spin_unlock(&bat_priv->mcast.mla_lock); out: batadv_mcast_mla_list_free(&mcast_list); @@ -639,10 +945,7 @@ static void batadv_mcast_mla_update(struct work_struct *work) priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work); bat_priv = container_of(priv_mcast, struct batadv_priv, mcast); - spin_lock(&bat_priv->mcast.mla_lock); __batadv_mcast_mla_update(bat_priv); - spin_unlock(&bat_priv->mcast.mla_lock); - batadv_mcast_start_timer(bat_priv); } @@ -677,6 +980,7 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable + * @is_routable: stores whether the destination is routable * * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. @@ -686,7 +990,8 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, - bool *is_unsnoopable) + bool *is_unsnoopable, + int *is_routable) { struct iphdr *iphdr; @@ -699,16 +1004,13 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, iphdr = ip_hdr(skb); - /* TODO: Implement Multicast Router Discovery (RFC4286), - * then allow scope > link local, too - */ - if (!ipv4_is_local_multicast(iphdr->daddr)) - return -EINVAL; - /* link-local multicast listeners behind a bridge are * not snoopable (see RFC4541, section 2.1.2.2) */ - *is_unsnoopable = true; + if (ipv4_is_local_multicast(iphdr->daddr)) + *is_unsnoopable = true; + else + *is_routable = ETH_P_IP; return 0; } @@ -743,6 +1045,7 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable + * @is_routable: stores whether the destination is routable * * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. @@ -751,7 +1054,8 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, - bool *is_unsnoopable) + bool *is_unsnoopable, + int *is_routable) { struct ipv6hdr *ip6hdr; @@ -764,10 +1068,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, ip6hdr = ipv6_hdr(skb); - /* TODO: Implement Multicast Router Discovery (RFC4286), - * then allow scope > link local, too - */ - if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) != IPV6_ADDR_SCOPE_LINKLOCAL) + if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) < IPV6_ADDR_SCOPE_LINKLOCAL) return -EINVAL; /* link-local-all-nodes multicast listeners behind a bridge are @@ -775,6 +1076,8 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, */ if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr)) *is_unsnoopable = true; + else if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) > IPV6_ADDR_SCOPE_LINKLOCAL) + *is_routable = ETH_P_IPV6; return 0; } @@ -784,6 +1087,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast frame to check * @is_unsnoopable: stores whether the destination is snoopable + * @is_routable: stores whether the destination is routable * * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. @@ -792,7 +1096,8 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, - bool *is_unsnoopable) + bool *is_unsnoopable, + int *is_routable) { struct ethhdr *ethhdr = eth_hdr(skb); @@ -802,13 +1107,15 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, - is_unsnoopable); + is_unsnoopable, + is_routable); case ETH_P_IPV6: if (!IS_ENABLED(CONFIG_IPV6)) return -EINVAL; return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, - is_unsnoopable); + is_unsnoopable, + is_routable); default: return -EINVAL; } @@ -839,6 +1146,29 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, } /** + * batadv_mcast_forw_rtr_count() - count nodes with a multicast router + * @bat_priv: the bat priv with all the soft interface information + * @protocol: the ethernet protocol type to count multicast routers for + * + * Return: the number of nodes which want all routable IPv4 multicast traffic + * if the protocol is ETH_P_IP or the number of nodes which want all routable + * IPv6 traffic if the protocol is ETH_P_IPV6. Otherwise returns 0. + */ + +static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, + int protocol) +{ + switch (protocol) { + case ETH_P_IP: + return atomic_read(&bat_priv->mcast.num_want_all_rtr4); + case ETH_P_IPV6: + return atomic_read(&bat_priv->mcast.num_want_all_rtr6); + default: + return 0; + } +} + +/** * batadv_mcast_forw_tt_node_get() - get a multicast tt node * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the ether header containing the multicast destination @@ -960,6 +1290,84 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) } /** + * batadv_mcast_forw_rtr4_node_get() - get a node with an ipv4 mcast router flag + * @bat_priv: the bat priv with all the soft interface information + * + * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR4 flag unset and + * increases its refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_rtr4_node_get(struct batadv_priv *bat_priv) +{ + struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_orig_node, + &bat_priv->mcast.want_all_rtr4_list, + mcast_want_all_rtr4_node) { + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) + continue; + + orig_node = tmp_orig_node; + break; + } + rcu_read_unlock(); + + return orig_node; +} + +/** + * batadv_mcast_forw_rtr6_node_get() - get a node with an ipv6 mcast router flag + * @bat_priv: the bat priv with all the soft interface information + * + * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR6 flag unset + * and increases its refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_rtr6_node_get(struct batadv_priv *bat_priv) +{ + struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_orig_node, + &bat_priv->mcast.want_all_rtr6_list, + mcast_want_all_rtr6_node) { + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) + continue; + + orig_node = tmp_orig_node; + break; + } + rcu_read_unlock(); + + return orig_node; +} + +/** + * batadv_mcast_forw_rtr_node_get() - get a node with an ipv4/ipv6 router flag + * @bat_priv: the bat priv with all the soft interface information + * @ethhdr: an ethernet header to determine the protocol family from + * + * Return: an orig_node which has no BATADV_MCAST_WANT_NO_RTR4 or + * BATADV_MCAST_WANT_NO_RTR6 flag, depending on the provided ethhdr, set and + * increases its refcount. + */ +static struct batadv_orig_node * +batadv_mcast_forw_rtr_node_get(struct batadv_priv *bat_priv, + struct ethhdr *ethhdr) +{ + switch (ntohs(ethhdr->h_proto)) { + case ETH_P_IP: + return batadv_mcast_forw_rtr4_node_get(bat_priv); + case ETH_P_IPV6: + return batadv_mcast_forw_rtr6_node_get(bat_priv); + default: + /* we shouldn't be here... */ + return NULL; + } +} + +/** * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the soft interface information * @skb: The multicast packet to check @@ -977,8 +1385,11 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, bool is_unsnoopable = false; unsigned int mcast_fanout; struct ethhdr *ethhdr; + int is_routable = 0; + int rtr_count = 0; - ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable); + ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable, + &is_routable); if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) @@ -991,8 +1402,9 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr); unsnoop_count = !is_unsnoopable ? 0 : atomic_read(&bat_priv->mcast.num_want_all_unsnoopables); + rtr_count = batadv_mcast_forw_rtr_count(bat_priv, is_routable); - total_count = tt_count + ip_count + unsnoop_count; + total_count = tt_count + ip_count + unsnoop_count + rtr_count; switch (total_count) { case 1: @@ -1002,6 +1414,9 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, *orig = batadv_mcast_forw_ip_node_get(bat_priv, ethhdr); else if (unsnoop_count) *orig = batadv_mcast_forw_unsnoop_node_get(bat_priv); + else if (rtr_count) + *orig = batadv_mcast_forw_rtr_node_get(bat_priv, + ethhdr); if (*orig) return BATADV_FORW_SINGLE; @@ -1173,6 +1588,111 @@ batadv_mcast_forw_want_all(struct batadv_priv *bat_priv, } /** + * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4 + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_NO_RTR4 flag unset. A transmission is performed via a + * batman-adv unicast packet for each such destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS + * otherwise. + */ +static int +batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + struct batadv_orig_node *orig_node; + int ret = NET_XMIT_SUCCESS; + struct sk_buff *newskb; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, + &bat_priv->mcast.want_all_rtr4_list, + mcast_want_all_rtr4_node) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + ret = NET_XMIT_DROP; + break; + } + + batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0, + orig_node, vid); + } + rcu_read_unlock(); + return ret; +} + +/** + * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6 + * @bat_priv: the bat priv with all the soft interface information + * @skb: The multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_NO_RTR6 flag unset. A transmission is performed via a + * batman-adv unicast packet for each such destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS + * otherwise. + */ +static int +batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + struct batadv_orig_node *orig_node; + int ret = NET_XMIT_SUCCESS; + struct sk_buff *newskb; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, + &bat_priv->mcast.want_all_rtr6_list, + mcast_want_all_rtr6_node) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + ret = NET_XMIT_DROP; + break; + } + + batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0, + orig_node, vid); + } + rcu_read_unlock(); + return ret; +} + +/** + * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_NO_RTR4 or BATADV_MCAST_WANT_NO_RTR6 flag unset. A + * transmission is performed via a batman-adv unicast packet for each such + * destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family + * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. + */ +static int +batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + switch (ntohs(eth_hdr(skb)->h_proto)) { + case ETH_P_IP: + return batadv_mcast_forw_want_all_rtr4(bat_priv, skb, vid); + case ETH_P_IPV6: + return batadv_mcast_forw_want_all_rtr6(bat_priv, skb, vid); + default: + /* we shouldn't be here... */ + return NET_XMIT_DROP; + } +} + +/** * batadv_mcast_forw_send() - send packet to any detected multicast recpient * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit @@ -1205,6 +1725,12 @@ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, return ret; } + ret = batadv_mcast_forw_want_rtr(bat_priv, skb, vid); + if (ret != NET_XMIT_SUCCESS) { + kfree_skb(skb); + return ret; + } + consume_skb(skb); return ret; } @@ -1345,6 +1871,127 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, } /** + * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node which multicast state might have changed of + * @mcast_flags: flags indicating the new multicast state + * + * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has + * toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. + */ +static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 mcast_flags) +{ + struct hlist_node *node = &orig->mcast_want_all_rtr4_node; + struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list; + + lockdep_assert_held(&orig->mcast_handler_lock); + + /* switched from flag set to unset */ + if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) && + orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) { + atomic_inc(&bat_priv->mcast.num_want_all_rtr4); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + /* switched from flag unset to set */ + } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 && + !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) { + atomic_dec(&bat_priv->mcast.num_want_all_rtr4); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + } +} + +/** + * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node which multicast state might have changed of + * @mcast_flags: flags indicating the new multicast state + * + * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has + * toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. + */ +static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 mcast_flags) +{ + struct hlist_node *node = &orig->mcast_want_all_rtr6_node; + struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list; + + lockdep_assert_held(&orig->mcast_handler_lock); + + /* switched from flag set to unset */ + if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) && + orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) { + atomic_inc(&bat_priv->mcast.num_want_all_rtr6); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + /* switched from flag unset to set */ + } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 && + !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) { + atomic_dec(&bat_priv->mcast.num_want_all_rtr6); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + } +} + +/** + * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV + * @enabled: whether the originator has multicast TVLV support enabled + * @tvlv_value: tvlv buffer containing the multicast flags + * @tvlv_value_len: tvlv buffer length + * + * Return: multicast flags for the given tvlv buffer + */ +static u8 +batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len) +{ + u8 mcast_flags = BATADV_NO_FLAGS; + + if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags)) + mcast_flags = *(u8 *)tvlv_value; + + if (!enabled) { + mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; + mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; + } + + /* remove redundant flags to avoid sending duplicate packets later */ + if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) + mcast_flags |= BATADV_MCAST_WANT_NO_RTR4; + + if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) + mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; + + return mcast_flags; +} + +/** * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm @@ -1359,16 +2006,10 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); - u8 mcast_flags = BATADV_NO_FLAGS; + u8 mcast_flags; - if (orig_mcast_enabled && tvlv_value && - tvlv_value_len >= sizeof(mcast_flags)) - mcast_flags = *(u8 *)tvlv_value; - - if (!orig_mcast_enabled) { - mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; - mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; - } + mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled, + tvlv_value, tvlv_value_len); spin_lock_bh(&orig->mcast_handler_lock); @@ -1385,6 +2026,8 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); + batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags); + batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; spin_unlock_bh(&orig->mcast_handler_lock); @@ -1417,15 +2060,16 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv, struct seq_file *seq) { - u8 flags = bat_priv->mcast.flags; + struct batadv_mcast_mla_flags *mla_flags = &bat_priv->mcast.mla_flags; char querier4, querier6, shadowing4, shadowing6; - bool bridged = bat_priv->mcast.bridged; + bool bridged = mla_flags->bridged; + u8 flags = mla_flags->tvlv_flags; if (bridged) { - querier4 = bat_priv->mcast.querier_ipv4.exists ? '.' : '4'; - querier6 = bat_priv->mcast.querier_ipv6.exists ? '.' : '6'; - shadowing4 = bat_priv->mcast.querier_ipv4.shadowing ? '4' : '.'; - shadowing6 = bat_priv->mcast.querier_ipv6.shadowing ? '6' : '.'; + querier4 = mla_flags->querier_ipv4.exists ? '.' : '4'; + querier6 = mla_flags->querier_ipv6.exists ? '.' : '6'; + shadowing4 = mla_flags->querier_ipv4.shadowing ? '4' : '.'; + shadowing6 = mla_flags->querier_ipv6.shadowing ? '6' : '.'; } else { querier4 = '?'; querier6 = '?'; @@ -1433,10 +2077,12 @@ static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv, shadowing6 = '?'; } - seq_printf(seq, "Multicast flags (own flags: [%c%c%c])\n", + seq_printf(seq, "Multicast flags (own flags: [%c%c%c%s%s])\n", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', - (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', + !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", + !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.'); seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n", querier4, querier6); @@ -1490,13 +2136,17 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) flags = orig_node->mcast_flags; - seq_printf(seq, "%pM [%c%c%c]\n", orig_node->orig, + seq_printf(seq, "%pM [%c%c%c%s%s]\n", orig_node->orig, (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) - ? '6' : '.'); + ? '6' : '.', + !(flags & BATADV_MCAST_WANT_NO_RTR4) + ? "R4" : ". ", + !(flags & BATADV_MCAST_WANT_NO_RTR6) + ? "R6" : ". "); } rcu_read_unlock(); } @@ -1517,19 +2167,19 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) int batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv) { - u32 flags = bat_priv->mcast.flags; + u32 flags = bat_priv->mcast.mla_flags.tvlv_flags; u32 flags_priv = BATADV_NO_FLAGS; - if (bat_priv->mcast.bridged) { + if (bat_priv->mcast.mla_flags.bridged) { flags_priv |= BATADV_MCAST_FLAGS_BRIDGED; - if (bat_priv->mcast.querier_ipv4.exists) + if (bat_priv->mcast.mla_flags.querier_ipv4.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS; - if (bat_priv->mcast.querier_ipv6.exists) + if (bat_priv->mcast.mla_flags.querier_ipv6.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS; - if (bat_priv->mcast.querier_ipv4.shadowing) + if (bat_priv->mcast.mla_flags.querier_ipv4.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING; - if (bat_priv->mcast.querier_ipv6.shadowing) + if (bat_priv->mcast.mla_flags.querier_ipv6.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING; } @@ -1770,6 +2420,8 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); + batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_NO_FLAGS); + batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_NO_FLAGS); spin_unlock_bh(&orig->mcast_handler_lock); } diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 653b9b76fabe..5d9e2bb29c97 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -9,9 +9,9 @@ #include "main.h" -struct netlink_callback; -struct seq_file; -struct sk_buff; +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> /** * enum batadv_forw_mode - the way a packet should be forwarded as diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index a67720fad46c..6f08fd122a8d 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -21,6 +21,7 @@ #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/limits.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> @@ -30,6 +31,7 @@ #include <linux/stddef.h> #include <linux/types.h> #include <net/genetlink.h> +#include <net/net_namespace.h> #include <net/netlink.h> #include <net/sock.h> #include <uapi/linux/batadv_packet.h> @@ -49,8 +51,6 @@ #include "tp_meter.h" #include "translation-table.h" -struct net; - struct genl_family batadv_netlink_family; /* multicast groups */ diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index d1e0681b8743..ddc674e47dbb 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -9,11 +9,10 @@ #include "main.h" +#include <linux/netlink.h> #include <linux/types.h> #include <net/genetlink.h> -struct nlmsghdr; - void batadv_netlink_register(void); void batadv_netlink_unregister(void); int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype); diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index c5e7906045f3..580609389f0f 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1951,34 +1951,19 @@ out: /** * batadv_nc_init_debugfs() - create nc folder and related files in debugfs * @bat_priv: the bat priv with all the soft interface information - * - * Return: 0 on success or negative error number in case of failure */ -int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) +void batadv_nc_init_debugfs(struct batadv_priv *bat_priv) { - struct dentry *nc_dir, *file; + struct dentry *nc_dir; nc_dir = debugfs_create_dir("nc", bat_priv->debug_dir); - if (!nc_dir) - goto out; - file = debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq); - if (!file) - goto out; + debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq); - file = debugfs_create_u32("max_fwd_delay", 0644, nc_dir, - &bat_priv->nc.max_fwd_delay); - if (!file) - goto out; + debugfs_create_u32("max_fwd_delay", 0644, nc_dir, + &bat_priv->nc.max_fwd_delay); - file = debugfs_create_u32("max_buffer_time", 0644, nc_dir, - &bat_priv->nc.max_buffer_time); - if (!file) - goto out; - - return 0; - -out: - return -ENOMEM; + debugfs_create_u32("max_buffer_time", 0644, nc_dir, + &bat_priv->nc.max_buffer_time); } #endif diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 74f56113a5d0..753fa49723cf 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -9,12 +9,11 @@ #include "main.h" +#include <linux/netdevice.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> - -struct batadv_ogm_packet; -struct net_device; -struct seq_file; -struct sk_buff; +#include <uapi/linux/batadv_packet.h> #ifdef CONFIG_BATMAN_ADV_NC @@ -40,7 +39,7 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb); int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset); -int batadv_nc_init_debugfs(struct batadv_priv *bat_priv); +void batadv_nc_init_debugfs(struct batadv_priv *bat_priv); #else /* ifdef CONFIG_BATMAN_ADV_NC */ @@ -111,9 +110,8 @@ static inline int batadv_nc_nodes_seq_print_text(struct seq_file *seq, return 0; } -static inline int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) +static inline void batadv_nc_init_debugfs(struct batadv_priv *bat_priv) { - return 0; } #endif /* ifdef CONFIG_BATMAN_ADV_NC */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 45db798a7297..38613487fb1b 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -27,6 +27,7 @@ #include <linux/stddef.h> #include <linux/workqueue.h> #include <net/sock.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" @@ -1043,7 +1044,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, orig_node->bcast_seqno_reset = reset_time; #ifdef CONFIG_BATMAN_ADV_MCAST - orig_node->mcast_flags = BATADV_NO_FLAGS; + orig_node->mcast_flags = BATADV_MCAST_WANT_NO_RTR4; + orig_node->mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node); diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 3829e26f9c5d..512a1f99dd75 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -12,12 +12,11 @@ #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> -struct netlink_callback; -struct seq_file; -struct sk_buff; - bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index b96c6d06d188..c20feac95107 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -9,10 +9,9 @@ #include "main.h" +#include <linux/skbuff.h> #include <linux/types.h> -struct sk_buff; - bool batadv_check_management_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, int header_len); diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 5921ee4e107c..5fc0fd1e5d08 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -10,12 +10,11 @@ #include "main.h" #include <linux/compiler.h> +#include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> -struct sk_buff; - void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet, bool dropped); struct batadv_forw_packet * diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index a7677e1d000f..c7a2e77ca1da 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -24,6 +24,7 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/percpu.h> #include <linux/printk.h> #include <linux/random.h> @@ -803,11 +804,6 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST - bat_priv->mcast.querier_ipv4.exists = false; - bat_priv->mcast.querier_ipv4.shadowing = false; - bat_priv->mcast.querier_ipv6.exists = false; - bat_priv->mcast.querier_ipv6.shadowing = false; - bat_priv->mcast.flags = BATADV_NO_FLAGS; atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 275442a7acb6..29139ad769fe 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -9,13 +9,12 @@ #include "main.h" +#include <linux/netdevice.h> +#include <linux/skbuff.h> #include <linux/types.h> +#include <net/net_namespace.h> #include <net/rtnetlink.h> -struct net_device; -struct net; -struct sk_buff; - int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 80fc3253c336..1efcb97039cd 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -18,6 +18,7 @@ #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/kref.h> +#include <linux/limits.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 83fa808b1871..5e466093dfa5 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -9,12 +9,11 @@ #include "main.h" +#include <linux/kobject.h> +#include <linux/netdevice.h> #include <linux/sysfs.h> #include <linux/types.h> -struct kobject; -struct net_device; - #define BATADV_SYSFS_IF_MESH_SUBDIR "mesh" #define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv" /** diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 820392146249..dd6a9a40dbb9 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/kthread.h> +#include <linux/limits.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/param.h> diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index 604b3799c972..78d310da0ad3 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -9,10 +9,9 @@ #include "main.h" +#include <linux/skbuff.h> #include <linux/types.h> -struct sk_buff; - void batadv_tp_meter_init(void); void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, u32 test_length, u32 *cookie); diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index c8c48d62a430..4a98860d7f0e 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -9,13 +9,12 @@ #include "main.h" +#include <linux/netdevice.h> +#include <linux/netlink.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/types.h> -struct netlink_callback; -struct net_device; -struct seq_file; -struct sk_buff; - int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark); diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 114ac01e06af..36985000a0a8 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -10,8 +10,7 @@ #include "main.h" #include <linux/types.h> - -struct batadv_ogm_packet; +#include <uapi/linux/batadv_packet.h> void batadv_tvlv_container_register(struct batadv_priv *bat_priv, u8 type, u8 version, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index e0b25104cbfa..6ae139d74e0f 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -14,20 +14,22 @@ #include <linux/average.h> #include <linux/bitops.h> #include <linux/compiler.h> +#include <linux/if.h> #include <linux/if_ether.h> #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/sched.h> /* for linux/wait.h */ +#include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/spinlock.h> +#include <linux/timer.h> #include <linux/types.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> -struct seq_file; - #ifdef CONFIG_BATMAN_ADV_DAT /** @@ -402,6 +404,17 @@ struct batadv_orig_node { * list */ struct hlist_node mcast_want_all_ipv6_node; + + /** + * @mcast_want_all_rtr4_node: a list node for the mcast.want_all_rtr4 + * list + */ + struct hlist_node mcast_want_all_rtr4_node; + /** + * @mcast_want_all_rtr6_node: a list node for the mcast.want_all_rtr6 + * list + */ + struct hlist_node mcast_want_all_rtr6_node; #endif /** @capabilities: announced capabilities of this originator */ @@ -1169,6 +1182,26 @@ struct batadv_mcast_querier_state { }; /** + * struct batadv_mcast_mla_flags - flags for the querier, bridge and tvlv state + */ +struct batadv_mcast_mla_flags { + /** @querier_ipv4: the current state of an IGMP querier in the mesh */ + struct batadv_mcast_querier_state querier_ipv4; + + /** @querier_ipv6: the current state of an MLD querier in the mesh */ + struct batadv_mcast_querier_state querier_ipv6; + + /** @enabled: whether the multicast tvlv is currently enabled */ + unsigned char enabled:1; + + /** @bridged: whether the soft interface has a bridge on top */ + unsigned char bridged:1; + + /** @tvlv_flags: the flags we have last sent in our mcast tvlv */ + u8 tvlv_flags; +}; + +/** * struct batadv_priv_mcast - per mesh interface mcast data */ struct batadv_priv_mcast { @@ -1196,20 +1229,22 @@ struct batadv_priv_mcast { */ struct hlist_head want_all_ipv6_list; - /** @querier_ipv4: the current state of an IGMP querier in the mesh */ - struct batadv_mcast_querier_state querier_ipv4; - - /** @querier_ipv6: the current state of an MLD querier in the mesh */ - struct batadv_mcast_querier_state querier_ipv6; - - /** @flags: the flags we have last sent in our mcast tvlv */ - u8 flags; + /** + * @want_all_rtr4_list: a list of orig_nodes wanting all routable IPv4 + * multicast traffic + */ + struct hlist_head want_all_rtr4_list; - /** @enabled: whether the multicast tvlv is currently enabled */ - unsigned char enabled:1; + /** + * @want_all_rtr6_list: a list of orig_nodes wanting all routable IPv6 + * multicast traffic + */ + struct hlist_head want_all_rtr6_list; - /** @bridged: whether the soft interface has a bridge on top */ - unsigned char bridged:1; + /** + * @mla_flags: flags for the querier, bridge and tvlv state + */ + struct batadv_mcast_mla_flags mla_flags; /** * @mla_lock: a lock protecting mla_list and mla_flags @@ -1228,6 +1263,12 @@ struct batadv_priv_mcast { /** @num_want_all_ipv6: counter for items in want_all_ipv6_list */ atomic_t num_want_all_ipv6; + /** @num_want_all_rtr4: counter for items in want_all_rtr4_list */ + atomic_t num_want_all_rtr4; + + /** @num_want_all_rtr6: counter for items in want_all_rtr6_list */ + atomic_t num_want_all_rtr6; + /** * @want_lists_lock: lock for protecting modifications to mcasts * want_all_{unsnoopables,ipv4,ipv6}_list (traversals are rcu-locked) diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 1555b0c6f7ec..9d41de1ec90f 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -164,26 +164,21 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, int count = atomic_read(&dev->peer_count); const struct in6_addr *nexthop; struct lowpan_peer *peer; + struct neighbour *neigh; BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt); - /* If we have multiple 6lowpan peers, then check where we should - * send the packet. If only one peer exists, then we can send the - * packet right away. - */ - if (count == 1) { - rcu_read_lock(); - peer = list_first_or_null_rcu(&dev->peers, struct lowpan_peer, - list); - rcu_read_unlock(); - return peer; - } - if (!rt) { - nexthop = &lowpan_cb(skb)->gw; - - if (ipv6_addr_any(nexthop)) - return NULL; + if (ipv6_addr_any(&lowpan_cb(skb)->gw)) { + /* There is neither route nor gateway, + * probably the destination is a direct peer. + */ + nexthop = daddr; + } else { + /* There is a known gateway + */ + nexthop = &lowpan_cb(skb)->gw; + } } else { nexthop = rt6_nexthop(rt, daddr); @@ -209,6 +204,20 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, } } + /* use the neighbour cache for matching addresses assigned by SLAAC + */ + neigh = __ipv6_neigh_lookup(dev->netdev, nexthop); + if (neigh) { + list_for_each_entry_rcu(peer, &dev->peers, list) { + if (!memcmp(neigh->ha, peer->lladdr, ETH_ALEN)) { + neigh_release(neigh); + rcu_read_unlock(); + return peer; + } + } + neigh_release(neigh); + } + rcu_read_unlock(); return NULL; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 15d1cb5aee18..ad5b0ac1f9ce 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -520,6 +520,9 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, set_bit(HCI_CONN_POWER_SAVE, &conn->flags); conn->disc_timeout = HCI_DISCONN_TIMEOUT; + /* Set Default Authenticated payload timeout to 30s */ + conn->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; + if (conn->role == HCI_ROLE_MASTER) conn->out = true; @@ -912,7 +915,7 @@ static void hci_req_directed_advertising(struct hci_request *req, sizeof(cp), &cp); } - __hci_req_enable_ext_advertising(req); + __hci_req_enable_ext_advertising(req, 0x00); } else { struct hci_cp_le_set_adv_param cp; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b81bf53c5ac4..b9585e7d9d2e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2827,7 +2827,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, memset(adv_instance->scan_rsp_data, 0, sizeof(adv_instance->scan_rsp_data)); } else { - if (hdev->adv_instance_cnt >= HCI_MAX_ADV_INSTANCES || + if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets || instance < 1 || instance > HCI_MAX_ADV_INSTANCES) return -EOVERFLOW; @@ -3195,11 +3195,13 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE; hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; + hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE; hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE; + hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 51f5b1efc3a5..bb67f4a5479a 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -941,6 +941,35 @@ static int adv_max_interval_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get, adv_max_interval_set, "%llu\n"); +static int auth_payload_timeout_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val < 0x0001 || val > 0xffff) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->auth_payload_timeout = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int auth_payload_timeout_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->auth_payload_timeout; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(auth_payload_timeout_fops, + auth_payload_timeout_get, + auth_payload_timeout_set, "%llu\n"); + DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter, HCI_QUIRK_STRICT_DUPLICATE_FILTER); DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery, @@ -994,6 +1023,8 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &adv_max_interval_fops); debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs, &hdev->discov_interleaved_timeout); + debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev, + &auth_payload_timeout_fops); debugfs_create_file("quirk_strict_duplicate_filter", 0644, hdev->debugfs, hdev, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 9e4fcf406d9c..cdb00c2ef242 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -579,6 +579,51 @@ static void hci_cc_read_local_commands(struct hci_dev *hdev, memcpy(hdev->commands, rp->commands, sizeof(hdev->commands)); } +static void hci_cc_read_auth_payload_timeout(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_auth_payload_to *rp = (void *)skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); + if (conn) + conn->auth_payload_timeout = __le16_to_cpu(rp->timeout); + + hci_dev_unlock(hdev); +} + +static void hci_cc_write_auth_payload_timeout(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_write_auth_payload_to *rp = (void *)skb->data; + struct hci_conn *conn; + void *sent; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO); + if (!sent) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); + if (conn) + conn->auth_payload_timeout = get_unaligned_le16(sent + 2); + + hci_dev_unlock(hdev); +} + static void hci_cc_read_local_features(struct hci_dev *hdev, struct sk_buff *skb) { @@ -2975,6 +3020,25 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; } + /* Set the default Authenticated Payload Timeout after + * an LE Link is established. As per Core Spec v5.0, Vol 2, Part B + * Section 3.3, the HCI command WRITE_AUTH_PAYLOAD_TIMEOUT should be + * sent when the link is active and Encryption is enabled, the conn + * type can be either LE or ACL and controller must support LMP Ping. + * Ensure for AES-CCM encryption as well. + */ + if (test_bit(HCI_CONN_ENCRYPT, &conn->flags) && + test_bit(HCI_CONN_AES_CCM, &conn->flags) && + ((conn->type == ACL_LINK && lmp_ping_capable(hdev)) || + (conn->type == LE_LINK && (hdev->le_features[0] & HCI_LE_PING)))) { + struct hci_cp_write_auth_payload_to cp; + + cp.handle = cpu_to_le16(conn->handle); + cp.timeout = cpu_to_le16(hdev->auth_payload_timeout); + hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO, + sizeof(cp), &cp); + } + notify: if (conn->state == BT_CONFIG) { if (!ev->status) @@ -3170,6 +3234,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_write_sc_support(hdev, skb); break; + case HCI_OP_READ_AUTH_PAYLOAD_TO: + hci_cc_read_auth_payload_timeout(hdev, skb); + break; + + case HCI_OP_WRITE_AUTH_PAYLOAD_TO: + hci_cc_write_auth_payload_timeout(hdev, skb); + break; + case HCI_OP_READ_LOCAL_VERSION: hci_cc_read_local_version(hdev, skb); break; @@ -5588,6 +5660,11 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_UNKNOWN_CONN_ID); + if (min < hcon->le_conn_min_interval || + max > hcon->le_conn_max_interval) + return send_conn_param_neg_reply(hdev, handle, + HCI_ERROR_INVALID_LL_PARAMS); + if (hci_check_conn_params(min, max, latency, timeout)) return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_INVALID_LL_PARAMS); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index e9a95ed65491..621f1a97d803 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1601,7 +1601,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; cp.tx_power = 127; - cp.handle = 0; + cp.handle = instance; if (flags & MGMT_ADV_FLAG_SEC_2M) { cp.primary_phy = HCI_ADV_PHY_1M; @@ -1643,11 +1643,21 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) return 0; } -void __hci_req_enable_ext_advertising(struct hci_request *req) +int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance) { + struct hci_dev *hdev = req->hdev; struct hci_cp_le_set_ext_adv_enable *cp; struct hci_cp_ext_adv_set *adv_set; u8 data[sizeof(*cp) + sizeof(*adv_set) * 1]; + struct adv_info *adv_instance; + + if (instance > 0) { + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -EINVAL; + } else { + adv_instance = NULL; + } cp = (void *) data; adv_set = (void *) cp->data; @@ -1659,11 +1669,23 @@ void __hci_req_enable_ext_advertising(struct hci_request *req) memset(adv_set, 0, sizeof(*adv_set)); - adv_set->handle = 0; + adv_set->handle = instance; + + /* Set duration per instance since controller is responsible for + * scheduling it. + */ + if (adv_instance && adv_instance->duration) { + u16 duration = adv_instance->duration * MSEC_PER_SEC; + + /* Time = N * 10 ms */ + adv_set->duration = cpu_to_le16(duration / 10); + } hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets, data); + + return 0; } int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) @@ -1679,7 +1701,7 @@ int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) return err; __hci_req_update_scan_rsp_data(req, instance); - __hci_req_enable_ext_advertising(req); + __hci_req_enable_ext_advertising(req, instance); return 0; } @@ -1723,10 +1745,13 @@ int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, adv_instance->remaining_time = adv_instance->remaining_time - timeout; - hdev->adv_instance_timeout = timeout; - queue_delayed_work(hdev->req_workqueue, + /* Only use work for scheduling instances with legacy advertising */ + if (!ext_adv_capable(hdev)) { + hdev->adv_instance_timeout = timeout; + queue_delayed_work(hdev->req_workqueue, &hdev->adv_instance_expire, msecs_to_jiffies(timeout * 1000)); + } /* If we're just re-scheduling the same instance again then do not * execute any HCI commands. This happens when a single instance is @@ -2744,7 +2769,8 @@ static int powered_update_hci(struct hci_request *req, unsigned long opt) if (!ext_adv_capable(hdev)) __hci_req_enable_advertising(req); else if (!err) - __hci_req_enable_ext_advertising(req); + __hci_req_enable_ext_advertising(req, + 0x00); } } else if (!list_empty(&hdev->adv_instances)) { struct adv_info *adv_instance; diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 55b2050cc9ff..a7019fbeadd3 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -83,7 +83,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); -void __hci_req_enable_ext_advertising(struct hci_request *req); +int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance); void __hci_req_clear_ext_adv_sets(struct hci_request *req); int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, bool use_rpa, struct adv_info *adv_instance, diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index a442e21f3894..5abd423b55fa 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -775,7 +775,7 @@ static int hidp_setup_hid(struct hidp_session *session, hid->version = req->version; hid->country = req->country; - strncpy(hid->name, req->name, sizeof(hid->name)); + strscpy(hid->name, req->name, sizeof(hid->name)); snprintf(hid->phys, sizeof(hid->phys), "%pMR", &l2cap_pi(session->ctrl_sock->sk)->chan->src); diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 2151913892ce..03be6a4baef3 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -192,6 +192,7 @@ static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne ca.version = ca32.version; ca.flags = ca32.flags; ca.idle_to = ca32.idle_to; + ca32.name[sizeof(ca32.name) - 1] = '\0'; memcpy(ca.name, ca32.name, 128); csock = sockfd_lookup(ca.ctrl_sock, &err); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 9f77432dbe38..007317b072b4 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -168,11 +168,18 @@ static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn, return c; } -static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src) +static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src, + u8 src_type) { struct l2cap_chan *c; list_for_each_entry(c, &chan_list, global_l) { + if (src_type == BDADDR_BREDR && c->src_type != BDADDR_BREDR) + continue; + + if (src_type != BDADDR_BREDR && c->src_type == BDADDR_BREDR) + continue; + if (c->sport == psm && !bacmp(&c->src, src)) return c; } @@ -185,7 +192,7 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm) write_lock(&chan_list_lock); - if (psm && __l2cap_global_chan_by_addr(psm, src)) { + if (psm && __l2cap_global_chan_by_addr(psm, src, chan->src_type)) { err = -EADDRINUSE; goto done; } @@ -209,7 +216,8 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm) err = -EINVAL; for (p = start; p <= end; p += incr) - if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) { + if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src, + chan->src_type)) { chan->psm = cpu_to_le16(p); chan->sport = cpu_to_le16(p); err = 0; @@ -4394,6 +4402,12 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, l2cap_chan_lock(chan); + if (chan->state != BT_DISCONN) { + l2cap_chan_unlock(chan); + mutex_unlock(&conn->chan_lock); + return 0; + } + l2cap_chan_hold(chan); l2cap_chan_del(chan, 0); @@ -5291,7 +5305,14 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, memset(&rsp, 0, sizeof(rsp)); - err = hci_check_conn_params(min, max, latency, to_multiplier); + if (min < hcon->le_conn_min_interval || + max > hcon->le_conn_max_interval) { + BT_DBG("requested connection interval exceeds current bounds."); + err = -EINVAL; + } else { + err = hci_check_conn_params(min, max, latency, to_multiplier); + } + if (err) rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); else diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index e68c715f8d37..6c2b4e6e87ba 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2579,6 +2579,19 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, goto distribute; } + /* Drop IRK if peer is using identity address during pairing but is + * providing different address as identity information. + * + * Microsoft Surface Precision Mouse is known to have this bug. + */ + if (hci_is_identity_address(&hcon->dst, hcon->dst_type) && + (bacmp(&info->bdaddr, &hcon->dst) || + info->addr_type != hcon->dst_type)) { + bt_dev_err(hcon->hdev, + "ignoring IRK with invalid identity address"); + goto distribute; + } + bacpy(&smp->id_addr, &info->bdaddr); smp->id_addr_type = info->addr_type; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index c05def8fd9cd..681b72862c16 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -52,6 +52,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br_switchdev_frame_unmark(skb); BR_INPUT_SKB_CB(skb)->brdev = dev; + BR_INPUT_SKB_CB(skb)->frag_max_size = 0; skb_reset_mac_header(skb); eth = eth_hdr(skb); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 52c712984cc7..09b1dd8cd853 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -232,7 +232,7 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) kfree_skb(skb); return RX_HANDLER_CONSUMED; case NF_QUEUE: - ret = nf_queue(skb, &state, e, i, verdict); + ret = nf_queue(skb, &state, i, verdict); if (ret == 1) continue; return RX_HANDLER_CONSUMED; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 34fa72c72ad8..d3f9592f4ff8 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -47,25 +47,22 @@ static unsigned int brnf_net_id __read_mostly; struct brnf_net { bool enabled; -}; #ifdef CONFIG_SYSCTL -static struct ctl_table_header *brnf_sysctl_header; -static int brnf_call_iptables __read_mostly = 1; -static int brnf_call_ip6tables __read_mostly = 1; -static int brnf_call_arptables __read_mostly = 1; -static int brnf_filter_vlan_tagged __read_mostly; -static int brnf_filter_pppoe_tagged __read_mostly; -static int brnf_pass_vlan_indev __read_mostly; -#else -#define brnf_call_iptables 1 -#define brnf_call_ip6tables 1 -#define brnf_call_arptables 1 -#define brnf_filter_vlan_tagged 0 -#define brnf_filter_pppoe_tagged 0 -#define brnf_pass_vlan_indev 0 + struct ctl_table_header *ctl_hdr; #endif + /* default value is 1 */ + int call_iptables; + int call_ip6tables; + int call_arptables; + + /* default value is 0 */ + int filter_vlan_tagged; + int filter_pppoe_tagged; + int pass_vlan_indev; +}; + #define IS_IP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) @@ -85,17 +82,28 @@ static inline __be16 vlan_proto(const struct sk_buff *skb) return 0; } -#define IS_VLAN_IP(skb) \ - (vlan_proto(skb) == htons(ETH_P_IP) && \ - brnf_filter_vlan_tagged) +static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged; +} -#define IS_VLAN_IPV6(skb) \ - (vlan_proto(skb) == htons(ETH_P_IPV6) && \ - brnf_filter_vlan_tagged) +static inline bool is_vlan_ipv6(const struct sk_buff *skb, + const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); -#define IS_VLAN_ARP(skb) \ - (vlan_proto(skb) == htons(ETH_P_ARP) && \ - brnf_filter_vlan_tagged) + return vlan_proto(skb) == htons(ETH_P_IPV6) && + brnet->filter_vlan_tagged; +} + +static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged; +} static inline __be16 pppoe_proto(const struct sk_buff *skb) { @@ -103,15 +111,23 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) sizeof(struct pppoe_hdr))); } -#define IS_PPPOE_IP(skb) \ - (skb->protocol == htons(ETH_P_PPP_SES) && \ - pppoe_proto(skb) == htons(PPP_IP) && \ - brnf_filter_pppoe_tagged) +static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + return skb->protocol == htons(ETH_P_PPP_SES) && + pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged; +} + +static inline bool is_pppoe_ipv6(const struct sk_buff *skb, + const struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); -#define IS_PPPOE_IPV6(skb) \ - (skb->protocol == htons(ETH_P_PPP_SES) && \ - pppoe_proto(skb) == htons(PPP_IPV6) && \ - brnf_filter_pppoe_tagged) + return skb->protocol == htons(ETH_P_PPP_SES) && + pppoe_proto(skb) == htons(PPP_IPV6) && + brnet->filter_pppoe_tagged; +} /* largest possible L2 header, see br_nf_dev_queue_xmit() */ #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) @@ -408,12 +424,16 @@ bridged_dnat: return 0; } -static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev) +static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, + const struct net_device *dev, + const struct net *net) { struct net_device *vlan, *br; + struct brnf_net *brnet = net_generic(net, brnf_net_id); br = bridge_parent(dev); - if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) + + if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) return br; vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, @@ -423,7 +443,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct } /* Some common code for IPv4/IPv6 */ -struct net_device *setup_pre_routing(struct sk_buff *skb) +struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); @@ -434,7 +454,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb) nf_bridge->in_prerouting = 1; nf_bridge->physindev = skb->dev; - skb->dev = brnf_get_logical_dev(skb, skb->dev); + skb->dev = brnf_get_logical_dev(skb, skb->dev, net); if (skb->protocol == htons(ETH_P_8021Q)) nf_bridge->orig_proto = BRNF_PROTO_8021Q; @@ -460,6 +480,7 @@ static unsigned int br_nf_pre_routing(void *priv, struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); + struct brnf_net *brnet; if (unlikely(!pskb_may_pull(skb, len))) return NF_DROP; @@ -469,8 +490,10 @@ static unsigned int br_nf_pre_routing(void *priv, return NF_DROP; br = p->br; - if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { - if (!brnf_call_ip6tables && + brnet = net_generic(state->net, brnf_net_id); + if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || + is_pppoe_ipv6(skb, state->net)) { + if (!brnet->call_ip6tables && !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; @@ -478,10 +501,11 @@ static unsigned int br_nf_pre_routing(void *priv, return br_nf_pre_routing_ipv6(priv, skb, state); } - if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) + if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) return NF_ACCEPT; - if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) + if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) && + !is_pppoe_ip(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); @@ -491,7 +515,7 @@ static unsigned int br_nf_pre_routing(void *priv, if (!nf_bridge_alloc(skb)) return NF_DROP; - if (!setup_pre_routing(skb)) + if (!setup_pre_routing(skb, state->net)) return NF_DROP; nf_bridge = nf_bridge_info_get(skb); @@ -514,7 +538,7 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *in; - if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { + if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) { if (skb->protocol == htons(ETH_P_IP)) nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; @@ -569,9 +593,11 @@ static unsigned int br_nf_forward_ip(void *priv, if (!parent) return NF_DROP; - if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) + if (IS_IP(skb) || is_vlan_ip(skb, state->net) || + is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; - else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) + else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || + is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; @@ -602,7 +628,7 @@ static unsigned int br_nf_forward_ip(void *priv, skb->protocol = htons(ETH_P_IPV6); NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb, - brnf_get_logical_dev(skb, state->in), + brnf_get_logical_dev(skb, state->in, state->net), parent, br_nf_forward_finish); return NF_STOLEN; @@ -615,23 +641,25 @@ static unsigned int br_nf_forward_arp(void *priv, struct net_bridge_port *p; struct net_bridge *br; struct net_device **d = (struct net_device **)(skb->cb); + struct brnf_net *brnet; p = br_port_get_rcu(state->out); if (p == NULL) return NF_ACCEPT; br = p->br; - if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) + brnet = net_generic(state->net, brnf_net_id); + if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) return NF_ACCEPT; if (!IS_ARP(skb)) { - if (!IS_VLAN_ARP(skb)) + if (!is_vlan_arp(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header(skb); } if (arp_hdr(skb)->ar_pln != 4) { - if (IS_VLAN_ARP(skb)) + if (is_vlan_arp(skb, state->net)) nf_bridge_push_encap_header(skb); return NF_ACCEPT; } @@ -791,9 +819,11 @@ static unsigned int br_nf_post_routing(void *priv, if (!realoutdev) return NF_DROP; - if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) + if (IS_IP(skb) || is_vlan_ip(skb, state->net) || + is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; - else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) + else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || + is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; @@ -946,23 +976,6 @@ static int brnf_device_event(struct notifier_block *unused, unsigned long event, return NOTIFY_OK; } -static void __net_exit brnf_exit_net(struct net *net) -{ - struct brnf_net *brnet = net_generic(net, brnf_net_id); - - if (!brnet->enabled) - return; - - nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); - brnet->enabled = false; -} - -static struct pernet_operations brnf_net_ops __read_mostly = { - .exit = brnf_exit_net, - .id = &brnf_net_id, - .size = sizeof(struct brnf_net), -}; - static struct notifier_block brnf_notifier __read_mostly = { .notifier_call = brnf_device_event, }; @@ -1021,49 +1034,124 @@ int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, static struct ctl_table brnf_table[] = { { .procname = "bridge-nf-call-arptables", - .data = &brnf_call_arptables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-iptables", - .data = &brnf_call_iptables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-ip6tables", - .data = &brnf_call_ip6tables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-vlan-tagged", - .data = &brnf_filter_vlan_tagged, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-pppoe-tagged", - .data = &brnf_filter_pppoe_tagged, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-pass-vlan-input-dev", - .data = &brnf_pass_vlan_indev, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { } }; + +static inline void br_netfilter_sysctl_default(struct brnf_net *brnf) +{ + brnf->call_iptables = 1; + brnf->call_ip6tables = 1; + brnf->call_arptables = 1; + brnf->filter_vlan_tagged = 0; + brnf->filter_pppoe_tagged = 0; + brnf->pass_vlan_indev = 0; +} + +static int br_netfilter_sysctl_init_net(struct net *net) +{ + struct ctl_table *table = brnf_table; + struct brnf_net *brnet; + + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL); + if (!table) + return -ENOMEM; + } + + brnet = net_generic(net, brnf_net_id); + table[0].data = &brnet->call_arptables; + table[1].data = &brnet->call_iptables; + table[2].data = &brnet->call_ip6tables; + table[3].data = &brnet->filter_vlan_tagged; + table[4].data = &brnet->filter_pppoe_tagged; + table[5].data = &brnet->pass_vlan_indev; + + br_netfilter_sysctl_default(brnet); + + brnet->ctl_hdr = register_net_sysctl(net, "net/bridge", table); + if (!brnet->ctl_hdr) { + if (!net_eq(net, &init_net)) + kfree(table); + + return -ENOMEM; + } + + return 0; +} + +static void br_netfilter_sysctl_exit_net(struct net *net, + struct brnf_net *brnet) +{ + struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; + + unregister_net_sysctl_table(brnet->ctl_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static int __net_init brnf_init_net(struct net *net) +{ + return br_netfilter_sysctl_init_net(net); +} +#endif + +static void __net_exit brnf_exit_net(struct net *net) +{ + struct brnf_net *brnet; + + brnet = net_generic(net, brnf_net_id); + if (brnet->enabled) { + nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); + brnet->enabled = false; + } + +#ifdef CONFIG_SYSCTL + br_netfilter_sysctl_exit_net(net, brnet); #endif +} + +static struct pernet_operations brnf_net_ops __read_mostly = { +#ifdef CONFIG_SYSCTL + .init = brnf_init_net, +#endif + .exit = brnf_exit_net, + .id = &brnf_net_id, + .size = sizeof(struct brnf_net), +}; static int __init br_netfilter_init(void) { @@ -1079,16 +1167,6 @@ static int __init br_netfilter_init(void) return ret; } -#ifdef CONFIG_SYSCTL - brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); - if (brnf_sysctl_header == NULL) { - printk(KERN_WARNING - "br_netfilter: can't register to sysctl.\n"); - unregister_netdevice_notifier(&brnf_notifier); - unregister_pernet_subsys(&brnf_net_ops); - return -ENOMEM; - } -#endif RCU_INIT_POINTER(nf_br_ops, &br_ops); printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; @@ -1099,9 +1177,6 @@ static void __exit br_netfilter_fini(void) RCU_INIT_POINTER(nf_br_ops, NULL); unregister_netdevice_notifier(&brnf_notifier); unregister_pernet_subsys(&brnf_net_ops); -#ifdef CONFIG_SYSCTL - unregister_net_sysctl_table(brnf_sysctl_header); -#endif } module_init(br_netfilter_init); diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 0e63e5dc5ac4..e4e0c836c3f5 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -224,7 +224,7 @@ unsigned int br_nf_pre_routing_ipv6(void *priv, nf_bridge = nf_bridge_alloc(skb); if (!nf_bridge) return NF_DROP; - if (!setup_pre_routing(skb)) + if (!setup_pre_routing(skb, state->net)) return NF_DROP; nf_bridge = nf_bridge_info_get(skb); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 159a0e2cb0f6..e8cf03b43b7d 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -421,6 +421,7 @@ struct net_bridge { struct br_input_skb_cb { struct net_device *brdev; + u16 frag_max_size; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING u8 igmp; u8 mrouters_only:1; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index f47f526b4f19..021cc9f66804 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -797,6 +797,16 @@ bool br_vlan_enabled(const struct net_device *dev) } EXPORT_SYMBOL_GPL(br_vlan_enabled); +int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) +{ + struct net_bridge *br = netdev_priv(dev); + + *p_proto = ntohs(br->vlan_proto); + + return 0; +} +EXPORT_SYMBOL_GPL(br_vlan_get_proto); + int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) { int err = 0; @@ -1227,13 +1237,11 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, } } -int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) +static int __br_vlan_get_pvid(const struct net_device *dev, + struct net_bridge_port *p, u16 *p_pvid) { struct net_bridge_vlan_group *vg; - struct net_bridge_port *p; - ASSERT_RTNL(); - p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) @@ -1244,8 +1252,21 @@ int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) *p_pvid = br_get_pvid(vg); return 0; } + +int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) +{ + ASSERT_RTNL(); + + return __br_vlan_get_pvid(dev, br_port_get_check_rtnl(dev), p_pvid); +} EXPORT_SYMBOL_GPL(br_vlan_get_pvid); +int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) +{ + return __br_vlan_get_pvid(dev, br_port_get_check_rcu(dev), p_pvid); +} +EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu); + int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index c3ad90c43801..fbc708508360 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -9,6 +9,12 @@ menuconfig NF_TABLES_BRIDGE bool "Ethernet Bridge nf_tables support" if NF_TABLES_BRIDGE + +config NFT_BRIDGE_META + tristate "Netfilter nf_table bridge meta support" + help + Add support for bridge dedicated meta key. + config NFT_BRIDGE_REJECT tristate "Netfilter nf_tables bridge reject support" depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6 @@ -19,6 +25,20 @@ config NF_LOG_BRIDGE tristate "Bridge packet logging" select NF_LOG_COMMON +config NF_CONNTRACK_BRIDGE + tristate "IPv4/IPV6 bridge connection tracking support" + depends on NF_CONNTRACK + default n + help + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. This is used to enhance packet filtering via + stateful policies. Enable this if you want native tracking from + the bridge. This provides a replacement for the `br_netfilter' + infrastructure. + + To compile it as a module, choose M here. If unsure, say N. + endif # NF_TABLES_BRIDGE menuconfig BRIDGE_NF_EBTABLES diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 9b868861f21a..8e2c5759d964 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -3,8 +3,12 @@ # Makefile for the netfilter modules for Link Layer filtering on a bridge. # +obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o +# connection tracking +obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o + # packet logging obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index eeae23a73c6a..ed91ea31978a 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -22,7 +22,7 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) const struct ebt_nat_info *info = par->targinfo; struct net_device *dev; - if (!skb_make_writable(skb, 0)) + if (skb_ensure_writable(skb, ETH_ALEN)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 53ef08e6765f..0cad62a4052b 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -21,7 +21,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_redirect_info *info = par->targinfo; - if (!skb_make_writable(skb, 0)) + if (skb_ensure_writable(skb, ETH_ALEN)) return EBT_DROP; if (xt_hooknum(par) != NF_BR_BROUTING) diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c index 700d338d5ddb..27443bf229a3 100644 --- a/net/bridge/netfilter/ebt_snat.c +++ b/net/bridge/netfilter/ebt_snat.c @@ -22,7 +22,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; - if (!skb_make_writable(skb, 0)) + if (skb_ensure_writable(skb, ETH_ALEN * 2)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_source, info->mac); diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c new file mode 100644 index 000000000000..4f5444d2a526 --- /dev/null +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -0,0 +1,435 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_bridge.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/route.h> +#include <net/ip.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_bridge.h> + +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/netfilter/nf_tables.h> + +#include "../br_private.h" + +/* Best effort variant of ip_do_fragment which preserves geometry, unless skbuff + * has been linearized or cloned. + */ +static int nf_br_ip_fragment(struct net *net, struct sock *sk, + struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + unsigned int hlen, ll_rs, mtu; + struct ip_frag_state state; + struct iphdr *iph; + int err; + + /* for offloaded checksums cleanup checksum before fragmentation */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto blackhole; + + iph = ip_hdr(skb); + + /* + * Setup starting values + */ + + hlen = iph->ihl * 4; + frag_max_size -= hlen; + ll_rs = LL_RESERVED_SPACE(skb->dev); + mtu = skb->dev->mtu; + + if (skb_has_frag_list(skb)) { + unsigned int first_len = skb_pagelen(skb); + struct ip_fraglist_iter iter; + struct sk_buff *frag; + + if (first_len - hlen > mtu || + skb_headroom(skb) < ll_rs) + goto blackhole; + + if (skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag) { + if (frag->len > mtu || + skb_headroom(frag) < hlen + ll_rs) + goto blackhole; + + if (skb_shared(frag)) + goto slow_path; + } + + ip_fraglist_init(skb, iph, hlen, &iter); + + for (;;) { + if (iter.frag) + ip_fraglist_prepare(skb, &iter); + + err = output(net, sk, data, skb); + if (err || !iter.frag) + break; + + skb = ip_fraglist_next(&iter); + } + return err; + } +slow_path: + /* This is a linearized skbuff, the original geometry is lost for us. + * This may also be a clone skbuff, we could preserve the geometry for + * the copies but probably not worth the effort. + */ + ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state); + + while (state.left > 0) { + struct sk_buff *skb2; + + skb2 = ip_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); + goto blackhole; + } + + err = output(net, sk, data, skb2); + if (err) + goto blackhole; + } + consume_skb(skb); + return err; + +blackhole: + kfree_skb(skb); + return 0; +} + +/* ip_defrag() expects IPCB() in place. */ +static void br_skb_cb_save(struct sk_buff *skb, struct br_input_skb_cb *cb, + size_t inet_skb_parm_size) +{ + memcpy(cb, skb->cb, sizeof(*cb)); + memset(skb->cb, 0, inet_skb_parm_size); +} + +static void br_skb_cb_restore(struct sk_buff *skb, + const struct br_input_skb_cb *cb, + u16 fragsz) +{ + memcpy(skb->cb, cb, sizeof(*cb)); + BR_INPUT_SKB_CB(skb)->frag_max_size = fragsz; +} + +static unsigned int nf_ct_br_defrag4(struct sk_buff *skb, + const struct nf_hook_state *state) +{ + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; + enum ip_conntrack_info ctinfo; + struct br_input_skb_cb cb; + const struct nf_conn *ct; + int err; + + if (!ip_is_fragment(ip_hdr(skb))) + return NF_ACCEPT; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + + br_skb_cb_save(skb, &cb, sizeof(struct inet_skb_parm)); + local_bh_disable(); + err = ip_defrag(state->net, skb, + IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id); + local_bh_enable(); + if (!err) { + br_skb_cb_restore(skb, &cb, IPCB(skb)->frag_max_size); + skb->ignore_df = 1; + return NF_ACCEPT; + } + + return NF_STOLEN; +} + +static unsigned int nf_ct_br_defrag6(struct sk_buff *skb, + const struct nf_hook_state *state) +{ + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; + enum ip_conntrack_info ctinfo; + struct br_input_skb_cb cb; + const struct nf_conn *ct; + int err; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + + br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm)); + + err = nf_ipv6_br_defrag(state->net, skb, + IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id); + /* queued */ + if (err == -EINPROGRESS) + return NF_STOLEN; + + br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size); + return err == 0 ? NF_ACCEPT : NF_DROP; +} + +static int nf_ct_br_ip_check(const struct sk_buff *skb) +{ + const struct iphdr *iph; + int nhoff, len; + + nhoff = skb_network_offset(skb); + iph = ip_hdr(skb); + if (iph->ihl < 5 || + iph->version != 4) + return -1; + + len = ntohs(iph->tot_len); + if (skb->len < nhoff + len || + len < (iph->ihl * 4)) + return -1; + + return 0; +} + +static int nf_ct_br_ipv6_check(const struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + int nhoff, len; + + nhoff = skb_network_offset(skb); + hdr = ipv6_hdr(skb); + if (hdr->version != 6) + return -1; + + len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff; + if (skb->len < len) + return -1; + + return 0; +} + +static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_hook_state bridge_state = *state; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u32 len; + int ret; + + ct = nf_ct_get(skb, &ctinfo); + if ((ct && !nf_ct_is_template(ct)) || + ctinfo == IP_CT_UNTRACKED) + return NF_ACCEPT; + + switch (skb->protocol) { + case htons(ETH_P_IP): + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return NF_ACCEPT; + + len = ntohs(ip_hdr(skb)->tot_len); + if (pskb_trim_rcsum(skb, len)) + return NF_ACCEPT; + + if (nf_ct_br_ip_check(skb)) + return NF_ACCEPT; + + bridge_state.pf = NFPROTO_IPV4; + ret = nf_ct_br_defrag4(skb, &bridge_state); + break; + case htons(ETH_P_IPV6): + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return NF_ACCEPT; + + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + if (pskb_trim_rcsum(skb, len)) + return NF_ACCEPT; + + if (nf_ct_br_ipv6_check(skb)) + return NF_ACCEPT; + + bridge_state.pf = NFPROTO_IPV6; + ret = nf_ct_br_defrag6(skb, &bridge_state); + break; + default: + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + return NF_ACCEPT; + } + + if (ret != NF_ACCEPT) + return ret; + + return nf_conntrack_in(skb, &bridge_state); +} + +static void nf_ct_bridge_frag_save(struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data) +{ + if (skb_vlan_tag_present(skb)) { + data->vlan_present = true; + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + } else { + data->vlan_present = false; + } + skb_copy_from_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN); +} + +static unsigned int +nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + struct nf_ct_bridge_frag_data data; + + if (!BR_INPUT_SKB_CB(skb)->frag_max_size) + return NF_ACCEPT; + + nf_ct_bridge_frag_save(skb, &data); + switch (skb->protocol) { + case htons(ETH_P_IP): + nf_br_ip_fragment(state->net, state->sk, skb, &data, output); + break; + case htons(ETH_P_IPV6): + nf_br_ip6_fragment(state->net, state->sk, skb, &data, output); + break; + default: + WARN_ON_ONCE(1); + return NF_DROP; + } + + return NF_STOLEN; +} + +/* Actually only slow path refragmentation needs this. */ +static int nf_ct_bridge_frag_restore(struct sk_buff *skb, + const struct nf_ct_bridge_frag_data *data) +{ + int err; + + err = skb_cow_head(skb, ETH_HLEN); + if (err) { + kfree_skb(skb); + return -ENOMEM; + } + if (data->vlan_present) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); + else if (skb_vlan_tag_present(skb)) + __vlan_hwaccel_clear_tag(skb); + + skb_copy_to_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN); + skb_reset_mac_header(skb); + + return 0; +} + +static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *skb) +{ + int err; + + err = nf_ct_bridge_frag_restore(skb, data); + if (err < 0) + return err; + + return br_dev_queue_push_xmit(net, sk, skb); +} + +static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + int protoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return nf_conntrack_confirm(skb); + + switch (skb->protocol) { + case htons(ETH_P_IP): + protoff = skb_network_offset(skb) + ip_hdrlen(skb); + break; + case htons(ETH_P_IPV6): { + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + return nf_conntrack_confirm(skb); + } + break; + default: + return NF_ACCEPT; + } + return nf_confirm(skb, protoff, ct, ctinfo); +} + +static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + int ret; + + ret = nf_ct_bridge_confirm(skb); + if (ret != NF_ACCEPT) + return ret; + + return nf_ct_bridge_refrag(skb, state, nf_ct_bridge_refrag_post); +} + +static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = { + { + .hook = nf_ct_bridge_pre, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = nf_ct_bridge_post, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, +}; + +static struct nf_ct_bridge_info bridge_info = { + .ops = nf_ct_bridge_hook_ops, + .ops_size = ARRAY_SIZE(nf_ct_bridge_hook_ops), + .me = THIS_MODULE, +}; + +static int __init nf_conntrack_l3proto_bridge_init(void) +{ + nf_ct_bridge_register(&bridge_info); + + return 0; +} + +static void __exit nf_conntrack_l3proto_bridge_fini(void) +{ + nf_ct_bridge_unregister(&bridge_info); +} + +module_init(nf_conntrack_l3proto_bridge_init); +module_exit(nf_conntrack_l3proto_bridge_fini); + +MODULE_ALIAS("nf_conntrack-" __stringify(AF_BRIDGE)); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c new file mode 100644 index 000000000000..bed66f536b34 --- /dev/null +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> +#include <linux/if_bridge.h> + +static const struct net_device * +nft_meta_get_bridge(const struct net_device *dev) +{ + if (dev && netif_is_bridge_port(dev)) + return netdev_master_upper_dev_get_rcu((struct net_device *)dev); + + return NULL; +} + +static void nft_meta_bridge_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); + u32 *dest = ®s->data[priv->dreg]; + const struct net_device *br_dev; + + switch (priv->key) { + case NFT_META_BRI_IIFNAME: + br_dev = nft_meta_get_bridge(in); + if (!br_dev) + goto err; + break; + case NFT_META_BRI_OIFNAME: + br_dev = nft_meta_get_bridge(out); + if (!br_dev) + goto err; + break; + case NFT_META_BRI_IIFPVID: { + u16 p_pvid; + + br_dev = nft_meta_get_bridge(in); + if (!br_dev || !br_vlan_enabled(br_dev)) + goto err; + + br_vlan_get_pvid_rcu(in, &p_pvid); + nft_reg_store16(dest, p_pvid); + return; + } + case NFT_META_BRI_IIFVPROTO: { + u16 p_proto; + + br_dev = nft_meta_get_bridge(in); + if (!br_dev || !br_vlan_enabled(br_dev)) + goto err; + + br_vlan_get_proto(br_dev, &p_proto); + nft_reg_store16(dest, p_proto); + return; + } + default: + goto out; + } + + strncpy((char *)dest, br_dev->name, IFNAMSIZ); + return; +out: + return nft_meta_get_eval(expr, regs, pkt); +err: + regs->verdict.code = NFT_BREAK; +} + +static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + unsigned int len; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_BRI_IIFNAME: + case NFT_META_BRI_OIFNAME: + len = IFNAMSIZ; + break; + case NFT_META_BRI_IIFPVID: + case NFT_META_BRI_IIFVPROTO: + len = sizeof(u16); + break; + default: + return nft_meta_get_init(ctx, expr, tb); + } + + priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); +} + +static struct nft_expr_type nft_meta_bridge_type; +static const struct nft_expr_ops nft_meta_bridge_get_ops = { + .type = &nft_meta_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_bridge_get_eval, + .init = nft_meta_bridge_get_init, + .dump = nft_meta_get_dump, +}; + +static const struct nft_expr_ops nft_meta_bridge_set_ops = { + .type = &nft_meta_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_set_eval, + .init = nft_meta_set_init, + .destroy = nft_meta_set_destroy, + .dump = nft_meta_set_dump, + .validate = nft_meta_set_validate, +}; + +static const struct nft_expr_ops * +nft_meta_bridge_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_META_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG]) + return &nft_meta_bridge_get_ops; + + if (tb[NFTA_META_SREG]) + return &nft_meta_bridge_set_ops; + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_meta_bridge_type __read_mostly = { + .family = NFPROTO_BRIDGE, + .name = "meta", + .select_ops = nft_meta_bridge_select_ops, + .policy = nft_meta_policy, + .maxattr = NFTA_META_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_meta_bridge_module_init(void) +{ + return nft_register_expr(&nft_meta_bridge_type); +} + +static void __exit nft_meta_bridge_module_exit(void) +{ + nft_unregister_expr(&nft_meta_bridge_type); +} + +module_init(nft_meta_bridge_module_init); +module_exit(nft_meta_bridge_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("wenxu <wenxu@ucloud.cn>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d1c4e1f3be2c..94c7f77ecb6b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -627,6 +627,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) unsigned int i; u32 nbuckets; u64 cost; + int ret; smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); if (!smap) @@ -636,13 +637,21 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus()))); nbuckets = 1U << smap->bucket_log; + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); + + ret = bpf_map_charge_init(&smap->map.memory, cost); + if (ret < 0) { + kfree(smap); + return ERR_PTR(ret); + } + smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { + bpf_map_charge_finish(&smap->map.memory); kfree(smap); return ERR_PTR(-ENOMEM); } - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); @@ -652,7 +661,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % BPF_SK_STORAGE_CACHE_SIZE; - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; return &smap->map; } diff --git a/net/core/dev.c b/net/core/dev.c index d6edd218babd..fc676b2610e3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2900,12 +2900,10 @@ static void skb_warn_bad_offload(const struct sk_buff *skb) else name = netdev_name(dev); } - WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " - "gso_type=%d ip_summed=%d\n", + skb_dump(KERN_WARNING, skb, false); + WARN(1, "%s: caps=(%pNF, %pNF)\n", name, dev ? &dev->features : &null_features, - skb->sk ? &skb->sk->sk_route_caps : &null_features, - skb->len, skb->data_len, skb_shinfo(skb)->gso_size, - skb_shinfo(skb)->gso_type, skb->ip_summed); + skb->sk ? &skb->sk->sk_route_caps : &null_features); } /* @@ -3124,13 +3122,7 @@ void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { if (net_ratelimit()) { pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); - if (dev) - pr_err("dev features: %pNF\n", &dev->features); - pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n", - skb->len, skb->data_len, skb->pkt_type, - skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, - skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum, - skb->csum_complete_sw, skb->csum_valid, skb->csum_level); + skb_dump(KERN_ERR, skb, true); dump_stack(); } } @@ -4689,9 +4681,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, __skb_push(skb, skb->mac_len); skb_do_redirect(skb); return NULL; - case TC_ACT_REINSERT: - /* this does not scrub the packet, and updates stats on error */ - skb_tc_reinsert(skb, &cl_res); + case TC_ACT_CONSUMED: return NULL; default: break; diff --git a/net/core/devlink.c b/net/core/devlink.c index 132f4b757963..89c533778135 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -17,6 +17,7 @@ #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/refcount.h> +#include <linux/workqueue.h> #include <rdma/ib_verbs.h> #include <net/netlink.h> #include <net/genetlink.h> @@ -1548,7 +1549,8 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, u32 seq, int flags) { const struct devlink_ops *ops = devlink->ops; - u8 inline_mode, encap_mode; + enum devlink_eswitch_encap_mode encap_mode; + u8 inline_mode; void *hdr; int err = 0; u16 mode; @@ -1624,7 +1626,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; - u8 inline_mode, encap_mode; + enum devlink_eswitch_encap_mode encap_mode; + u8 inline_mode; int err = 0; u16 mode; @@ -2668,6 +2671,108 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static int devlink_nl_flash_update_fill(struct sk_buff *msg, + struct devlink *devlink, + enum devlink_command cmd, + const char *status_msg, + const char *component, + unsigned long done, unsigned long total) +{ + void *hdr; + + hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) + goto out; + + if (status_msg && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, + status_msg)) + goto nla_put_failure; + if (component && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, + component)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, + done, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, + total, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void __devlink_flash_update_notify(struct devlink *devlink, + enum devlink_command cmd, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && + cmd != DEVLINK_CMD_FLASH_UPDATE_END && + cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg, + component, done, total); + if (err) + goto out_free_msg; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + return; + +out_free_msg: + nlmsg_free(msg); +} + +void devlink_flash_update_begin_notify(struct devlink *devlink) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE, + NULL, NULL, 0, 0); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); + +void devlink_flash_update_end_notify(struct devlink *devlink) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_END, + NULL, NULL, 0, 0); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); + +void devlink_flash_update_status_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + status_msg, component, done, total); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); + static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { @@ -4415,6 +4520,35 @@ nla_put_failure: return err; } +static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, + struct netlink_callback *cb, + enum devlink_command cmd) +{ + int index = cb->args[0]; + int tmp_index = index; + void *hdr; + int err; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd); + if (!hdr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = devlink_fmsg_prepare_skb(fmsg, skb, &index); + if ((err && err != -EMSGSIZE) || tmp_index == index) + goto nla_put_failure; + + cb->args[0] = index; + genlmsg_end(skb, hdr); + return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return err; +} + struct devlink_health_reporter { struct list_head list; void *priv; @@ -4647,17 +4781,16 @@ int devlink_health_report(struct devlink_health_reporter *reporter, EXPORT_SYMBOL_GPL(devlink_health_report); static struct devlink_health_reporter * -devlink_health_reporter_get_from_info(struct devlink *devlink, - struct genl_info *info) +devlink_health_reporter_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) { struct devlink_health_reporter *reporter; char *reporter_name; - if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) + if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) return NULL; - reporter_name = - nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); + reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); mutex_lock(&devlink->reporters_lock); reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); if (reporter) @@ -4666,6 +4799,48 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, return reporter; } +static struct devlink_health_reporter * +devlink_health_reporter_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_health_reporter_get_from_attrs(devlink, info->attrs); +} + +static struct devlink_health_reporter * +devlink_health_reporter_get_from_cb(struct netlink_callback *cb) +{ + struct devlink_health_reporter *reporter; + struct devlink *devlink; + struct nlattr **attrs; + int err; + + attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return NULL; + + err = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + devlink_nl_family.hdrsize, + attrs, DEVLINK_ATTR_MAX, + devlink_nl_family.policy, cb->extack); + if (err) + goto free; + + mutex_lock(&devlink_mutex); + devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + if (IS_ERR(devlink)) + goto unlock; + + reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); + mutex_unlock(&devlink_mutex); + kfree(attrs); + return reporter; +unlock: + mutex_unlock(&devlink_mutex); +free: + kfree(attrs); + return NULL; +} + static void devlink_health_reporter_put(struct devlink_health_reporter *reporter) { @@ -4901,32 +5076,40 @@ out: return err; } -static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, - struct genl_info *info) +static int +devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) { - struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; + u64 start = cb->args[0]; int err; - reporter = devlink_health_reporter_get_from_info(devlink, info); + reporter = devlink_health_reporter_get_from_cb(cb); if (!reporter) return -EINVAL; if (!reporter->ops->dump) { - devlink_health_reporter_put(reporter); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto out; } - mutex_lock(&reporter->dump_lock); - err = devlink_health_do_dump(reporter, NULL); - if (err) - goto out; - - err = devlink_fmsg_snd(reporter->dump_fmsg, info, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0); + if (!start) { + err = devlink_health_do_dump(reporter, NULL); + if (err) + goto unlock; + cb->args[1] = reporter->dump_ts; + } + if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) { + NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry"); + err = -EAGAIN; + goto unlock; + } -out: + err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); +unlock: mutex_unlock(&reporter->dump_lock); +out: devlink_health_reporter_put(reporter); return err; } @@ -5263,7 +5446,7 @@ static const struct genl_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_dump_get_doit, + .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, @@ -5386,6 +5569,38 @@ void devlink_free(struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_free); +static void devlink_port_type_warn(struct work_struct *work) +{ + WARN(true, "Type was not set for devlink port."); +} + +static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) +{ + /* Ignore CPU and DSA flavours. */ + return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA; +} + +#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30) + +static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) +{ + if (!devlink_port_type_should_warn(devlink_port)) + return; + /* Schedule a work to WARN in case driver does not set port + * type within timeout. + */ + schedule_delayed_work(&devlink_port->type_warn_dw, + DEVLINK_PORT_TYPE_WARN_TIMEOUT); +} + +static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) +{ + if (!devlink_port_type_should_warn(devlink_port)) + return; + cancel_delayed_work_sync(&devlink_port->type_warn_dw); +} + /** * devlink_port_register - Register devlink port * @@ -5415,6 +5630,8 @@ int devlink_port_register(struct devlink *devlink, list_add_tail(&devlink_port->list, &devlink->port_list); INIT_LIST_HEAD(&devlink_port->param_list); mutex_unlock(&devlink->lock); + INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); + devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } @@ -5429,6 +5646,7 @@ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; + devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); mutex_lock(&devlink->lock); list_del(&devlink_port->list); @@ -5442,6 +5660,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, { if (WARN_ON(!devlink_port->registered)) return; + devlink_port_type_warn_cancel(devlink_port); spin_lock(&devlink_port->type_lock); devlink_port->type = type; devlink_port->type_dev = type_dev; @@ -5515,6 +5734,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); void devlink_port_type_clear(struct devlink_port *devlink_port) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); + devlink_port_type_warn_schedule(devlink_port); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); diff --git a/net/core/dst.c b/net/core/dst.c index e46366228eaf..1325316d9eab 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev, true); dst->input = dst_discard; dst->output = dst_discard_out; - dst->dev = dev_net(dst->dev)->loopback_dev; + dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4d1011b2e24f..6288e69e94fc 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2883,6 +2883,30 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) match->mask.basic.n_proto = htons(0xffff); switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { + case ETHER_FLOW: { + const struct ethhdr *ether_spec, *ether_m_spec; + + ether_spec = &fs->h_u.ether_spec; + ether_m_spec = &fs->m_u.ether_spec; + + if (!is_zero_ether_addr(ether_m_spec->h_source)) { + ether_addr_copy(match->key.eth_addrs.src, + ether_spec->h_source); + ether_addr_copy(match->mask.eth_addrs.src, + ether_m_spec->h_source); + } + if (!is_zero_ether_addr(ether_m_spec->h_dest)) { + ether_addr_copy(match->key.eth_addrs.dst, + ether_spec->h_dest); + ether_addr_copy(match->mask.eth_addrs.dst, + ether_m_spec->h_dest); + } + if (ether_m_spec->h_proto) { + match->key.basic.n_proto = ether_spec->h_proto; + match->mask.basic.n_proto = ether_m_spec->h_proto; + } + } + break; case TCP_V4_FLOW: case UDP_V4_FLOW: { const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec; diff --git a/net/core/filter.c b/net/core/filter.c index 3fdf1b21be36..47f6386fb17a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -62,6 +62,7 @@ #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> @@ -2157,8 +2158,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) if (unlikely(flags & ~(BPF_F_INGRESS))) return TC_ACT_SHOT; - ri->ifindex = ifindex; ri->flags = flags; + ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } @@ -2168,8 +2169,8 @@ int skb_do_redirect(struct sk_buff *skb) struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; - dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); - ri->ifindex = 0; + dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); + ri->tgt_index = 0; if (unlikely(!dev)) { kfree_skb(skb); return -EINVAL; @@ -3487,11 +3488,11 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) { struct net_device *fwd; - u32 index = ri->ifindex; + u32 index = ri->tgt_index; int err; fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->ifindex = 0; + ri->tgt_index = 0; if (unlikely(!fwd)) { err = -EINVAL; goto err; @@ -3522,7 +3523,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = dev_map_enqueue(dst, xdp, dev_rx); if (unlikely(err)) return err; - __dev_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_CPUMAP: { @@ -3531,7 +3531,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (unlikely(err)) return err; - __cpu_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_XSKMAP: { @@ -3605,18 +3604,14 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, struct bpf_map *map, struct bpf_redirect_info *ri) { - u32 index = ri->ifindex; - void *fwd = NULL; + u32 index = ri->tgt_index; + void *fwd = ri->tgt_value; int err; - ri->ifindex = 0; + ri->tgt_index = 0; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - fwd = __xdp_map_lookup_elem(map, index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) xdp_do_flush_map(); @@ -3652,19 +3647,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - u32 index = ri->ifindex; - void *fwd = NULL; + u32 index = ri->tgt_index; + void *fwd = ri->tgt_value; int err = 0; - ri->ifindex = 0; + ri->tgt_index = 0; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - fwd = __xdp_map_lookup_elem(map, index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { struct bpf_dtab_netdev *dst = fwd; @@ -3696,14 +3686,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->ifindex; + u32 index = ri->tgt_index; struct net_device *fwd; int err = 0; if (map) return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, map); - ri->ifindex = 0; + ri->tgt_index = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); if (unlikely(!fwd)) { err = -EINVAL; @@ -3731,8 +3721,9 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) if (unlikely(flags)) return XDP_ABORTED; - ri->ifindex = ifindex; ri->flags = flags; + ri->tgt_index = ifindex; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); return XDP_REDIRECT; @@ -3751,11 +3742,23 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - if (unlikely(flags)) + /* Lower bits of the flags are used as return code on lookup failure */ + if (unlikely(flags > XDP_TX)) return XDP_ABORTED; - ri->ifindex = ifindex; + ri->tgt_value = __xdp_map_lookup_elem(map, ifindex); + if (unlikely(!ri->tgt_value)) { + /* If the lookup fails we want to clear out the state in the + * redirect_info struct completely, so that if an eBPF program + * performs multiple lookups, the last one always takes + * precedence. + */ + WRITE_ONCE(ri->map, NULL); + return flags; + } + ri->flags = flags; + ri->tgt_index = ifindex; WRITE_ONCE(ri->map, map); return XDP_REDIRECT; @@ -4670,7 +4673,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (res.type != RTN_UNICAST) return BPF_FIB_LKUP_RET_NOT_FWDED; - if (res.fi->fib_nhs > 1) + if (fib_info_num_path(res.fi) > 1) fib_select_path(net, &res, &fl4, NULL); if (check_mtu) { @@ -5191,54 +5194,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ -#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \ -do { \ - switch (si->off) { \ - case offsetof(md_type, snd_cwnd): \ - CONVERT(snd_cwnd); break; \ - case offsetof(md_type, srtt_us): \ - CONVERT(srtt_us); break; \ - case offsetof(md_type, snd_ssthresh): \ - CONVERT(snd_ssthresh); break; \ - case offsetof(md_type, rcv_nxt): \ - CONVERT(rcv_nxt); break; \ - case offsetof(md_type, snd_nxt): \ - CONVERT(snd_nxt); break; \ - case offsetof(md_type, snd_una): \ - CONVERT(snd_una); break; \ - case offsetof(md_type, mss_cache): \ - CONVERT(mss_cache); break; \ - case offsetof(md_type, ecn_flags): \ - CONVERT(ecn_flags); break; \ - case offsetof(md_type, rate_delivered): \ - CONVERT(rate_delivered); break; \ - case offsetof(md_type, rate_interval_us): \ - CONVERT(rate_interval_us); break; \ - case offsetof(md_type, packets_out): \ - CONVERT(packets_out); break; \ - case offsetof(md_type, retrans_out): \ - CONVERT(retrans_out); break; \ - case offsetof(md_type, total_retrans): \ - CONVERT(total_retrans); break; \ - case offsetof(md_type, segs_in): \ - CONVERT(segs_in); break; \ - case offsetof(md_type, data_segs_in): \ - CONVERT(data_segs_in); break; \ - case offsetof(md_type, segs_out): \ - CONVERT(segs_out); break; \ - case offsetof(md_type, data_segs_out): \ - CONVERT(data_segs_out); break; \ - case offsetof(md_type, lost_out): \ - CONVERT(lost_out); break; \ - case offsetof(md_type, sacked_out): \ - CONVERT(sacked_out); break; \ - case offsetof(md_type, bytes_received): \ - CONVERT(bytes_received); break; \ - case offsetof(md_type, bytes_acked): \ - CONVERT(bytes_acked); break; \ - } \ -} while (0) - #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) @@ -5589,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, + icsk_retransmits)) return false; if (off % size != 0) @@ -5620,8 +5576,19 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct tcp_sock, FIELD)); \ } while (0) - CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, - BPF_TCP_SOCK_GET_COMMON); +#define BPF_INET_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \ + FIELD) > \ + FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct inet_connection_sock, \ + FIELD), \ + si->dst_reg, si->src_reg, \ + offsetof( \ + struct inet_connection_sock, \ + FIELD)); \ + } while (0) if (insn > insn_buf) return insn - insn_buf; @@ -5637,6 +5604,81 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct tcp_sock, rtt_min) + offsetof(struct minmax_sample, v)); break; + case offsetof(struct bpf_tcp_sock, snd_cwnd): + BPF_TCP_SOCK_GET_COMMON(snd_cwnd); + break; + case offsetof(struct bpf_tcp_sock, srtt_us): + BPF_TCP_SOCK_GET_COMMON(srtt_us); + break; + case offsetof(struct bpf_tcp_sock, snd_ssthresh): + BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); + break; + case offsetof(struct bpf_tcp_sock, rcv_nxt): + BPF_TCP_SOCK_GET_COMMON(rcv_nxt); + break; + case offsetof(struct bpf_tcp_sock, snd_nxt): + BPF_TCP_SOCK_GET_COMMON(snd_nxt); + break; + case offsetof(struct bpf_tcp_sock, snd_una): + BPF_TCP_SOCK_GET_COMMON(snd_una); + break; + case offsetof(struct bpf_tcp_sock, mss_cache): + BPF_TCP_SOCK_GET_COMMON(mss_cache); + break; + case offsetof(struct bpf_tcp_sock, ecn_flags): + BPF_TCP_SOCK_GET_COMMON(ecn_flags); + break; + case offsetof(struct bpf_tcp_sock, rate_delivered): + BPF_TCP_SOCK_GET_COMMON(rate_delivered); + break; + case offsetof(struct bpf_tcp_sock, rate_interval_us): + BPF_TCP_SOCK_GET_COMMON(rate_interval_us); + break; + case offsetof(struct bpf_tcp_sock, packets_out): + BPF_TCP_SOCK_GET_COMMON(packets_out); + break; + case offsetof(struct bpf_tcp_sock, retrans_out): + BPF_TCP_SOCK_GET_COMMON(retrans_out); + break; + case offsetof(struct bpf_tcp_sock, total_retrans): + BPF_TCP_SOCK_GET_COMMON(total_retrans); + break; + case offsetof(struct bpf_tcp_sock, segs_in): + BPF_TCP_SOCK_GET_COMMON(segs_in); + break; + case offsetof(struct bpf_tcp_sock, data_segs_in): + BPF_TCP_SOCK_GET_COMMON(data_segs_in); + break; + case offsetof(struct bpf_tcp_sock, segs_out): + BPF_TCP_SOCK_GET_COMMON(segs_out); + break; + case offsetof(struct bpf_tcp_sock, data_segs_out): + BPF_TCP_SOCK_GET_COMMON(data_segs_out); + break; + case offsetof(struct bpf_tcp_sock, lost_out): + BPF_TCP_SOCK_GET_COMMON(lost_out); + break; + case offsetof(struct bpf_tcp_sock, sacked_out): + BPF_TCP_SOCK_GET_COMMON(sacked_out); + break; + case offsetof(struct bpf_tcp_sock, bytes_received): + BPF_TCP_SOCK_GET_COMMON(bytes_received); + break; + case offsetof(struct bpf_tcp_sock, bytes_acked): + BPF_TCP_SOCK_GET_COMMON(bytes_acked); + break; + case offsetof(struct bpf_tcp_sock, dsack_dups): + BPF_TCP_SOCK_GET_COMMON(dsack_dups); + break; + case offsetof(struct bpf_tcp_sock, delivered): + BPF_TCP_SOCK_GET_COMMON(delivered); + break; + case offsetof(struct bpf_tcp_sock, delivered_ce): + BPF_TCP_SOCK_GET_COMMON(delivered_ce); + break; + case offsetof(struct bpf_tcp_sock, icsk_retransmits): + BPF_INET_SOCK_GET_COMMON(icsk_retransmits); + break; } return insn - insn_buf; @@ -5650,7 +5692,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) return (unsigned long)NULL; } -static const struct bpf_func_proto bpf_tcp_sock_proto = { +const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, @@ -5694,6 +5736,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) return INET_ECN_set_ce(skb); } +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + default: + return size == sizeof(__u32); + } +} + +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_XDP_SOCK_GET(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct xdp_sock, FIELD)); \ + } while (0) + + switch (si->off) { + case offsetof(struct bpf_xdp_sock, queue_id): + BPF_XDP_SOCK_GET(queue_id); + break; + } + + return insn - insn_buf; +} + static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, @@ -5896,6 +5978,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skc_lookup_tcp: return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; default: return bpf_base_func_proto(func_id); } @@ -5933,6 +6019,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; +#endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; @@ -6113,6 +6203,14 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_sockopt_event_output_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } @@ -6792,6 +6890,16 @@ static bool sock_addr_is_valid_access(int off, int size, if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { + if (bpf_ctx_wide_store_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_store_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + if (size != size_default) return false; } @@ -6800,6 +6908,13 @@ static bool sock_addr_is_valid_access(int off, int size, if (size != size_default) return false; break; + case offsetof(struct bpf_sock_addr, sk): + if (type != BPF_READ) + return false; + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; default: if (type == BPF_READ) { if (size != size_default) @@ -6843,6 +6958,11 @@ static bool sock_ops_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct bpf_sock_ops, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET_OR_NULL; + break; default: if (size != size_default) return false; @@ -7620,9 +7740,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. * - * It doesn't support SIZE argument though since narrow stores are not - * supported for now. - * * In addition it uses Temporary Field TF (member of struct S) as the 3rd * "register" since two registers available in convert_ctx_access are not * enough: we can't override neither SRC, since it contains value to store, nor @@ -7630,7 +7747,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, * instructions. But we need a temporary place to save pointer to nested * structure whose field we want to store to. */ -#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ do { \ int tmp_reg = BPF_REG_9; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ @@ -7641,8 +7758,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ - *insn++ = BPF_STX_MEM( \ - BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ + *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ target_size) \ + OFF); \ @@ -7654,8 +7770,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, TF) \ do { \ if (type == BPF_WRITE) { \ - SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ - TF); \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ + OFF, TF); \ } else { \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, SIZE, OFF); \ @@ -7750,6 +7866,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; + case offsetof(struct bpf_sock_addr, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_addr_kern, sk)); + break; } return insn - insn_buf; @@ -7837,9 +7958,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) - CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops, - SOCK_OPS_GET_TCP_SOCK_FIELD); - if (insn > insn_buf) return insn - insn_buf; @@ -8009,6 +8127,82 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); + break; + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); + break; + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); + break; + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); + break; + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); + break; + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); + break; + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); + break; + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); + break; + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); + break; + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); + break; + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); + break; + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); + break; + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); + break; + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); + break; + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); + break; + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); + break; + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); + break; + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); + break; + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); + break; + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); + break; + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); + break; + case offsetof(struct bpf_sock_ops, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + break; } return insn - insn_buf; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index edd622956083..01ad60b5aa75 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -199,6 +199,22 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +void skb_flow_dissect_meta(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_meta *meta; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META)) + return; + + meta = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_META, + target_container); + meta->ingress_ifindex = skb->skb_iif; +} +EXPORT_SYMBOL(skb_flow_dissect_meta); + static void skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, struct flow_dissector *flow_dissector, @@ -757,7 +773,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * @flags: flags that control the dissection process, e.g. - * FLOW_DISSECTOR_F_STOP_AT_L3. + * FLOW_DISSECTOR_F_STOP_AT_ENCAP. * * The function will try to retrieve individual keys into target specified * by flow_dissector from either the skbuff or a raw buffer specified by the @@ -922,11 +938,6 @@ proto_again: __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) { - fdret = FLOW_DISSECT_RET_OUT_GOOD; - break; - } - break; } case htons(ETH_P_IPV6): { @@ -975,9 +986,6 @@ proto_again: __skb_flow_dissect_ipv6(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) - fdret = FLOW_DISSECT_RET_OUT_GOOD; - break; } case htons(ETH_P_8021AD): diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 5ce7d47a960e..f52fe0bc4017 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -7,8 +7,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) { struct flow_rule *rule; - rule = kzalloc(sizeof(struct flow_rule) + - sizeof(struct flow_action_entry) * num_actions, + rule = kzalloc(struct_size(rule, action.entries, num_actions), GFP_KERNEL); if (!rule) return NULL; @@ -26,6 +25,13 @@ EXPORT_SYMBOL(flow_rule_alloc); (__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \ (__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \ +void flow_rule_match_meta(const struct flow_rule *rule, + struct flow_match_meta *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out); +} +EXPORT_SYMBOL(flow_rule_match_meta); + void flow_rule_match_basic(const struct flow_rule *rule, struct flow_match_basic *out) { diff --git a/net/core/hwbm.c b/net/core/hwbm.c index fd822ca5a245..ac1a66df9adc 100644 --- a/net/core/hwbm.c +++ b/net/core/hwbm.c @@ -43,34 +43,33 @@ int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) } EXPORT_SYMBOL_GPL(hwbm_pool_refill); -int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp) +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num) { int err, i; - unsigned long flags; - spin_lock_irqsave(&bm_pool->lock, flags); + mutex_lock(&bm_pool->buf_lock); if (bm_pool->buf_num == bm_pool->size) { pr_warn("pool already filled\n"); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return bm_pool->buf_num; } if (buf_num + bm_pool->buf_num > bm_pool->size) { pr_warn("cannot allocate %d buffers for pool\n", buf_num); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return 0; } if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) { pr_warn("Adding %d buffers to the %d current buffers will overflow\n", buf_num, bm_pool->buf_num); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return 0; } for (i = 0; i < buf_num; i++) { - err = hwbm_pool_refill(bm_pool, gfp); + err = hwbm_pool_refill(bm_pool, GFP_KERNEL); if (err < 0) break; } @@ -79,7 +78,7 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp) bm_pool->buf_num += i; pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num); - spin_unlock_irqrestore(&bm_pool->lock, flags); + mutex_unlock(&bm_pool->buf_lock); return i; } diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 04fdc9535772..f153e0601838 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -163,9 +163,16 @@ static void linkwatch_do_dev(struct net_device *dev) static void __linkwatch_run_queue(int urgent_only) { +#define MAX_DO_DEV_PER_LOOP 100 + + int do_dev = MAX_DO_DEV_PER_LOOP; struct net_device *dev; LIST_HEAD(wrk); + /* Give urgent case more budget */ + if (urgent_only) + do_dev += MAX_DO_DEV_PER_LOOP; + /* * Limit the number of linkwatch events to one * per second so that a runaway driver does not @@ -184,7 +191,7 @@ static void __linkwatch_run_queue(int urgent_only) spin_lock_irq(&lweventlist_lock); list_splice_init(&lweventlist, &wrk); - while (!list_empty(&wrk)) { + while (!list_empty(&wrk) && do_dev > 0) { dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); @@ -195,9 +202,13 @@ static void __linkwatch_run_queue(int urgent_only) } spin_unlock_irq(&lweventlist_lock); linkwatch_do_dev(dev); + do_dev--; spin_lock_irq(&lweventlist_lock); } + /* Add the remaining work back to lweventlist */ + list_splice_init(&wrk, &lweventlist); + if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9e7fc929bc50..742cea4ce72e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -583,6 +583,8 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, int error; struct neigh_hash_table *nht; + trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); + if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 470b179d599e..283ddb2dbc7d 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -43,6 +43,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); #endif +#if IS_ENABLED(CONFIG_PAGE_POOL) +#include <trace/events/page_pool.h> +#endif + #include <trace/events/neigh.h> EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 15f68842ac6b..198ce503ae73 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -145,6 +145,17 @@ static void ops_free(const struct pernet_operations *ops, struct net *net) } } +static void ops_pre_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + + if (ops->pre_exit) { + list_for_each_entry(net, net_exit_list, exit_list) + ops->pre_exit(net); + } +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { @@ -330,6 +341,12 @@ out_undo: list_add(&net->exit_list, &net_exit_list); saved_ops = ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_pre_exit_list(ops, &net_exit_list); + + synchronize_rcu(); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); ops = saved_ops; @@ -541,10 +558,15 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } + /* Run all of the network namespace pre_exit methods */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_pre_exit_list(ops, &net_exit_list); + /* * Another CPU might be rcu-iterating the list, wait for it. * This needs to be before calling the exit() notifiers, so * the rcu_barrier() below isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. */ synchronize_rcu(); @@ -1101,6 +1123,8 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); + ops_pre_exit_list(ops, &net_exit_list); + synchronize_rcu(); ops_exit_list(ops, &net_exit_list); ops_free_list(ops, &net_exit_list); return error; @@ -1115,6 +1139,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); + ops_pre_exit_list(ops, &net_exit_list); + synchronize_rcu(); ops_exit_list(ops, &net_exit_list); ops_free_list(ops, &net_exit_list); } @@ -1139,6 +1165,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) } else { LIST_HEAD(net_exit_list); list_add(&init_net.exit_list, &net_exit_list); + ops_pre_exit_list(ops, &net_exit_list); + synchronize_rcu(); ops_exit_list(ops, &net_exit_list); ops_free_list(ops, &net_exit_list); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index dd8b1a460d64..2cf27da1baeb 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -696,16 +696,22 @@ int netpoll_setup(struct netpoll *np) if (!np->local_ip.ip) { if (!np->ipv6) { + const struct in_ifaddr *ifa; + in_dev = __in_dev_get_rtnl(ndev); + if (!in_dev) + goto put_noaddr; - if (!in_dev || !in_dev->ifa_list) { + ifa = rtnl_dereference(in_dev->ifa_list); + if (!ifa) { +put_noaddr: np_err(np, "no IP address for %s, aborting\n", np->dev_name); err = -EDESTADDRREQ; goto put; } - np->local_ip.ip = in_dev->ifa_list->ifa_local; + np->local_ip.ip = ifa->ifa_local; np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5b2252c6d49b..3272dc7a8c81 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -4,9 +4,11 @@ * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ + #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/device.h> #include <net/page_pool.h> #include <linux/dma-direction.h> @@ -14,6 +16,8 @@ #include <linux/page-flags.h> #include <linux/mm.h> /* for __put_page() */ +#include <trace/events/page_pool.h> + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -43,6 +47,14 @@ static int page_pool_init(struct page_pool *pool, if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; + atomic_set(&pool->pages_state_release_cnt, 0); + + /* Driver calling page_pool_create() also call page_pool_destroy() */ + refcount_set(&pool->user_cnt, 1); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + get_device(pool->p.dev); + return 0; } @@ -61,6 +73,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) kfree(pool); return ERR_PTR(err); } + return pool; } EXPORT_SYMBOL(page_pool_create); @@ -151,6 +164,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, page->dma_addr = dma; skip_dma_map: + /* Track how many pages are held 'in-flight' */ + pool->pages_state_hold_cnt++; + + trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + /* When page just alloc'ed is should/must have refcnt 1. */ return page; } @@ -173,6 +191,33 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) } EXPORT_SYMBOL(page_pool_alloc_pages); +/* Calculate distance between two u32 values, valid if distance is below 2^(31) + * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution + */ +#define _distance(a, b) (s32)((a) - (b)) + +static s32 page_pool_inflight(struct page_pool *pool) +{ + u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); + u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); + s32 distance; + + distance = _distance(hold_cnt, release_cnt); + + trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt); + return distance; +} + +static bool __page_pool_safe_to_destroy(struct page_pool *pool) +{ + s32 inflight = page_pool_inflight(pool); + + /* The distance should not be able to become negative */ + WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); + + return (inflight == 0); +} + /* Cleanup page_pool state from page */ static void __page_pool_clean_page(struct page_pool *pool, struct page *page) @@ -180,7 +225,7 @@ static void __page_pool_clean_page(struct page_pool *pool, dma_addr_t dma; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) - return; + goto skip_dma_unmap; dma = page->dma_addr; /* DMA unmap */ @@ -188,12 +233,27 @@ static void __page_pool_clean_page(struct page_pool *pool, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); page->dma_addr = 0; +skip_dma_unmap: + atomic_inc(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, page, + atomic_read(&pool->pages_state_release_cnt)); +} + +/* unmap the page and clean our state */ +void page_pool_unmap_page(struct page_pool *pool, struct page *page) +{ + /* When page is unmapped, this implies page will not be + * returned to page_pool. + */ + __page_pool_clean_page(pool, page); } +EXPORT_SYMBOL(page_pool_unmap_page); /* Return a page to the page allocator, cleaning up our state */ static void __page_pool_return_page(struct page_pool *pool, struct page *page) { __page_pool_clean_page(pool, page); + put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a @@ -285,21 +345,45 @@ static void __page_pool_empty_ring(struct page_pool *pool) } } -static void __page_pool_destroy_rcu(struct rcu_head *rcu) +static void __warn_in_flight(struct page_pool *pool) { - struct page_pool *pool; + u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); + u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); + s32 distance; - pool = container_of(rcu, struct page_pool, rcu); + distance = _distance(hold_cnt, release_cnt); + + /* Drivers should fix this, but only problematic when DMA is used */ + WARN(1, "Still in-flight pages:%d hold:%u released:%u", + distance, hold_cnt, release_cnt); +} + +void __page_pool_free(struct page_pool *pool) +{ + /* Only last user actually free/release resources */ + if (!page_pool_put(pool)) + return; WARN(pool->alloc.count, "API usage violation"); + WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty"); + + /* Can happen due to forced shutdown */ + if (!__page_pool_safe_to_destroy(pool)) + __warn_in_flight(pool); - __page_pool_empty_ring(pool); ptr_ring_cleanup(&pool->ring, NULL); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + put_device(pool->p.dev); + kfree(pool); } +EXPORT_SYMBOL(__page_pool_free); -/* Cleanup and release resources */ -void page_pool_destroy(struct page_pool *pool) +/* Request to shutdown: release pages cached by page_pool, and check + * for in-flight pages + */ +bool __page_pool_request_shutdown(struct page_pool *pool) { struct page *page; @@ -317,7 +401,6 @@ void page_pool_destroy(struct page_pool *pool) */ __page_pool_empty_ring(pool); - /* An xdp_mem_allocator can still ref page_pool pointer */ - call_rcu(&pool->rcu, __page_pool_destroy_rcu); + return __page_pool_safe_to_destroy(pool); } -EXPORT_SYMBOL(page_pool_destroy); +EXPORT_SYMBOL(__page_pool_request_shutdown); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f975c5e2a369..bb9915291644 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2118,9 +2118,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) rcu_read_lock(); in_dev = __in_dev_get_rcu(pkt_dev->odev); if (in_dev) { - if (in_dev->ifa_list) { - pkt_dev->saddr_min = - in_dev->ifa_list->ifa_address; + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(in_dev->ifa_list); + if (ifa) { + pkt_dev->saddr_min = ifa->ifa_address; pkt_dev->saddr_max = pkt_dev->saddr_min; } } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cec60583931f..1ee6460f8275 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -751,6 +751,10 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) struct nlattr *mx; int i, valid = 0; + /* nothing is dumped for dst_default_metrics, so just skip the loop */ + if (metrics == dst_default_metrics.metrics) + return 0; + mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; @@ -908,6 +912,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * @@ -1197,6 +1202,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; + struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; memset(&ivi, 0, sizeof(ivi)); @@ -1231,6 +1237,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_trust.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; @@ -1247,6 +1254,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, if (!vf) goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || + nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || @@ -1753,6 +1761,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c8cd99c3603f..cdb0ccdaac0b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -72,6 +72,7 @@ #include <linux/highmem.h> #include <linux/capability.h> #include <linux/user_namespace.h> +#include <linux/indirect_call_wrapper.h> #include "datagram.h" @@ -365,18 +366,20 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); -static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct page_frag_cache *nc; - unsigned long flags; - void *data; + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - local_irq_save(flags); - nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc(nc, fragsz, gfp_mask); - local_irq_restore(flags); - return data; + return page_frag_alloc(&nc->page, fragsz, gfp_mask); +} + +void *napi_alloc_frag(unsigned int fragsz) +{ + fragsz = SKB_DATA_ALIGN(fragsz); + + return __napi_alloc_frag(fragsz, GFP_ATOMIC); } +EXPORT_SYMBOL(napi_alloc_frag); /** * netdev_alloc_frag - allocate a page fragment @@ -387,26 +390,21 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) */ void *netdev_alloc_frag(unsigned int fragsz) { - fragsz = SKB_DATA_ALIGN(fragsz); - - return __netdev_alloc_frag(fragsz, GFP_ATOMIC); -} -EXPORT_SYMBOL(netdev_alloc_frag); - -static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) -{ - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - - return page_frag_alloc(&nc->page, fragsz, gfp_mask); -} + struct page_frag_cache *nc; + void *data; -void *napi_alloc_frag(unsigned int fragsz) -{ fragsz = SKB_DATA_ALIGN(fragsz); - - return __napi_alloc_frag(fragsz, GFP_ATOMIC); + if (in_irq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc(nc, fragsz, GFP_ATOMIC); + } else { + local_bh_disable(); + data = __napi_alloc_frag(fragsz, GFP_ATOMIC); + local_bh_enable(); + } + return data; } -EXPORT_SYMBOL(napi_alloc_frag); +EXPORT_SYMBOL(netdev_alloc_frag); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device @@ -425,7 +423,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, gfp_t gfp_mask) { struct page_frag_cache *nc; - unsigned long flags; struct sk_buff *skb; bool pfmemalloc; void *data; @@ -446,13 +443,17 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - local_irq_save(flags); - - nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; - - local_irq_restore(flags); + if (in_irq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + } else { + local_bh_disable(); + nc = this_cpu_ptr(&napi_alloc_cache.page); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + local_bh_enable(); + } if (unlikely(!data)) return NULL; @@ -706,6 +707,105 @@ void kfree_skb_list(struct sk_buff *segs) } EXPORT_SYMBOL(kfree_skb_list); +/* Dump skb information and contents. + * + * Must only be called from net_ratelimit()-ed paths. + * + * Dumps up to can_dump_full whole packets if full_pkt, headers otherwise. + */ +void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) +{ + static atomic_t can_dump_full = ATOMIC_INIT(5); + struct skb_shared_info *sh = skb_shinfo(skb); + struct net_device *dev = skb->dev; + struct sock *sk = skb->sk; + struct sk_buff *list_skb; + bool has_mac, has_trans; + int headroom, tailroom; + int i, len, seg_len; + + if (full_pkt) + full_pkt = atomic_dec_if_positive(&can_dump_full) >= 0; + + if (full_pkt) + len = skb->len; + else + len = min_t(int, skb->len, MAX_HEADER + 128); + + headroom = skb_headroom(skb); + tailroom = skb_tailroom(skb); + + has_mac = skb_mac_header_was_set(skb); + has_trans = skb_transport_header_was_set(skb); + + printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" + "mac=(%d,%d) net=(%d,%d) trans=%d\n" + "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" + "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" + "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", + level, skb->len, headroom, skb_headlen(skb), tailroom, + has_mac ? skb->mac_header : -1, + has_mac ? skb_mac_header_len(skb) : -1, + skb->network_header, + has_trans ? skb_network_header_len(skb) : -1, + has_trans ? skb->transport_header : -1, + sh->tx_flags, sh->nr_frags, + sh->gso_size, sh->gso_type, sh->gso_segs, + skb->csum, skb->ip_summed, skb->csum_complete_sw, + skb->csum_valid, skb->csum_level, + skb->hash, skb->sw_hash, skb->l4_hash, + ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); + + if (dev) + printk("%sdev name=%s feat=0x%pNF\n", + level, dev->name, &dev->features); + if (sk) + printk("%ssk family=%hu type=%hu proto=%hu\n", + level, sk->sk_family, sk->sk_type, sk->sk_protocol); + + if (full_pkt && headroom) + print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, + 16, 1, skb->head, headroom, false); + + seg_len = min_t(int, skb_headlen(skb), len); + if (seg_len) + print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, + 16, 1, skb->data, seg_len, false); + len -= seg_len; + + if (full_pkt && tailroom) + print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, + 16, 1, skb_tail_pointer(skb), tailroom, false); + + for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + skb_frag_foreach_page(frag, frag->page_offset, + skb_frag_size(frag), p, p_off, p_len, + copied) { + seg_len = min_t(int, p_len, len); + vaddr = kmap_atomic(p); + print_hex_dump(level, "skb frag: ", + DUMP_PREFIX_OFFSET, + 16, 1, vaddr + p_off, seg_len, false); + kunmap_atomic(vaddr); + len -= seg_len; + if (!len) + break; + } + } + + if (full_pkt && skb_has_frag_list(skb)) { + printk("skb fraglist:\n"); + skb_walk_frags(skb, list_skb) + skb_dump(level, list_skb, true); + } +} +EXPORT_SYMBOL(skb_dump); + /** * skb_tx_error - report an sk_buff xmit error * @skb: buffer that triggered an error @@ -909,6 +1009,31 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) } /** + * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg + * @first: first sk_buff of the msg + */ +struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) +{ + struct sk_buff *n; + + n = alloc_skb(0, GFP_ATOMIC); + if (!n) + return NULL; + + n->len = first->len; + n->data_len = first->len; + n->truesize = first->truesize; + + skb_shinfo(n)->frag_list = first; + + __copy_skb_header(n, first); + n->destructor = NULL; + + return n; +} +EXPORT_SYMBOL_GPL(alloc_skb_for_msg); + +/** * skb_morph - morph one skb into another * @dst: the skb to receive the contents * @src: the skb to supply the contents @@ -2508,7 +2633,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, if (copy > 0) { if (copy > len) copy = len; - csum = ops->update(skb->data + offset, copy, csum); + csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, + skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; @@ -2535,9 +2661,13 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, frag->page_offset + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); - csum2 = ops->update(vaddr + p_off, p_len, 0); + csum2 = INDIRECT_CALL_1(ops->update, + csum_partial_ext, + vaddr + p_off, p_len, 0); kunmap_atomic(vaddr); - csum = ops->combine(csum, csum2, pos, p_len); + csum = INDIRECT_CALL_1(ops->combine, + csum_block_add_ext, csum, + csum2, pos, p_len); pos += p_len; } @@ -2560,7 +2690,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, copy = len; csum2 = __skb_checksum(frag_iter, offset - start, copy, 0, ops); - csum = ops->combine(csum, csum2, pos, copy); + csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, + csum, csum2, pos, copy); if ((len -= copy) == 0) return csum; offset += copy; diff --git a/net/core/sock.c b/net/core/sock.c index aa4a00d381e3..3e073ca6138f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1039,6 +1039,10 @@ set_rcvbuf: } break; + case SO_DETACH_REUSEPORT_BPF: + ret = reuseport_detach_prog(sk); + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -2843,7 +2847,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) if (sock) { sk->sk_type = sock->type; - RCU_INIT_POINTER(sk->sk_wq, sock->wq); + RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; sk->sk_uid = SOCK_INODE(sock)->i_uid; } else { diff --git a/net/core/sock_map.c b/net/core/sock_map.c index be6092ac69f8..52d4faeee18b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -44,13 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* Make sure page count doesn't overflow. */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_stab; - } - - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(stab->map.pages); + err = bpf_map_charge_init(&stab->map.memory, cost); if (err) goto free_stab; @@ -60,6 +54,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (stab->sks) return &stab->map; err = -ENOMEM; + bpf_map_charge_finish(&stab->map.memory); free_stab: kfree(stab); return ERR_PTR(err); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index dc4aefdf2a08..9408f9264d05 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -332,3 +332,27 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) return 0; } EXPORT_SYMBOL(reuseport_attach_prog); + +int reuseport_detach_prog(struct sock *sk) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return sk->sk_reuseport ? -ENOENT : -EINVAL; + + old_prog = NULL; + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + rcu_swap_protected(reuse->prog, old_prog, + lockdep_is_held(&reuseport_lock)); + spin_unlock_bh(&reuseport_lock); + + if (!old_prog) + return -ENOENT; + + sk_reuseport_prog_free(old_prog); + return 0; +} +EXPORT_SYMBOL(reuseport_detach_prog); diff --git a/net/core/xdp.c b/net/core/xdp.c index 8aab08b131d9..d7bf62ffbb5e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -14,6 +14,8 @@ #include <net/page_pool.h> #include <net/xdp.h> +#include <net/xdp_priv.h> /* struct xdp_mem_allocator */ +#include <trace/events/xdp.h> #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 @@ -29,17 +31,6 @@ static int mem_id_next = MEM_ID_MIN; static bool mem_id_init; /* false */ static struct rhashtable *mem_id_ht; -struct xdp_mem_allocator { - struct xdp_mem_info mem; - union { - void *allocator; - struct page_pool *page_pool; - struct zero_copy_allocator *zc_alloc; - }; - struct rhash_head node; - struct rcu_head rcu; -}; - static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) { const u32 *k = data; @@ -79,13 +70,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); + /* Allocator have indicated safe to remove before this is called */ + if (xa->mem.type == MEM_TYPE_PAGE_POOL) + page_pool_free(xa->page_pool); + /* Allow this ID to be reused */ ida_simple_remove(&mem_id_pool, xa->mem.id); - /* Notice, driver is expected to free the *allocator, - * e.g. page_pool, and MUST also use RCU free. - */ - /* Poison memory */ xa->mem.id = 0xFFFF; xa->mem.type = 0xF0F0; @@ -94,6 +85,64 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) kfree(xa); } +static bool __mem_id_disconnect(int id, bool force) +{ + struct xdp_mem_allocator *xa; + bool safe_to_remove = true; + + mutex_lock(&mem_id_lock); + + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + WARN(1, "Request remove non-existing id(%d), driver bug?", id); + return true; + } + xa->disconnect_cnt++; + + /* Detects in-flight packet-pages for page_pool */ + if (xa->mem.type == MEM_TYPE_PAGE_POOL) + safe_to_remove = page_pool_request_shutdown(xa->page_pool); + + trace_mem_disconnect(xa, safe_to_remove, force); + + if ((safe_to_remove || force) && + !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + + mutex_unlock(&mem_id_lock); + return (safe_to_remove|force); +} + +#define DEFER_TIME (msecs_to_jiffies(1000)) +#define DEFER_WARN_INTERVAL (30 * HZ) +#define DEFER_MAX_RETRIES 120 + +static void mem_id_disconnect_defer_retry(struct work_struct *wq) +{ + struct delayed_work *dwq = to_delayed_work(wq); + struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq); + bool force = false; + + if (xa->disconnect_cnt > DEFER_MAX_RETRIES) + force = true; + + if (__mem_id_disconnect(xa->mem.id, force)) + return; + + /* Periodic warning */ + if (time_after_eq(jiffies, xa->defer_warn)) { + int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ; + + pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n", + __func__, xa->mem.id, xa->disconnect_cnt, sec); + xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; + } + + /* Still not ready to be disconnected, retry later */ + schedule_delayed_work(&xa->defer_wq, DEFER_TIME); +} + void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; @@ -112,16 +161,30 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; + if (__mem_id_disconnect(id, false)) + return; + + /* Could not disconnect, defer new disconnect attempt to later */ mutex_lock(&mem_id_lock); xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + if (!xa) { + mutex_unlock(&mem_id_lock); + return; + } + xa->defer_start = jiffies; + xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; + INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry); mutex_unlock(&mem_id_lock); + schedule_delayed_work(&xa->defer_wq, DEFER_TIME); } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); +/* This unregister operation will also cleanup and destroy the + * allocator. The page_pool_free() operation is first called when it's + * safe to remove, possibly deferred to a workqueue. + */ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -301,12 +364,18 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { + ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id); + xdp_rxq->mem.id = 0; errno = PTR_ERR(ptr); goto err; } + if (type == MEM_TYPE_PAGE_POOL) + page_pool_get(xdp_alloc->page_pool); + mutex_unlock(&mem_id_lock); + trace_mem_connect(xdp_alloc, xdp_rxq); return 0; err: mutex_unlock(&mem_id_lock); @@ -333,10 +402,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (xa) { + if (likely(xa)) { napi_direct &= !xdp_return_frame_no_direct(); page_pool_put_page(xa->page_pool, page, napi_direct); } else { + /* Hopefully stack show who to blame for late return */ + WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id); + trace_mem_return_failed(mem, page); put_page(page); } rcu_read_unlock(); @@ -379,6 +451,21 @@ void xdp_return_buff(struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_return_buff); +/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ +void __xdp_release_frame(void *data, struct xdp_mem_info *mem) +{ + struct xdp_mem_allocator *xa; + struct page *page; + + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_release_page(xa->page_pool, page); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(__xdp_release_frame); + int xdp_attachment_query(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 85c10c8f50bd..1b7381ff787b 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -830,7 +830,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index d449f78c1bd0..6e942dda1bcd 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -106,6 +106,7 @@ config NET_DSA_TAG_LAN9303 config NET_DSA_TAG_SJA1105 tristate "Tag driver for NXP SJA1105 switches" select NET_DSA_TAG_8021Q + select PACKING help Say Y or M if you want to enable support for tagging frames with the NXP SJA1105 switch family. Both the native tagging protocol (which diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 820dd8da57fc..3abd173ebacb 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -257,7 +257,7 @@ static int dsa_port_setup(struct dsa_port *dp) enum devlink_port_flavour flavour; struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; - int err; + int err = 0; if (dp->type == DSA_PORT_TYPE_UNUSED) return 0; @@ -295,19 +295,15 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_CPU: err = dsa_port_link_register_of(dp); - if (err) { + if (err) dev_err(ds->dev, "failed to setup link for port %d.%d\n", ds->index, dp->index); - return err; - } break; case DSA_PORT_TYPE_DSA: err = dsa_port_link_register_of(dp); - if (err) { + if (err) dev_err(ds->dev, "failed to setup link for port %d.%d\n", ds->index, dp->index); - return err; - } break; case DSA_PORT_TYPE_USER: err = dsa_slave_create(dp); @@ -319,7 +315,10 @@ static int dsa_port_setup(struct dsa_port *dp) break; } - return 0; + if (err) + devlink_port_unregister(&dp->devlink_port); + + return err; } static void dsa_port_teardown(struct dsa_port *dp) @@ -347,7 +346,7 @@ static void dsa_port_teardown(struct dsa_port *dp) static int dsa_switch_setup(struct dsa_switch *ds) { - int err; + int err = 0; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus * driver and before ops->setup() has run, since the switch drivers and @@ -365,29 +364,41 @@ static int dsa_switch_setup(struct dsa_switch *ds) err = devlink_register(ds->devlink, ds->dev); if (err) - return err; + goto free_devlink; err = dsa_switch_register_notifier(ds); if (err) - return err; + goto unregister_devlink; err = ds->ops->setup(ds); if (err < 0) - return err; + goto unregister_notifier; if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); - if (!ds->slave_mii_bus) - return -ENOMEM; + if (!ds->slave_mii_bus) { + err = -ENOMEM; + goto unregister_notifier; + } dsa_slave_mii_bus_init(ds); err = mdiobus_register(ds->slave_mii_bus); if (err < 0) - return err; + goto unregister_notifier; } return 0; + +unregister_notifier: + dsa_switch_unregister_notifier(ds); +unregister_devlink: + devlink_unregister(ds->devlink); +free_devlink: + devlink_free(ds->devlink); + ds->devlink = NULL; + + return err; } static void dsa_switch_teardown(struct dsa_switch *ds) @@ -397,6 +408,9 @@ static void dsa_switch_teardown(struct dsa_switch *ds) dsa_switch_unregister_notifier(ds); + if (ds->ops->teardown) + ds->ops->teardown(ds); + if (ds->devlink) { devlink_unregister(ds->devlink); devlink_free(ds->devlink); @@ -409,8 +423,8 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) { struct dsa_switch *ds; struct dsa_port *dp; - int device, port; - int err; + int device, port, i; + int err = 0; for (device = 0; device < DSA_MAX_SWITCHES; device++) { ds = dst->ds[device]; @@ -419,18 +433,41 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) err = dsa_switch_setup(ds); if (err) - return err; + goto switch_teardown; for (port = 0; port < ds->num_ports; port++) { dp = &ds->ports[port]; err = dsa_port_setup(dp); if (err) - return err; + goto ports_teardown; } } return 0; + +ports_teardown: + for (i = 0; i < port; i++) + dsa_port_teardown(&ds->ports[i]); + + dsa_switch_teardown(ds); + +switch_teardown: + for (i = 0; i < device; i++) { + ds = dst->ds[i]; + if (!ds) + continue; + + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + dsa_port_teardown(dp); + } + + dsa_switch_teardown(ds); + } + + return err; } static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) @@ -492,17 +529,24 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) err = dsa_tree_setup_switches(dst); if (err) - return err; + goto teardown_default_cpu; err = dsa_tree_setup_master(dst); if (err) - return err; + goto teardown_switches; dst->setup = true; pr_info("DSA: tree %d setup\n", dst->index); return 0; + +teardown_switches: + dsa_tree_teardown_switches(dst); +teardown_default_cpu: + dsa_tree_teardown_default_cpu(dst); + + return err; } static void dsa_tree_teardown(struct dsa_switch_tree *dst) @@ -543,8 +587,10 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst, dst->ds[index] = ds; err = dsa_tree_setup(dst); - if (err) - dsa_tree_remove_switch(dst, index); + if (err) { + dst->ds[index] = NULL; + dsa_tree_put(dst); + } return err; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 3986cedfafc0..b2be53a13aa0 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -159,6 +159,23 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags); int dsa_port_vid_del(struct dsa_port *dp, u16 vid); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); +void dsa_port_phylink_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state); +int dsa_port_phylink_mac_link_state(struct phylink_config *config, + struct phylink_link_state *state); +void dsa_port_phylink_mac_config(struct phylink_config *config, + unsigned int mode, + const struct phylink_link_state *state); +void dsa_port_phylink_mac_an_restart(struct phylink_config *config); +void dsa_port_phylink_mac_link_down(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface); +void dsa_port_phylink_mac_link_up(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev); +extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index 363eab6df51b..d2b65e8dc60c 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -336,9 +336,6 @@ int dsa_port_vlan_add(struct dsa_port *dp, .vlan = vlan, }; - /* Can be called from dsa_slave_port_obj_add() or - * dsa_slave_vlan_rx_add_vid() - */ if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); @@ -354,12 +351,6 @@ int dsa_port_vlan_del(struct dsa_port *dp, .vlan = vlan, }; - if (vlan->obj.orig_dev && netif_is_bridge_master(vlan->obj.orig_dev)) - return -EOPNOTSUPP; - - /* Can be called from dsa_slave_port_obj_del() or - * dsa_slave_vlan_rx_kill_vid() - */ if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); @@ -418,6 +409,108 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) return phydev; } +void dsa_port_phylink_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_validate) + return; + + ds->ops->phylink_validate(ds, dp->index, supported, state); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_validate); + +int dsa_port_phylink_mac_link_state(struct phylink_config *config, + struct phylink_link_state *state) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + /* Only called for SGMII and 802.3z */ + if (!ds->ops->phylink_mac_link_state) + return -EOPNOTSUPP; + + return ds->ops->phylink_mac_link_state(ds, dp->index, state); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state); + +void dsa_port_phylink_mac_config(struct phylink_config *config, + unsigned int mode, + const struct phylink_link_state *state) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_config) + return; + + ds->ops->phylink_mac_config(ds, dp->index, mode, state); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_config); + +void dsa_port_phylink_mac_an_restart(struct phylink_config *config) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_an_restart) + return; + + ds->ops->phylink_mac_an_restart(ds, dp->index); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_an_restart); + +void dsa_port_phylink_mac_link_down(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct phy_device *phydev = NULL; + struct dsa_switch *ds = dp->ds; + + if (dsa_is_user_port(ds, dp->index)) + phydev = dp->slave->phydev; + + if (!ds->ops->phylink_mac_link_down) { + if (ds->ops->adjust_link && phydev) + ds->ops->adjust_link(ds, dp->index, phydev); + return; + } + + ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_down); + +void dsa_port_phylink_mac_link_up(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_link_up) { + if (ds->ops->adjust_link && phydev) + ds->ops->adjust_link(ds, dp->index, phydev); + return; + } + + ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up); + +const struct phylink_mac_ops dsa_port_phylink_mac_ops = { + .validate = dsa_port_phylink_validate, + .mac_link_state = dsa_port_phylink_mac_link_state, + .mac_config = dsa_port_phylink_mac_config, + .mac_an_restart = dsa_port_phylink_mac_an_restart, + .mac_link_down = dsa_port_phylink_mac_link_down, + .mac_link_up = dsa_port_phylink_mac_link_up, +}; + static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable) { struct dsa_switch *ds = dp->ds; @@ -495,8 +588,53 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp) return 0; } +static int dsa_port_phylink_register(struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + struct device_node *port_dn = dp->dn; + int mode, err; + + mode = of_get_phy_mode(port_dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + + dp->pl_config.dev = ds->dev; + dp->pl_config.type = PHYLINK_DEV; + + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), + mode, &dsa_port_phylink_mac_ops); + if (IS_ERR(dp->pl)) { + pr_err("error creating PHYLINK: %ld\n", PTR_ERR(dp->pl)); + return PTR_ERR(dp->pl); + } + + err = phylink_of_phy_connect(dp->pl, port_dn, 0); + if (err && err != -ENODEV) { + pr_err("could not attach to PHY: %d\n", err); + goto err_phy_connect; + } + + rtnl_lock(); + phylink_start(dp->pl); + rtnl_unlock(); + + return 0; + +err_phy_connect: + phylink_destroy(dp->pl); + return err; +} + int dsa_port_link_register_of(struct dsa_port *dp) { + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->adjust_link) + return dsa_port_phylink_register(dp); + + dev_warn(ds->dev, + "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n"); + if (of_phy_is_fixed_link(dp->dn)) return dsa_port_fixed_link_register_of(dp); else @@ -505,6 +643,16 @@ int dsa_port_link_register_of(struct dsa_port *dp) void dsa_port_link_unregister_of(struct dsa_port *dp) { + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->adjust_link) { + rtnl_lock(); + phylink_disconnect_phy(dp->pl); + rtnl_unlock(); + phylink_destroy(dp->pl); + return; + } + if (of_phy_is_fixed_link(dp->dn)) of_phy_deregister_fixed_link(dp->dn); else diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 8157be7e162d..99673f6b07f6 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -22,7 +22,7 @@ #include "dsa_priv.h" -static bool dsa_slave_dev_check(struct net_device *dev); +static bool dsa_slave_dev_check(const struct net_device *dev); /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) @@ -311,7 +311,8 @@ static int dsa_slave_port_attr_set(struct net_device *dev, static int dsa_slave_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans) + struct switchdev_trans *trans, + struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); int err; @@ -323,6 +324,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans); break; case SWITCHDEV_OBJ_ID_HOST_MDB: @@ -333,6 +336,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev, trans); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), trans); break; @@ -352,6 +357,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: @@ -361,6 +368,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev, err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: + if (obj->orig_dev != dev) + return -EOPNOTSUPP; err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj)); break; default: @@ -423,6 +432,8 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, if (!clone) return; + DSA_SKB_CB(skb)->clone = clone; + if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) return; @@ -460,6 +471,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) u64_stats_update_end(&s->syncp); DSA_SKB_CB(skb)->deferred_xmit = false; + DSA_SKB_CB(skb)->clone = NULL; /* Identify PTP protocol packets, clone them, and pass them to the * switch driver @@ -1160,98 +1172,6 @@ static struct device_type dsa_type = { .name = "dsa", }; -static void dsa_slave_phylink_validate(struct net_device *dev, - unsigned long *supported, - struct phylink_link_state *state) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_validate) - return; - - ds->ops->phylink_validate(ds, dp->index, supported, state); -} - -static int dsa_slave_phylink_mac_link_state(struct net_device *dev, - struct phylink_link_state *state) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - /* Only called for SGMII and 802.3z */ - if (!ds->ops->phylink_mac_link_state) - return -EOPNOTSUPP; - - return ds->ops->phylink_mac_link_state(ds, dp->index, state); -} - -static void dsa_slave_phylink_mac_config(struct net_device *dev, - unsigned int mode, - const struct phylink_link_state *state) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_config) - return; - - ds->ops->phylink_mac_config(ds, dp->index, mode, state); -} - -static void dsa_slave_phylink_mac_an_restart(struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_an_restart) - return; - - ds->ops->phylink_mac_an_restart(ds, dp->index); -} - -static void dsa_slave_phylink_mac_link_down(struct net_device *dev, - unsigned int mode, - phy_interface_t interface) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_link_down) { - if (ds->ops->adjust_link && dev->phydev) - ds->ops->adjust_link(ds, dp->index, dev->phydev); - return; - } - - ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); -} - -static void dsa_slave_phylink_mac_link_up(struct net_device *dev, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_link_up) { - if (ds->ops->adjust_link && dev->phydev) - ds->ops->adjust_link(ds, dp->index, dev->phydev); - return; - } - - ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev); -} - -static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = { - .validate = dsa_slave_phylink_validate, - .mac_link_state = dsa_slave_phylink_mac_link_state, - .mac_config = dsa_slave_phylink_mac_config, - .mac_an_restart = dsa_slave_phylink_mac_an_restart, - .mac_link_down = dsa_slave_phylink_mac_link_down, - .mac_link_up = dsa_slave_phylink_mac_link_up, -}; - void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) { const struct dsa_port *dp = dsa_to_port(ds, port); @@ -1299,8 +1219,11 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) if (mode < 0) mode = PHY_INTERFACE_MODE_NA; - dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode, - &dsa_slave_phylink_mac_ops); + dp->pl_config.dev = &slave_dev->dev; + dp->pl_config.type = PHYLINK_NETDEV; + + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, + &dsa_port_phylink_mac_ops); if (IS_ERR(dp->pl)) { netdev_err(slave_dev, "error creating PHYLINK: %ld\n", PTR_ERR(dp->pl)); @@ -1494,7 +1417,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) free_netdev(slave_dev); } -static bool dsa_slave_dev_check(struct net_device *dev) +static bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; } @@ -1565,19 +1488,6 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, return NOTIFY_DONE; } -static int -dsa_slave_switchdev_port_attr_set_event(struct net_device *netdev, - struct switchdev_notifier_port_attr_info *port_attr_info) -{ - int err; - - err = dsa_slave_port_attr_set(netdev, port_attr_info->attr, - port_attr_info->trans); - - port_attr_info->handled = true; - return notifier_from_errno(err); -} - struct dsa_switchdev_event_work { struct work_struct work; struct switchdev_notifier_fdb_info fdb_info; @@ -1652,13 +1562,18 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct dsa_switchdev_event_work *switchdev_work; + int err; + + if (event == SWITCHDEV_PORT_ATTR_SET) { + err = switchdev_handle_port_attr_set(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_attr_set); + return notifier_from_errno(err); + } if (!dsa_slave_dev_check(dev)) return NOTIFY_DONE; - if (event == SWITCHDEV_PORT_ATTR_SET) - return dsa_slave_switchdev_port_attr_set_event(dev, ptr); - switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); if (!switchdev_work) return NOTIFY_BAD; @@ -1688,41 +1603,28 @@ err_fdb_work_init: return NOTIFY_BAD; } -static int -dsa_slave_switchdev_port_obj_event(unsigned long event, - struct net_device *netdev, - struct switchdev_notifier_port_obj_info *port_obj_info) -{ - int err = -EOPNOTSUPP; - - switch (event) { - case SWITCHDEV_PORT_OBJ_ADD: - err = dsa_slave_port_obj_add(netdev, port_obj_info->obj, - port_obj_info->trans); - break; - case SWITCHDEV_PORT_OBJ_DEL: - err = dsa_slave_port_obj_del(netdev, port_obj_info->obj); - break; - } - - port_obj_info->handled = true; - return notifier_from_errno(err); -} - static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); - - if (!dsa_slave_dev_check(dev)) - return NOTIFY_DONE; + int err; switch (event) { - case SWITCHDEV_PORT_OBJ_ADD: /* fall through */ + case SWITCHDEV_PORT_OBJ_ADD: + err = switchdev_handle_port_obj_add(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_obj_add); + return notifier_from_errno(err); case SWITCHDEV_PORT_OBJ_DEL: - return dsa_slave_switchdev_port_obj_event(event, dev, ptr); + err = switchdev_handle_port_obj_del(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_obj_del); + return notifier_from_errno(err); case SWITCHDEV_PORT_ATTR_SET: - return dsa_slave_switchdev_port_attr_set_event(dev, ptr); + err = switchdev_handle_port_attr_set(dev, ptr, + dsa_slave_dev_check, + dsa_slave_port_attr_set); + return notifier_from_errno(err); } return NOTIFY_DONE; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 65a35e976d7b..6ebbd799c4eb 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -235,31 +235,48 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev, - struct packet_type *pt, u16 *tpid, u16 *tci) +/* In the DSA packet_type handler, skb->data points in the middle of the VLAN + * tag, after tpid and before tci. This is because so far, ETH_HLEN + * (DMAC, SMAC, EtherType) bytes were pulled. + * There are 2 bytes of VLAN tag left in skb->data, and upper + * layers expect the 'real' EtherType to be consumed as well. + * Coincidentally, a VLAN header is also of the same size as + * the number of bytes that need to be pulled. + * + * skb_mac_header skb->data + * | | + * v v + * | | | | | | | | | | | | | | | | | | | + * +-----------------------+-----------------------+-------+-------+-------+ + * | Destination MAC | Source MAC | TPID | TCI | EType | + * +-----------------------+-----------------------+-------+-------+-------+ + * ^ | | + * |<--VLAN_HLEN-->to <---VLAN_HLEN---> + * from | + * >>>>>>> v + * >>>>>>> | | | | | | | | | | | | | | | + * >>>>>>> +-----------------------+-----------------------+-------+ + * >>>>>>> | Destination MAC | Source MAC | EType | + * +-----------------------+-----------------------+-------+ + * ^ ^ + * (now part of | | + * skb->head) skb_mac_header skb->data + */ +struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) { - struct vlan_ethhdr *tag; - - if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) - return NULL; + u8 *from = skb_mac_header(skb); + u8 *dest = from + VLAN_HLEN; - tag = vlan_eth_hdr(skb); - *tpid = ntohs(tag->h_vlan_proto); - *tci = ntohs(tag->h_vlan_TCI); - - /* skb->data points in the middle of the VLAN tag, - * after tpid and before tci. This is because so far, - * ETH_HLEN (DMAC, SMAC, EtherType) bytes were pulled. - * There are 2 bytes of VLAN tag left in skb->data, and upper - * layers expect the 'real' EtherType to be consumed as well. - * Coincidentally, a VLAN header is also of the same size as - * the number of bytes that need to be pulled. - */ - skb_pull_rcsum(skb, VLAN_HLEN); + memmove(dest, from, ETH_HLEN - VLAN_HLEN); + skb_pull(skb, VLAN_HLEN); + skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + skb_pull_rcsum(skb, ETH_HLEN); return skb; } -EXPORT_SYMBOL_GPL(dsa_8021q_rcv); +EXPORT_SYMBOL_GPL(dsa_8021q_remove_header); static const struct dsa_device_ops dsa_8021q_netdev_ops = { .name = "8021q", diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index d43737e6c3fb..1d96c9d4a8e9 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -13,6 +13,8 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb) const struct ethhdr *hdr = eth_hdr(skb); u64 dmac = ether_addr_to_u64(hdr->h_dest); + if (ntohs(hdr->h_proto) == ETH_P_SJA1105_META) + return false; if ((dmac & SJA1105_LINKLOCAL_FILTER_A_MASK) == SJA1105_LINKLOCAL_FILTER_A) return true; @@ -22,15 +24,61 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb) return false; } +struct sja1105_meta { + u64 tstamp; + u64 dmac_byte_4; + u64 dmac_byte_3; + u64 source_port; + u64 switch_id; +}; + +static void sja1105_meta_unpack(const struct sk_buff *skb, + struct sja1105_meta *meta) +{ + u8 *buf = skb_mac_header(skb) + ETH_HLEN; + + /* UM10944.pdf section 4.2.17 AVB Parameters: + * Structure of the meta-data follow-up frame. + * It is in network byte order, so there are no quirks + * while unpacking the meta frame. + * + * Also SJA1105 E/T only populates bits 23:0 of the timestamp + * whereas P/Q/R/S does 32 bits. Since the structure is the + * same and the E/T puts zeroes in the high-order byte, use + * a unified unpacking command for both device series. + */ + packing(buf, &meta->tstamp, 31, 0, 4, UNPACK, 0); + packing(buf + 4, &meta->dmac_byte_4, 7, 0, 1, UNPACK, 0); + packing(buf + 5, &meta->dmac_byte_3, 7, 0, 1, UNPACK, 0); + packing(buf + 6, &meta->source_port, 7, 0, 1, UNPACK, 0); + packing(buf + 7, &meta->switch_id, 7, 0, 1, UNPACK, 0); +} + +static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) +{ + const struct ethhdr *hdr = eth_hdr(skb); + u64 smac = ether_addr_to_u64(hdr->h_source); + u64 dmac = ether_addr_to_u64(hdr->h_dest); + + if (smac != SJA1105_META_SMAC) + return false; + if (dmac != SJA1105_META_DMAC) + return false; + if (ntohs(hdr->h_proto) != ETH_P_SJA1105_META) + return false; + return true; +} + /* This is the first time the tagger sees the frame on RX. - * Figure out if we can decode it, and if we can, annotate skb->cb with how we - * plan to do that, so we don't need to check again in the rcv function. + * Figure out if we can decode it. */ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) { + if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) + return true; if (sja1105_is_link_local(skb)) return true; - if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) + if (sja1105_is_meta_frame(skb)) return true; return false; } @@ -62,25 +110,152 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } +static void sja1105_transfer_meta(struct sk_buff *skb, + const struct sja1105_meta *meta) +{ + struct ethhdr *hdr = eth_hdr(skb); + + hdr->h_dest[3] = meta->dmac_byte_3; + hdr->h_dest[4] = meta->dmac_byte_4; + SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp; +} + +/* This is a simple state machine which follows the hardware mechanism of + * generating RX timestamps: + * + * After each timestampable skb (all traffic for which send_meta1 and + * send_meta0 is true, aka all MAC-filtered link-local traffic) a meta frame + * containing a partial timestamp is immediately generated by the switch and + * sent as a follow-up to the link-local frame on the CPU port. + * + * The meta frames have no unique identifier (such as sequence number) by which + * one may pair them to the correct timestampable frame. + * Instead, the switch has internal logic that ensures no frames are sent on + * the CPU port between a link-local timestampable frame and its corresponding + * meta follow-up. It also ensures strict ordering between ports (lower ports + * have higher priority towards the CPU port). For this reason, a per-port + * data structure is not needed/desirable. + * + * This function pairs the link-local frame with its partial timestamp from the + * meta follow-up frame. The full timestamp will be reconstructed later in a + * work queue. + */ +static struct sk_buff +*sja1105_rcv_meta_state_machine(struct sk_buff *skb, + struct sja1105_meta *meta, + bool is_link_local, + bool is_meta) +{ + struct sja1105_port *sp; + struct dsa_port *dp; + + dp = dsa_slave_to_port(skb->dev); + sp = dp->priv; + + /* Step 1: A timestampable frame was received. + * Buffer it until we get its meta frame. + */ + if (is_link_local && sp->data->hwts_rx_en) { + spin_lock(&sp->data->meta_lock); + /* Was this a link-local frame instead of the meta + * that we were expecting? + */ + if (sp->data->stampable_skb) { + dev_err_ratelimited(dp->ds->dev, + "Expected meta frame, is %12llx " + "in the DSA master multicast filter?\n", + SJA1105_META_DMAC); + } + + /* Hold a reference to avoid dsa_switch_rcv + * from freeing the skb. + */ + sp->data->stampable_skb = skb_get(skb); + spin_unlock(&sp->data->meta_lock); + + /* Tell DSA we got nothing */ + return NULL; + + /* Step 2: The meta frame arrived. + * Time to take the stampable skb out of the closet, annotate it + * with the partial timestamp, and pretend that we received it + * just now (basically masquerade the buffered frame as the meta + * frame, which serves no further purpose). + */ + } else if (is_meta) { + struct sk_buff *stampable_skb; + + spin_lock(&sp->data->meta_lock); + + stampable_skb = sp->data->stampable_skb; + sp->data->stampable_skb = NULL; + + /* Was this a meta frame instead of the link-local + * that we were expecting? + */ + if (!stampable_skb) { + dev_err_ratelimited(dp->ds->dev, + "Unexpected meta frame\n"); + spin_unlock(&sp->data->meta_lock); + return NULL; + } + + if (stampable_skb->dev != skb->dev) { + dev_err_ratelimited(dp->ds->dev, + "Meta frame on wrong port\n"); + spin_unlock(&sp->data->meta_lock); + return NULL; + } + + /* Free the meta frame and give DSA the buffered stampable_skb + * for further processing up the network stack. + */ + kfree_skb(skb); + + skb = skb_copy(stampable_skb, GFP_ATOMIC); + if (!skb) { + dev_err_ratelimited(dp->ds->dev, + "Failed to copy stampable skb\n"); + return NULL; + } + sja1105_transfer_meta(skb, meta); + /* The cached copy will be freed now */ + skb_unref(stampable_skb); + + spin_unlock(&sp->data->meta_lock); + } + + return skb; +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { - struct ethhdr *hdr = eth_hdr(skb); - u64 source_port, switch_id; - struct sk_buff *nskb; + struct sja1105_meta meta = {0}; + int source_port, switch_id; + struct vlan_ethhdr *hdr; u16 tpid, vid, tci; + bool is_link_local; bool is_tagged; + bool is_meta; - nskb = dsa_8021q_rcv(skb, netdev, pt, &tpid, &tci); - is_tagged = (nskb && tpid == ETH_P_SJA1105); - - skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; - vid = tci & VLAN_VID_MASK; + hdr = vlan_eth_hdr(skb); + tpid = ntohs(hdr->h_vlan_proto); + is_tagged = (tpid == ETH_P_SJA1105); + is_link_local = sja1105_is_link_local(skb); + is_meta = sja1105_is_meta_frame(skb); skb->offload_fwd_mark = 1; - if (sja1105_is_link_local(skb)) { + if (is_tagged) { + /* Normal traffic path. */ + tci = ntohs(hdr->h_vlan_TCI); + vid = tci & VLAN_VID_MASK; + source_port = dsa_8021q_rx_source_port(vid); + switch_id = dsa_8021q_rx_switch_id(vid); + skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of * the incl_srcpt options. @@ -90,10 +265,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, /* Clear the DMAC bytes that were mangled by the switch */ hdr->h_dest[3] = 0; hdr->h_dest[4] = 0; + } else if (is_meta) { + sja1105_meta_unpack(skb, &meta); + source_port = meta.source_port; + switch_id = meta.switch_id; } else { - /* Normal traffic path. */ - source_port = dsa_8021q_rx_source_port(vid); - switch_id = dsa_8021q_rx_switch_id(vid); + return NULL; } skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); @@ -106,10 +283,10 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN). */ if (is_tagged) - memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - VLAN_HLEN, - ETH_HLEN - VLAN_HLEN); + skb = dsa_8021q_remove_header(skb); - return skb; + return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, + is_meta); } static struct dsa_device_ops sja1105_netdev_ops = { diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 20b0bcb7e9e3..17374afee28f 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -545,17 +545,10 @@ unsigned char * __weak arch_get_platform_mac_address(void) int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) { - const unsigned char *addr; - struct device_node *dp; + const unsigned char *addr = NULL; - if (dev_is_pci(dev)) - dp = pci_device_to_OF_node(to_pci_dev(dev)); - else - dp = dev->of_node; - - addr = NULL; - if (dp) - addr = of_get_mac_address(dp); + if (dev->of_node) + addr = of_get_mac_address(dev->of_node); if (IS_ERR_OR_NULL(addr)) addr = arch_get_platform_mac_address(); @@ -563,6 +556,7 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) return -ENODEV; ether_addr_copy(mac_addr, addr); + return 0; } EXPORT_SYMBOL(eth_platform_get_mac_address); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 9133e3cede77..e4aba5d485be 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -74,7 +74,7 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, key.src = *src; key.dst = *dst; - q = inet_frag_find(&ieee802154_lowpan->frags, &key); + q = inet_frag_find(ieee802154_lowpan->fqdir, &key); if (!q) return NULL; @@ -134,7 +134,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, fq->q.flags |= INET_FRAG_FIRST_IN; fq->q.meat += skb->len; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { @@ -321,23 +321,18 @@ err: static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", - .data = &init_net.ieee802154_lowpan.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", - .data = &init_net.ieee802154_lowpan.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, { .procname = "6lowpanfrag_time", - .data = &init_net.ieee802154_lowpan.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -372,17 +367,17 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) if (table == NULL) goto err_alloc; - table[0].data = &ieee802154_lowpan->frags.high_thresh; - table[0].extra1 = &ieee802154_lowpan->frags.low_thresh; - table[1].data = &ieee802154_lowpan->frags.low_thresh; - table[1].extra2 = &ieee802154_lowpan->frags.high_thresh; - table[2].data = &ieee802154_lowpan->frags.timeout; - /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) table[0].procname = NULL; } + table[0].data = &ieee802154_lowpan->fqdir->high_thresh; + table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh; + table[1].data = &ieee802154_lowpan->fqdir->low_thresh; + table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh; + table[2].data = &ieee802154_lowpan->fqdir->timeout; + hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table); if (hdr == NULL) goto err_reg; @@ -449,32 +444,42 @@ static int __net_init lowpan_frags_init_net(struct net *net) net_ieee802154_lowpan(net); int res; - ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; - ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; - ieee802154_lowpan->frags.f = &lowpan_frags; - res = inet_frags_init_net(&ieee802154_lowpan->frags); + res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net); if (res < 0) return res; + + ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = lowpan_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&ieee802154_lowpan->frags); + fqdir_exit(ieee802154_lowpan->fqdir); return res; } +static void __net_exit lowpan_frags_pre_exit_net(struct net *net) +{ + struct netns_ieee802154_lowpan *ieee802154_lowpan = + net_ieee802154_lowpan(net); + + fqdir_pre_exit(ieee802154_lowpan->fqdir); +} + static void __net_exit lowpan_frags_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&ieee802154_lowpan->frags); + fqdir_exit(ieee802154_lowpan->fqdir); } static struct pernet_operations lowpan_frags_ops = { - .init = lowpan_frags_init_net, - .exit = lowpan_frags_exit_net, + .init = lowpan_frags_init_net, + .pre_exit = lowpan_frags_pre_exit_net, + .exit = lowpan_frags_exit_net, }; static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) @@ -539,7 +544,7 @@ err_sysctl: void lowpan_net_frag_exit(void) { - inet_frags_fini(&lowpan_frags); lowpan_frags_sysctl_unregister(); unregister_pernet_subsys(&lowpan_frags_ops); + inet_frags_fini(&lowpan_frags); } diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 000a61994c8f..d57ecfaf89d4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -14,7 +14,7 @@ obj-y := route.o inetpeer.o protocol.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ - metrics.o netlink.o + metrics.o netlink.o nexthop.o obj-$(CONFIG_BPFILTER) += bpfilter/ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 52bdb881a506..ed2301ef872e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -784,10 +784,8 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, } EXPORT_SYMBOL(inet_getname); -int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +int inet_send_prepare(struct sock *sk) { - struct sock *sk = sock->sk; - sock_rps_record_flow(sk); /* We may need to bind the socket. */ @@ -795,7 +793,19 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->sendmsg(sk, msg, size); + return 0; +} +EXPORT_SYMBOL_GPL(inet_send_prepare); + +int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + + if (unlikely(inet_send_prepare(sk))) + return -EAGAIN; + + return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg, + sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg); @@ -804,11 +814,7 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, { struct sock *sk = sock->sk; - sock_rps_record_flow(sk); - - /* We may need to bind the socket. */ - if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind && - inet_autobind(sk)) + if (unlikely(inet_send_prepare(sk))) return -EAGAIN; if (sk->sk_prot->sendpage) @@ -817,6 +823,8 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, } EXPORT_SYMBOL(inet_sendpage); +INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, + size_t, int, int, int *)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -827,8 +835,9 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); - err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, - flags & ~MSG_DONTWAIT, &addr_len); + err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, + sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 9c3afd550612..974179b3b314 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -590,8 +590,7 @@ static void __exit ah4_fini(void) { if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ah_type, AF_INET) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&ah_type, AF_INET); } module_init(ah4_init); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c5ebfa199794..a4b5bd4d2c89 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -195,7 +195,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +static void inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); @@ -301,8 +302,8 @@ static void in_dev_rcu_put(struct rcu_head *head) static void inetdev_destroy(struct in_device *in_dev) { - struct in_ifaddr *ifa; struct net_device *dev; + struct in_ifaddr *ifa; ASSERT_RTNL(); @@ -312,7 +313,7 @@ static void inetdev_destroy(struct in_device *in_dev) ip_mc_destroy_dev(in_dev); - while ((ifa = in_dev->ifa_list) != NULL) { + while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } @@ -328,30 +329,35 @@ static void inetdev_destroy(struct in_device *in_dev) int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { + const struct in_ifaddr *ifa; + rcu_read_lock(); - for_primary_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { rcu_read_unlock(); return 1; } } - } endfor_ifa(in_dev); + } rcu_read_unlock(); return 0; } -static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, - int destroy, struct nlmsghdr *nlh, u32 portid) +static void __inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, + int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; - struct in_ifaddr *ifa, *ifa1 = *ifap; - struct in_ifaddr *last_prim = in_dev->ifa_list; + struct in_ifaddr *ifa, *ifa1; + struct in_ifaddr *last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); + ifa1 = rtnl_dereference(*ifap); + last_prim = rtnl_dereference(in_dev->ifa_list); if (in_dev->dead) goto no_promotions; @@ -360,9 +366,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { - struct in_ifaddr **ifap1 = &ifa1->ifa_next; + struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; - while ((ifa = *ifap1) != NULL) { + while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = ifa; @@ -395,7 +401,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ - for (ifa = promote; ifa; ifa = ifa->ifa_next) { + for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); @@ -421,19 +427,25 @@ no_promotions: blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { - struct in_ifaddr *next_sec = promote->ifa_next; + struct in_ifaddr *next_sec; + next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { - prev_prom->ifa_next = promote->ifa_next; - promote->ifa_next = last_prim->ifa_next; - last_prim->ifa_next = promote; + struct in_ifaddr *last_sec; + + rcu_assign_pointer(prev_prom->ifa_next, next_sec); + + last_sec = rtnl_dereference(last_prim->ifa_next); + rcu_assign_pointer(promote->ifa_next, last_sec); + rcu_assign_pointer(last_prim->ifa_next, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); - for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { + for (ifa = next_sec; ifa; + ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; @@ -445,7 +457,8 @@ no_promotions: inet_free_ifa(ifa1); } -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +static void inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); @@ -458,9 +471,10 @@ static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { + struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; - struct in_ifaddr *ifa1, **ifap, **last_primary; struct in_validator_info ivi; + struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); @@ -476,8 +490,10 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, /* Don't set IPv6 only flags to IPv4 addresses */ ifa->ifa_flags &= ~IPV6ONLY_FLAGS; - for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; - ifap = &ifa1->ifa_next) { + ifap = &in_dev->ifa_list; + ifa1 = rtnl_dereference(*ifap); + + while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; @@ -493,6 +509,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } ifa->ifa_flags |= IFA_F_SECONDARY; } + + ifap = &ifa1->ifa_next; + ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh @@ -518,8 +537,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifap = last_primary; } - ifa->ifa_next = *ifap; - *ifap = ifa; + rcu_assign_pointer(ifa->ifa_next, *ifap); + rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); @@ -584,12 +603,14 @@ EXPORT_SYMBOL(inetdev_by_index); struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask) { + struct in_ifaddr *ifa; + ASSERT_RTNL(); - for_primary_ifa(in_dev) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) return ifa; - } endfor_ifa(in_dev); + } return NULL; } @@ -617,10 +638,12 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; - struct in_ifaddr *ifa, **ifap; + struct in_ifaddr *ifa; + int err = -EINVAL; ASSERT_RTNL(); @@ -637,7 +660,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) @@ -725,15 +748,19 @@ static void check_lifetime(struct work_struct *work) if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { - struct in_ifaddr **ifap; + struct in_ifaddr __rcu **ifap; + struct in_ifaddr *tmp; - for (ifap = &ifa->ifa_dev->ifa_list; - *ifap != NULL; ifap = &(*ifap)->ifa_next) { - if (*ifap == ifa) { + ifap = &ifa->ifa_dev->ifa_list; + tmp = rtnl_dereference(*ifap); + while (tmp) { + if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } + ifap = &tmp->ifa_next; + tmp = rtnl_dereference(*ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && @@ -877,13 +904,12 @@ errout: static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; - struct in_ifaddr *ifa1, **ifap; + struct in_ifaddr *ifa1; if (!ifa->ifa_local) return NULL; - for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; - ifap = &ifa1->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) @@ -978,8 +1004,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; + struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; - struct in_ifaddr **ifap = NULL; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; @@ -1050,7 +1076,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == @@ -1063,7 +1091,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; @@ -1212,7 +1241,7 @@ out: static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl(dev); - struct in_ifaddr *ifa; + const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; @@ -1222,7 +1251,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int s if (!in_dev) goto out; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (!buf) { done += size; continue; @@ -1250,18 +1279,24 @@ out: static __be32 in_dev_select_addr(const struct in_device *in_dev, int scope) { - for_primary_ifa(in_dev) { + const struct in_ifaddr *ifa; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) return ifa->ifa_local; - } endfor_ifa(in_dev); + } return 0; } __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { + const struct in_ifaddr *ifa; __be32 addr = 0; + unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; struct net *net = dev_net(dev); int master_idx; @@ -1271,8 +1306,13 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (!in_dev) goto no_in_dev; - for_primary_ifa(in_dev) { - if (ifa->ifa_scope > scope) + if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) + localnet_scope = RT_SCOPE_LINK; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + if (min(ifa->ifa_scope, localnet_scope) > scope) continue; if (!dst || inet_ifa_match(dst, ifa)) { addr = ifa->ifa_local; @@ -1280,7 +1320,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) } if (!addr) addr = ifa->ifa_local; - } endfor_ifa(in_dev); + } if (addr) goto out_unlock; @@ -1325,13 +1365,20 @@ EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) { - int same = 0; + unsigned char localnet_scope = RT_SCOPE_HOST; + const struct in_ifaddr *ifa; __be32 addr = 0; + int same = 0; + + if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) + localnet_scope = RT_SCOPE_LINK; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + unsigned char min_scope = min(ifa->ifa_scope, localnet_scope); - for_ifa(in_dev) { if (!addr && (local == ifa->ifa_local || !local) && - ifa->ifa_scope <= scope) { + min_scope <= scope) { addr = ifa->ifa_local; if (same) break; @@ -1346,7 +1393,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, if (inet_ifa_match(addr, ifa)) break; /* No, then can we use new local src? */ - if (ifa->ifa_scope <= scope) { + if (min_scope <= scope) { addr = ifa->ifa_local; break; } @@ -1354,7 +1401,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, same = 0; } } - } endfor_ifa(in_dev); + } return same ? addr : 0; } @@ -1428,7 +1475,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) struct in_ifaddr *ifa; int named = 0; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { char old[IFNAMSIZ], *dot; memcpy(old, ifa->ifa_label, IFNAMSIZ); @@ -1458,10 +1505,9 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { - struct in_ifaddr *ifa; + const struct in_ifaddr *ifa; - for (ifa = in_dev->ifa_list; ifa; - ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { arp_send(ARPOP_REQUEST, ETH_P_ARP, ifa->ifa_local, dev, ifa->ifa_local, NULL, @@ -1731,15 +1777,17 @@ static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, int ip_idx = 0; int err; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) { - if (ip_idx < s_ip_idx) + in_dev_for_each_ifa_rtnl(ifa, in_dev) { + if (ip_idx < s_ip_idx) { + ip_idx++; continue; - + } err = inet_fill_ifaddr(skb, ifa, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + ip_idx++; } err = 0; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b9ae95576084..5c967764041f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -33,8 +33,6 @@ struct esp_output_extra { #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) -static u32 esp4_get_mtu(struct xfrm_state *x, int mtu); - /* * Allocate an AEAD request structure with extra space for SG and IV. * @@ -506,7 +504,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } @@ -788,28 +786,6 @@ out: return err; } -static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) -{ - struct crypto_aead *aead = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); - unsigned int net_adj; - - switch (x->props.mode) { - case XFRM_MODE_TRANSPORT: - case XFRM_MODE_BEET: - net_adj = sizeof(struct iphdr); - break; - case XFRM_MODE_TUNNEL: - net_adj = 0; - break; - default: - BUG(); - } - - return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - - net_adj) & ~(blksize - 1)) + net_adj - 2; -} - static int esp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); @@ -1035,7 +1011,6 @@ static const struct xfrm_type esp_type = .flags = XFRM_TYPE_REPLAY_PROT, .init_state = esp_init_state, .destructor = esp_destroy, - .get_mtu = esp4_get_mtu, .input = esp_input, .output = esp_output, }; @@ -1066,8 +1041,7 @@ static void __exit esp4_fini(void) { if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&esp_type, AF_INET) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&esp_type, AF_INET); } module_init(esp4_init); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 2e5e377f50a1..0e4a7cf6bc87 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -312,9 +312,7 @@ static int __init esp4_offload_init(void) static void __exit esp4_offload_exit(void) { - if (xfrm_unregister_type_offload(&esp_type_offload, AF_INET) < 0) - pr_info("%s: can't remove xfrm type offload\n", __func__); - + xfrm_unregister_type_offload(&esp_type_offload, AF_INET); inet_del_offload(&esp4_offload, IPPROTO_ESP); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e54c2bcbb465..317339cd7f03 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -39,6 +39,7 @@ #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> @@ -188,7 +189,7 @@ int fib_unmerge(struct net *net) return 0; } -static void fib_flush(struct net *net) +void fib_flush(struct net *net) { int flushed = 0; unsigned int h; @@ -230,7 +231,9 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { - if (!dev || dev == res.fi->fib_dev) + struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); + + if (!dev || dev == nhc->nhc_dev) ret = res.type; } } @@ -317,19 +320,19 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) #ifdef CONFIG_IP_ROUTE_MULTIPATH int ret; - for (ret = 0; ret < fi->fib_nhs; ret++) { - struct fib_nh *nh = &fi->fib_nh[ret]; + for (ret = 0; ret < fib_info_num_path(fi); ret++) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); - if (nh->fib_nh_dev == dev) { + if (nhc->nhc_dev == dev) { dev_match = true; break; - } else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) { + } else if (l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) { dev_match = true; break; } } #else - if (fi->fib_nh[0].fib_nh_dev == dev) + if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif @@ -536,14 +539,22 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { - struct in_ifaddr *ifa; - struct in_device *in_dev = __in_dev_get_rtnl(dev); + const struct in_ifaddr *ifa; + struct in_device *in_dev; + + in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return -ENODEV; + *colon = ':'; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + + rcu_read_lock(); + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; + } + rcu_read_unlock(); + if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; @@ -641,6 +652,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) } const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { + [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, @@ -659,6 +671,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, + [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, @@ -796,6 +809,18 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, if (err < 0) goto errout; break; + case RTA_NH_ID: + cfg->fc_nh_id = nla_get_u32(attr); + break; + } + } + + if (cfg->fc_nh_id) { + if (cfg->fc_oif || cfg->fc_gw_family || + cfg->fc_encap || cfg->fc_mp) { + NL_SET_ERR_MSG(extack, + "Nexthop specification and nexthop id are mutually exclusive"); + return -EINVAL; } } @@ -822,6 +847,12 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + err = -EINVAL; + goto errout; + } + tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); @@ -881,10 +912,15 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } + if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); return -EINVAL; } + if (rtm->rtm_flags & RTM_F_CLONED) + filter->dump_routes = false; + else + filter->dump_exceptions = false; filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC); filter->flags = rtm->rtm_flags; @@ -931,9 +967,10 @@ EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct fib_dump_filter filter = { .dump_routes = true, + .dump_exceptions = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct fib_dump_filter filter = {}; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; @@ -950,8 +987,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); } - /* fib entries are never clones and ipv4 does not use prefix flag */ - if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED)) + /* ipv4 does not use prefix flag */ + if (filter.flags & RTM_F_PREFIX) return skb->len; if (filter.table_id) { @@ -1172,8 +1209,8 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) * * Scan address list to be sure that addresses are really gone. */ - - for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + rcu_read_lock(); + in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; @@ -1241,6 +1278,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) } } } + rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) @@ -1410,6 +1448,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); + struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { @@ -1424,9 +1463,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo switch (event) { case NETDEV_UP: - for_ifa(in_dev) { + in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); - } endfor_ifa(in_dev); + } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 7945f0534db7..a68b5e21ec51 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/list.h> #include <net/ip_fib.h> +#include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index a38e86b98e4f..b43a7ba5c6a4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -27,6 +27,7 @@ #include <net/route.h> #include <net/tcp.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/fib_rules.h> struct fib4_rule { @@ -141,8 +142,11 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg struct fib_result *result = (struct fib_result *) arg->result; struct net_device *dev = NULL; - if (result->fi) - dev = result->fi->fib_dev; + if (result->fi) { + struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0); + + dev = nhc->nhc_dev; + } /* do not accept result if the route does * not meet the required prefix length diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index bfa49a88d03a..2db089e10ba0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -38,6 +38,7 @@ #include <net/sock.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> +#include <net/nexthop.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> @@ -56,18 +57,21 @@ static unsigned int fib_info_cnt; #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; +/* for_nexthops and change_nexthops only used when nexthop object + * is not set in a fib_info. The logic within can reference fib_nh. + */ #ifdef CONFIG_IP_ROUTE_MULTIPATH #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ for (nhsel = 0, nh = (fi)->fib_nh; \ - nhsel < (fi)->fib_nhs; \ + nhsel < fib_info_num_path((fi)); \ nh++, nhsel++) #define change_nexthops(fi) { \ int nhsel; struct fib_nh *nexthop_nh; \ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ - nhsel < (fi)->fib_nhs; \ + nhsel < fib_info_num_path((fi)); \ nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -228,9 +232,13 @@ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); - change_nexthops(fi) { - fib_nh_release(fi->fib_net, nexthop_nh); - } endfor_nexthops(fi); + if (fi->nh) { + nexthop_put(fi->nh); + } else { + change_nexthops(fi) { + fib_nh_release(fi->fib_net, nexthop_nh); + } endfor_nexthops(fi); + } ip_fib_metrics_put(fi->fib_metrics); @@ -256,22 +264,34 @@ void fib_release_info(struct fib_info *fi) hlist_del(&fi->fib_hash); if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); - change_nexthops(fi) { - if (!nexthop_nh->fib_nh_dev) - continue; - hlist_del(&nexthop_nh->nh_hash); - } endfor_nexthops(fi) + if (fi->nh) { + list_del(&fi->nh_list); + } else { + change_nexthops(fi) { + if (!nexthop_nh->fib_nh_dev) + continue; + hlist_del(&nexthop_nh->nh_hash); + } endfor_nexthops(fi) + } fi->fib_dead = 1; fib_info_put(fi); } spin_unlock_bh(&fib_info_lock); } -static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) { - const struct fib_nh *onh = ofi->fib_nh; + const struct fib_nh *onh; + + if (fi->nh || ofi->nh) + return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1; + + if (ofi->fib_nhs == 0) + return 0; for_nexthops(fi) { + onh = fib_info_nh(ofi, nhsel); + if (nh->fib_nh_oif != onh->fib_nh_oif || nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || @@ -292,8 +312,6 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) if (nh->fib_nh_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) return -1; - - onh++; } endfor_nexthops(fi); return 0; } @@ -307,22 +325,78 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val) (val >> (DEVINDEX_HASHBITS * 2))) & mask; } -static inline unsigned int fib_info_hashfn(const struct fib_info *fi) +static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, + u32 prefsrc, u32 priority) { - unsigned int mask = (fib_info_hash_size - 1); - unsigned int val = fi->fib_nhs; + unsigned int val = init_val; - val ^= (fi->fib_protocol << 8) | fi->fib_scope; - val ^= (__force u32)fi->fib_prefsrc; - val ^= fi->fib_priority; - for_nexthops(fi) { - val ^= fib_devindex_hashfn(nh->fib_nh_oif); - } endfor_nexthops(fi) + val ^= (protocol << 8) | scope; + val ^= prefsrc; + val ^= priority; + + return val; +} + +static unsigned int fib_info_hashfn_result(unsigned int val) +{ + unsigned int mask = (fib_info_hash_size - 1); return (val ^ (val >> 7) ^ (val >> 12)) & mask; } -static struct fib_info *fib_find_info(const struct fib_info *nfi) +static inline unsigned int fib_info_hashfn(struct fib_info *fi) +{ + unsigned int val; + + val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, + fi->fib_scope, (__force u32)fi->fib_prefsrc, + fi->fib_priority); + + if (fi->nh) { + val ^= fib_devindex_hashfn(fi->nh->id); + } else { + for_nexthops(fi) { + val ^= fib_devindex_hashfn(nh->fib_nh_oif); + } endfor_nexthops(fi) + } + + return fib_info_hashfn_result(val); +} + +/* no metrics, only nexthop id */ +static struct fib_info *fib_find_info_nh(struct net *net, + const struct fib_config *cfg) +{ + struct hlist_head *head; + struct fib_info *fi; + unsigned int hash; + + hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id), + cfg->fc_protocol, cfg->fc_scope, + (__force u32)cfg->fc_prefsrc, + cfg->fc_priority); + hash = fib_info_hashfn_result(hash); + head = &fib_info_hash[hash]; + + hlist_for_each_entry(fi, head, fib_hash) { + if (!net_eq(fi->fib_net, net)) + continue; + if (!fi->nh || fi->nh->id != cfg->fc_nh_id) + continue; + if (cfg->fc_protocol == fi->fib_protocol && + cfg->fc_scope == fi->fib_scope && + cfg->fc_prefsrc == fi->fib_prefsrc && + cfg->fc_priority == fi->fib_priority && + cfg->fc_type == fi->fib_type && + cfg->fc_table == fi->fib_tb_id && + !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK)) + return fi; + } + + return NULL; +} + +static struct fib_info *fib_find_info(struct fib_info *nfi) { struct hlist_head *head; struct fib_info *fi; @@ -344,7 +418,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && - (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) + nh_comp(fi, nfi) == 0) return fi; } @@ -386,34 +460,40 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) + nla_total_size(4) /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PREFSRC */ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ + unsigned int nhs = fib_info_num_path(fi); /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); - if (fi->fib_nhs) { + if (fi->nh) + payload += nla_total_size(4); /* RTA_NH_ID */ + + if (nhs) { size_t nh_encapsize = 0; - /* Also handles the special case fib_nhs == 1 */ + /* Also handles the special case nhs == 1 */ /* each nexthop is packed in an attribute */ size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); + unsigned int i; /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); /* grab encap info */ - for_nexthops(fi) { - if (nh->fib_nh_lws) { + for (i = 0; i < fib_info_num_path(fi); i++) { + struct fib_nh_common *nhc = fib_info_nhc(fi, i); + + if (nhc->nhc_lwtstate) { /* RTA_ENCAP_TYPE */ nh_encapsize += lwtunnel_get_encap_size( - nh->fib_nh_lws); + nhc->nhc_lwtstate); /* RTA_ENCAP */ nh_encapsize += nla_total_size(2); } - } endfor_nexthops(fi); + } /* all nexthops are packed in a nested attribute */ - payload += nla_total_size((fi->fib_nhs * nhsize) + - nh_encapsize); + payload += nla_total_size((nhs * nhsize) + nh_encapsize); } @@ -574,12 +654,14 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, return nhs; } +/* only called when fib_nh is integrated into fib_info */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct net *net = fi->fib_net; struct fib_config fib_cfg; + struct fib_nh *nh; int ret; change_nexthops(fi) { @@ -642,24 +724,25 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, } endfor_nexthops(fi); ret = -EINVAL; - if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) { + nh = fib_info_nh(fi, 0); + if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) { NL_SET_ERR_MSG(extack, "Nexthop device index does not match RTA_OIF"); goto errout; } if (cfg->fc_gw_family) { - if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family || + if (cfg->fc_gw_family != nh->fib_nh_gw_family || (cfg->fc_gw_family == AF_INET && - fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) || + nh->fib_nh_gw4 != cfg->fc_gw4) || (cfg->fc_gw_family == AF_INET6 && - ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) { + ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) { NL_SET_ERR_MSG(extack, "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); goto errout; } } #ifdef CONFIG_IP_ROUTE_CLASSID - if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { + if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) { NL_SET_ERR_MSG(extack, "Nexthop class id does not match RTA_FLOW"); goto errout; @@ -670,12 +753,13 @@ errout: return ret; } +/* only called when fib_nh is integrated into fib_info */ static void fib_rebalance(struct fib_info *fi) { int total; int w; - if (fi->fib_nhs < 2) + if (fib_info_num_path(fi) < 2) return; total = 0; @@ -756,28 +840,36 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; + if (cfg->fc_nh_id) { + if (fi->nh && cfg->fc_nh_id == fi->nh->id) + return 0; + return 1; + } + if (cfg->fc_oif || cfg->fc_gw_family) { + struct fib_nh *nh = fib_info_nh(fi, 0); + if (cfg->fc_encap) { if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, - fi->fib_nh, cfg, extack)) + nh, cfg, extack)) return 1; } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && - cfg->fc_flow != fi->fib_nh->nh_tclassid) + cfg->fc_flow != nh->nh_tclassid) return 1; #endif - if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) || + if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) || (cfg->fc_gw_family && - cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family)) + cfg->fc_gw_family != nh->fib_nh_gw_family)) return 1; if (cfg->fc_gw_family == AF_INET && - cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4) + cfg->fc_gw4 != nh->fib_nh_gw4) return 1; if (cfg->fc_gw_family == AF_INET6 && - ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6)) + ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6)) return 1; return 0; @@ -1088,15 +1180,13 @@ out: return err; } -static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, - struct netlink_ext_ack *extack) +int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, + struct netlink_ext_ack *extack) { - struct net *net = cfg->fc_nlinfo.nl_net; - u32 table = cfg->fc_table; int err; if (nh->fib_nh_gw_family == AF_INET) - err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack); + err = fib_check_nh_v4_gw(net, nh, table, scope, extack); else if (nh->fib_nh_gw_family == AF_INET6) err = fib_check_nh_v6_gw(net, nh, table, extack); else @@ -1187,11 +1277,16 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, fib_info_hash_free(old_laddrhash, bytes); } -__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) +__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, + unsigned char scope) { - nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, - nh->fib_nh_gw4, - nh->nh_parent->fib_scope); + struct fib_nh *nh; + + if (nhc->nhc_family != AF_INET) + return inet_select_addr(nhc->nhc_dev, 0, scope); + + nh = container_of(nhc, struct fib_nh, nh_common); + nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); return nh->nh_saddr; @@ -1200,16 +1295,19 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) { struct fib_nh_common *nhc = res->nhc; - struct fib_nh *nh; if (res->fi->fib_prefsrc) return res->fi->fib_prefsrc; - nh = container_of(nhc, struct fib_nh, nh_common); - if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) - return nh->nh_saddr; + if (nhc->nhc_family == AF_INET) { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) + return nh->nh_saddr; + } - return fib_info_update_nh_saddr(net, nh); + return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); } static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) @@ -1241,6 +1339,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, { int err; struct fib_info *fi = NULL; + struct nexthop *nh = NULL; struct fib_info *ofi; int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; @@ -1260,6 +1359,23 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } + if (cfg->fc_nh_id) { + if (!cfg->fc_mx) { + fi = fib_find_info_nh(net, cfg); + if (fi) { + fi->fib_treeref++; + return fi; + } + } + + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto err_inval; + } + nhs = 0; + } + #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); @@ -1295,7 +1411,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, extack); - if (unlikely(IS_ERR(fi->fib_metrics))) { + if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); kfree(fi); return ERR_PTR(err); @@ -1312,14 +1428,25 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; - change_nexthops(fi) { - nexthop_nh->nh_parent = fi; - } endfor_nexthops(fi) + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = 0; + fi->nh = nh; + } + } else { + change_nexthops(fi) { + nexthop_nh->nh_parent = fi; + } endfor_nexthops(fi) - if (cfg->fc_mp) - err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); - else - err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); + if (cfg->fc_mp) + err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, + extack); + else + err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); + } if (err != 0) goto failure; @@ -1350,7 +1477,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } - if (cfg->fc_scope == RT_SCOPE_HOST) { + if (fi->nh) { + err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack); + if (err) + goto failure; + } else if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ @@ -1365,7 +1496,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } nh->fib_nh_scope = RT_SCOPE_NOWHERE; - nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif); + nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif); err = -ENODEV; if (!nh->fib_nh_dev) goto failure; @@ -1373,7 +1504,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, nexthop_nh, extack); + err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh, + cfg->fc_table, cfg->fc_scope, + extack); if (err != 0) goto failure; if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) @@ -1388,13 +1521,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; } - change_nexthops(fi) { - fib_info_update_nh_saddr(net, nexthop_nh); - if (nexthop_nh->fib_nh_gw_family == AF_INET6) - fi->fib_nh_is_v6 = true; - } endfor_nexthops(fi) + if (!fi->nh) { + change_nexthops(fi) { + fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common, + fi->fib_scope); + if (nexthop_nh->fib_nh_gw_family == AF_INET6) + fi->fib_nh_is_v6 = true; + } endfor_nexthops(fi) - fib_rebalance(fi); + fib_rebalance(fi); + } link_it: ofi = fib_find_info(fi); @@ -1416,16 +1552,20 @@ link_it: head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; hlist_add_head(&fi->fib_lhash, head); } - change_nexthops(fi) { - struct hlist_head *head; - unsigned int hash; + if (fi->nh) { + list_add(&fi->nh_list, &nh->fi_list); + } else { + change_nexthops(fi) { + struct hlist_head *head; + unsigned int hash; - if (!nexthop_nh->fib_nh_dev) - continue; - hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); - head = &fib_info_devhash[hash]; - hlist_add_head(&nexthop_nh->nh_hash, head); - } endfor_nexthops(fi) + if (!nexthop_nh->fib_nh_dev) + continue; + hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_add_head(&nexthop_nh->nh_hash, head); + } endfor_nexthops(fi) + } spin_unlock_bh(&fib_info_lock); return fi; @@ -1552,6 +1692,12 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) if (!mp) goto nla_put_failure; + if (unlikely(fi->nh)) { + if (nexthop_mpath_fill_node(skb, fi->nh) < 0) + goto nla_put_failure; + goto mp_end; + } + for_nexthops(fi) { if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0) goto nla_put_failure; @@ -1562,6 +1708,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif } endfor_nexthops(fi); +mp_end: nla_nest_end(skb, mp); return 0; @@ -1580,6 +1727,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) { + unsigned int nhs = fib_info_num_path(fi); struct nlmsghdr *nlh; struct rtmsg *rtm; @@ -1615,18 +1763,31 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_prefsrc && nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; - if (fi->fib_nhs == 1) { - struct fib_nh *nh = &fi->fib_nh[0]; + + if (fi->nh) { + if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id)) + goto nla_put_failure; + if (nexthop_is_blackhole(fi->nh)) + rtm->rtm_type = RTN_BLACKHOLE; + } + + if (nhs == 1) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); unsigned char flags = 0; - if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0) + if (fib_nexthop_info(skb, nhc, &flags, false) < 0) goto nla_put_failure; rtm->rtm_flags = flags; #ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; + if (nhc->nhc_family == AF_INET) { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + if (nh->nh_tclassid && + nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) + goto nla_put_failure; + } #endif } else { if (fib_add_multipath(skb, fi) < 0) @@ -1709,7 +1870,7 @@ static int call_fib_nh_notifiers(struct fib_nh *nh, * - if the new MTU is greater than the PMTU, don't make any change * - otherwise, unlock and set PMTU */ -static void nh_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) +void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) { struct fnhe_hash_bucket *bucket; int i; @@ -1745,7 +1906,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) hlist_for_each_entry(nh, head, nh_hash) { if (nh->fib_nh_dev == dev) - nh_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); + fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } @@ -1754,6 +1915,8 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed + * + * only used when fib_nh is built into fib_info */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { @@ -1835,6 +1998,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; + struct fib_nh *nh; if (fa->fa_slen != slen) continue; @@ -1856,8 +2020,9 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - if (!next_fi->fib_nh[0].fib_nh_gw4 || - next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK) + + nh = fib_info_nh(next_fi, 0); + if (!nh->fib_nh_gw4 || nh->fib_nh_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); @@ -1899,6 +2064,8 @@ out: /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. + * + * only used when fib_nh is built into fib_info */ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { @@ -1993,6 +2160,11 @@ void fib_select_multipath(struct fib_result *res, int hash) struct net *net = fi->fib_net; bool first = false; + if (unlikely(res->fi->nh)) { + nexthop_path_fib_result(res, hash); + return; + } + change_nexthops(fi) { if (net->ipv4.sysctl_fib_multipath_use_neigh) { if (!fib_good_nh(nexthop_nh)) @@ -2021,7 +2193,7 @@ void fib_select_path(struct net *net, struct fib_result *res, goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1) { + if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 868c74771fa9..2b2b3d291ab0 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -338,12 +338,18 @@ static struct tnode *tnode_alloc(int bits) static inline void empty_child_inc(struct key_vector *n) { - ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children; + tn_info(n)->empty_children++; + + if (!tn_info(n)->empty_children) + tn_info(n)->full_children++; } static inline void empty_child_dec(struct key_vector *n) { - tn_info(n)->empty_children-- ? : tn_info(n)->full_children--; + if (!tn_info(n)->empty_children) + tn_info(n)->full_children--; + + tn_info(n)->empty_children--; } static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) @@ -1449,6 +1455,7 @@ found: fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (unlikely(err < 0)) { +out_reject: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif @@ -1457,7 +1464,13 @@ found: } if (fi->fib_flags & RTNH_F_DEAD) continue; - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + + if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) { + err = fib_props[RTN_BLACKHOLE].error; + goto out_reject; + } + + for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); if (nhc->nhc_flags & RTNH_F_DEAD) @@ -1931,6 +1944,77 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) return found; } +/* derived from fib_trie_free */ +static void __fib_info_notify_update(struct net *net, struct fib_table *tb, + struct nl_info *info) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct fib_alias *fa; + + for (;;) { + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; + + if (IS_TRIE(pn)) + break; + + pn = node_parent(pn); + cindex = get_index(pkey, pn); + continue; + } + + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; + + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; + + continue; + } + + hlist_for_each_entry(fa, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id) + continue; + + rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, + KEYLENGTH - fa->fa_slen, tb->tb_id, + info, NLM_F_REPLACE); + + /* call_fib_entry_notifiers will be removed when + * in-kernel notifier is implemented and supported + * for nexthop objects + */ + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, + n->key, + KEYLENGTH - fa->fa_slen, fa, + NULL); + } + } +} + +void fib_info_notify_update(struct net *net, struct nl_info *info) +{ + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb_hlist) + __fib_info_notify_update(net, tb, info); + } +} + static void fib_leaf_notify(struct net *net, struct key_vector *l, struct fib_table *tb, struct notifier_block *nb) { @@ -2006,22 +2090,26 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, { unsigned int flags = NLM_F_MULTI; __be32 xkey = htonl(l->key); + int i, s_i, i_fa, s_fa, err; struct fib_alias *fa; - int i, s_i; - if (filter->filter_set) + if (filter->filter_set || + !filter->dump_exceptions || !filter->dump_routes) flags |= NLM_F_DUMP_FILTERED; s_i = cb->args[4]; + s_fa = cb->args[5]; i = 0; /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { - int err; + struct fib_info *fi = fa->fa_info; if (i < s_i) goto next; + i_fa = 0; + if (tb->tb_id != fa->tb_id) goto next; @@ -2030,29 +2118,49 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, goto next; if ((filter->protocol && - fa->fa_info->fib_protocol != filter->protocol)) + fi->fib_protocol != filter->protocol)) goto next; if (filter->dev && - !fib_info_nh_uses_dev(fa->fa_info, filter->dev)) + !fib_info_nh_uses_dev(fi, filter->dev)) goto next; } - err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - tb->tb_id, fa->fa_type, - xkey, KEYLENGTH - fa->fa_slen, - fa->fa_tos, fa->fa_info, flags); - if (err < 0) { - cb->args[4] = i; - return err; + if (filter->dump_routes) { + if (!s_fa) { + err = fib_dump_info(skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, + tb->tb_id, fa->fa_type, + xkey, + KEYLENGTH - fa->fa_slen, + fa->fa_tos, fi, flags); + if (err < 0) + goto stop; + } + + i_fa++; } + + if (filter->dump_exceptions) { + err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, + &i_fa, s_fa); + if (err < 0) + goto stop; + } + next: i++; } cb->args[4] = i; return skb->len; + +stop: + cb->args[4] = i; + cb->args[5] = i_fa; + return err; } /* rcu_read_lock needs to be hold by caller from readside */ @@ -2634,14 +2742,18 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } -static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) +static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { unsigned int flags = 0; if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; - if (fi && fi->fib_nh->fib_nh_gw4) - flags |= RTF_GATEWAY; + if (fi) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); + + if (nhc->nhc_gw.ipv4) + flags |= RTF_GATEWAY; + } if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; flags |= RTF_UP; @@ -2672,7 +2784,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) prefix = htonl(l->key); hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { - const struct fib_info *fi = fa->fa_info; + struct fib_info *fi = fa->fa_info; __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); @@ -2685,26 +2797,31 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) seq_setwidth(seq, 127); - if (fi) + if (fi) { + struct fib_nh_common *nhc = fib_info_nhc(fi, 0); + __be32 gw = 0; + + if (nhc->nhc_gw_family == AF_INET) + gw = nhc->nhc_gw.ipv4; + seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", - fi->fib_dev ? fi->fib_dev->name : "*", - prefix, - fi->fib_nh->fib_nh_gw4, flags, 0, 0, + nhc->nhc_dev ? nhc->nhc_dev->name : "*", + prefix, gw, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); - else + } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); - + } seq_pad(seq, '\n'); } diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 293acfb36376..44bfeecac33e 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -83,7 +83,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { if (!skb_checksum_simple_validate(skb)) { - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + skb_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } else if (csum_err) { *csum_err = true; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7c857c72aad1..1510e951f451 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -201,7 +201,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; */ static struct sock *icmp_sk(struct net *net) { - return *this_cpu_ptr(net->ipv4.icmp_sk); + return this_cpu_read(*net->ipv4.icmp_sk); } /* Called with BH disabled */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 85107bf812f2..180f6896b98b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -332,14 +332,15 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev, const struct flowi4 *fl4) { struct in_device *in_dev = __in_dev_get_rcu(dev); + const struct in_ifaddr *ifa; if (!in_dev) return htonl(INADDR_ANY); - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (fl4->saddr == ifa->ifa_local) return fl4->saddr; - } endfor_ifa(in_dev); + } return htonl(INADDR_ANY); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7fd6db3fe366..f5c163d4771b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -649,8 +649,7 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) EXPORT_SYMBOL(inet_rtx_syn_ack); /* return true if req was found in the ehash table */ -static bool reqsk_queue_unlink(struct request_sock_queue *queue, - struct request_sock *req) +static bool reqsk_queue_unlink(struct request_sock *req) { struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; bool found = false; @@ -669,7 +668,7 @@ static bool reqsk_queue_unlink(struct request_sock_queue *queue, void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { - if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) { + if (reqsk_queue_unlink(req)) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5ce6969896f5..d666756be5f1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -106,48 +106,90 @@ int inet_frags_init(struct inet_frags *f) if (!f->frags_cachep) return -ENOMEM; + refcount_set(&f->refcnt, 1); + init_completion(&f->completion); return 0; } EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { - /* We must wait that all inet_frag_destroy_rcu() have completed. */ - rcu_barrier(); + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); + + wait_for_completion(&f->completion); kmem_cache_destroy(f->frags_cachep); f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); +/* called from rhashtable_free_and_destroy() at netns_frags dismantle */ static void inet_frags_free_cb(void *ptr, void *arg) { struct inet_frag_queue *fq = ptr; + int count; - /* If we can not cancel the timer, it means this frag_queue - * is already disappearing, we have nothing to do. - * Otherwise, we own a refcount until the end of this function. - */ - if (!del_timer(&fq->timer)) - return; + count = del_timer_sync(&fq->timer) ? 1 : 0; spin_lock_bh(&fq->lock); if (!(fq->flags & INET_FRAG_COMPLETE)) { fq->flags |= INET_FRAG_COMPLETE; - refcount_dec(&fq->refcnt); + count++; + } else if (fq->flags & INET_FRAG_HASH_DEAD) { + count++; } spin_unlock_bh(&fq->lock); - inet_frag_put(fq); + if (refcount_sub_and_test(count, &fq->refcnt)) + inet_frag_destroy(fq); +} + +static void fqdir_work_fn(struct work_struct *work) +{ + struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); + struct inet_frags *f = fqdir->f; + + rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); + + /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) + * have completed, since they need to dereference fqdir. + * Would it not be nice to have kfree_rcu_barrier() ? :) + */ + rcu_barrier(); + + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); + + kfree(fqdir); } -void inet_frags_exit_net(struct netns_frags *nf) +int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) { - nf->high_thresh = 0; /* prevent creation of new frags */ + struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); + int res; - rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); + if (!fqdir) + return -ENOMEM; + fqdir->f = f; + fqdir->net = net; + res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); + if (res < 0) { + kfree(fqdir); + return res; + } + refcount_inc(&f->refcnt); + *fqdirp = fqdir; + return 0; } -EXPORT_SYMBOL(inet_frags_exit_net); +EXPORT_SYMBOL(fqdir_init); + +void fqdir_exit(struct fqdir *fqdir) +{ + INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); + queue_work(system_wq, &fqdir->destroy_work); +} +EXPORT_SYMBOL(fqdir_exit); void inet_frag_kill(struct inet_frag_queue *fq) { @@ -155,11 +197,23 @@ void inet_frag_kill(struct inet_frag_queue *fq) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - struct netns_frags *nf = fq->net; + struct fqdir *fqdir = fq->fqdir; fq->flags |= INET_FRAG_COMPLETE; - rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); - refcount_dec(&fq->refcnt); + rcu_read_lock(); + /* The RCU read lock provides a memory barrier + * guaranteeing that if fqdir->dead is false then + * the hash table destruction will not start until + * after we unlock. Paired with inet_frags_exit_net(). + */ + if (!fqdir->dead) { + rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, + fqdir->f->rhash_params); + refcount_dec(&fq->refcnt); + } else { + fq->flags |= INET_FRAG_HASH_DEAD; + } + rcu_read_unlock(); } } EXPORT_SYMBOL(inet_frag_kill); @@ -168,7 +222,7 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) { struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, rcu); - struct inet_frags *f = q->net->f; + struct inet_frags *f = q->fqdir->f; if (f->destructor) f->destructor(q); @@ -199,7 +253,7 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge); void inet_frag_destroy(struct inet_frag_queue *q) { - struct netns_frags *nf; + struct fqdir *fqdir; unsigned int sum, sum_truesize = 0; struct inet_frags *f; @@ -207,18 +261,18 @@ void inet_frag_destroy(struct inet_frag_queue *q) WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ - nf = q->net; - f = nf->f; + fqdir = q->fqdir; + f = fqdir->f; sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); sum = sum_truesize + f->qsize; call_rcu(&q->rcu, inet_frag_destroy_rcu); - sub_frag_mem_limit(nf, sum); + sub_frag_mem_limit(fqdir, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, +static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, struct inet_frags *f, void *arg) { @@ -228,9 +282,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, if (!q) return NULL; - q->net = nf; + q->fqdir = fqdir; f->constructor(q, arg); - add_frag_mem_limit(nf, f->qsize); + add_frag_mem_limit(fqdir, f->qsize); timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); @@ -239,21 +293,21 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, return q; } -static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, +static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, void *arg, struct inet_frag_queue **prev) { - struct inet_frags *f = nf->f; + struct inet_frags *f = fqdir->f; struct inet_frag_queue *q; - q = inet_frag_alloc(nf, f, arg); + q = inet_frag_alloc(fqdir, f, arg); if (!q) { *prev = ERR_PTR(-ENOMEM); return NULL; } - mod_timer(&q->timer, jiffies + nf->timeout); + mod_timer(&q->timer, jiffies + fqdir->timeout); - *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, + *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { q->flags |= INET_FRAG_COMPLETE; @@ -265,18 +319,18 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, } /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) +struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { struct inet_frag_queue *fq = NULL, *prev; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) + if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh) return NULL; rcu_read_lock(); - prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); if (!prev) - fq = inet_frag_create(nf, key, &prev); + fq = inet_frag_create(fqdir, key, &prev); if (prev && !IS_ERR(prev)) { fq = prev; if (!refcount_inc_not_zero(&fq->refcnt)) @@ -387,7 +441,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, delta += head->truesize; if (delta) - add_frag_mem_limit(q->net, delta); + add_frag_mem_limit(q->fqdir, delta); /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part @@ -409,7 +463,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, head->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; - add_frag_mem_limit(q->net, clone->truesize); + add_frag_mem_limit(q->fqdir, clone->truesize); skb_shinfo(head)->frag_list = clone; nextp = &clone->next; } else { @@ -462,7 +516,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, rbn = rbnext; } } - sub_frag_mem_limit(q->net, head->truesize); + sub_frag_mem_limit(q->fqdir, head->truesize); *nextp = NULL; skb_mark_not_on_list(head); @@ -490,7 +544,7 @@ struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) if (head == q->fragments_tail) q->fragments_tail = NULL; - sub_frag_mem_limit(q->net, head->truesize); + sub_frag_mem_limit(q->fqdir, head->truesize); return head; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c4503073248b..97824864e40d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -316,7 +316,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); done: - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cf2b0a6a3337..4385eb9e781f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -82,15 +82,13 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); - struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, - frags); - struct net *net = container_of(ipv4, struct net, ipv4); + struct net *net = q->fqdir->net; const struct frag_v4_compare_key *key = a; q->key.v4 = *key; qp->ecn = 0; - qp->peer = q->net->max_dist ? + qp->peer = q->fqdir->max_dist ? inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } @@ -142,9 +140,13 @@ static void ip_expire(struct timer_list *t) int err; qp = container_of(frag, struct ipq, q); - net = container_of(qp->q.net, struct net, ipv4.frags); + net = qp->q.fqdir->net; rcu_read_lock(); + + if (qp->q.fqdir->dead) + goto out_rcu_unlock; + spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) @@ -211,7 +213,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->ipv4.frags, &key); + q = inet_frag_find(net->ipv4.fqdir, &key); if (!q) return NULL; @@ -222,7 +224,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; - unsigned int max = qp->q.net->max_dist; + unsigned int max = qp->q.fqdir->max_dist; unsigned int start, end; int rc; @@ -236,12 +238,8 @@ static int ip_frag_too_far(struct ipq *qp) rc = qp->q.fragments_tail && (end - start) > max; - if (rc) { - struct net *net; - - net = container_of(qp->q.net, struct net, ipv4.frags); - __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - } + if (rc) + __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS); return rc; } @@ -250,13 +248,13 @@ static int ip_frag_reinit(struct ipq *qp) { unsigned int sum_truesize = 0; - if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { + if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { refcount_inc(&qp->q.refcnt); return -ETIMEDOUT; } sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments); - sub_frag_mem_limit(qp->q.net, sum_truesize); + sub_frag_mem_limit(qp->q.fqdir, sum_truesize); qp->q.flags = 0; qp->q.len = 0; @@ -273,7 +271,7 @@ static int ip_frag_reinit(struct ipq *qp) /* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; struct sk_buff *prev_tail; struct net_device *dev; @@ -352,7 +350,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - add_frag_mem_limit(qp->q.net, skb->truesize); + add_frag_mem_limit(qp->q.fqdir, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; @@ -399,7 +397,7 @@ err: static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct net *net = qp->q.fqdir->net; struct iphdr *iph; void *reasm_data; int len, err; @@ -544,30 +542,24 @@ static int dist_min; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", - .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", - .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv4.frags.high_thresh }, { .procname = "ipfrag_time", - .data = &init_net.ipv4.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ipfrag_max_dist", - .data = &init_net.ipv4.frags.max_dist, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -600,13 +592,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv4.frags.high_thresh; - table[0].extra1 = &net->ipv4.frags.low_thresh; - table[1].data = &net->ipv4.frags.low_thresh; - table[1].extra2 = &net->ipv4.frags.high_thresh; - table[2].data = &net->ipv4.frags.timeout; - table[3].data = &net->ipv4.frags.max_dist; } + table[0].data = &net->ipv4.fqdir->high_thresh; + table[0].extra1 = &net->ipv4.fqdir->low_thresh; + table[1].data = &net->ipv4.fqdir->low_thresh; + table[1].extra2 = &net->ipv4.fqdir->high_thresh; + table[2].data = &net->ipv4.fqdir->timeout; + table[3].data = &net->ipv4.fqdir->max_dist; hdr = register_net_sysctl(net, "net/ipv4", table); if (!hdr) @@ -654,6 +646,9 @@ static int __net_init ipv4_frags_init_net(struct net *net) { int res; + res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net); + if (res < 0) + return res; /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -668,36 +663,38 @@ static int __net_init ipv4_frags_init_net(struct net *net) * we will prune down to 3MB, making room for approx 8 big 64K * fragments 8x128k. */ - net->ipv4.frags.high_thresh = 4 * 1024 * 1024; - net->ipv4.frags.low_thresh = 3 * 1024 * 1024; + net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024; + net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival * by TTL. */ - net->ipv4.frags.timeout = IP_FRAG_TIME; + net->ipv4.fqdir->timeout = IP_FRAG_TIME; - net->ipv4.frags.max_dist = 64; - net->ipv4.frags.f = &ip4_frags; + net->ipv4.fqdir->max_dist = 64; - res = inet_frags_init_net(&net->ipv4.frags); - if (res < 0) - return res; res = ip4_frags_ns_ctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv4.frags); + fqdir_exit(net->ipv4.fqdir); return res; } +static void __net_exit ipv4_frags_pre_exit_net(struct net *net) +{ + fqdir_pre_exit(net->ipv4.fqdir); +} + static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); - inet_frags_exit_net(&net->ipv4.frags); + fqdir_exit(net->ipv4.fqdir); } static struct pernet_operations ip4_frags_ops = { - .init = ipv4_frags_init_net, - .exit = ipv4_frags_exit_net, + .init = ipv4_frags_init_net, + .pre_exit = ipv4_frags_pre_exit_net, + .exit = ipv4_frags_exit_net, }; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 3db31bb9df50..ddaa01ec2bce 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -473,6 +473,7 @@ error: *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } +EXPORT_SYMBOL(__ip_options_compile); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8c2ec35b6512..cc7ef0d05bbd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, return ret; } -static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -315,14 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + static int ip_mc_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *new_rt; - int ret; + bool do_cn = false; + int ret, err; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { + switch (ret) { + case NET_XMIT_CN: + do_cn = true; + /* fall through */ + case NET_XMIT_SUCCESS: + break; + default: kfree_skb(skb); return ret; } @@ -338,7 +354,8 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk, skb_dst_set(skb, &new_rt->dst); } - return dev_loopback_xmit(net, sk, skb); + err = dev_loopback_xmit(net, sk, skb); + return (do_cn && err) ? ret : err; } int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -537,9 +554,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_hash(to, from); - /* Copy the flags to each fragment. */ - IPCB(to)->flags = IPCB(from)->flags; - #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif @@ -573,6 +587,175 @@ static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return ip_do_fragment(net, sk, skb, output); } +void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, + unsigned int hlen, struct ip_fraglist_iter *iter) +{ + unsigned int first_len = skb_pagelen(skb); + + iter->frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + + iter->offset = 0; + iter->iph = iph; + iter->hlen = hlen; + + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + iph->tot_len = htons(first_len); + iph->frag_off = htons(IP_MF); + ip_send_check(iph); +} +EXPORT_SYMBOL(ip_fraglist_init); + +static void ip_fraglist_ipcb_prepare(struct sk_buff *skb, + struct ip_fraglist_iter *iter) +{ + struct sk_buff *to = iter->frag; + + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(skb)->flags; + + if (iter->offset == 0) + ip_options_fragment(to); +} + +void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) +{ + unsigned int hlen = iter->hlen; + struct iphdr *iph = iter->iph; + struct sk_buff *frag; + + frag = iter->frag; + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iph, hlen); + iter->iph = ip_hdr(frag); + iph = iter->iph; + iph->tot_len = htons(frag->len); + ip_copy_metadata(frag, skb); + iter->offset += skb->len - hlen; + iph->frag_off = htons(iter->offset >> 3); + if (frag->next) + iph->frag_off |= htons(IP_MF); + /* Ready, complete checksum */ + ip_send_check(iph); +} +EXPORT_SYMBOL(ip_fraglist_prepare); + +void ip_frag_init(struct sk_buff *skb, unsigned int hlen, + unsigned int ll_rs, unsigned int mtu, + struct ip_frag_state *state) +{ + struct iphdr *iph = ip_hdr(skb); + + state->hlen = hlen; + state->ll_rs = ll_rs; + state->mtu = mtu; + + state->left = skb->len - hlen; /* Space per frame */ + state->ptr = hlen; /* Where to start from */ + + state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; + state->not_last_frag = iph->frag_off & htons(IP_MF); +} +EXPORT_SYMBOL(ip_frag_init); + +static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, + bool first_frag, struct ip_frag_state *state) +{ + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(from)->flags; + + if (IPCB(from)->flags & IPSKB_FRAG_PMTU) + state->iph->frag_off |= htons(IP_DF); + + /* ANK: dirty, but effective trick. Upgrade options only if + * the segment to be fragmented was THE FIRST (otherwise, + * options are already fixed) and make it ONCE + * on the initial skb, so that all the following fragments + * will inherit fixed options. + */ + if (first_frag) + ip_options_fragment(from); +} + +struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) +{ + unsigned int len = state->left; + struct sk_buff *skb2; + struct iphdr *iph; + + len = state->left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > state->mtu) + len = state->mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < state->left) { + len &= ~7; + } + + /* Allocate buffer */ + skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC); + if (!skb2) + return ERR_PTR(-ENOMEM); + + /* + * Set up data on packet + */ + + ip_copy_metadata(skb2, skb); + skb_reserve(skb2, state->ll_rs); + skb_put(skb2, len + state->hlen); + skb_reset_network_header(skb2); + skb2->transport_header = skb2->network_header + state->hlen; + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + + skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen); + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len)) + BUG(); + state->left -= len; + + /* + * Fill in the new header fields. + */ + iph = ip_hdr(skb2); + iph->frag_off = htons((state->offset >> 3)); + + /* + * Added AC : If we are fragmenting a fragment that's not the + * last fragment then keep MF on each bit + */ + if (state->left > 0 || state->not_last_frag) + iph->frag_off |= htons(IP_MF); + state->ptr += len; + state->offset += len; + + iph->tot_len = htons(len + state->hlen); + + ip_send_check(iph); + + return skb2; +} +EXPORT_SYMBOL(ip_frag_next); + /* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus @@ -584,12 +767,11 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; - int ptr; struct sk_buff *skb2; - unsigned int mtu, hlen, left, len, ll_rs; - int offset; - __be16 not_last_frag; struct rtable *rt = skb_rtable(skb); + unsigned int mtu, hlen, ll_rs; + struct ip_fraglist_iter iter; + struct ip_frag_state state; int err = 0; /* for offloaded checksums cleanup checksum before fragmentation */ @@ -654,49 +836,24 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, } /* Everything is OK. Generate! */ - - err = 0; - offset = 0; - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - iph->tot_len = htons(first_len); - iph->frag_off = htons(IP_MF); - ip_send_check(iph); + ip_fraglist_init(skb, iph, hlen, &iter); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), iph, hlen); - iph = ip_hdr(frag); - iph->tot_len = htons(frag->len); - ip_copy_metadata(frag, skb); - if (offset == 0) - ip_options_fragment(frag); - offset += skb->len - hlen; - iph->frag_off = htons(offset>>3); - if (frag->next) - iph->frag_off |= htons(IP_MF); - /* Ready, complete checksum */ - ip_send_check(iph); + if (iter.frag) { + ip_fraglist_ipcb_prepare(skb, &iter); + ip_fraglist_prepare(skb, &iter); } err = output(net, sk, skb); if (!err) IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); - if (err || !frag) + if (err || !iter.frag) break; - skb = frag; - frag = skb->next; - skb_mark_not_on_list(skb); + skb = ip_fraglist_next(&iter); } if (err == 0) { @@ -704,7 +861,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - kfree_skb_list(frag); + kfree_skb_list(iter.frag); IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; @@ -720,105 +877,29 @@ slow_path_clean: } slow_path: - iph = ip_hdr(skb); - - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - /* * Fragment the datagram. */ - offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; - not_last_frag = iph->frag_off & htons(IP_MF); + ip_frag_init(skb, hlen, ll_rs, mtu, &state); /* * Keep copying data until we run out. */ - while (left > 0) { - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } + while (state.left > 0) { + bool first_frag = (state.offset == 0); - /* Allocate buffer */ - skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC); - if (!skb2) { - err = -ENOMEM; + skb2 = ip_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); goto fail; } - - /* - * Set up data on packet - */ - - ip_copy_metadata(skb2, skb); - skb_reserve(skb2, ll_rs); - skb_put(skb2, len + hlen); - skb_reset_network_header(skb2); - skb2->transport_header = skb2->network_header + hlen; - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - - skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); - - /* - * Copy a block of the IP datagram. - */ - if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) - BUG(); - left -= len; - - /* - * Fill in the new header fields. - */ - iph = ip_hdr(skb2); - iph->frag_off = htons((offset >> 3)); - - if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) - iph->frag_off |= htons(IP_DF); - - /* ANK: dirty, but effective trick. Upgrade options only if - * the segment to be fragmented was THE FIRST (otherwise, - * options are already fixed) and make it ONCE - * on the initial skb, so that all the following fragments - * will inherit fixed options. - */ - if (offset == 0) - ip_options_fragment(skb); - - /* - * Added AC : If we are fragmenting a fragment that's not the - * last fragment then keep MF on each bit - */ - if (left > 0 || not_last_frag) - iph->frag_off |= htons(IP_MF); - ptr += len; - offset += len; + ip_frag_ipcb(skb, skb2, first_frag, &state); /* * Put this fragment into the sending queue. */ - iph->tot_len = htons(len + hlen); - - ip_send_check(iph); - err = output(net, sk, skb2); if (err) goto fail; @@ -1568,7 +1649,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, - unsigned int len) + unsigned int len, u64 transmit_time) { struct ip_options_data replyopts; struct ipcm_cookie ipc; @@ -1584,6 +1665,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, ipcm_init(&ipc); ipc.addr = daddr; + ipc.sockc.transmit_time = transmit_time; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 2f4cdcc13d53..59bfa3825810 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -186,8 +186,7 @@ static void __exit ipcomp4_fini(void) { if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&ipcomp_type, AF_INET); } module_init(ipcomp4_init); diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 87ca2c42359b..a4e07e5e9c11 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -17,7 +17,7 @@ target(struct sk_buff *skb, const struct xt_action_param *par) unsigned char *arpptr; int pln, hln; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return NF_DROP; arp = arp_hdr(skb); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 5f116c3749b4..5930d3b02555 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -29,7 +29,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo) if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { __u8 oldtos; - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return false; iph = ip_hdr(skb); oldtos = iph->tos; @@ -58,7 +58,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) tcph->cwr == einfo->proto.tcp.cwr)) return true; - if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) + if (skb_ensure_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) return false; tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb); diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 64d9563c0218..8e7f84ec783d 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -3,258 +3,11 @@ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <net/tcp.h> - #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_synproxy.h> -#include <net/netfilter/nf_conntrack_ecache.h> - -static struct iphdr * -synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, - __be32 daddr) -{ - struct iphdr *iph; - - skb_reset_network_header(skb); - iph = skb_put(skb, sizeof(*iph)); - iph->version = 4; - iph->ihl = sizeof(*iph) / 4; - iph->tos = 0; - iph->id = 0; - iph->frag_off = htons(IP_DF); - iph->ttl = net->ipv4.sysctl_ip_default_ttl; - iph->protocol = IPPROTO_TCP; - iph->check = 0; - iph->saddr = saddr; - iph->daddr = daddr; - - return iph; -} - -static void -synproxy_send_tcp(struct net *net, - const struct sk_buff *skb, struct sk_buff *nskb, - struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, - struct iphdr *niph, struct tcphdr *nth, - unsigned int tcp_hdr_size) -{ - nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)nth - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - skb_dst_set_noref(nskb, skb_dst(skb)); - nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) - goto free_nskb; - - if (nfct) { - nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); - nf_conntrack_get(nfct); - } - - ip_local_out(net, nskb->sk, nskb); - return; - -free_nskb: - kfree_skb(nskb); -} - -static void -synproxy_send_client_synack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - u16 mss = opts->mss; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE; - nth->doff = tcp_hdr_size / 4; - nth->window = 0; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_syn(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(recv_seq - 1); - /* ack_seq is used to relay our ISN to the synproxy hook to initialize - * sequence number translation once a connection tracking entry exists. - */ - nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); - tcp_flag_word(nth) = TCP_FLAG_SYN; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; - nth->doff = tcp_hdr_size / 4; - nth->window = th->window; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, - niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_ack(struct net *net, - const struct ip_ct_tcp *state, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(ntohl(th->ack_seq)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(ntohl(th->seq) + 1); - nth->ack_seq = th->ack_seq; - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(ntohs(th->window) >> opts->wscale); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static bool -synproxy_recv_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - int mss; - - mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); - if (mss == 0) { - this_cpu_inc(snet->stats->cookie_invalid); - return false; - } - - this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; - - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy_check_timestamp_cookie(opts); - - synproxy_send_server_syn(net, skb, th, opts, recv_seq); - return true; -} +#include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) @@ -306,135 +59,6 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv4_synproxy_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct net *net = nhs->net; - struct synproxy_net *snet = synproxy_pernet(net); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - struct nf_conn_synproxy *synproxy; - struct synproxy_options opts = {}; - const struct ip_ct_tcp *state; - struct tcphdr *th, _th; - unsigned int thoff; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_ACCEPT; - - synproxy = nfct_synproxy(ct); - if (synproxy == NULL) - return NF_ACCEPT; - - if (nf_is_loopback_packet(skb) || - ip_hdr(skb)->protocol != IPPROTO_TCP) - return NF_ACCEPT; - - thoff = ip_hdrlen(skb); - th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); - if (th == NULL) - return NF_DROP; - - state = &ct->proto.tcp; - switch (state->state) { - case TCP_CONNTRACK_CLOSE: - if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - - ntohl(th->seq) + 1); - break; - } - - if (!th->syn || th->ack || - CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - break; - - /* Reopened connection - reset the sequence number and timestamp - * adjustments, they will get initialized once the connection is - * reestablished. - */ - nf_ct_seqadj_init(ct, ctinfo, 0); - synproxy->tsoff = 0; - this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ - case TCP_CONNTRACK_SYN_SENT: - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (!th->syn && th->ack && - CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, - * therefore we need to add 1 to make the SYN sequence - * number match the one of first SYN. - */ - if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) { - this_cpu_inc(snet->stats->cookie_retrans); - consume_skb(skb); - return NF_STOLEN; - } else { - return NF_DROP; - } - } - - synproxy->isn = ntohl(th->ack_seq); - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy->its = opts.tsecr; - - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - break; - case TCP_CONNTRACK_SYN_RECV: - if (!th->syn || !th->ack) - break; - - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { - synproxy->tsoff = opts.tsval - synproxy->its; - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - } - - opts.options &= ~(XT_SYNPROXY_OPT_MSS | - XT_SYNPROXY_OPT_WSCALE | - XT_SYNPROXY_OPT_SACK_PERM); - - swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(net, state, skb, th, &opts); - - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); - nf_conntrack_event_cache(IPCT_SEQADJ, ct); - - swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(net, skb, th, &opts); - - consume_skb(skb); - return NF_STOLEN; - default: - break; - } - - synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); - return NF_ACCEPT; -} - -static const struct nf_hook_ops ipv4_synproxy_ops[] = { - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int synproxy_tg4_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); @@ -449,16 +73,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par) if (err) return err; - if (snet->hook_ref4 == 0) { - err = nf_register_net_hooks(par->net, ipv4_synproxy_ops, - ARRAY_SIZE(ipv4_synproxy_ops)); - if (err) { - nf_ct_netns_put(par->net, par->family); - return err; - } + err = nf_synproxy_ipv4_init(snet, par->net); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; } - snet->hook_ref4++; return err; } @@ -466,10 +86,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); - snet->hook_ref4--; - if (snet->hook_ref4 == 0) - nf_unregister_net_hooks(par->net, ipv4_synproxy_ops, - ARRAY_SIZE(ipv4_synproxy_ops)); + nf_synproxy_ipv4_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 6eefde5bc468..69697eb4bfc6 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -2,7 +2,7 @@ /* * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT . * - * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index dfea10f13878..87b711fd5a44 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -6,7 +6,7 @@ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the 'brute force' H.323 NAT module by - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Jozsef Kadlecsik <kadlec@netfilter.org> */ #include <linux/module.h> @@ -58,7 +58,7 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff, net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); return -1; } - /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy + /* nf_nat_mangle_udp_packet uses skb_ensure_writable() to copy * or pull everything in a linear buffer, so we can safely * use the skb pointers now */ *data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c index 657d2dcec3cc..717b726504fe 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -186,7 +186,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, return NF_DROP; } - if (!skb_make_writable(skb, skb->len)) { + if (skb_ensure_writable(skb, skb->len)) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); return NF_DROP; } diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index b6dd39636bea..b2bae0b0e42a 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -49,6 +49,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4); __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) { + const struct in_ifaddr *ifa; struct in_device *indev; __be32 laddr; @@ -57,10 +58,14 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) laddr = 0; indev = __in_dev_get_rcu(skb->dev); - for_primary_ifa(indev) { + + in_dev_for_each_ifa_rcu(ifa, indev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + laddr = ifa->ifa_local; break; - } endfor_ifa(indev); + } return laddr ? laddr : daddr; } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c new file mode 100644 index 000000000000..5fe5a3981d43 --- /dev/null +++ b/net/ipv4/nexthop.c @@ -0,0 +1,1828 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Generic nexthop implementation + * + * Copyright (c) 2017-19 Cumulus Networks + * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> + */ + +#include <linux/nexthop.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> +#include <net/arp.h> +#include <net/ipv6_stubs.h> +#include <net/lwtunnel.h> +#include <net/ndisc.h> +#include <net/nexthop.h> +#include <net/route.h> +#include <net/sock.h> + +static void remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo); + +#define NH_DEV_HASHBITS 8 +#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) + +static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { + [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 }, + [NHA_ID] = { .type = NLA_U32 }, + [NHA_GROUP] = { .type = NLA_BINARY }, + [NHA_GROUP_TYPE] = { .type = NLA_U16 }, + [NHA_BLACKHOLE] = { .type = NLA_FLAG }, + [NHA_OIF] = { .type = NLA_U32 }, + [NHA_GATEWAY] = { .type = NLA_BINARY }, + [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, + [NHA_ENCAP] = { .type = NLA_NESTED }, + [NHA_GROUPS] = { .type = NLA_FLAG }, + [NHA_MASTER] = { .type = NLA_U32 }, +}; + +static unsigned int nh_dev_hashfn(unsigned int val) +{ + unsigned int mask = NH_DEV_HASHSIZE - 1; + + return (val ^ + (val >> NH_DEV_HASHBITS) ^ + (val >> (NH_DEV_HASHBITS * 2))) & mask; +} + +static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) +{ + struct net_device *dev = nhi->fib_nhc.nhc_dev; + struct hlist_head *head; + unsigned int hash; + + WARN_ON(!dev); + + hash = nh_dev_hashfn(dev->ifindex); + head = &net->nexthop.devhash[hash]; + hlist_add_head(&nhi->dev_hash, head); +} + +static void nexthop_free_mpath(struct nexthop *nh) +{ + struct nh_group *nhg; + int i; + + nhg = rcu_dereference_raw(nh->nh_grp); + for (i = 0; i < nhg->num_nh; ++i) + WARN_ON(nhg->nh_entries[i].nh); + + kfree(nhg); +} + +static void nexthop_free_single(struct nexthop *nh) +{ + struct nh_info *nhi; + + nhi = rcu_dereference_raw(nh->nh_info); + switch (nhi->family) { + case AF_INET: + fib_nh_release(nh->net, &nhi->fib_nh); + break; + case AF_INET6: + ipv6_stub->fib6_nh_release(&nhi->fib6_nh); + break; + } + kfree(nhi); +} + +void nexthop_free_rcu(struct rcu_head *head) +{ + struct nexthop *nh = container_of(head, struct nexthop, rcu); + + if (nh->is_group) + nexthop_free_mpath(nh); + else + nexthop_free_single(nh); + + kfree(nh); +} +EXPORT_SYMBOL_GPL(nexthop_free_rcu); + +static struct nexthop *nexthop_alloc(void) +{ + struct nexthop *nh; + + nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); + if (nh) { + INIT_LIST_HEAD(&nh->fi_list); + INIT_LIST_HEAD(&nh->f6i_list); + INIT_LIST_HEAD(&nh->grp_list); + } + return nh; +} + +static struct nh_group *nexthop_grp_alloc(u16 num_nh) +{ + size_t sz = offsetof(struct nexthop, nh_grp) + + sizeof(struct nh_group) + + sizeof(struct nh_grp_entry) * num_nh; + struct nh_group *nhg; + + nhg = kzalloc(sz, GFP_KERNEL); + if (nhg) + nhg->num_nh = num_nh; + + return nhg; +} + +static void nh_base_seq_inc(struct net *net) +{ + while (++net->nexthop.seq == 0) + ; +} + +/* no reference taken; rcu lock or rtnl must be held */ +struct nexthop *nexthop_find_by_id(struct net *net, u32 id) +{ + struct rb_node **pp, *parent = NULL, *next; + + pp = &net->nexthop.rb_root.rb_node; + while (1) { + struct nexthop *nh; + + next = rcu_dereference_raw(*pp); + if (!next) + break; + parent = next; + + nh = rb_entry(parent, struct nexthop, rb_node); + if (id < nh->id) + pp = &next->rb_left; + else if (id > nh->id) + pp = &next->rb_right; + else + return nh; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nexthop_find_by_id); + +/* used for auto id allocation; called with rtnl held */ +static u32 nh_find_unused_id(struct net *net) +{ + u32 id_start = net->nexthop.last_id_allocated; + + while (1) { + net->nexthop.last_id_allocated++; + if (net->nexthop.last_id_allocated == id_start) + break; + + if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) + return net->nexthop.last_id_allocated; + } + return 0; +} + +static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) +{ + struct nexthop_grp *p; + size_t len = nhg->num_nh * sizeof(*p); + struct nlattr *nla; + u16 group_type = 0; + int i; + + if (nhg->mpath) + group_type = NEXTHOP_GRP_TYPE_MPATH; + + if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) + goto nla_put_failure; + + nla = nla_reserve(skb, NHA_GROUP, len); + if (!nla) + goto nla_put_failure; + + p = nla_data(nla); + for (i = 0; i < nhg->num_nh; ++i) { + p->id = nhg->nh_entries[i].nh->id; + p->weight = nhg->nh_entries[i].weight - 1; + p += 1; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, + int event, u32 portid, u32 seq, unsigned int nlflags) +{ + struct fib6_nh *fib6_nh; + struct fib_nh *fib_nh; + struct nlmsghdr *nlh; + struct nh_info *nhi; + struct nhmsg *nhm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); + if (!nlh) + return -EMSGSIZE; + + nhm = nlmsg_data(nlh); + nhm->nh_family = AF_UNSPEC; + nhm->nh_flags = nh->nh_flags; + nhm->nh_protocol = nh->protocol; + nhm->nh_scope = 0; + nhm->resvd = 0; + + if (nla_put_u32(skb, NHA_ID, nh->id)) + goto nla_put_failure; + + if (nh->is_group) { + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + if (nla_put_nh_group(skb, nhg)) + goto nla_put_failure; + goto out; + } + + nhi = rtnl_dereference(nh->nh_info); + nhm->nh_family = nhi->family; + if (nhi->reject_nh) { + if (nla_put_flag(skb, NHA_BLACKHOLE)) + goto nla_put_failure; + goto out; + } else { + const struct net_device *dev; + + dev = nhi->fib_nhc.nhc_dev; + if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) + goto nla_put_failure; + } + + nhm->nh_scope = nhi->fib_nhc.nhc_scope; + switch (nhi->family) { + case AF_INET: + fib_nh = &nhi->fib_nh; + if (fib_nh->fib_nh_gw_family && + nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) + goto nla_put_failure; + break; + + case AF_INET6: + fib6_nh = &nhi->fib6_nh; + if (fib6_nh->fib_nh_gw_family && + nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) + goto nla_put_failure; + break; + } + + if (nhi->fib_nhc.nhc_lwtstate && + lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, + NHA_ENCAP, NHA_ENCAP_TYPE) < 0) + goto nla_put_failure; + +out: + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static size_t nh_nlmsg_size_grp(struct nexthop *nh) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; + + return nla_total_size(sz) + + nla_total_size(2); /* NHA_GROUP_TYPE */ +} + +static size_t nh_nlmsg_size_single(struct nexthop *nh) +{ + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + size_t sz; + + /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE + * are mutually exclusive + */ + sz = nla_total_size(4); /* NHA_OIF */ + + switch (nhi->family) { + case AF_INET: + if (nhi->fib_nh.fib_nh_gw_family) + sz += nla_total_size(4); /* NHA_GATEWAY */ + break; + + case AF_INET6: + /* NHA_GATEWAY */ + if (nhi->fib6_nh.fib_nh_gw_family) + sz += nla_total_size(sizeof(const struct in6_addr)); + break; + } + + if (nhi->fib_nhc.nhc_lwtstate) { + sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); + sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ + } + + return sz; +} + +static size_t nh_nlmsg_size(struct nexthop *nh) +{ + size_t sz = nla_total_size(4); /* NHA_ID */ + + if (nh->is_group) + sz += nh_nlmsg_size_grp(nh); + else + sz += nh_nlmsg_size_single(nh); + + return sz; +} + +static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) +{ + unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); + if (!skb) + goto errout; + + err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags); + if (err < 0) { + /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); +} + +static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, + struct netlink_ext_ack *extack) +{ + if (nh->is_group) { + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + /* nested multipath (group within a group) is not + * supported + */ + if (nhg->mpath) { + NL_SET_ERR_MSG(extack, + "Multipath group can not be a nexthop within a group"); + return false; + } + } else { + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + + if (nhi->reject_nh && npaths > 1) { + NL_SET_ERR_MSG(extack, + "Blackhole nexthop can not be used in a group with more than 1 path"); + return false; + } + } + + return true; +} + +static int nh_check_attr_group(struct net *net, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + unsigned int len = nla_len(tb[NHA_GROUP]); + struct nexthop_grp *nhg; + unsigned int i, j; + + if (len & (sizeof(struct nexthop_grp) - 1)) { + NL_SET_ERR_MSG(extack, + "Invalid length for nexthop group attribute"); + return -EINVAL; + } + + /* convert len to number of nexthop ids */ + len /= sizeof(*nhg); + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + if (nhg[i].resvd1 || nhg[i].resvd2) { + NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); + return -EINVAL; + } + if (nhg[i].weight > 254) { + NL_SET_ERR_MSG(extack, "Invalid value for weight"); + return -EINVAL; + } + for (j = i + 1; j < len; ++j) { + if (nhg[i].id == nhg[j].id) { + NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); + return -EINVAL; + } + } + } + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + struct nexthop *nh; + + nh = nexthop_find_by_id(net, nhg[i].id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + return -EINVAL; + } + if (!valid_group_nh(nh, len, extack)) + return -EINVAL; + } + for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { + if (!tb[i]) + continue; + + NL_SET_ERR_MSG(extack, + "No other attributes can be set in nexthop groups"); + return -EINVAL; + } + + return 0; +} + +static bool ipv6_good_nh(const struct fib6_nh *nh) +{ + int state = NUD_REACHABLE; + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + + return !!(state & NUD_VALID); +} + +static bool ipv4_good_nh(const struct fib_nh *nh) +{ + int state = NUD_REACHABLE; + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, + (__force u32)nh->fib_nh_gw4); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + + return !!(state & NUD_VALID); +} + +struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) +{ + struct nexthop *rc = NULL; + struct nh_group *nhg; + int i; + + if (!nh->is_group) + return nh; + + nhg = rcu_dereference(nh->nh_grp); + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + struct nh_info *nhi; + + if (hash > atomic_read(&nhge->upper_bound)) + continue; + + /* nexthops always check if it is good and does + * not rely on a sysctl for this behavior + */ + nhi = rcu_dereference(nhge->nh->nh_info); + switch (nhi->family) { + case AF_INET: + if (ipv4_good_nh(&nhi->fib_nh)) + return nhge->nh; + break; + case AF_INET6: + if (ipv6_good_nh(&nhi->fib6_nh)) + return nhge->nh; + break; + } + + if (!rc) + rc = nhge->nh; + } + + return rc; +} +EXPORT_SYMBOL_GPL(nexthop_select_path); + +int nexthop_for_each_fib6_nh(struct nexthop *nh, + int (*cb)(struct fib6_nh *nh, void *arg), + void *arg) +{ + struct nh_info *nhi; + int err; + + if (nh->is_group) { + struct nh_group *nhg; + int i; + + nhg = rcu_dereference_rtnl(nh->nh_grp); + for (i = 0; i < nhg->num_nh; i++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + nhi = rcu_dereference_rtnl(nhge->nh->nh_info); + err = cb(&nhi->fib6_nh, arg); + if (err) + return err; + } + } else { + nhi = rcu_dereference_rtnl(nh->nh_info); + err = cb(&nhi->fib6_nh, arg); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); + +static int check_src_addr(const struct in6_addr *saddr, + struct netlink_ext_ack *extack) +{ + if (!ipv6_addr_any(saddr)) { + NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); + return -EINVAL; + } + return 0; +} + +int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + /* fib6_src is unique to a fib6_info and limits the ability to cache + * routes in fib6_nh within a nexthop that is potentially shared + * across multiple fib entries. If the config wants to use source + * routing it can not use nexthop objects. mlxsw also does not allow + * fib6_src on routes. + */ + if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) + return -EINVAL; + + if (nh->is_group) { + struct nh_group *nhg; + + nhg = rtnl_dereference(nh->nh_grp); + if (nhg->has_v4) + goto no_v4_nh; + } else { + nhi = rtnl_dereference(nh->nh_info); + if (nhi->family == AF_INET) + goto no_v4_nh; + } + + return 0; +no_v4_nh: + NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(fib6_check_nexthop); + +/* if existing nexthop has ipv6 routes linked to it, need + * to verify this new spec works with ipv6 + */ +static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct fib6_info *f6i; + + if (list_empty(&old->f6i_list)) + return 0; + + list_for_each_entry(f6i, &old->f6i_list, nh_list) { + if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) + return -EINVAL; + } + + return fib6_check_nexthop(new, NULL, extack); +} + +static int nexthop_check_scope(struct nexthop *nh, u8 scope, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + nhi = rtnl_dereference(nh->nh_info); + if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { + NL_SET_ERR_MSG(extack, + "Route with host scope can not have a gateway"); + return -EINVAL; + } + + if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); + return -EINVAL; + } + + return 0; +} + +/* Invoked by fib add code to verify nexthop by id is ok with + * config for prefix; parts of fib_check_nh not done when nexthop + * object is used. + */ +int fib_check_nexthop(struct nexthop *nh, u8 scope, + struct netlink_ext_ack *extack) +{ + int err = 0; + + if (nh->is_group) { + struct nh_group *nhg; + + if (scope == RT_SCOPE_HOST) { + NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); + err = -EINVAL; + goto out; + } + + nhg = rtnl_dereference(nh->nh_grp); + /* all nexthops in a group have the same scope */ + err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack); + } else { + err = nexthop_check_scope(nh, scope, extack); + } +out: + return err; +} + +static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct fib_info *fi; + + list_for_each_entry(fi, &old->fi_list, nh_list) { + int err; + + err = fib_check_nexthop(new, fi->fib_scope, extack); + if (err) + return err; + } + return 0; +} + +static void nh_group_rebalance(struct nh_group *nhg) +{ + int total = 0; + int w = 0; + int i; + + for (i = 0; i < nhg->num_nh; ++i) + total += nhg->nh_entries[i].weight; + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + int upper_bound; + + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; + atomic_set(&nhge->upper_bound, upper_bound); + } +} + +static void remove_nh_grp_entry(struct nh_grp_entry *nhge, + struct nh_group *nhg, + struct nl_info *nlinfo) +{ + struct nexthop *nh = nhge->nh; + struct nh_grp_entry *nhges; + bool found = false; + int i; + + WARN_ON(!nh); + + nhges = nhg->nh_entries; + for (i = 0; i < nhg->num_nh; ++i) { + if (found) { + nhges[i-1].nh = nhges[i].nh; + nhges[i-1].weight = nhges[i].weight; + list_del(&nhges[i].nh_list); + list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list); + } else if (nhg->nh_entries[i].nh == nh) { + found = true; + } + } + + if (WARN_ON(!found)) + return; + + nhg->num_nh--; + nhg->nh_entries[nhg->num_nh].nh = NULL; + + nh_group_rebalance(nhg); + + nexthop_put(nh); + + if (nlinfo) + nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo); +} + +static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + struct nh_grp_entry *nhge, *tmp; + + list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) { + struct nh_group *nhg; + + list_del(&nhge->nh_list); + nhg = rtnl_dereference(nhge->nh_parent->nh_grp); + remove_nh_grp_entry(nhge, nhg, nlinfo); + + /* if this group has no more entries then remove it */ + if (!nhg->num_nh) + remove_nexthop(net, nhge->nh_parent, nlinfo); + } +} + +static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) +{ + struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); + int i, num_nh = nhg->num_nh; + + for (i = 0; i < num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + if (WARN_ON(!nhge->nh)) + continue; + + list_del(&nhge->nh_list); + nexthop_put(nhge->nh); + nhge->nh = NULL; + nhg->num_nh--; + } +} + +/* not called for nexthop replace */ +static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) +{ + struct fib6_info *f6i, *tmp; + bool do_flush = false; + struct fib_info *fi; + + list_for_each_entry(fi, &nh->fi_list, nh_list) { + fi->fib_flags |= RTNH_F_DEAD; + do_flush = true; + } + if (do_flush) + fib_flush(net); + + /* ip6_del_rt removes the entry from this list hence the _safe */ + list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { + /* __ip6_del_rt does a release, so do a hold here */ + fib6_info_hold(f6i); + ipv6_stub->ip6_del_rt(net, f6i); + } +} + +static void __remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + __remove_nexthop_fib(net, nh); + + if (nh->is_group) { + remove_nexthop_group(nh, nlinfo); + } else { + struct nh_info *nhi; + + nhi = rtnl_dereference(nh->nh_info); + if (nhi->fib_nhc.nhc_dev) + hlist_del(&nhi->dev_hash); + + remove_nexthop_from_groups(net, nh, nlinfo); + } +} + +static void remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + /* remove from the tree */ + rb_erase(&nh->rb_node, &net->nexthop.rb_root); + + if (nlinfo) + nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); + + __remove_nexthop(net, nh, nlinfo); + nh_base_seq_inc(net); + + nexthop_put(nh); +} + +/* if any FIB entries reference this nexthop, any dst entries + * need to be regenerated + */ +static void nh_rt_cache_flush(struct net *net, struct nexthop *nh) +{ + struct fib6_info *f6i; + + if (!list_empty(&nh->fi_list)) + rt_cache_flush(net); + + list_for_each_entry(f6i, &nh->f6i_list, nh_list) + ipv6_stub->fib6_update_sernum(net, f6i); +} + +static int replace_nexthop_grp(struct net *net, struct nexthop *old, + struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct nh_group *oldg, *newg; + int i; + + if (!new->is_group) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); + return -EINVAL; + } + + oldg = rtnl_dereference(old->nh_grp); + newg = rtnl_dereference(new->nh_grp); + + /* update parents - used by nexthop code for cleanup */ + for (i = 0; i < newg->num_nh; i++) + newg->nh_entries[i].nh_parent = old; + + rcu_assign_pointer(old->nh_grp, newg); + + for (i = 0; i < oldg->num_nh; i++) + oldg->nh_entries[i].nh_parent = new; + + rcu_assign_pointer(new->nh_grp, oldg); + + return 0; +} + +static int replace_nexthop_single(struct net *net, struct nexthop *old, + struct nexthop *new, + struct netlink_ext_ack *extack) +{ + struct nh_info *oldi, *newi; + + if (new->is_group) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); + return -EINVAL; + } + + oldi = rtnl_dereference(old->nh_info); + newi = rtnl_dereference(new->nh_info); + + newi->nh_parent = old; + oldi->nh_parent = new; + + old->protocol = new->protocol; + old->nh_flags = new->nh_flags; + + rcu_assign_pointer(old->nh_info, newi); + rcu_assign_pointer(new->nh_info, oldi); + + return 0; +} + +static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, + struct nl_info *info) +{ + struct fib6_info *f6i; + + if (!list_empty(&nh->fi_list)) { + struct fib_info *fi; + + /* expectation is a few fib_info per nexthop and then + * a lot of routes per fib_info. So mark the fib_info + * and then walk the fib tables once + */ + list_for_each_entry(fi, &nh->fi_list, nh_list) + fi->nh_updated = true; + + fib_info_notify_update(net, info); + + list_for_each_entry(fi, &nh->fi_list, nh_list) + fi->nh_updated = false; + } + + list_for_each_entry(f6i, &nh->f6i_list, nh_list) + ipv6_stub->fib6_rt_update(net, f6i, info); +} + +/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries + * linked to this nexthop and for all groups that the nexthop + * is a member of + */ +static void nexthop_replace_notify(struct net *net, struct nexthop *nh, + struct nl_info *info) +{ + struct nh_grp_entry *nhge; + + __nexthop_replace_notify(net, nh, info); + + list_for_each_entry(nhge, &nh->grp_list, nh_list) + __nexthop_replace_notify(net, nhge->nh_parent, info); +} + +static int replace_nexthop(struct net *net, struct nexthop *old, + struct nexthop *new, struct netlink_ext_ack *extack) +{ + bool new_is_reject = false; + struct nh_grp_entry *nhge; + int err; + + /* check that existing FIB entries are ok with the + * new nexthop definition + */ + err = fib_check_nh_list(old, new, extack); + if (err) + return err; + + err = fib6_check_nh_list(old, new, extack); + if (err) + return err; + + if (!new->is_group) { + struct nh_info *nhi = rtnl_dereference(new->nh_info); + + new_is_reject = nhi->reject_nh; + } + + list_for_each_entry(nhge, &old->grp_list, nh_list) { + /* if new nexthop is a blackhole, any groups using this + * nexthop cannot have more than 1 path + */ + if (new_is_reject && + nexthop_num_path(nhge->nh_parent) > 1) { + NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); + return -EINVAL; + } + + err = fib_check_nh_list(nhge->nh_parent, new, extack); + if (err) + return err; + + err = fib6_check_nh_list(nhge->nh_parent, new, extack); + if (err) + return err; + } + + if (old->is_group) + err = replace_nexthop_grp(net, old, new, extack); + else + err = replace_nexthop_single(net, old, new, extack); + + if (!err) { + nh_rt_cache_flush(net, old); + + __remove_nexthop(net, new, NULL); + nexthop_put(new); + } + + return err; +} + +/* called with rtnl_lock held */ +static int insert_nexthop(struct net *net, struct nexthop *new_nh, + struct nh_config *cfg, struct netlink_ext_ack *extack) +{ + struct rb_node **pp, *parent = NULL, *next; + struct rb_root *root = &net->nexthop.rb_root; + bool replace = !!(cfg->nlflags & NLM_F_REPLACE); + bool create = !!(cfg->nlflags & NLM_F_CREATE); + u32 new_id = new_nh->id; + int replace_notify = 0; + int rc = -EEXIST; + + pp = &root->rb_node; + while (1) { + struct nexthop *nh; + + next = rtnl_dereference(*pp); + if (!next) + break; + + parent = next; + + nh = rb_entry(parent, struct nexthop, rb_node); + if (new_id < nh->id) { + pp = &next->rb_left; + } else if (new_id > nh->id) { + pp = &next->rb_right; + } else if (replace) { + rc = replace_nexthop(net, nh, new_nh, extack); + if (!rc) { + new_nh = nh; /* send notification with old nh */ + replace_notify = 1; + } + goto out; + } else { + /* id already exists and not a replace */ + goto out; + } + } + + if (replace && !create) { + NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); + rc = -ENOENT; + goto out; + } + + rb_link_node_rcu(&new_nh->rb_node, parent, pp); + rb_insert_color(&new_nh->rb_node, root); + rc = 0; +out: + if (!rc) { + nh_base_seq_inc(net); + nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); + if (replace_notify) + nexthop_replace_notify(net, new_nh, &cfg->nlinfo); + } + + return rc; +} + +/* rtnl */ +/* remove all nexthops tied to a device being deleted */ +static void nexthop_flush_dev(struct net_device *dev) +{ + unsigned int hash = nh_dev_hashfn(dev->ifindex); + struct net *net = dev_net(dev); + struct hlist_head *head = &net->nexthop.devhash[hash]; + struct hlist_node *n; + struct nh_info *nhi; + + hlist_for_each_entry_safe(nhi, n, head, dev_hash) { + if (nhi->fib_nhc.nhc_dev != dev) + continue; + + remove_nexthop(net, nhi->nh_parent, NULL); + } +} + +/* rtnl; called when net namespace is deleted */ +static void flush_all_nexthops(struct net *net) +{ + struct rb_root *root = &net->nexthop.rb_root; + struct rb_node *node; + struct nexthop *nh; + + while ((node = rb_first(root))) { + nh = rb_entry(node, struct nexthop, rb_node); + remove_nexthop(net, nh, NULL); + cond_resched(); + } +} + +static struct nexthop *nexthop_create_group(struct net *net, + struct nh_config *cfg) +{ + struct nlattr *grps_attr = cfg->nh_grp; + struct nexthop_grp *entry = nla_data(grps_attr); + struct nh_group *nhg; + struct nexthop *nh; + int i; + + nh = nexthop_alloc(); + if (!nh) + return ERR_PTR(-ENOMEM); + + nh->is_group = 1; + + nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry)); + if (!nhg) { + kfree(nh); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nhg->num_nh; ++i) { + struct nexthop *nhe; + struct nh_info *nhi; + + nhe = nexthop_find_by_id(net, entry[i].id); + if (!nexthop_get(nhe)) + goto out_no_nh; + + nhi = rtnl_dereference(nhe->nh_info); + if (nhi->family == AF_INET) + nhg->has_v4 = true; + + nhg->nh_entries[i].nh = nhe; + nhg->nh_entries[i].weight = entry[i].weight + 1; + list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); + nhg->nh_entries[i].nh_parent = nh; + } + + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + nhg->mpath = 1; + nh_group_rebalance(nhg); + } + + rcu_assign_pointer(nh->nh_grp, nhg); + + return nh; + +out_no_nh: + for (; i >= 0; --i) + nexthop_put(nhg->nh_entries[i].nh); + + kfree(nhg); + kfree(nh); + + return ERR_PTR(-ENOENT); +} + +static int nh_create_ipv4(struct net *net, struct nexthop *nh, + struct nh_info *nhi, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct fib_nh *fib_nh = &nhi->fib_nh; + struct fib_config fib_cfg = { + .fc_oif = cfg->nh_ifindex, + .fc_gw4 = cfg->gw.ipv4, + .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, + .fc_flags = cfg->nh_flags, + .fc_encap = cfg->nh_encap, + .fc_encap_type = cfg->nh_encap_type, + }; + u32 tb_id = l3mdev_fib_table(cfg->dev); + int err = -EINVAL; + + err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); + if (err) { + fib_nh_release(net, fib_nh); + goto out; + } + + /* sets nh_dev if successful */ + err = fib_check_nh(net, fib_nh, tb_id, 0, extack); + if (!err) { + nh->nh_flags = fib_nh->fib_nh_flags; + fib_info_update_nhc_saddr(net, &fib_nh->nh_common, + fib_nh->fib_nh_scope); + } else { + fib_nh_release(net, fib_nh); + } +out: + return err; +} + +static int nh_create_ipv6(struct net *net, struct nexthop *nh, + struct nh_info *nhi, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct fib6_nh *fib6_nh = &nhi->fib6_nh; + struct fib6_config fib6_cfg = { + .fc_table = l3mdev_fib_table(cfg->dev), + .fc_ifindex = cfg->nh_ifindex, + .fc_gateway = cfg->gw.ipv6, + .fc_flags = cfg->nh_flags, + .fc_encap = cfg->nh_encap, + .fc_encap_type = cfg->nh_encap_type, + }; + int err; + + if (!ipv6_addr_any(&cfg->gw.ipv6)) + fib6_cfg.fc_flags |= RTF_GATEWAY; + + /* sets nh_dev if successful */ + err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, + extack); + if (err) + ipv6_stub->fib6_nh_release(fib6_nh); + else + nh->nh_flags = fib6_nh->fib_nh_flags; + + return err; +} + +static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + struct nexthop *nh; + int err = 0; + + nh = nexthop_alloc(); + if (!nh) + return ERR_PTR(-ENOMEM); + + nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); + if (!nhi) { + kfree(nh); + return ERR_PTR(-ENOMEM); + } + + nh->nh_flags = cfg->nh_flags; + nh->net = net; + + nhi->nh_parent = nh; + nhi->family = cfg->nh_family; + nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; + + if (cfg->nh_blackhole) { + nhi->reject_nh = 1; + cfg->nh_ifindex = net->loopback_dev->ifindex; + } + + switch (cfg->nh_family) { + case AF_INET: + err = nh_create_ipv4(net, nh, nhi, cfg, extack); + break; + case AF_INET6: + err = nh_create_ipv6(net, nh, nhi, cfg, extack); + break; + } + + if (err) { + kfree(nhi); + kfree(nh); + return ERR_PTR(err); + } + + /* add the entry to the device based hash */ + nexthop_devhash_add(net, nhi); + + rcu_assign_pointer(nh->nh_info, nhi); + + return nh; +} + +/* called with rtnl lock held */ +static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nexthop *nh; + int err; + + if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { + NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); + return ERR_PTR(-EINVAL); + } + + if (!cfg->nh_id) { + cfg->nh_id = nh_find_unused_id(net); + if (!cfg->nh_id) { + NL_SET_ERR_MSG(extack, "No unused id"); + return ERR_PTR(-EINVAL); + } + } + + if (cfg->nh_grp) + nh = nexthop_create_group(net, cfg); + else + nh = nexthop_create(net, cfg, extack); + + if (IS_ERR(nh)) + return nh; + + refcount_set(&nh->refcnt, 1); + nh->id = cfg->nh_id; + nh->protocol = cfg->nh_protocol; + nh->net = net; + + err = insert_nexthop(net, nh, cfg, extack); + if (err) { + __remove_nexthop(net, nh, NULL); + nexthop_put(nh); + nh = ERR_PTR(err); + } + + return nh; +} + +static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nhmsg *nhm = nlmsg_data(nlh); + struct nlattr *tb[NHA_MAX + 1]; + int err; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + extack); + if (err < 0) + return err; + + err = -EINVAL; + if (nhm->resvd || nhm->nh_scope) { + NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); + goto out; + } + if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { + NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); + goto out; + } + + switch (nhm->nh_family) { + case AF_INET: + case AF_INET6: + break; + case AF_UNSPEC: + if (tb[NHA_GROUP]) + break; + /* fallthrough */ + default: + NL_SET_ERR_MSG(extack, "Invalid address family"); + goto out; + } + + if (tb[NHA_GROUPS] || tb[NHA_MASTER]) { + NL_SET_ERR_MSG(extack, "Invalid attributes in request"); + goto out; + } + + memset(cfg, 0, sizeof(*cfg)); + cfg->nlflags = nlh->nlmsg_flags; + cfg->nlinfo.portid = NETLINK_CB(skb).portid; + cfg->nlinfo.nlh = nlh; + cfg->nlinfo.nl_net = net; + + cfg->nh_family = nhm->nh_family; + cfg->nh_protocol = nhm->nh_protocol; + cfg->nh_flags = nhm->nh_flags; + + if (tb[NHA_ID]) + cfg->nh_id = nla_get_u32(tb[NHA_ID]); + + if (tb[NHA_GROUP]) { + if (nhm->nh_family != AF_UNSPEC) { + NL_SET_ERR_MSG(extack, "Invalid family for group"); + goto out; + } + cfg->nh_grp = tb[NHA_GROUP]; + + cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; + if (tb[NHA_GROUP_TYPE]) + cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); + + if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { + NL_SET_ERR_MSG(extack, "Invalid group type"); + goto out; + } + err = nh_check_attr_group(net, tb, extack); + + /* no other attributes should be set */ + goto out; + } + + if (tb[NHA_BLACKHOLE]) { + if (tb[NHA_GATEWAY] || tb[NHA_OIF] || + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif"); + goto out; + } + + cfg->nh_blackhole = 1; + err = 0; + goto out; + } + + if (!tb[NHA_OIF]) { + NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops"); + goto out; + } + + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); + if (cfg->nh_ifindex) + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); + + if (!cfg->dev) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + goto out; + } else if (!(cfg->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } else if (!netif_carrier_ok(cfg->dev)) { + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); + err = -ENETDOWN; + goto out; + } + + err = -EINVAL; + if (tb[NHA_GATEWAY]) { + struct nlattr *gwa = tb[NHA_GATEWAY]; + + switch (cfg->nh_family) { + case AF_INET: + if (nla_len(gwa) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid gateway"); + goto out; + } + cfg->gw.ipv4 = nla_get_be32(gwa); + break; + case AF_INET6: + if (nla_len(gwa) != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG(extack, "Invalid gateway"); + goto out; + } + cfg->gw.ipv6 = nla_get_in6_addr(gwa); + break; + default: + NL_SET_ERR_MSG(extack, + "Unknown address family for gateway"); + goto out; + } + } else { + /* device only nexthop (no gateway) */ + if (cfg->nh_flags & RTNH_F_ONLINK) { + NL_SET_ERR_MSG(extack, + "ONLINK flag can not be set for nexthop without a gateway"); + goto out; + } + } + + if (tb[NHA_ENCAP]) { + cfg->nh_encap = tb[NHA_ENCAP]; + + if (!tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); + goto out; + } + + cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); + err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); + if (err < 0) + goto out; + + } else if (tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); + goto out; + } + + + err = 0; +out: + return err; +} + +/* rtnl */ +static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nh_config cfg; + struct nexthop *nh; + int err; + + err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); + if (!err) { + nh = nexthop_add(net, &cfg, extack); + if (IS_ERR(nh)) + err = PTR_ERR(nh); + } + + return err; +} + +static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id, + struct netlink_ext_ack *extack) +{ + struct nhmsg *nhm = nlmsg_data(nlh); + struct nlattr *tb[NHA_MAX + 1]; + int err, i; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + extack); + if (err < 0) + return err; + + err = -EINVAL; + for (i = 0; i < __NHA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NHA_ID: + break; + default: + NL_SET_ERR_MSG_ATTR(extack, tb[i], + "Unexpected attribute in request"); + goto out; + } + } + if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Invalid values in header"); + goto out; + } + + if (!tb[NHA_ID]) { + NL_SET_ERR_MSG(extack, "Nexthop id is missing"); + goto out; + } + + *id = nla_get_u32(tb[NHA_ID]); + if (!(*id)) + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + else + err = 0; +out: + return err; +} + +/* rtnl */ +static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nl_info nlinfo = { + .nlh = nlh, + .nl_net = net, + .portid = NETLINK_CB(skb).portid, + }; + struct nexthop *nh; + int err; + u32 id; + + err = nh_valid_get_del_req(nlh, &id, extack); + if (err) + return err; + + nh = nexthop_find_by_id(net, id); + if (!nh) + return -ENOENT; + + remove_nexthop(net, nh, &nlinfo); + + return 0; +} + +/* rtnl */ +static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct sk_buff *skb = NULL; + struct nexthop *nh; + int err; + u32 id; + + err = nh_valid_get_del_req(nlh, &id, extack); + if (err) + return err; + + err = -ENOBUFS; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto out; + + err = -ENOENT; + nh = nexthop_find_by_id(net, id); + if (!nh) + goto errout_free; + + err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + goto errout_free; + } + + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + return err; +errout_free: + kfree_skb(skb); + goto out; +} + +static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, + bool group_filter, u8 family) +{ + const struct net_device *dev; + const struct nh_info *nhi; + + if (group_filter && !nh->is_group) + return true; + + if (!dev_idx && !master_idx && !family) + return false; + + if (nh->is_group) + return true; + + nhi = rtnl_dereference(nh->nh_info); + if (family && nhi->family != family) + return true; + + dev = nhi->fib_nhc.nhc_dev; + if (dev_idx && (!dev || dev->ifindex != dev_idx)) + return true; + + if (master_idx) { + struct net_device *master; + + if (!dev) + return true; + + master = netdev_master_upper_dev_get((struct net_device *)dev); + if (!master || master->ifindex != master_idx) + return true; + } + + return false; +} + +static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, + int *master_idx, bool *group_filter, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NHA_MAX + 1]; + struct nhmsg *nhm; + int err, i; + u32 idx; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + NULL); + if (err < 0) + return err; + + for (i = 0; i <= NHA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NHA_OIF: + idx = nla_get_u32(tb[i]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + return -EINVAL; + } + *dev_idx = idx; + break; + case NHA_MASTER: + idx = nla_get_u32(tb[i]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid master device index"); + return -EINVAL; + } + *master_idx = idx; + break; + case NHA_GROUPS: + *group_filter = true; + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + nhm = nlmsg_data(nlh); + if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); + return -EINVAL; + } + + return 0; +} + +/* rtnl */ +static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nhmsg *nhm = nlmsg_data(cb->nlh); + int dev_filter_idx = 0, master_idx = 0; + struct net *net = sock_net(skb->sk); + struct rb_root *root = &net->nexthop.rb_root; + bool group_filter = false; + struct rb_node *node; + int idx = 0, s_idx; + int err; + + err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, + &group_filter, cb); + if (err < 0) + return err; + + s_idx = cb->args[0]; + for (node = rb_first(root); node; node = rb_next(node)) { + struct nexthop *nh; + + if (idx < s_idx) + goto cont; + + nh = rb_entry(node, struct nexthop, rb_node); + if (nh_dump_filtered(nh, dev_filter_idx, master_idx, + group_filter, nhm->nh_family)) + goto cont; + + err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } +cont: + idx++; + } + +out: + err = skb->len; +out_err: + cb->args[0] = idx; + cb->seq = net->nexthop.seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + return err; +} + +static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) +{ + unsigned int hash = nh_dev_hashfn(dev->ifindex); + struct net *net = dev_net(dev); + struct hlist_head *head = &net->nexthop.devhash[hash]; + struct hlist_node *n; + struct nh_info *nhi; + + hlist_for_each_entry_safe(nhi, n, head, dev_hash) { + if (nhi->fib_nhc.nhc_dev == dev) { + if (nhi->family == AF_INET) + fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, + orig_mtu); + } + } +} + +/* rtnl */ +static int nh_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_info_ext *info_ext; + + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_flush_dev(dev); + break; + case NETDEV_CHANGE: + if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) + nexthop_flush_dev(dev); + break; + case NETDEV_CHANGEMTU: + info_ext = ptr; + nexthop_sync_mtu(dev, info_ext->ext.mtu); + rt_cache_flush(dev_net(dev)); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block nh_netdev_notifier = { + .notifier_call = nh_netdev_event, +}; + +static void __net_exit nexthop_net_exit(struct net *net) +{ + rtnl_lock(); + flush_all_nexthops(net); + rtnl_unlock(); + kfree(net->nexthop.devhash); +} + +static int __net_init nexthop_net_init(struct net *net) +{ + size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; + + net->nexthop.rb_root = RB_ROOT; + net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); + if (!net->nexthop.devhash) + return -ENOMEM; + + return 0; +} + +static struct pernet_operations nexthop_net_ops = { + .init = nexthop_net_init, + .exit = nexthop_net_exit, +}; + +static int __init nexthop_init(void) +{ + register_pernet_subsys(&nexthop_net_ops); + + register_netdevice_notifier(&nh_netdev_notifier); + + rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop, + rtm_dump_nexthop, 0); + + rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + + rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + + return 0; +} +subsys_initcall(nexthop_init); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 073273b751f8..cc90243ccf76 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -68,8 +68,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %u memory %lu\n", - atomic_read(&net->ipv4.frags.rhashtable.nelems), - frag_mem_limit(&net->ipv4.frags)); + atomic_read(&net->ipv4.fqdir->rhashtable.nelems), + frag_mem_limit(net->ipv4.fqdir)); return 0; } @@ -288,6 +288,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), + SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b2b35b38724d..517300d587a7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -95,6 +95,7 @@ #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/arp.h> #include <net/tcp.h> #include <net/icmp.h> @@ -1531,7 +1532,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst) void rt_flush_dev(struct net_device *dev) { - struct net *net = dev_net(dev); struct rtable *rt; int cpu; @@ -1542,7 +1542,7 @@ void rt_flush_dev(struct net_device *dev) list_for_each_entry(rt, &ul->head, rt_uncached) { if (rt->dst.dev != dev) continue; - rt->dst.dev = net->loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(dev); } @@ -1580,7 +1580,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID - { + if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); @@ -1962,6 +1962,36 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.basic.ip_proto = fl4->flowi4_proto; } break; + case 2: + memset(&hash_keys, 0, sizeof(hash_keys)); + /* skb is currently provided only when forwarding */ + if (skb) { + struct flow_keys keys; + + skb_flow_dissect_flow_keys(skb, &keys, 0); + /* Inner can be v4 or v6 */ + if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + hash_keys.tags.flow_label = keys.tags.flow_label; + hash_keys.basic.ip_proto = keys.basic.ip_proto; + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + ip_multipath_l3_keys(skb, &hash_keys); + } + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = fl4->saddr; + hash_keys.addrs.v4addrs.dst = fl4->daddr; + } + break; } mhash = flow_hash_from_keys(&hash_keys); @@ -1979,7 +2009,7 @@ static int ip_mkroute_input(struct sk_buff *skb, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) { + if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); @@ -2714,7 +2744,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = fl4->flowi4_tos; + r->rtm_tos = fl4 ? fl4->flowi4_tos : 0; r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; @@ -2742,7 +2772,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif - if (!rt_is_input_route(rt) && + if (fl4 && !rt_is_input_route(rt) && fl4->saddr != src) { if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; @@ -2782,36 +2812,40 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - if (fl4->flowi4_mark && - nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) - goto nla_put_failure; - - if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && - nla_put_u32(skb, RTA_UID, - from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) - goto nla_put_failure; + if (fl4) { + if (fl4->flowi4_mark && + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) + goto nla_put_failure; - error = rt->dst.error; + if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && + nla_put_u32(skb, RTA_UID, + from_kuid_munged(current_user_ns(), + fl4->flowi4_uid))) + goto nla_put_failure; - if (rt_is_input_route(rt)) { + if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE - if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && - IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { - int err = ipmr_get_route(net, skb, - fl4->saddr, fl4->daddr, - r, portid); - - if (err <= 0) { - if (err == 0) - return 0; - goto nla_put_failure; - } - } else + if (ipv4_is_multicast(dst) && + !ipv4_is_local_multicast(dst) && + IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + int err = ipmr_get_route(net, skb, + fl4->saddr, fl4->daddr, + r, portid); + + if (err <= 0) { + if (err == 0) + return 0; + goto nla_put_failure; + } + } else #endif - if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) - goto nla_put_failure; + if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) + goto nla_put_failure; + } } + error = rt->dst.error; + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; @@ -2823,6 +2857,80 @@ nla_put_failure: return -EMSGSIZE; } +static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, u32 table_id, + struct fnhe_hash_bucket *bucket, int genid, + int *fa_index, int fa_start) +{ + int i; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + for (fnhe = rcu_dereference(bucket[i].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + struct rtable *rt; + int err; + + if (*fa_index < fa_start) + goto next; + + if (fnhe->fnhe_genid != genid) + goto next; + + if (fnhe->fnhe_expires && + time_after(jiffies, fnhe->fnhe_expires)) + goto next; + + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (!rt) + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (!rt) + goto next; + + err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, + table_id, NULL, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err) + return err; +next: + (*fa_index)++; + } + } + + return 0; +} + +int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, + u32 table_id, struct fib_info *fi, + int *fa_index, int fa_start) +{ + struct net *net = sock_net(cb->skb->sk); + int nhsel, genid = fnhe_genid(net); + + for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { + struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); + struct fnhe_hash_bucket *bucket; + int err; + + if (nhc->nhc_flags & RTNH_F_DEAD) + continue; + + rcu_read_lock(); + bucket = rcu_dereference(nhc->nhc_exceptions); + err = 0; + if (bucket) + err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, + genid, fa_index, fa_start); + rcu_read_unlock(); + if (err) + return err; + } + + return 0; +} + static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, u8 ip_proto, __be16 sport, __be16 dport) @@ -3230,9 +3338,11 @@ static struct ctl_table ipv4_route_table[] = { { } }; +static const char ipv4_route_flush_procname[] = "flush"; + static struct ctl_table ipv4_route_flush_table[] = { { - .procname = "flush", + .procname = ipv4_route_flush_procname, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, @@ -3250,9 +3360,11 @@ static __net_init int sysctl_route_net_init(struct net *net) if (!tbl) goto err_dup; - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - tbl[0].procname = NULL; + /* Don't export non-whitelisted sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) { + if (tbl[0].procname != ipv4_route_flush_procname) + tbl[0].procname = NULL; + } } tbl[0].extra1 = net; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b6f14af926fa..7d66306b5f39 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -279,55 +279,96 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } +static int sscanf_key(char *buf, __le32 *key) +{ + u32 user_key[4]; + int i, ret = 0; + + if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1, + user_key + 2, user_key + 3) != 4) { + ret = -EINVAL; + } else { + for (i = 0; i < ARRAY_SIZE(user_key); i++) + key[i] = cpu_to_le32(user_key[i]); + } + pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", + user_key[0], user_key[1], user_key[2], user_key[3], buf, ret); + + return ret; +} + static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); - struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; - struct tcp_fastopen_context *ctxt; - u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ - __le32 key[4]; - int ret, i; + /* maxlen to print the list of keys in hex (*2), with dashes + * separating doublewords and a comma in between keys. + */ + struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * + 2 * TCP_FASTOPEN_KEY_MAX) + + (TCP_FASTOPEN_KEY_MAX * 5)) }; + struct tcp_fastopen_context *ctx; + u32 user_key[TCP_FASTOPEN_KEY_MAX * 4]; + __le32 key[TCP_FASTOPEN_KEY_MAX * 4]; + char *backup_data; + int ret, i = 0, off = 0, n_keys = 0; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); - if (ctxt) - memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); - else - memset(key, 0, sizeof(key)); + ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctx) { + n_keys = tcp_fastopen_context_len(ctx); + memcpy(&key[0], &ctx->key[0], TCP_FASTOPEN_KEY_LENGTH * n_keys); + } rcu_read_unlock(); - for (i = 0; i < ARRAY_SIZE(key); i++) + if (!n_keys) { + memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH); + n_keys = 1; + } + + for (i = 0; i < n_keys * 4; i++) user_key[i] = le32_to_cpu(key[i]); - snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", - user_key[0], user_key[1], user_key[2], user_key[3]); + for (i = 0; i < n_keys; i++) { + off += snprintf(tbl.data + off, tbl.maxlen - off, + "%08x-%08x-%08x-%08x", + user_key[i * 4], + user_key[i * 4 + 1], + user_key[i * 4 + 2], + user_key[i * 4 + 3]); + if (i + 1 < n_keys) + off += snprintf(tbl.data + off, tbl.maxlen - off, ","); + } + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { - if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1, - user_key + 2, user_key + 3) != 4) { + backup_data = strchr(tbl.data, ','); + if (backup_data) { + *backup_data = '\0'; + backup_data++; + } + if (sscanf_key(tbl.data, key)) { ret = -EINVAL; goto bad_key; } - - for (i = 0; i < ARRAY_SIZE(user_key); i++) - key[i] = cpu_to_le32(user_key[i]); - + if (backup_data) { + if (sscanf_key(backup_data, key + 4)) { + ret = -EINVAL; + goto bad_key; + } + } tcp_fastopen_reset_cipher(net, NULL, key, - TCP_FASTOPEN_KEY_LENGTH); + backup_data ? key + 4 : NULL); } bad_key: - pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", - user_key[0], user_key[1], user_key[2], user_key[3], - (char *)tbl.data, ret); kfree(tbl.data); return ret; } @@ -956,7 +997,12 @@ static struct ctl_table ipv4_net_table[] = { .procname = "tcp_fastopen_key", .mode = 0600, .data = &init_net.ipv4.sysctl_tcp_fastopen, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + /* maxlen to print the list of keys in hex (*2), with dashes + * separating doublewords and a comma in between keys. + */ + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * + 2 * TCP_FASTOPEN_KEY_MAX) + + (TCP_FASTOPEN_KEY_MAX * 5)), .proc_handler = proc_tcp_fastopen_key, }, { @@ -984,7 +1030,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = &zero, - .extra2 = &one, + .extra2 = &two, }, #endif { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2eebd092c3c1..7846afacdf0b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2743,6 +2743,21 @@ static int tcp_repair_options_est(struct sock *sk, return 0; } +DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +EXPORT_SYMBOL(tcp_tx_delay_enabled); + +static void tcp_enable_tx_delay(void) +{ + if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { + static int __tcp_tx_delay_enabled = 0; + + if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { + static_branch_enable(&tcp_tx_delay_enabled); + pr_info("TCP_TX_DELAY enabled\n"); + } + } +} + /* * Socket option code for TCP. */ @@ -2793,15 +2808,23 @@ static int do_tcp_setsockopt(struct sock *sk, int level, return err; } case TCP_FASTOPEN_KEY: { - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; + __u8 *backup_key = NULL; - if (optlen != sizeof(key)) + /* Allow a backup key as well to facilitate key rotation + * First key is the active one. + */ + if (optlen != TCP_FASTOPEN_KEY_LENGTH && + optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; if (copy_from_user(key, optval, optlen)) return -EFAULT; - return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); + if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) + backup_key = key + TCP_FASTOPEN_KEY_LENGTH; + + return tcp_fastopen_reset_cipher(net, sk, key, backup_key); } default: /* fallthru */ @@ -3085,6 +3108,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->recvmsg_inq = val; break; + case TCP_TX_DELAY: + if (val) + tcp_enable_tx_delay(); + tp->tcp_tx_delay = val; + break; default: err = -ENOPROTOOPT; break; @@ -3455,21 +3483,23 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; case TCP_FASTOPEN_KEY: { - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; struct tcp_fastopen_context *ctx; + unsigned int key_len = 0; if (get_user(len, optlen)) return -EFAULT; rcu_read_lock(); ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); - if (ctx) - memcpy(key, ctx->key, sizeof(key)); - else - len = 0; + if (ctx) { + key_len = tcp_fastopen_context_len(ctx) * + TCP_FASTOPEN_KEY_LENGTH; + memcpy(&key[0], &ctx->key[0], key_len); + } rcu_read_unlock(); - len = min_t(unsigned int, len, sizeof(key)); + len = min_t(unsigned int, len, key_len); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, key, len)) @@ -3542,6 +3572,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_no_cookie; break; + case TCP_TX_DELAY: + val = tp->tcp_tx_delay; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f5a45e1e1182..3fd451271a70 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -30,15 +30,15 @@ void tcp_fastopen_init_key_once(struct net *net) * for a valid cookie, so this is an acceptable risk. */ get_random_bytes(key, sizeof(key)); - tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key)); + tcp_fastopen_reset_cipher(net, NULL, key, NULL); } static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = container_of(head, struct tcp_fastopen_context, rcu); - crypto_free_cipher(ctx->tfm); - kfree(ctx); + + kzfree(ctx); } void tcp_fastopen_destroy_cipher(struct sock *sk) @@ -67,31 +67,27 @@ void tcp_fastopen_ctx_destroy(struct net *net) } int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, - void *key, unsigned int len) + void *primary_key, void *backup_key) { struct tcp_fastopen_context *ctx, *octx; struct fastopen_queue *q; - int err; + int err = 0; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - ctx->tfm = crypto_alloc_cipher("aes", 0, 0); - - if (IS_ERR(ctx->tfm)) { - err = PTR_ERR(ctx->tfm); -error: kfree(ctx); - pr_err("TCP: TFO aes cipher alloc error: %d\n", err); - return err; + if (!ctx) { + err = -ENOMEM; + goto out; } - err = crypto_cipher_setkey(ctx->tfm, key, len); - if (err) { - pr_err("TCP: TFO cipher key error: %d\n", err); - crypto_free_cipher(ctx->tfm); - goto error; - } - memcpy(ctx->key, key, len); + ctx->key[0].key[0] = get_unaligned_le64(primary_key); + ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8); + if (backup_key) { + ctx->key[1].key[0] = get_unaligned_le64(backup_key); + ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8); + ctx->num = 2; + } else { + ctx->num = 1; + } spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); if (sk) { @@ -108,66 +104,58 @@ error: kfree(ctx); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); +out: return err; } -static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path, - struct tcp_fastopen_cookie *foc) +static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, + struct sk_buff *syn, + const siphash_key_t *key, + struct tcp_fastopen_cookie *foc) { - struct tcp_fastopen_context *ctx; - bool ok = false; - - rcu_read_lock(); - - ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); - if (!ctx) - ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); + BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64)); - if (ctx) { - crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); - foc->len = TCP_FASTOPEN_COOKIE_SIZE; - ok = true; - } - rcu_read_unlock(); - return ok; -} - -/* Generate the fastopen cookie by doing aes128 encryption on both - * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 - * addresses. For the longer IPv6 addresses use CBC-MAC. - * - * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. - */ -static bool tcp_fastopen_cookie_gen(struct sock *sk, - struct request_sock *req, - struct sk_buff *syn, - struct tcp_fastopen_cookie *foc) -{ if (req->rsk_ops->family == AF_INET) { const struct iphdr *iph = ip_hdr(syn); - __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(sk, path, foc); + foc->val[0] = cpu_to_le64(siphash(&iph->saddr, + sizeof(iph->saddr) + + sizeof(iph->daddr), + key)); + foc->len = TCP_FASTOPEN_COOKIE_SIZE; + return true; } - #if IS_ENABLED(CONFIG_IPV6) if (req->rsk_ops->family == AF_INET6) { const struct ipv6hdr *ip6h = ipv6_hdr(syn); - struct tcp_fastopen_cookie tmp; - - if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) { - struct in6_addr *buf = &tmp.addr; - int i; - for (i = 0; i < 4; i++) - buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(sk, buf, foc); - } + foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr, + sizeof(ip6h->saddr) + + sizeof(ip6h->daddr), + key)); + foc->len = TCP_FASTOPEN_COOKIE_SIZE; + return true; } #endif return false; } +/* Generate the fastopen cookie by applying SipHash to both the source and + * destination addresses. + */ +static void tcp_fastopen_cookie_gen(struct sock *sk, + struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *foc) +{ + struct tcp_fastopen_context *ctx; + + rcu_read_lock(); + ctx = tcp_fastopen_get_ctx(sk); + if (ctx) + __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc); + rcu_read_unlock(); +} /* If an incoming SYN or SYNACK frame contains a payload and/or FIN, * queue this additional data / FIN. @@ -212,6 +200,35 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) tcp_fin(sk); } +/* returns 0 - no key match, 1 for primary, 2 for backup */ +static int tcp_fastopen_cookie_gen_check(struct sock *sk, + struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *orig, + struct tcp_fastopen_cookie *valid_foc) +{ + struct tcp_fastopen_cookie search_foc = { .len = -1 }; + struct tcp_fastopen_cookie *foc = valid_foc; + struct tcp_fastopen_context *ctx; + int i, ret = 0; + + rcu_read_lock(); + ctx = tcp_fastopen_get_ctx(sk); + if (!ctx) + goto out; + for (i = 0; i < tcp_fastopen_context_len(ctx); i++) { + __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc); + if (tcp_fastopen_cookie_match(foc, orig)) { + ret = i + 1; + goto out; + } + foc = &search_foc; + } +out: + rcu_read_unlock(); + return ret; +} + static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct request_sock *req) @@ -327,6 +344,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; + int ret = 0; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); @@ -342,31 +360,44 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; - if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) && - foc->len == TCP_FASTOPEN_COOKIE_SIZE && - foc->len == valid_foc.len && - !memcmp(foc->val, valid_foc.val, foc->len)) { - /* Cookie is valid. Create a (full) child socket to accept - * the data in SYN before returning a SYN-ACK to ack the - * data. If we fail to create the socket, fall back and - * ack the ISN only but includes the same cookie. - * - * Note: Data-less SYN with valid cookie is allowed to send - * data in SYN_RECV state. - */ + if (foc->len == 0) { + /* Client requests a cookie. */ + tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc); + } else if (foc->len > 0) { + ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc, + &valid_foc); + if (!ret) { + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + } else { + /* Cookie is valid. Create a (full) child socket to + * accept the data in SYN before returning a SYN-ACK to + * ack the data. If we fail to create the socket, fall + * back and ack the ISN only but includes the same + * cookie. + * + * Note: Data-less SYN with valid cookie is allowed to + * send data in SYN_RECV state. + */ fastopen: - child = tcp_fastopen_create_child(sk, skb, req); - if (child) { - foc->len = -1; + child = tcp_fastopen_create_child(sk, skb, req); + if (child) { + if (ret == 2) { + valid_foc.exp = foc->exp; + *foc = valid_foc; + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEALTKEY); + } else { + foc->len = -1; + } + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVE); + return child; + } NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVE); - return child; + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - } else if (foc->len > 0) /* Client presents an invalid cookie */ - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - + } valid_foc.exp = foc->exp; *foc = valid_foc; return NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d95ee40df6c2..c21e8a22fb3b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -119,7 +119,7 @@ void clean_acked_data_enable(struct inet_connection_sock *icsk, void (*cad)(struct sock *sk, u32 ack_seq)) { icsk->icsk_clean_acked = cad; - static_branch_inc(&clean_acked_data_enabled.key); + static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); @@ -778,6 +778,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); + + tcp_bpf_rtt(sk); } } else { /* no previous measure. */ @@ -786,6 +788,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; + + tcp_bpf_rtt(sk); } tp->srtt_us = max(1U, srtt); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cfa81190a1b1..d57641cb3477 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -662,8 +662,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) int genhash; struct sock *sk1 = NULL; #endif - struct net *net; + u64 transmit_time = 0; struct sock *ctl_sk; + struct net *net; /* Never send a reset in response to a reset. */ if (th->rst) @@ -766,14 +767,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); - if (sk) + ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); + if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + transmit_time = tcp_transmit_time(sk); + } ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - &arg, arg.iov[0].iov_len); + &arg, arg.iov[0].iov_len, + transmit_time); ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); @@ -808,6 +812,7 @@ static void tcp_v4_send_ack(const struct sock *sk, struct net *net = sock_net(sk); struct ip_reply_arg arg; struct sock *ctl_sk; + u64 transmit_time; memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -858,14 +863,15 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); - if (sk) - ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - &arg, arg.iov[0].iov_len); + &arg, arg.iov[0].iov_len, + transmit_time); ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c35731816e2..8bcaf2586b68 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; - + tcptw->tw_tx_delay = tp->tcp_tx_delay; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -283,6 +283,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); + tw->tw_txhash = sk->sk_txhash; tw->tw_ipv6only = sk->sk_ipv6only; } #endif diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0ebc33d1c9e5..4af1f5dae9d3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); + tcp_add_tx_delay(skb, tp); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { @@ -2239,6 +2241,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; + if (static_branch_unlikely(&tcp_tx_delay_enabled) && + tcp_sk(sk)->tcp_tx_delay) { + u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay; + + /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we + * approximate our needs assuming an ~100% skb->truesize overhead. + * USEC_PER_SEC is approximated by 2^20. + * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. + */ + extra_bytes >>= (20 - 1); + limit += extra_bytes; + } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, @@ -3217,6 +3231,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, int tcp_header_size; struct tcphdr *th; int mss; + u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { @@ -3248,13 +3263,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); + now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif { - skb->skb_mstamp_ns = tcp_clock_ns(); + skb->skb_mstamp_ns = now; if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3297,8 +3313,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif - /* Do not fool tcpdump (if any), clean our debris */ - skb->tstamp = 0; + skb->skb_mstamp_ns = now; + tcp_add_tx_delay(skb, tp); + return skb; } EXPORT_SYMBOL(tcp_make_synack); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eed59c847722..c21862ba9c02 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -125,17 +125,6 @@ EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) -/* IPCB reference means this can not be used from early demux */ -static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_udp_l3mdev_accept && - skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) - return true; -#endif - return false; -} - static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, @@ -364,7 +353,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) static int compute_score(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, - int dif, int sdif, bool exact_dif) + int dif, int sdif) { int score; struct inet_sock *inet; @@ -420,7 +409,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, - int dif, int sdif, bool exact_dif, + int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { @@ -432,7 +421,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); + daddr, hnum, dif, sdif); if (score > badness) { if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, @@ -460,7 +449,6 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2; struct udp_hslot *hslot2; - bool exact_dif = udp_lib_exact_dif_match(net, skb); hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -468,7 +456,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, - exact_dif, hslot2, skb); + hslot2, skb); if (!result) { hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; @@ -476,9 +464,9 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, - exact_dif, hslot2, skb); + hslot2, skb); } - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } @@ -2236,8 +2224,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - inet_compute_pseudo); + skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 9763464a75d7..a3908e55ed89 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -208,7 +208,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, gso_skb->destructor = NULL; segs = skb_segment(gso_skb, features); - if (unlikely(IS_ERR_OR_NULL(segs))) { + if (IS_ERR_OR_NULL(segs)) { if (copy_dtor) gso_skb->destructor = sock_wfree; return segs; diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 80c40b4981bb..f8ed3c3bb928 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -15,46 +15,6 @@ #include <linux/netfilter_ipv4.h> #include <linux/export.h> -static int xfrm4_init_flags(struct xfrm_state *x) -{ - if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) - x->props.flags |= XFRM_STATE_NOPMTUDISC; - return 0; -} - -static void -__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) -{ - const struct flowi4 *fl4 = &fl->u.ip4; - - sel->daddr.a4 = fl4->daddr; - sel->saddr.a4 = fl4->saddr; - sel->dport = xfrm_flowi_dport(fl, &fl4->uli); - sel->dport_mask = htons(0xffff); - sel->sport = xfrm_flowi_sport(fl, &fl4->uli); - sel->sport_mask = htons(0xffff); - sel->family = AF_INET; - sel->prefixlen_d = 32; - sel->prefixlen_s = 32; - sel->proto = fl4->flowi4_proto; - sel->ifindex = fl4->flowi4_oif; -} - -static void -xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, - const xfrm_address_t *daddr, const xfrm_address_t *saddr) -{ - x->id = tmpl->id; - if (x->id.daddr.a4 == 0) - x->id.daddr.a4 = daddr->a4; - x->props.saddr = tmpl->saddr; - if (x->props.saddr.a4 == 0) - x->props.saddr.a4 = saddr->a4; - x->props.mode = tmpl->mode; - x->props.reqid = tmpl->reqid; - x->props.family = AF_INET; -} - int xfrm4_extract_header(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); @@ -74,11 +34,6 @@ int xfrm4_extract_header(struct sk_buff *skb) static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, .proto = IPPROTO_IPIP, - .eth_proto = htons(ETH_P_IP), - .owner = THIS_MODULE, - .init_flags = xfrm4_init_flags, - .init_tempsel = __xfrm4_init_tempsel, - .init_temprop = xfrm4_init_temprop, .output = xfrm4_output, .output_finish = xfrm4_output_finish, .extract_input = xfrm4_extract_input, diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 5d00e54cd319..dc19aff7c2e0 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -108,8 +108,7 @@ static void __exit ipip_fini(void) if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET)) pr_info("%s: can't remove xfrm handler for AF_INET\n", __func__); - if (xfrm_unregister_type(&ipip_type, AF_INET) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&ipip_type, AF_INET); } module_init(ipip_init); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 081bb517e40d..521e3203e83a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2417,9 +2417,13 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex) + /* prefix routes only use builtin fib6_nh */ + if (rt->nh) continue; - if (no_gw && rt->fib6_nh.fib_nh_gw_family) + + if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex) + continue; + if (no_gw && rt->fib6_nh->fib_nh_gw_family) continue; if ((rt->fib6_flags & flags) != flags) continue; @@ -3123,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) struct in_device *in_dev = __in_dev_get_rtnl(dev); if (in_dev && (dev->flags & IFF_UP)) { struct in_ifaddr *ifa; - int flag = scope; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - + in_dev_for_each_ifa_rtnl(ifa, in_dev) { addr.s6_addr32[3] = ifa->ifa_local; if (ifa->ifa_scope == RT_SCOPE_LINK) @@ -6350,16 +6352,17 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct fib6_info *rt = ifa->rt; + /* host routes only use builtin fib6_nh */ + struct fib6_nh *nh = ifa->rt->fib6_nh; int cpu; rcu_read_lock(); ifa->rt->dst_nopolicy = val ? true : false; - if (rt->rt6i_pcpu) { + if (nh->rt6i_pcpu) { for_each_possible_cpu(cpu) { struct rt6_info **rtp; - rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu); + rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu); addrconf_set_nopolicy(*rtp, val); } } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 5b1246635e02..783f3c1466da 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -183,6 +183,11 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +{ + return -EAFNOSUPPORT; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .ipv6_route_input = eafnosupport_ipv6_route_input, @@ -192,6 +197,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .fib6_select_path = eafnosupport_fib6_select_path, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, .fib6_nh_init = eafnosupport_fib6_nh_init, + .ip6_del_rt = eafnosupport_ip6_del_rt, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5352708b7b2d..ef37e0574f54 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -208,7 +208,7 @@ lookup_protocol: np->mc_loop = 1; np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->repflow = net->ipv6.sysctl.flowlabel_reflect; + np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets @@ -564,6 +564,39 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } EXPORT_SYMBOL(inet6_ioctl); +INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, + size_t)); +int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + + if (unlikely(inet_send_prepare(sk))) + return -EAGAIN; + + return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, + sk, msg, size); +} + +INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *, + size_t, int, int, int *)); +int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + if (likely(!(flags & MSG_ERRQUEUE))) + sock_rps_record_flow(sk); + + err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, + sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} + const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -580,8 +613,8 @@ const struct proto_ops inet6_stream_ops = { .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = inet_recvmsg, /* ok */ + .sendmsg = inet6_sendmsg, /* retpoline's sake */ + .recvmsg = inet6_recvmsg, /* retpoline's sake */ #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif @@ -614,8 +647,8 @@ const struct proto_ops inet6_dgram_ops = { .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = inet_recvmsg, /* ok */ + .sendmsg = inet6_sendmsg, /* retpoline's sake */ + .recvmsg = inet6_recvmsg, /* retpoline's sake */ .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, @@ -922,6 +955,9 @@ static const struct ipv6_stub ipv6_stub_impl = { .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .fib6_nh_init = fib6_nh_init, .fib6_nh_release = fib6_nh_release, + .fib6_update_sernum = fib6_update_sernum_stub, + .fib6_rt_update = fib6_rt_update, + .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 68b9e92e469e..25e1172fd1c3 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -793,9 +793,7 @@ static void __exit ah6_fini(void) if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); - + xfrm_unregister_type(&ah6_type, AF_INET6); } module_init(ah6_init); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index ae6a739c5f52..a3b403ba8f8f 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -41,8 +41,6 @@ struct esp_skb_cb { #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) -static u32 esp6_get_mtu(struct xfrm_state *x, int mtu); - /* * Allocate an AEAD request structure with extra space for SG and IV. * @@ -447,7 +445,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } @@ -687,21 +685,6 @@ out: return ret; } -static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) -{ - struct crypto_aead *aead = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); - unsigned int net_adj; - - if (x->props.mode != XFRM_MODE_TUNNEL) - net_adj = sizeof(struct ipv6hdr); - else - net_adj = 0; - - return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - - net_adj) & ~(blksize - 1)) + net_adj - 2; -} - static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { @@ -919,7 +902,6 @@ static const struct xfrm_type esp6_type = { .flags = XFRM_TYPE_REPLAY_PROT, .init_state = esp6_init_state, .destructor = esp6_destroy, - .get_mtu = esp6_get_mtu, .input = esp6_input, .output = esp6_output, .hdr_offset = xfrm6_find_1stfragopt, @@ -951,8 +933,7 @@ static void __exit esp6_fini(void) { if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&esp6_type, AF_INET6); } module_init(esp6_init); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index d0d8528b294a..e31626ffccd1 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -336,9 +336,7 @@ static int __init esp6_offload_init(void) static void __exit esp6_offload_exit(void) { - if (xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type offload\n", __func__); - + xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6); inet6_del_offload(&esp6_offload, IPPROTO_ESP); } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index bcfae13409b5..d22b6c140f23 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); } - dst_hold(&net->ipv6.ip6_null_entry->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } @@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } again: - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); err = -EAGAIN; rt = NULL; goto out; discard_pkt: - dst_hold(&rt->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); out: res->rt6 = rt; return err; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 375b4b4f9bf5..62c997201970 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -75,9 +75,9 @@ * * On SMP we have one ICMP socket per-cpu. */ -static inline struct sock *icmpv6_sk(struct net *net) +static struct sock *icmpv6_sk(struct net *net) { - return *this_cpu_ptr(net->ipv6.icmp_sk); + return this_cpu_read(*net->ipv6.icmp_sk); } static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, @@ -703,6 +703,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; memset(&fl6, 0, sizeof(fl6)); + if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) + fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb)); + fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b2a55f300318..cf60fae9533b 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -174,7 +174,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, &in6addr_any, hnum, dif, sdif); done: - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9180c8b6f764..49884f96232b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -143,20 +143,19 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; + size_t sz = sizeof(*f6i); - f6i = kzalloc(sizeof(*f6i), gfp_flags); + if (with_fib6_nh) + sz += sizeof(struct fib6_nh); + + f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; - f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); - if (!f6i->rt6i_pcpu) { - kfree(f6i); - return NULL; - } - + /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); @@ -166,36 +165,15 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); - struct rt6_exception_bucket *bucket; WARN_ON(f6i->fib6_node); - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); - kfree(bucket); - - if (f6i->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - dst_dev_put(&pcpu_rt->dst); - dst_release(&pcpu_rt->dst); - *ppcpu_rt = NULL; - } - } - - free_percpu(f6i->rt6i_pcpu); - } - - fib6_nh_release(&f6i->fib6_nh); + if (f6i->nh) + nexthop_put(f6i->nh); + else + fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); - kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); @@ -338,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); + if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); } return &rt->dst; @@ -389,14 +368,30 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, return call_fib6_notifier(nb, net, event_type, &info.info); } -static int call_fib6_entry_notifiers(struct net *net, - enum fib_event_type event_type, - struct fib6_info *rt, - struct netlink_ext_ack *extack) +int call_fib6_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + }; + + rt->fib6_table->fib_seq++; + return call_fib6_notifiers(net, event_type, &info.info); +} + +int call_fib6_multipath_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + unsigned int nsiblings, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, + .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; @@ -469,12 +464,19 @@ static int fib6_dump_node(struct fib6_walker *w) struct fib6_info *rt; for_each_fib6_walker_rt(w) { - res = rt6_dump_route(rt, w->args); - if (res < 0) { + res = rt6_dump_route(rt, w->args, w->skip_in_node); + if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; + + /* We'll restart from this node, so if some routes were + * already dumped, skip them next time. + */ + w->skip_in_node += res; + return 1; } + w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the @@ -526,6 +528,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, if (cb->args[4] == 0) { w->count = 0; w->skip = 0; + w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); @@ -541,6 +544,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->state = FWS_INIT; w->node = w->root; w->skip = w->count; + w->skip_in_node = 0; } else w->skip = 0; @@ -558,9 +562,10 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, + .filter.dump_routes = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct rt6_rtnl_dump_arg arg = {}; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib6_walker *w; @@ -577,13 +582,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); - arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED); + if (rtm->rtm_flags & RTM_F_PREFIX) + arg.filter.flags = RTM_F_PREFIX; } - /* fib entries are never clones */ - if (arg.filter.flags & RTM_F_CLONED) - goto out; - w = (void *)cb->args[2]; if (!w) { /* New dump: @@ -895,16 +897,14 @@ insert_above: return ln; } -static void fib6_drop_pcpu_from(struct fib6_info *f6i, - const struct fib6_table *table) +static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, + const struct fib6_info *match, + const struct fib6_table *table) { int cpu; - /* Make sure rt6_make_pcpu_route() wont add other percpu routes - * while we are cleaning them here. - */ - f6i->fib6_destroying = 1; - mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + if (!fib6_nh->rt6i_pcpu) + return; /* release the reference to this fib entry from * all of its cached pcpu routes @@ -913,9 +913,15 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { + + /* only dropping the 'from' reference if the cached route + * is using 'match'. The cached pcpu_rt->from only changes + * from a fib6_info to NULL (ip6_dst_destroy); it can never + * change from one fib6_info reference to another + */ + if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); @@ -924,13 +930,53 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, } } +struct fib6_nh_pcpu_arg { + struct fib6_info *from; + const struct fib6_table *table; +}; + +static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_pcpu_arg *arg = _arg; + + __fib6_drop_pcpu_from(nh, arg->from, arg->table); + return 0; +} + +static void fib6_drop_pcpu_from(struct fib6_info *f6i, + const struct fib6_table *table) +{ + /* Make sure rt6_make_pcpu_route() wont add other percpu routes + * while we are cleaning them here. + */ + f6i->fib6_destroying = 1; + mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + + if (f6i->nh) { + struct fib6_nh_pcpu_arg arg = { + .from = f6i, + .table = table + }; + + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, + &arg); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = f6i->fib6_nh; + __fib6_drop_pcpu_from(fib6_nh, f6i, table); + } +} + static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; - if (rt->rt6i_pcpu) - fib6_drop_pcpu_from(rt, table); + fib6_drop_pcpu_from(rt, table); + + if (rt->nh && !list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1101,11 +1147,13 @@ next_iter: add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); - if (err) - return err; + if (!info->skip_notify_kernel) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); + if (err) + return err; + } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); @@ -1130,11 +1178,13 @@ add: return -ENOENT; } - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_REPLACE, - rt, extack); - if (err) - return err; + if (!info->skip_notify_kernel) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_REPLACE, + rt, extack); + if (err) + return err; + } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); @@ -1218,6 +1268,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } +/* allow ipv4 to update sernum via ipv6_stub */ +void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_update_sernum_upto_root(net, f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> @@ -1331,6 +1389,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { + if (rt->nh) + list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); } @@ -1536,7 +1596,8 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, if (plen == fn->fn_bit) return fn; - prev = fn; + if (fn->fn_flags & RTN_RTINFO) + prev = fn; next: /* @@ -1807,9 +1868,11 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); + if (!info->skip_notify_kernel) + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); + fib6_info_release(rt); } @@ -2041,6 +2104,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; + c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; @@ -2292,9 +2356,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; + struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; + if (rt->nh) + fib6_nh = nexthop_fib6_nh(rt->nh); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES @@ -2302,14 +2370,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->fib6_nh.fib_nh_gw_family) { + if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; - seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6); + seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } - dev = rt->fib6_nh.fib_nh_dev; + dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 545e339b8c4f..ad284b1fd308 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> +#include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -53,6 +54,9 @@ static DEFINE_SPINLOCK(ip6_fl_lock); static DEFINE_SPINLOCK(ip6_sk_fl_lock); +DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); +EXPORT_SYMBOL(ipv6_flowlabel_exclusive); + #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ fl != NULL; \ @@ -90,6 +94,13 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) return fl; } +static bool fl_shared_exclusive(struct ip6_flowlabel *fl) +{ + return fl->share == IPV6_FL_S_EXCL || + fl->share == IPV6_FL_S_PROCESS || + fl->share == IPV6_FL_S_USER; +} + static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); @@ -103,8 +114,13 @@ static void fl_free_rcu(struct rcu_head *head) static void fl_free(struct ip6_flowlabel *fl) { - if (fl) - call_rcu(&fl->rcu, fl_free_rcu); + if (!fl) + return; + + if (fl_shared_exclusive(fl) || fl->opt) + static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); + + call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) @@ -240,7 +256,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, /* Socket flowlabel lists */ -struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) +struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); @@ -260,7 +276,7 @@ struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) rcu_read_unlock_bh(); return NULL; } -EXPORT_SYMBOL_GPL(fl6_sock_lookup); +EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { @@ -419,6 +435,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); + if (fl_shared_exclusive(fl) || fl->opt) + static_branch_deferred_inc(&ipv6_flowlabel_exclusive); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: @@ -854,6 +872,7 @@ int ip6_flowlabel_init(void) void ip6_flowlabel_cleanup(void) { + static_key_deferred_flush(&ipv6_flowlabel_exclusive); del_timer(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 21efcd02f337..8e49fd62eea9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -124,16 +124,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return -EINVAL; } -static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } - #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -150,6 +142,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ip6_finish_output2(net, sk, skb); } +static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip6_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip6_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -588,6 +596,169 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, + struct ip6_fraglist_iter *iter) +{ + unsigned int first_len; + struct frag_hdr *fh; + + /* BUILD HEADER */ + *prevhdr = NEXTHDR_FRAGMENT; + iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); + if (!iter->tmp_hdr) + return -ENOMEM; + + iter->frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + + iter->offset = 0; + iter->hlen = hlen; + iter->frag_id = frag_id; + iter->nexthdr = nexthdr; + + __skb_pull(skb, hlen); + fh = __skb_push(skb, sizeof(struct frag_hdr)); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); + + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + fh->identification = frag_id; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); + + return 0; +} +EXPORT_SYMBOL(ip6_fraglist_init); + +void ip6_fraglist_prepare(struct sk_buff *skb, + struct ip6_fraglist_iter *iter) +{ + struct sk_buff *frag = iter->frag; + unsigned int hlen = iter->hlen; + struct frag_hdr *fh; + + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + fh = __skb_push(frag, sizeof(struct frag_hdr)); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); + iter->offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = iter->nexthdr; + fh->reserved = 0; + fh->frag_off = htons(iter->offset); + if (frag->next) + fh->frag_off |= htons(IP6_MF); + fh->identification = iter->frag_id; + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); +} +EXPORT_SYMBOL(ip6_fraglist_prepare); + +void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, + unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) +{ + state->prevhdr = prevhdr; + state->nexthdr = nexthdr; + state->frag_id = frag_id; + + state->hlen = hlen; + state->mtu = mtu; + + state->left = skb->len - hlen; /* Space per frame */ + state->ptr = hlen; /* Where to start from */ + + state->hroom = hdr_room; + state->troom = needed_tailroom; + + state->offset = 0; +} +EXPORT_SYMBOL(ip6_frag_init); + +struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) +{ + u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; + struct sk_buff *frag; + struct frag_hdr *fh; + unsigned int len; + + len = state->left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > state->mtu) + len = state->mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < state->left) + len &= ~7; + + /* Allocate buffer */ + frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + + state->hroom + state->troom, GFP_ATOMIC); + if (!frag) + return ERR_PTR(-ENOMEM); + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, state->hroom); + skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); + frag->transport_header = (frag->network_header + state->hlen + + sizeof(struct frag_hdr)); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); + + fragnexthdr_offset = skb_network_header(frag); + fragnexthdr_offset += prevhdr - skb_network_header(skb); + *fragnexthdr_offset = NEXTHDR_FRAGMENT; + + /* + * Build fragment header. + */ + fh->nexthdr = state->nexthdr; + fh->reserved = 0; + fh->identification = state->frag_id; + + /* + * Copy a block of the IP datagram. + */ + BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), + len)); + state->left -= len; + + fh->frag_off = htons(state->offset); + if (state->left > 0) + fh->frag_off |= htons(IP6_MF); + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + + state->ptr += len; + state->offset += len; + + return frag; +} +EXPORT_SYMBOL(ip6_frag_next); + int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { @@ -595,12 +766,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - struct ipv6hdr *tmp_hdr; - struct frag_hdr *fh; - unsigned int mtu, hlen, left, len, nexthdr_offset; - int hroom, troom; + struct ip6_frag_state state; + unsigned int mtu, hlen, nexthdr_offset; + int hroom, err = 0; __be32 frag_id; - int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; err = ip6_find_1stfragopt(skb, &prevhdr); @@ -647,6 +816,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; struct sk_buff *frag2; if (first_len - hlen > mtu || @@ -674,74 +844,29 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, skb->truesize -= frag->truesize; } - err = 0; - offset = 0; - /* BUILD HEADER */ - - *prevhdr = NEXTHDR_FRAGMENT; - tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); - if (!tmp_hdr) { - err = -ENOMEM; + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) goto fail; - } - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - - __skb_pull(skb, hlen); - fh = __skb_push(skb, sizeof(struct frag_hdr)); - __skb_push(skb, hlen); - skb_reset_network_header(skb); - memcpy(skb_network_header(skb), tmp_hdr, hlen); - - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(IP6_MF); - fh->identification = frag_id; - - first_len = skb_pagelen(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - ipv6_hdr(skb)->payload_len = htons(first_len - - sizeof(struct ipv6hdr)); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - fh = __skb_push(frag, sizeof(struct frag_hdr)); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), tmp_hdr, - hlen); - offset += skb->len - hlen - sizeof(struct frag_hdr); - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(offset); - if (frag->next) - fh->frag_off |= htons(IP6_MF); - fh->identification = frag_id; - ipv6_hdr(frag)->payload_len = - htons(frag->len - - sizeof(struct ipv6hdr)); - ip6_copy_metadata(frag, skb); - } + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); - if (err || !frag) + if (err || !iter.frag) break; - skb = frag; - frag = skb->next; - skb_mark_not_on_list(skb); + skb = ip6_fraglist_next(&iter); } - kfree(tmp_hdr); + kfree(iter.tmp_hdr); if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -749,7 +874,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - kfree_skb_list(frag); + kfree_skb_list(iter.frag); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); @@ -766,91 +891,26 @@ slow_path_clean: } slow_path: - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - /* * Fragment the datagram. */ - troom = rt->dst.dev->needed_tailroom; + ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, + LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, + &state); /* * Keep copying data until we run out. */ - while (left > 0) { - u8 *fragnexthdr_offset; - - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } - /* Allocate buffer */ - frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + - hroom + troom, GFP_ATOMIC); - if (!frag) { - err = -ENOMEM; + while (state.left > 0) { + frag = ip6_frag_next(skb, &state); + if (IS_ERR(frag)) { + err = PTR_ERR(frag); goto fail; } /* - * Set up data on packet - */ - - ip6_copy_metadata(frag, skb); - skb_reserve(frag, hroom); - skb_put(frag, len + hlen + sizeof(struct frag_hdr)); - skb_reset_network_header(frag); - fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); - frag->transport_header = (frag->network_header + hlen + - sizeof(struct frag_hdr)); - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - if (skb->sk) - skb_set_owner_w(frag, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); - - fragnexthdr_offset = skb_network_header(frag); - fragnexthdr_offset += prevhdr - skb_network_header(skb); - *fragnexthdr_offset = NEXTHDR_FRAGMENT; - - /* - * Build fragment header. - */ - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->identification = frag_id; - - /* - * Copy a block of the IP datagram. - */ - BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), - len)); - left -= len; - - fh->frag_off = htons(offset); - if (left > 0) - fh->frag_off |= htons(IP6_MF); - ipv6_hdr(frag)->payload_len = htons(frag->len - - sizeof(struct ipv6hdr)); - - ptr += len; - offset += len; - - /* * Put this fragment into the sending queue. */ err = output(net, sk, frag); diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 51fd33294c7c..3752bd3e92ce 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -206,8 +206,7 @@ static void __exit ipcomp6_fini(void) { if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&ipcomp6_type, AF_INET6); } module_init(ipcomp6_init); diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 91801432878c..878fcec14949 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -499,10 +499,8 @@ static void __exit mip6_fini(void) { if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) pr_info("%s: can't remove rawv6 mh filter\n", __func__); - if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type(rthdr)\n", __func__); - if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type(destopt)\n", __func__); + xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); + xfrm_unregister_type(&mip6_destopt_type, AF_INET6); } module_init(mip6_init); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 09dd2edfb868..083cc1c94cd3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1285,12 +1285,11 @@ static void ndisc_router_discovery(struct sk_buff *skb) !in6_dev->cnf.accept_ra_rtr_pref) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif - + /* routes added from RAs do not use nexthop objects */ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); - if (rt) { - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, @@ -1319,8 +1318,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 1240ccd57f39..61819ed858b1 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -16,6 +16,9 @@ #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/netfilter/nf_queue.h> +#include <net/netfilter/nf_conntrack_bridge.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include "../bridge/br_private.h" int ip6_route_me_harder(struct net *net, struct sk_buff *skb) { @@ -109,16 +112,142 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst, } EXPORT_SYMBOL_GPL(__nf_ip6_route); +int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + struct ip6_frag_state state; + u8 *prevhdr, nexthdr = 0; + unsigned int mtu, hlen; + int hroom, err = 0; + __be32 frag_id; + + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + goto blackhole; + hlen = err; + nexthdr = *prevhdr; + + mtu = skb->dev->mtu; + if (frag_max_size > mtu || + frag_max_size < IPV6_MIN_MTU) + goto blackhole; + + mtu = frag_max_size; + if (mtu < hlen + sizeof(struct frag_hdr) + 8) + goto blackhole; + mtu -= hlen + sizeof(struct frag_hdr); + + frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto blackhole; + + hroom = LL_RESERVED_SPACE(skb->dev); + if (skb_has_frag_list(skb)) { + unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; + struct sk_buff *frag2; + + if (first_len - hlen > mtu || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + goto blackhole; + + if (skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag2) { + if (frag2->len > mtu || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + goto blackhole; + + /* Partially cloned skb? */ + if (skb_shared(frag2)) + goto slow_path; + } + + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) + goto blackhole; + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. + */ + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); + + err = output(net, sk, data, skb); + if (err || !iter.frag) + break; + + skb = ip6_fraglist_next(&iter); + } + + kfree(iter.tmp_hdr); + if (!err) + return 0; + + kfree_skb_list(iter.frag); + return err; + } +slow_path: + /* This is a linearized skbuff, the original geometry is lost for us. + * This may also be a clone skbuff, we could preserve the geometry for + * the copies but probably not worth the effort. + */ + ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom, + LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id, + &state); + + while (state.left > 0) { + struct sk_buff *skb2; + + skb2 = ip6_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); + goto blackhole; + } + + err = output(net, sk, data, skb2); + if (err) + goto blackhole; + } + consume_skb(skb); + return err; + +blackhole: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL_GPL(br_ip6_fragment); + static const struct nf_ipv6_ops ipv6ops = { #if IS_MODULE(CONFIG_IPV6) .chk_addr = ipv6_chk_addr, .route_me_harder = ip6_route_me_harder, .dev_get_saddr = ipv6_dev_get_saddr, .route = __nf_ip6_route, +#if IS_ENABLED(CONFIG_SYN_COOKIES) + .cookie_init_sequence = __cookie_v6_init_sequence, + .cookie_v6_check = __cookie_v6_check, +#endif #endif .route_input = ip6_route_input, .fragment = ip6_fragment, .reroute = nf_ip6_reroute, +#if IS_MODULE(CONFIG_IPV6) && IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + .br_defrag = nf_ct_frag6_gather, +#endif +#if IS_MODULE(CONFIG_IPV6) + .br_fragment = br_ip6_fragment, +#endif }; int __init ipv6_netfilter_init(void) diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 41325d517478..e77ea1ed5edd 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -3,272 +3,11 @@ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <net/ip6_checksum.h> -#include <net/ip6_route.h> -#include <net/tcp.h> - #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_synproxy.h> -#include <net/netfilter/nf_conntrack_ecache.h> - -static struct ipv6hdr * -synproxy_build_ip(struct net *net, struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - struct ipv6hdr *iph; - - skb_reset_network_header(skb); - iph = skb_put(skb, sizeof(*iph)); - ip6_flow_hdr(iph, 0, 0); - iph->hop_limit = net->ipv6.devconf_all->hop_limit; - iph->nexthdr = IPPROTO_TCP; - iph->saddr = *saddr; - iph->daddr = *daddr; - - return iph; -} - -static void -synproxy_send_tcp(struct net *net, - const struct sk_buff *skb, struct sk_buff *nskb, - struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, - struct ipv6hdr *niph, struct tcphdr *nth, - unsigned int tcp_hdr_size) -{ - struct dst_entry *dst; - struct flowi6 fl6; - - nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)nth - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_TCP; - fl6.saddr = niph->saddr; - fl6.daddr = niph->daddr; - fl6.fl6_sport = nth->source; - fl6.fl6_dport = nth->dest; - security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - dst_release(dst); - goto free_nskb; - } - dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); - if (IS_ERR(dst)) - goto free_nskb; - - skb_dst_set(nskb, dst); - - if (nfct) { - nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); - nf_conntrack_get(nfct); - } - - ip6_local_out(net, nskb->sk, nskb); - return; - -free_nskb: - kfree_skb(nskb); -} - -static void -synproxy_send_client_synack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - u16 mss = opts->mss; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE; - nth->doff = tcp_hdr_size / 4; - nth->window = 0; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} -static void -synproxy_send_server_syn(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(recv_seq - 1); - /* ack_seq is used to relay our ISN to the synproxy hook to initialize - * sequence number translation once a connection tracking entry exists. - */ - nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); - tcp_flag_word(nth) = TCP_FLAG_SYN; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; - nth->doff = tcp_hdr_size / 4; - nth->window = th->window; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, - niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_ack(struct net *net, - const struct ip_ct_tcp *state, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(ntohl(th->ack_seq)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(ntohl(th->seq) + 1); - nth->ack_seq = th->ack_seq; - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(ntohs(th->window) >> opts->wscale); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static bool -synproxy_recv_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - int mss; - - mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); - if (mss == 0) { - this_cpu_inc(snet->stats->cookie_invalid); - return false; - } - - this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; - - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy_check_timestamp_cookie(opts); - - synproxy_send_server_syn(net, skb, th, opts, recv_seq); - return true; -} +#include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) @@ -304,13 +43,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(net, skb, th, &opts); + synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { + if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, + ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { @@ -321,141 +61,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv6_synproxy_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct net *net = nhs->net; - struct synproxy_net *snet = synproxy_pernet(net); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - struct nf_conn_synproxy *synproxy; - struct synproxy_options opts = {}; - const struct ip_ct_tcp *state; - struct tcphdr *th, _th; - __be16 frag_off; - u8 nexthdr; - int thoff; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_ACCEPT; - - synproxy = nfct_synproxy(ct); - if (synproxy == NULL) - return NF_ACCEPT; - - if (nf_is_loopback_packet(skb)) - return NF_ACCEPT; - - nexthdr = ipv6_hdr(skb)->nexthdr; - thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (thoff < 0 || nexthdr != IPPROTO_TCP) - return NF_ACCEPT; - - th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); - if (th == NULL) - return NF_DROP; - - state = &ct->proto.tcp; - switch (state->state) { - case TCP_CONNTRACK_CLOSE: - if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - - ntohl(th->seq) + 1); - break; - } - - if (!th->syn || th->ack || - CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - break; - - /* Reopened connection - reset the sequence number and timestamp - * adjustments, they will get initialized once the connection is - * reestablished. - */ - nf_ct_seqadj_init(ct, ctinfo, 0); - synproxy->tsoff = 0; - this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ - case TCP_CONNTRACK_SYN_SENT: - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (!th->syn && th->ack && - CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, - * therefore we need to add 1 to make the SYN sequence - * number match the one of first SYN. - */ - if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) { - this_cpu_inc(snet->stats->cookie_retrans); - consume_skb(skb); - return NF_STOLEN; - } else { - return NF_DROP; - } - } - - synproxy->isn = ntohl(th->ack_seq); - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy->its = opts.tsecr; - - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - break; - case TCP_CONNTRACK_SYN_RECV: - if (!th->syn || !th->ack) - break; - - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { - synproxy->tsoff = opts.tsval - synproxy->its; - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - } - - opts.options &= ~(XT_SYNPROXY_OPT_MSS | - XT_SYNPROXY_OPT_WSCALE | - XT_SYNPROXY_OPT_SACK_PERM); - - swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(net, state, skb, th, &opts); - - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); - nf_conntrack_event_cache(IPCT_SEQADJ, ct); - - swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(net, skb, th, &opts); - - consume_skb(skb); - return NF_STOLEN; - default: - break; - } - - synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); - return NF_ACCEPT; -} - -static const struct nf_hook_ops ipv6_synproxy_ops[] = { - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); @@ -471,16 +76,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par) if (err) return err; - if (snet->hook_ref6 == 0) { - err = nf_register_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); - if (err) { - nf_ct_netns_put(par->net, par->family); - return err; - } + err = nf_synproxy_ipv6_init(snet, par->net); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; } - snet->hook_ref6++; return err; } @@ -488,10 +89,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); - snet->hook_ref6--; - if (snet->hook_ref6 == 0) - nf_unregister_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); + nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 3f7d4691c423..a22100b1cf2c 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -2,7 +2,7 @@ /* * IPv6 raw table, a port of the IPv4 raw table to IPv6 * - * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 84322ce81d70..398e1df41406 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -54,26 +54,21 @@ static struct inet_frags nf_frags; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", - .data = &init_net.nf_frag.frags.timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_frag6_low_thresh", - .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", - .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.nf_frag.frags.low_thresh }, { } }; @@ -89,15 +84,15 @@ static int nf_ct_frag6_sysctl_register(struct net *net) GFP_KERNEL); if (table == NULL) goto err_alloc; - - table[0].data = &net->nf_frag.frags.timeout; - table[1].data = &net->nf_frag.frags.low_thresh; - table[1].extra2 = &net->nf_frag.frags.high_thresh; - table[2].data = &net->nf_frag.frags.high_thresh; - table[2].extra1 = &net->nf_frag.frags.low_thresh; - table[2].extra2 = &init_net.nf_frag.frags.high_thresh; } + table[0].data = &net->nf_frag.fqdir->timeout; + table[1].data = &net->nf_frag.fqdir->low_thresh; + table[1].extra2 = &net->nf_frag.fqdir->high_thresh; + table[2].data = &net->nf_frag.fqdir->high_thresh; + table[2].extra1 = &net->nf_frag.fqdir->low_thresh; + table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh; + hdr = register_net_sysctl(net, "net/netfilter", table); if (hdr == NULL) goto err_reg; @@ -144,12 +139,10 @@ static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } /* Creation primitives. */ @@ -165,7 +158,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->nf_frag.frags, &key); + q = inet_frag_find(net->nf_frag.fqdir, &key); if (!q) return NULL; @@ -278,7 +271,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -494,29 +487,35 @@ static int nf_ct_net_init(struct net *net) { int res; - net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - net->nf_frag.frags.f = &nf_frags; - - res = inet_frags_init_net(&net->nf_frag.frags); + res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net); if (res < 0) return res; + + net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = nf_ct_frag6_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); return res; } +static void nf_ct_net_pre_exit(struct net *net) +{ + fqdir_pre_exit(net->nf_frag.fqdir); +} + static void nf_ct_net_exit(struct net *net) { nf_ct_frags6_sysctl_unregister(net); - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); } static struct pernet_operations nf_ct_net_ops = { - .init = nf_ct_net_init, - .exit = nf_ct_net_exit, + .init = nf_ct_net_init, + .pre_exit = nf_ct_net_pre_exit, + .exit = nf_ct_net_exit, }; static const struct rhashtable_params nfct_rhash_params = { diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 4a8da679866e..bbff3e02e302 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -44,8 +44,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %u memory %lu\n", - atomic_read(&net->ipv6.frags.rhashtable.nelems), - frag_mem_limit(&net->ipv6.frags)); + atomic_read(&net->ipv6.fqdir->rhashtable.nelems), + frag_mem_limit(net->ipv6.fqdir)); return 0; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 70693bc7ad9d..8a6131991e38 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -834,7 +834,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -876,7 +876,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b2b2c0c38b87..ca05b16f1bb9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -72,12 +72,10 @@ static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, ipv6.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } static struct frag_queue * @@ -96,7 +94,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) IPV6_ADDR_LINKLOCAL))) key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &key); + q = inet_frag_find(net->ipv6.fqdir, &key); if (!q) return NULL; @@ -196,7 +194,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); fragsize = -skb_network_offset(skb) + skb->len; if (fragsize > fq->q.max_size) @@ -250,7 +248,7 @@ err: static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { - struct net *net = container_of(fq->q.net, struct net, ipv6.frags); + struct net *net = fq->q.fqdir->net; unsigned int nhoff; void *reasm_data; int payload_len; @@ -397,23 +395,18 @@ static const struct inet6_protocol frag_protocol = { static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", - .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", - .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv6.frags.high_thresh }, { .procname = "ip6frag_time", - .data = &init_net.ipv6.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -445,12 +438,12 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv6.frags.high_thresh; - table[0].extra1 = &net->ipv6.frags.low_thresh; - table[1].data = &net->ipv6.frags.low_thresh; - table[1].extra2 = &net->ipv6.frags.high_thresh; - table[2].data = &net->ipv6.frags.timeout; } + table[0].data = &net->ipv6.fqdir->high_thresh; + table[0].extra1 = &net->ipv6.fqdir->low_thresh; + table[1].data = &net->ipv6.fqdir->low_thresh; + table[1].extra2 = &net->ipv6.fqdir->high_thresh; + table[2].data = &net->ipv6.fqdir->timeout; hdr = register_net_sysctl(net, "net/ipv6", table); if (!hdr) @@ -513,30 +506,35 @@ static int __net_init ipv6_frags_init_net(struct net *net) { int res; - net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; - net->ipv6.frags.f = &ip6_frags; - - res = inet_frags_init_net(&net->ipv6.frags); + res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net); if (res < 0) return res; + net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = ip6_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); return res; } +static void __net_exit ipv6_frags_pre_exit_net(struct net *net) +{ + fqdir_pre_exit(net->ipv6.fqdir); +} + static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); } static struct pernet_operations ip6_frags_ops = { - .init = ipv6_frags_init_net, - .exit = ipv6_frags_exit_net, + .init = ipv6_frags_init_net, + .pre_exit = ipv6_frags_pre_exit_net, + .exit = ipv6_frags_exit_net, }; static const struct rhashtable_params ip6_rhash_params = { @@ -587,8 +585,8 @@ err_protocol: void ipv6_frag_exit(void) { - inet_frags_fini(&ip6_frags); ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + inet_frags_fini(&ip6_frags); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 97a843cf164c..4d2e6b31a8d6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -100,7 +100,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict); -static size_t rt6_nlmsg_size(struct fib6_info *rt); +static size_t rt6_nlmsg_size(struct fib6_info *f6i); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } if (rt_dev == dev) { - rt->dst.dev = loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(rt_dev); } @@ -429,21 +429,27 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, struct fib6_info *sibling, *next_sibling; struct fib6_info *match = res->f6i; - if (!match->fib6_nsiblings || have_oif_match) + if ((!match->fib6_nsiblings && !match->nh) || have_oif_match) goto out; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ - if (!fl6->mp_hash) + if (!fl6->mp_hash && + (!match->nh || nexthop_is_multipath(match->nh))) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) + if (unlikely(match->nh)) { + nexthop_path_fib6_result(res, fl6->mp_hash); + return; + } + + if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) goto out; list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { - const struct fib6_nh *nh = &sibling->fib6_nh; + const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); @@ -457,7 +463,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, out: res->f6i = match; - res->nh = &match->fib6_nh; + res->nh = match->fib6_nh; } /* @@ -485,6 +491,45 @@ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, return false; } +struct fib6_nh_dm_arg { + struct net *net; + const struct in6_addr *saddr; + int oif; + int flags; + struct fib6_nh *nh; +}; + +static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_dm_arg *arg = _arg; + + arg->nh = nh; + return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, + arg->flags); +} + +/* returns fib6_nh from nexthop or NULL */ +static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, + struct fib6_result *res, + const struct in6_addr *saddr, + int oif, int flags) +{ + struct fib6_nh_dm_arg arg = { + .net = net, + .saddr = saddr, + .oif = oif, + .flags = flags, + }; + + if (nexthop_is_blackhole(nh)) + return NULL; + + if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) + return arg.nh; + + return NULL; +} + static void rt6_device_match(struct net *net, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { @@ -493,14 +538,31 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { - nh = &f6i->fib6_nh; + if (unlikely(f6i->nh)) { + nh = nexthop_fib6_nh(f6i->nh); + if (nexthop_is_blackhole(f6i->nh)) + goto out_blackhole; + } else { + nh = f6i->fib6_nh; + } if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { - nh = &spf6i->fib6_nh; - if (__rt6_device_match(net, nh, saddr, oif, flags)) { + bool matched = false; + + if (unlikely(spf6i->nh)) { + nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, + oif, flags); + if (nh) + matched = true; + } else { + nh = spf6i->fib6_nh; + if (__rt6_device_match(net, nh, saddr, oif, flags)) + matched = true; + } + if (matched) { res->f6i = spf6i; goto out; } @@ -508,19 +570,32 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; goto out; } - nh = &f6i->fib6_nh; + if (unlikely(f6i->nh)) { + nh = nexthop_fib6_nh(f6i->nh); + if (nexthop_is_blackhole(f6i->nh)) + goto out_blackhole; + } else { + nh = f6i->fib6_nh; + } + if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; } out: res->nh = nh; res->fib6_type = res->f6i->fib6_type; res->fib6_flags = res->f6i->fib6_flags; + return; + +out_blackhole: + res->fib6_flags |= RTF_REJECT; + res->fib6_type = RTN_BLACKHOLE; + res->nh = nh; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -691,6 +766,24 @@ out: return rc; } +struct fib6_nh_frl_arg { + u32 flags; + int oif; + int strict; + int *mpri; + bool *do_rr; + struct fib6_nh *nh; +}; + +static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_frl_arg *arg = _arg; + + arg->nh = nh; + return find_match(nh, arg->flags, arg->oif, arg->strict, + arg->mpri, arg->do_rr); +} + static void __find_rr_leaf(struct fib6_info *f6i_start, struct fib6_info *nomatch, u32 metric, struct fib6_result *res, struct fib6_info **cont, @@ -701,6 +794,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, for (f6i = f6i_start; f6i && f6i != nomatch; f6i = rcu_dereference(f6i->fib6_next)) { + bool matched = false; struct fib6_nh *nh; if (cont && f6i->fib6_metric != metric) { @@ -711,8 +805,34 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, if (fib6_check_expired(f6i)) continue; - nh = &f6i->fib6_nh; - if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { + if (unlikely(f6i->nh)) { + struct fib6_nh_frl_arg arg = { + .flags = f6i->fib6_flags, + .oif = oif, + .strict = strict, + .mpri = mpri, + .do_rr = do_rr + }; + + if (nexthop_is_blackhole(f6i->nh)) { + res->fib6_flags = RTF_REJECT; + res->fib6_type = RTN_BLACKHOLE; + res->f6i = f6i; + res->nh = nexthop_fib6_nh(f6i->nh); + return; + } + if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, + &arg)) { + matched = true; + nh = arg.nh; + } + } else { + nh = f6i->fib6_nh; + if (find_match(nh, f6i->fib6_flags, oif, strict, + mpri, do_rr)) + matched = true; + } + if (matched) { res->f6i = f6i; res->nh = nh; res->fib6_flags = f6i->fib6_flags; @@ -793,7 +913,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif, out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; - res->nh = &res->f6i->fib6_nh; + res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } @@ -1114,6 +1234,8 @@ restart: rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); goto out; + } else if (res.fib6_flags & RTF_REJECT) { + goto do_create; } fib6_select_path(net, &res, fl6, fl6->flowi6_oif, @@ -1125,6 +1247,7 @@ restart: if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); } else { +do_create: rt = ip6_create_rt_rcu(&res); } @@ -1265,13 +1388,9 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) /* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { - struct rt6_info *pcpu_rt, **p; - - p = this_cpu_ptr(res->f6i->rt6i_pcpu); - pcpu_rt = *p; + struct rt6_info *pcpu_rt; - if (pcpu_rt) - ip6_hold_safe(NULL, &pcpu_rt); + pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); return pcpu_rt; } @@ -1282,13 +1401,10 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(res); - if (!pcpu_rt) { - dst_hold(&net->ipv6.ip6_null_entry->dst); - return net->ipv6.ip6_null_entry; - } + if (!pcpu_rt) + return NULL; - dst_hold(&pcpu_rt->dst); - p = this_cpu_ptr(res->f6i->rt6i_pcpu); + p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -1458,25 +1574,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res) return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } +#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL + +/* used when the flushed bit is not relevant, only access to the bucket + * (ie., all bucket users except rt6_insert_exception); + * + * called under rcu lock; sometimes called with rt6_exception_lock held + */ +static +struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + + if (lock) + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + else + bucket = rcu_dereference(nh->rt6i_exception_bucket); + + /* remove bucket flushed bit if set */ + if (bucket) { + unsigned long p = (unsigned long)bucket; + + p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + } + + return bucket; +} + +static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) +{ + unsigned long p = (unsigned long)bucket; + + return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); +} + +/* called with rt6_exception_lock held */ +static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + unsigned long p; + + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + + p = (unsigned long)bucket; + p |= FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); +} + static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; + struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *f6i = res->f6i; + struct fib6_nh *nh = res->nh; int err = 0; spin_lock_bh(&rt6_exception_lock); - if (f6i->exception_bucket_flushed) { - err = -EINVAL; - goto out; - } - - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), GFP_ATOMIC); @@ -1484,7 +1649,10 @@ static int rt6_insert_exception(struct rt6_info *nrt, err = -ENOMEM; goto out; } - rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); + } else if (fib6_nh_excptn_bucket_flushed(bucket)) { + err = -EINVAL; + goto out; } #ifdef CONFIG_IPV6_SUBTREES @@ -1539,7 +1707,7 @@ out: return err; } -void rt6_flush_exceptions(struct fib6_info *rt) +static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1547,25 +1715,46 @@ void rt6_flush_exceptions(struct fib6_info *rt) int i; spin_lock_bh(&rt6_exception_lock); - /* Prevent rt6_insert_exception() to recreate the bucket list */ - rt->exception_bucket_flushed = 1; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; + /* Prevent rt6_insert_exception() to recreate the bucket list */ + if (!from) + fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) - rt6_remove_exception(bucket, rt6_ex); - WARN_ON_ONCE(bucket->depth); + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { + if (!from || + rcu_access_pointer(rt6_ex->rt6i->from) == from) + rt6_remove_exception(bucket, rt6_ex); + } + WARN_ON_ONCE(!from && bucket->depth); bucket++; } - out: spin_unlock_bh(&rt6_exception_lock); } +static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) +{ + struct fib6_info *f6i = arg; + + fib6_nh_flush_exceptions(nh, f6i); + + return 0; +} + +void rt6_flush_exceptions(struct fib6_info *f6i) +{ + if (f6i->nh) + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, + f6i); + else + fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); +} + /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ @@ -1594,7 +1783,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, src_key = saddr; find_ex: #endif - bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); + bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) @@ -1612,25 +1801,20 @@ find_ex: } /* Remove the passed in cached rt from the hash table that contains it */ -static int rt6_remove_exception_rt(struct rt6_info *rt) +static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) { + const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *from; int err; - from = rcu_dereference(rt->from); - if (!from || - !(rt->rt6i_flags & RTF_CACHE)) - return -EINVAL; - - if (!rcu_access_pointer(from->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(from->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); + #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1638,7 +1822,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, @@ -1655,23 +1839,60 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) return err; } -/* Find rt6_ex which contains the passed in rt cache and - * refresh its stamp - */ -static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +struct fib6_nh_excptn_arg { + struct rt6_info *rt; + int plen; +}; + +static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_excptn_arg *arg = _arg; + int err; + + err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); + if (err == 0) + return 1; + + return 0; +} + +static int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; - struct rt6_exception *rt6_ex; struct fib6_info *from; - rcu_read_lock(); from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) - goto unlock; + return -EINVAL; - bucket = rcu_dereference(from->rt6i_exception_bucket); + if (from->nh) { + struct fib6_nh_excptn_arg arg = { + .rt = rt, + .plen = from->fib6_src.plen + }; + int rc; + + /* rc = 1 means an entry was found */ + rc = nexthop_for_each_fib6_nh(from->nh, + rt6_nh_remove_exception_rt, + &arg); + return rc ? 0 : -ENOENT; + } + return fib6_nh_remove_exception(from->fib6_nh, + from->fib6_src.plen, rt); +} + +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) +{ + const struct in6_addr *src_key = NULL; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1679,15 +1900,63 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif - rt6_ex = __rt6_find_exception_rcu(&bucket, - &rt->rt6i_dst.addr, - src_key); + rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; +} + +struct fib6_nh_match_arg { + const struct net_device *dev; + const struct in6_addr *gw; + struct fib6_nh *match; +}; + +/* determine if fib6_nh has given device and gateway */ +static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_match_arg *arg = _arg; + + if (arg->dev != nh->fib_nh_dev || + (arg->gw && !nh->fib_nh_gw_family) || + (!arg->gw && nh->fib_nh_gw_family) || + (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) + return 0; + + arg->match = nh; + + /* found a match, break the loop */ + return 1; +} + +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct fib6_info *from; + struct fib6_nh *fib6_nh; + + rcu_read_lock(); + + from = rcu_dereference(rt->from); + if (!from || !(rt->rt6i_flags & RTF_CACHE)) + goto unlock; + if (from->nh) { + struct fib6_nh_match_arg arg = { + .dev = rt->dst.dev, + .gw = &rt->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); + + if (!arg.match) + return; + fib6_nh = arg.match; + } else { + fib6_nh = from->fib6_nh; + } + fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } @@ -1715,15 +1984,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, - struct fib6_info *rt, int mtu) + const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; @@ -1745,21 +2012,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) -static void rt6_exceptions_clean_tohost(struct fib6_info *rt, - struct in6_addr *gateway) +static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, + const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1824,23 +2089,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -void rt6_age_exceptions(struct fib6_info *rt, - struct fib6_gc_args *gc_args, - unsigned long now) +static void fib6_nh_age_exceptions(const struct fib6_nh *nh, + struct fib6_gc_args *gc_args, + unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1855,6 +2118,36 @@ void rt6_age_exceptions(struct fib6_info *rt, rcu_read_unlock_bh(); } +struct fib6_nh_age_excptn_arg { + struct fib6_gc_args *gc_args; + unsigned long now; +}; + +static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_age_excptn_arg *arg = _arg; + + fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); + return 0; +} + +void rt6_age_exceptions(struct fib6_info *f6i, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + if (f6i->nh) { + struct fib6_nh_age_excptn_arg arg = { + .gc_args = gc_args, + .now = now + }; + + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, + &arg); + } else { + fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); + } +} + /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) @@ -1891,9 +2184,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; - struct rt6_info *rt; + struct rt6_info *rt = NULL; int strict = 0; + WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && + !rcu_read_lock_held()); + strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (net->ipv6.devconf_all->forwarding == 0) @@ -1902,23 +2198,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); - if (res.f6i == net->ipv6.fib6_null_entry) { - rt = net->ipv6.ip6_null_entry; - rcu_read_unlock(); - dst_hold(&rt->dst); - return rt; - } + if (res.f6i == net->ipv6.fib6_null_entry) + goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { - if (ip6_hold_safe(net, &rt)) - dst_use_noref(&rt->dst, jiffies); - - rcu_read_unlock(); - return rt; + goto out; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be @@ -1926,40 +2214,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ - struct rt6_info *uncached_rt; - - uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); + rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); - rcu_read_unlock(); - - if (uncached_rt) { - /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() - * No need for another dst_hold() + if (rt) { + /* 1 refcnt is taken during ip6_rt_cache_alloc(). + * As rt6_uncached_list_add() does not consume refcnt, + * this refcnt is always returned to the caller even + * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ - rt6_uncached_list_add(uncached_rt); + rt6_uncached_list_add(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); - } else { - uncached_rt = net->ipv6.ip6_null_entry; - dst_hold(&uncached_rt->dst); - } + rcu_read_unlock(); - return uncached_rt; + return rt; + } } else { /* Get a percpu copy */ - - struct rt6_info *pcpu_rt; - local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(&res); + rt = rt6_get_pcpu_route(&res); - if (!pcpu_rt) - pcpu_rt = rt6_make_pcpu_route(net, &res); + if (!rt) + rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); - rcu_read_unlock(); - - return pcpu_rt; } +out: + if (!rt) + rt = net->ipv6.ip6_null_entry; + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + ip6_hold_safe(net, &rt); + rcu_read_unlock(); + + return rt; } EXPORT_SYMBOL_GPL(ip6_pol_route); @@ -2084,17 +2370,54 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.basic.ip_proto = fl6->flowi6_proto; } break; + case 2: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (skb) { + struct flow_keys keys; + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, 0); + flkeys = &keys; + } + + /* Inner can be v4 or v6 */ + if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; + } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; + hash_keys.tags.flow_label = flkeys->tags.flow_label; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); + } + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; } mhash = flow_hash_from_keys(&hash_keys); return mhash >> 1; } +/* Called with rcu held */ void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); - int flags = RT6_LOOKUP_F_HAS_SADDR; + int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, @@ -2116,8 +2439,8 @@ void ip6_route_input(struct sk_buff *skb) if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); - skb_dst_set(skb, - ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); + skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, + &fl6, skb, flags)); } static struct rt6_info *ip6_pol_route_output(struct net *net, @@ -2129,8 +2452,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, int flags) { bool any_src; @@ -2138,6 +2462,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; + /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; @@ -2145,6 +2470,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, fl6->flowi6_iif = LOOPBACK_IFINDEX; + flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) @@ -2157,6 +2483,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } +EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); + +struct dst_entry *ip6_route_output_flags(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) +{ + struct dst_entry *dst; + struct rt6_info *rt6; + + rcu_read_lock(); + dst = ip6_route_output_flags_noref(net, sk, fl6, flags); + rt6 = (struct rt6_info *)dst; + /* For dst cached in uncached_list, refcnt is already taken. */ + if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { + dst = &net->ipv6.ip6_null_entry->dst; + dst_hold(dst); + } + rcu_read_unlock(); + + return dst; +} EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) @@ -2381,10 +2729,31 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, rcu_read_unlock(); return; } - res.nh = &res.f6i->fib6_nh; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; + if (res.f6i->nh) { + struct fib6_nh_match_arg arg = { + .dev = dst->dev, + .gw = &rt6->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(res.f6i->nh, + fib6_nh_find_match, &arg); + + /* fib6_info uses a nexthop that does not have fib6_nh + * using the dst->dev + gw. Should be impossible. + */ + if (!arg.match) { + rcu_read_unlock(); + return; + } + + res.nh = arg.match; + } else { + res.nh = res.f6i->fib6_nh; + } + nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); @@ -2491,6 +2860,21 @@ static bool ip6_redirect_nh_match(const struct fib6_result *res, return true; } +struct fib6_nh_rd_arg { + struct fib6_result *res; + struct flowi6 *fl6; + const struct in6_addr *gw; + struct rt6_info **ret; +}; + +static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_rd_arg *arg = _arg; + + arg->res->nh = nh; + return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); +} + /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; @@ -2506,6 +2890,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL; struct fib6_result res = {}; + struct fib6_nh_rd_arg arg = { + .res = &res, + .fl6 = fl6, + .gw = &rdfl->gateway, + .ret = &ret + }; struct fib6_info *rt; struct fib6_node *fn; @@ -2530,14 +2920,24 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; - res.nh = &rt->fib6_nh; - if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; - if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) - goto out; + if (unlikely(rt->nh)) { + if (nexthop_is_blackhole(rt->nh)) + continue; + /* on match, res->nh is filled in and potentially ret */ + if (nexthop_for_each_fib6_nh(rt->nh, + fib6_nh_redirect_match, + &arg)) + goto out; + } else { + res.nh = rt->fib6_nh; + if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, + &ret)) + goto out; + } } if (!rt) @@ -2554,7 +2954,7 @@ restart: } res.f6i = rt; - res.nh = &rt->fib6_nh; + res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); @@ -2781,10 +3181,9 @@ out: return entries > rt_max_size; } -static struct rt6_info *ip6_nh_lookup_table(struct net *net, - struct fib6_config *cfg, - const struct in6_addr *gw_addr, - u32 tbid, int flags) +static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, + const struct in6_addr *gw_addr, u32 tbid, + int flags, struct fib6_result *res) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, @@ -2792,25 +3191,23 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, .saddr = cfg->fc_prefsrc, }; struct fib6_table *table; - struct rt6_info *rt; + int err; table = fib6_get_table(net, tbid); if (!table) - return NULL; + return -EINVAL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; - rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); - /* if table lookup failed, fall back to full lookup */ - if (rt == net->ipv6.ip6_null_entry) { - ip6_rt_put(rt); - rt = NULL; - } + err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); + if (!err && res->f6i != net->ipv6.fib6_null_entry) + fib6_select_path(net, res, &fl6, cfg->fc_ifindex, + cfg->fc_ifindex != 0, NULL, flags); - return rt; + return err; } static int ip6_route_check_nh_onlink(struct net *net, @@ -2818,29 +3215,19 @@ static int ip6_route_check_nh_onlink(struct net *net, const struct net_device *dev, struct netlink_ext_ack *extack) { - u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; - u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; - struct fib6_info *from; - struct rt6_info *grt; + struct fib6_result res = {}; int err; - err = 0; - grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); - if (grt) { - rcu_read_lock(); - from = rcu_dereference(grt->from); - if (!grt->dst.error && - /* ignore match if it is the default route */ - from && !ipv6_addr_any(&from->fib6_dst.addr) && - (grt->rt6i_flags & flags || dev != grt->dst.dev)) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway or device mismatch"); - err = -EINVAL; - } - rcu_read_unlock(); - - ip6_rt_put(grt); + err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); + if (!err && !(res.fib6_flags & RTF_REJECT) && + /* ignore match if it is the default route */ + !ipv6_addr_any(&res.f6i->fib6_dst.addr) && + (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway or device mismatch"); + err = -EINVAL; } return err; @@ -2853,47 +3240,50 @@ static int ip6_route_check_nh(struct net *net, { const struct in6_addr *gw_addr = &cfg->fc_gateway; struct net_device *dev = _dev ? *_dev : NULL; - struct rt6_info *grt = NULL; + int flags = RT6_LOOKUP_F_IFACE; + struct fib6_result res = {}; int err = -EHOSTUNREACH; if (cfg->fc_table) { - int flags = RT6_LOOKUP_F_IFACE; - - grt = ip6_nh_lookup_table(net, cfg, gw_addr, - cfg->fc_table, flags); - if (grt) { - if (grt->rt6i_flags & RTF_GATEWAY || - (dev && dev != grt->dst.dev)) { - ip6_rt_put(grt); - grt = NULL; - } - } + err = ip6_nh_lookup_table(net, cfg, gw_addr, + cfg->fc_table, flags, &res); + /* gw_addr can not require a gateway or resolve to a reject + * route. If a device is given, it must match the result. + */ + if (err || res.fib6_flags & RTF_REJECT || + res.nh->fib_nh_gw_family || + (dev && dev != res.nh->fib_nh_dev)) + err = -EHOSTUNREACH; } - if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); + if (err < 0) { + struct flowi6 fl6 = { + .flowi6_oif = cfg->fc_ifindex, + .daddr = *gw_addr, + }; - if (!grt) - goto out; + err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); + if (err || res.fib6_flags & RTF_REJECT || + res.nh->fib_nh_gw_family) + err = -EHOSTUNREACH; + if (err) + return err; + + fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, + cfg->fc_ifindex != 0, NULL, flags); + } + + err = 0; if (dev) { - if (dev != grt->dst.dev) { - ip6_rt_put(grt); - goto out; - } + if (dev != res.nh->fib_nh_dev) + err = -EHOSTUNREACH; } else { - *_dev = dev = grt->dst.dev; - *idev = grt->rt6i_idev; + *_dev = dev = res.nh->fib_nh_dev; dev_hold(dev); - in6_dev_hold(grt->rt6i_idev); + *idev = in6_dev_get(dev); } - if (!(grt->rt6i_flags & RTF_GATEWAY)) - err = 0; - - ip6_rt_put(grt); - -out: return err; } @@ -2934,11 +3324,15 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, goto out; } + rcu_read_lock(); + if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else err = ip6_route_check_nh(net, cfg, _dev, idev); + rcu_read_unlock(); + if (err) goto out; } @@ -3039,7 +3433,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, goto out; } } - goto set_dev; + goto pcpu_alloc; } if (cfg->fc_flags & RTF_GATEWAY) { @@ -3075,7 +3469,14 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) goto out; -set_dev: + +pcpu_alloc: + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!fib6_nh->rt6i_pcpu) { + err = -ENOMEM; + goto out; + } + fib6_nh->fib_nh_dev = dev; fib6_nh->fib_nh_oif = dev->ifindex; err = 0; @@ -3095,6 +3496,38 @@ out: void fib6_nh_release(struct fib6_nh *fib6_nh) { + struct rt6_exception_bucket *bucket; + + rcu_read_lock(); + + fib6_nh_flush_exceptions(fib6_nh, NULL); + bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); + if (bucket) { + rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); + kfree(bucket); + } + + rcu_read_unlock(); + + if (fib6_nh->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + + free_percpu(fib6_nh->rt6i_pcpu); + } + fib_nh_common_release(&fib6_nh->nh_common); } @@ -3104,7 +3537,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_info *rt = NULL; + struct nexthop *nh = NULL; struct fib6_table *table; + struct fib6_nh *fib6_nh; int err = -EINVAL; int addr_type; @@ -3140,6 +3575,16 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } #endif + if (cfg->fc_nh_id) { + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto out; + } + err = fib6_check_nexthop(nh, cfg, extack); + if (err) + goto out; + } err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && @@ -3157,7 +3602,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags); + rt = fib6_info_alloc(gfp_flags, !nh); if (!rt) goto out; @@ -3197,19 +3642,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); - if (err) - goto out; + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + goto out; + } + if (rt->fib6_src.plen) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto out; + } + rt->nh = nh; + fib6_nh = nexthop_fib6_nh(rt->nh); + } else { + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); + if (err) + goto out; - /* We cannot add true routes via loopback here, - * they would result in kernel looping; promote them to reject routes - */ - addr_type = ipv6_addr_type(&cfg->fc_dst); - if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) - rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; + fib6_nh = rt->fib6_nh; + + /* We cannot add true routes via loopback here, they would + * result in kernel looping; promote them to reject routes + */ + addr_type = ipv6_addr_type(&cfg->fc_dst); + if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, + addr_type)) + rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; + } if (!ipv6_addr_any(&cfg->fc_prefsrc)) { - struct net_device *dev = fib6_info_nh_dev(rt); + struct net_device *dev = fib6_nh->fib_nh_dev; if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); @@ -3301,6 +3762,12 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) info->skip_notify = 1; } + info->skip_notify_kernel = 1; + call_fib6_multipath_entry_notifiers(net, + FIB_EVENT_ENTRY_DEL, + rt, + rt->fib6_nsiblings, + NULL); list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { @@ -3323,7 +3790,7 @@ out_put: return err; } -static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) +static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; @@ -3339,10 +3806,49 @@ out: return rc; } +static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, + struct fib6_nh *nh) +{ + struct fib6_result res = { + .f6i = rt, + .nh = nh, + }; + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); + if (rt_cache) + return __ip6_del_cached_rt(rt_cache, cfg); + + return 0; +} + +struct fib6_nh_del_cached_rt_arg { + struct fib6_config *cfg; + struct fib6_info *f6i; +}; + +static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_del_cached_rt_arg *arg = _arg; + int rc; + + rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); + return rc != -ESRCH ? rc : 0; +} + +static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) +{ + struct fib6_nh_del_cached_rt_arg arg = { + .cfg = cfg, + .f6i = f6i + }; + + return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); +} + static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt_cache; struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; @@ -3365,26 +3871,45 @@ static int ip6_route_del(struct fib6_config *cfg, for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; + if (rt->nh && cfg->fc_nh_id && + rt->nh->id != cfg->fc_nh_id) + continue; + if (cfg->fc_flags & RTF_CACHE) { - struct fib6_result res = { - .f6i = rt, - }; - int rc; - - rt_cache = rt6_find_cached_rt(&res, - &cfg->fc_dst, - &cfg->fc_src); - if (rt_cache) { - rc = ip6_del_cached_rt(rt_cache, cfg); - if (rc != -ESRCH) { - rcu_read_unlock(); - return rc; - } + int rc = 0; + + if (rt->nh) { + rc = ip6_del_cached_rt_nh(cfg, rt); + } else if (cfg->fc_nh_id) { + continue; + } else { + nh = rt->fib6_nh; + rc = ip6_del_cached_rt(cfg, rt, nh); + } + if (rc != -ESRCH) { + rcu_read_unlock(); + return rc; } continue; } - nh = &rt->fib6_nh; + if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) + continue; + if (cfg->fc_protocol && + cfg->fc_protocol != rt->fib6_protocol) + continue; + + if (rt->nh) { + if (!fib6_info_hold_safe(rt)) + continue; + rcu_read_unlock(); + + return __ip6_del_rt(rt, &cfg->fc_nlinfo); + } + if (cfg->fc_nh_id) + continue; + + nh = rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) @@ -3392,10 +3917,6 @@ static int ip6_route_del(struct fib6_config *cfg, if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; - if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) - continue; - if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) - continue; if (!fib6_info_hold_safe(rt)) continue; rcu_read_unlock(); @@ -3506,7 +4027,25 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (!res.f6i) goto out; - res.nh = &res.f6i->fib6_nh; + if (res.f6i->nh) { + struct fib6_nh_match_arg arg = { + .dev = dst->dev, + .gw = &rt->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(res.f6i->nh, + fib6_nh_find_match, &arg); + + /* fib6_info uses a nexthop that does not have fib6_nh + * using the dst->dev. Should be impossible + */ + if (!arg.match) + goto out; + res.nh = arg.match; + } else { + res.nh = res.f6i->fib6_nh; + } + res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); @@ -3558,12 +4097,15 @@ static struct fib6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) + /* these routes do not use nexthops */ + if (rt->nh) + continue; + if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || - !rt->fib6_nh.fib_nh_gw_family) + !rt->fib6_nh->fib_nh_gw_family) continue; - if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; @@ -3621,8 +4163,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - struct fib6_nh *nh = &rt->fib6_nh; + struct fib6_nh *nh; + + /* RA routes do not use nexthops */ + if (rt->nh) + continue; + nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&nh->fib_nh_gw6, addr)) @@ -3873,7 +4420,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && + if (!rt->nh && + ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -3901,18 +4449,22 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; + struct fib6_nh *nh; + /* RA routes do not use nexthops */ + if (rt->nh) + return 0; + + nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - rt->fib6_nh.fib_nh_gw_family && - ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { + nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; - } /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ - rt6_exceptions_clean_tohost(rt, gateway); + fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } @@ -3950,11 +4502,12 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) return NULL; } +/* only called for fib entries with builtin fib6_nh */ static bool rt6_is_dead(const struct fib6_info *rt) { - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || - (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && - ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) + if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || + (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && + ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; @@ -3966,11 +4519,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->fib6_nh.fib_nh_weight; + total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) - total += iter->fib6_nh.fib_nh_weight; + total += iter->fib6_nh->fib_nh_weight; } return total; @@ -3981,11 +4534,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->fib6_nh.fib_nh_weight; + *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) @@ -4028,9 +4581,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.fib6_null_entry && - rt->fib6_nh.fib_nh_dev == arg->dev) { - rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; + if (rt != net->ipv6.fib6_null_entry && !rt->nh && + rt->fib6_nh->fib_nh_dev == arg->dev) { + rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -4053,15 +4606,16 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } +/* only called for fib entries with inline fib6_nh */ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) + if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) + if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; @@ -4082,12 +4636,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, struct fib6_info *iter; unsigned int dead = 0; - if (rt->fib6_nh.fib_nh_dev == down_dev || - rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh->fib_nh_dev == down_dev || + rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == down_dev || - iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh->fib_nh_dev == down_dev || + iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -4099,11 +4653,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) - rt->fib6_nh.fib_nh_flags |= nh_flags; + if (rt->fib6_nh->fib_nh_dev == dev) + rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) - iter->fib6_nh.fib_nh_flags |= nh_flags; + if (iter->fib6_nh->fib_nh_dev == dev) + iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -4113,17 +4667,17 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); - if (rt == net->ipv6.fib6_null_entry) + if (rt == net->ipv6.fib6_null_entry || rt->nh) return 0; switch (arg->event) { case NETDEV_UNREGISTER: - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -4139,10 +4693,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->fib6_nh.fib_nh_dev != dev || + if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -4176,9 +4730,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event) struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; + struct fib6_info *f6i; }; -static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) +static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; + struct fib6_info *f6i = arg->f6i; + + /* For administrative MTU increase, there is no way to discover + * IPv6 PMTU increase, so PMTU increase should be updated here. + * Since RFC 1981 doesn't include administrative MTU increase + * update PMTU increase is a MUST. (i.e. jumbo frame) + */ + if (nh->fib_nh_dev == arg->dev) { + struct inet6_dev *idev = __in6_dev_get(arg->dev); + u32 mtu = f6i->fib6_pmtu; + + if (mtu >= arg->mtu || + (mtu < arg->mtu && mtu == idev->cnf.mtu6)) + fib6_metric_set(f6i, RTAX_MTU, arg->mtu); + + spin_lock_bh(&rt6_exception_lock); + rt6_exceptions_update_pmtu(idev, nh, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); + } + + return 0; +} + +static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; @@ -4193,24 +4774,17 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) if (!idev) return 0; - /* For administrative MTU increase, there is no way to discover - IPv6 PMTU increase, so PMTU increase should be updated here. - Since RFC 1981 doesn't include administrative MTU increase - update PMTU increase is a MUST. (i.e. jumbo frame) - */ - if (rt->fib6_nh.fib_nh_dev == arg->dev && - !fib6_metric_locked(rt, RTAX_MTU)) { - u32 mtu = rt->fib6_pmtu; - - if (mtu >= arg->mtu || - (mtu < arg->mtu && mtu == idev->cnf.mtu6)) - fib6_metric_set(rt, RTAX_MTU, arg->mtu); + if (fib6_metric_locked(f6i, RTAX_MTU)) + return 0; - spin_lock_bh(&rt6_exception_lock); - rt6_exceptions_update_pmtu(idev, rt, arg->mtu); - spin_unlock_bh(&rt6_exception_lock); + arg->f6i = f6i; + if (f6i->nh) { + /* fib6_nh_mtu_change only returns 0, so this is safe */ + return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, + arg); } - return 0; + + return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) @@ -4224,6 +4798,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu) } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { + [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, @@ -4241,6 +4816,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, + [RTA_NH_ID] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -4287,6 +4863,16 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); + if (tb[RTA_NH_ID]) { + if (tb[RTA_GATEWAY] || tb[RTA_OIF] || + tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { + NL_SET_ERR_MSG(extack, + "Nexthop specification and nexthop id are mutually exclusive"); + goto errout; + } + cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); + } + if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; @@ -4430,6 +5016,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; + enum fib_event_type event_type; struct fib6_config r_cfg; struct rtnexthop *rtnh; struct fib6_info *rt; @@ -4489,7 +5076,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); @@ -4501,12 +5088,23 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } + if (list_empty(&rt6_nh_list)) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - no valid nexthops"); + return -EINVAL; + } + /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done */ info->skip_notify = 1; + /* For add and replace, send one notification with all nexthops. For + * append, send one notification with all appended nexthops. + */ + info->skip_notify_kernel = 1; + err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { err = __ip6_ins_rt(nh->fib6_info, info, extack); @@ -4543,6 +5141,15 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nhn++; } + event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD; + err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type, + rt_notif, nhn - 1, extack); + if (err) { + /* Delete all the siblings that were just added */ + err_nh = NULL; + goto add_errout; + } + /* success ... tell user about new route */ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); goto cleanup; @@ -4621,6 +5228,12 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (cfg.fc_nh_id && + !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + return -EINVAL; + } + if (cfg.fc_mp) return ip6_route_multipath_del(&cfg, extack); else { @@ -4648,17 +5261,46 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, return ip6_route_add(&cfg, GFP_KERNEL, extack); } -static size_t rt6_nlmsg_size(struct fib6_info *rt) +/* add the overhead of this fib6_nh to nexthop_len */ +static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) { - int nexthop_len = 0; + int *nexthop_len = arg; - if (rt->fib6_nsiblings) { - nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ - + NLA_ALIGN(sizeof(struct rtnexthop)) - + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); + *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16); /* RTA_GATEWAY */ - nexthop_len *= rt->fib6_nsiblings; + if (nh->fib_nh_lws) { + /* RTA_ENCAP_TYPE */ + *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); + /* RTA_ENCAP */ + *nexthop_len += nla_total_size(2); + } + + return 0; +} + +static size_t rt6_nlmsg_size(struct fib6_info *f6i) +{ + int nexthop_len; + + if (f6i->nh) { + nexthop_len = nla_total_size(4); /* RTA_NH_ID */ + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, + &nexthop_len); + } else { + struct fib6_nh *nh = f6i->fib6_nh; + + nexthop_len = 0; + if (f6i->fib6_nsiblings) { + nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16) /* RTA_GATEWAY */ + + lwtunnel_get_encap_size(nh->fib_nh_lws); + + nexthop_len *= f6i->fib6_nsiblings; + } + nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } return NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -4674,10 +5316,38 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) + nexthop_len; } +static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, + unsigned char *flags) +{ + if (nexthop_is_multipath(nh)) { + struct nlattr *mp; + + mp = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp) + goto nla_put_failure; + + if (nexthop_mpath_fill_node(skb, nh)) + goto nla_put_failure; + + nla_nest_end(skb, mp); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = nexthop_fib6_nh(nh); + if (fib_nexthop_info(skb, &fib6_nh->nh_common, + flags, false) < 0) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -4687,6 +5357,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct rt6_info *rt6 = (struct rt6_info *)dst; struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; + unsigned char nh_flags = 0; struct nlmsghdr *nlh; struct rtmsg *rtm; long expires = 0; @@ -4794,22 +5465,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (!mp) goto nla_put_failure; - if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, - rt->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, + rt->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { - if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, - sibling->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, + sibling->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; } nla_nest_end(skb, mp); - } else { - unsigned char nh_flags = 0; + } else if (rt->nh) { + if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) + goto nla_put_failure; - if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, + if (nexthop_is_blackhole(rt->nh)) + rtm->rtm_type = RTN_BLACKHOLE; + + if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) + goto nla_put_failure; + + rtm->rtm_flags |= nh_flags; + } else { + if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, &nh_flags, false) < 0) goto nla_put_failure; @@ -4836,10 +5516,28 @@ nla_put_failure: return -EMSGSIZE; } +static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) +{ + const struct net_device *dev = arg; + + if (nh->fib_nh_dev == dev) + return 1; + + return 0; +} + static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { - if (f6i->fib6_nh.fib_nh_dev == dev) + if (f6i->nh) { + struct net_device *_dev = (struct net_device *)dev; + + return !!nexthop_for_each_fib6_nh(f6i->nh, + fib6_info_nh_uses_dev, + _dev); + } + + if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (f6i->fib6_nsiblings) { @@ -4847,7 +5545,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, list_for_each_entry_safe(sibling, next_sibling, &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh.fib_nh_dev == dev) + if (sibling->fib6_nh->fib_nh_dev == dev) return true; } } @@ -4855,33 +5553,131 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, return false; } -int rt6_dump_route(struct fib6_info *rt, void *p_arg) +struct fib6_nh_exception_dump_walker { + struct rt6_rtnl_dump_arg *dump; + struct fib6_info *rt; + unsigned int flags; + unsigned int skip; + unsigned int count; +}; + +static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) +{ + struct fib6_nh_exception_dump_walker *w = arg; + struct rt6_rtnl_dump_arg *dump = w->dump; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i, err; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); + if (!bucket) + return 0; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + if (w->skip) { + w->skip--; + continue; + } + + /* Expiration of entries doesn't bump sernum, insertion + * does. Removal is triggered by insertion, so we can + * rely on the fact that if entries change between two + * partial dumps, this node is scanned again completely, + * see rt6_insert_exception() and fib6_dump_table(). + * + * Count expired entries we go through as handled + * entries that we'll skip next time, in case of partial + * node dump. Otherwise, if entries expire meanwhile, + * we'll skip the wrong amount. + */ + if (rt6_check_expired(rt6_ex->rt6i)) { + w->count++; + continue; + } + + err = rt6_fill_node(dump->net, dump->skb, w->rt, + &rt6_ex->rt6i->dst, NULL, NULL, 0, + RTM_NEWROUTE, + NETLINK_CB(dump->cb->skb).portid, + dump->cb->nlh->nlmsg_seq, w->flags); + if (err) + return err; + + w->count++; + } + bucket++; + } + + return 0; +} + +/* Return -1 if done with node, number of handled routes on partial dump */ +int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct fib_dump_filter *filter = &arg->filter; unsigned int flags = NLM_F_MULTI; struct net *net = arg->net; + int count = 0; if (rt == net->ipv6.fib6_null_entry) - return 0; + return -1; if ((filter->flags & RTM_F_PREFIX) && !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ - return 1; + return -1; } - if (filter->filter_set) { - if ((filter->rt_type && rt->fib6_type != filter->rt_type) || - (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || - (filter->protocol && rt->fib6_protocol != filter->protocol)) { - return 1; - } + if (filter->filter_set && + ((filter->rt_type && rt->fib6_type != filter->rt_type) || + (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || + (filter->protocol && rt->fib6_protocol != filter->protocol))) { + return -1; + } + + if (filter->filter_set || + !filter->dump_routes || !filter->dump_exceptions) { flags |= NLM_F_DUMP_FILTERED; } - return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, - RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, - arg->cb->nlh->nlmsg_seq, flags); + if (filter->dump_routes) { + if (skip) { + skip--; + } else { + if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, + 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).portid, + arg->cb->nlh->nlmsg_seq, flags)) { + return 0; + } + count++; + } + } + + if (filter->dump_exceptions) { + struct fib6_nh_exception_dump_walker w = { .dump = arg, + .rt = rt, + .flags = flags, + .skip = skip, + .count = 0 }; + int err; + + rcu_read_lock(); + if (rt->nh) { + err = nexthop_for_each_fib6_nh(rt->nh, + rt6_nh_dump_exceptions, + &w); + } else { + err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); + } + rcu_read_unlock(); + + if (err) + return count += w.count; + } + + return -1; } static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, @@ -5126,6 +5922,38 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } +void fib6_rt_update(struct net *net, struct fib6_info *rt, + struct nl_info *info) +{ + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + /* call_fib6_entry_notifiers will be removed when in-kernel notifier + * is implemented and supported for nexthop objects + */ + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL); + + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + if (!skb) + goto errout; + + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, + RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); + if (err < 0) { + /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); +} + static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -5136,7 +5964,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { - net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; + net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -5330,11 +6158,11 @@ static int __net_init ip6_route_net_init(struct net *net) if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; - net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, - sizeof(*net->ipv6.fib6_null_entry), - GFP_KERNEL); + net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; + memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, + sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), @@ -5344,6 +6172,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; @@ -5355,6 +6184,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), @@ -5364,6 +6194,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); #endif net->ipv6.sysctl.flush_delay = 0; @@ -5471,7 +6302,7 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index e15cd37024fd..dc4c91e0bfb8 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -23,6 +23,7 @@ static int zero; static int one = 1; +static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; @@ -113,7 +114,9 @@ static struct ctl_table ipv6_table_template[] = { .data = &init_net.ipv6.sysctl.flowlabel_reflect, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &flowlabel_reflect_max, }, { .procname = "max_dst_opts_number", diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7a14ea37d2df..d56a9019a0fe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -171,7 +171,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } @@ -883,9 +883,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_oif = oif; } - if (sk) - mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) { + mark = inet_twsk(sk)->tw_mark; + /* autoflowlabel relies on buff->hash */ + skb_set_hash(buff, inet_twsk(sk)->tw_txhash, + PKT_HASH_TYPE_L4); + } else { + mark = sk->sk_mark; + } + buff->tstamp = tcp_transmit_time(sk); + } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; @@ -912,15 +920,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); u32 seq = 0, ack_seq = 0; struct tcp_md5sig_key *key = NULL; #ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; - struct ipv6hdr *ipv6h = ipv6_hdr(skb); unsigned char newhash[16]; int genhash; struct sock *sk1 = NULL; #endif + __be32 label = 0; + struct net *net; int oif = 0; if (th->rst) @@ -932,6 +942,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (!sk && !ipv6_unicast_destination(skb)) return; + net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); @@ -945,7 +956,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), + sk1 = inet6_lookup_listener(net, &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, @@ -975,9 +986,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) trace_tcp_send_reset(sk, skb); + if (sk->sk_state == TCP_TIME_WAIT) + label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); + } else { + if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) + label = ip6_flowlabel(ipv6h); } - tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, + label); #ifdef CONFIG_TCP_MD5SIG out: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 70b01bd95022..827fe7385078 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -54,16 +54,6 @@ #include <trace/events/skb.h> #include "udp_impl.h" -static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if defined(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_udp_l3mdev_accept && - skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) - return true; -#endif - return false; -} - static u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -111,7 +101,7 @@ void udp_v6_rehash(struct sock *sk) static int compute_score(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, - int dif, int sdif, bool exact_dif) + int dif, int sdif) { int score; struct inet_sock *inet; @@ -155,8 +145,8 @@ static int compute_score(struct sock *sk, struct net *net, static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, - int dif, int sdif, bool exact_dif, - struct udp_hslot *hslot2, struct sk_buff *skb) + int dif, int sdif, struct udp_hslot *hslot2, + struct sk_buff *skb) { struct sock *sk, *result; int score, badness; @@ -166,7 +156,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); + daddr, hnum, dif, sdif); if (score > badness) { if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, @@ -195,14 +185,13 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result; - bool exact_dif = udp6_lib_exact_dif_match(net, skb); hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif, + daddr, hnum, dif, sdif, hslot2, skb); if (!result) { hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); @@ -212,10 +201,9 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, - exact_dif, hslot2, - skb); + hslot2, skb); } - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } @@ -838,8 +826,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - ip6_compute_pseudo); + skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo); ret = udpv6_queue_rcv_skb(sk, skb); @@ -1332,7 +1319,7 @@ do_udp_sendmsg: fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -1384,7 +1371,7 @@ do_udp_sendmsg: } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 5bdca3d5d6b7..78daadecbdef 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -21,137 +21,6 @@ #include <net/ipv6.h> #include <net/addrconf.h> -static void -__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) -{ - const struct flowi6 *fl6 = &fl->u.ip6; - - /* Initialize temporary selector matching only - * to current session. */ - *(struct in6_addr *)&sel->daddr = fl6->daddr; - *(struct in6_addr *)&sel->saddr = fl6->saddr; - sel->dport = xfrm_flowi_dport(fl, &fl6->uli); - sel->dport_mask = htons(0xffff); - sel->sport = xfrm_flowi_sport(fl, &fl6->uli); - sel->sport_mask = htons(0xffff); - sel->family = AF_INET6; - sel->prefixlen_d = 128; - sel->prefixlen_s = 128; - sel->proto = fl6->flowi6_proto; - sel->ifindex = fl6->flowi6_oif; -} - -static void -xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, - const xfrm_address_t *daddr, const xfrm_address_t *saddr) -{ - x->id = tmpl->id; - if (ipv6_addr_any((struct in6_addr *)&x->id.daddr)) - memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); - memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); - if (ipv6_addr_any((struct in6_addr *)&x->props.saddr)) - memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); - x->props.mode = tmpl->mode; - x->props.reqid = tmpl->reqid; - x->props.family = AF_INET6; -} - -/* distribution counting sort function for xfrm_state and xfrm_tmpl */ -static int -__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) -{ - int count[XFRM_MAX_DEPTH] = { }; - int class[XFRM_MAX_DEPTH]; - int i; - - for (i = 0; i < n; i++) { - int c; - class[i] = c = cmp(src[i]); - count[c]++; - } - - for (i = 2; i < maxclass; i++) - count[i] += count[i - 1]; - - for (i = 0; i < n; i++) { - dst[count[class[i] - 1]++] = src[i]; - src[i] = NULL; - } - - return 0; -} - -/* - * Rule for xfrm_state: - * - * rule 1: select IPsec transport except AH - * rule 2: select MIPv6 RO or inbound trigger - * rule 3: select IPsec transport AH - * rule 4: select IPsec tunnel - * rule 5: others - */ -static int __xfrm6_state_sort_cmp(void *p) -{ - struct xfrm_state *v = p; - - switch (v->props.mode) { - case XFRM_MODE_TRANSPORT: - if (v->id.proto != IPPROTO_AH) - return 1; - else - return 3; -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case XFRM_MODE_ROUTEOPTIMIZATION: - case XFRM_MODE_IN_TRIGGER: - return 2; -#endif - case XFRM_MODE_TUNNEL: - case XFRM_MODE_BEET: - return 4; - } - return 5; -} - -static int -__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n) -{ - return __xfrm6_sort((void **)dst, (void **)src, n, - __xfrm6_state_sort_cmp, 6); -} - -/* - * Rule for xfrm_tmpl: - * - * rule 1: select IPsec transport - * rule 2: select MIPv6 RO or inbound trigger - * rule 3: select IPsec tunnel - * rule 4: others - */ -static int __xfrm6_tmpl_sort_cmp(void *p) -{ - struct xfrm_tmpl *v = p; - switch (v->mode) { - case XFRM_MODE_TRANSPORT: - return 1; -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case XFRM_MODE_ROUTEOPTIMIZATION: - case XFRM_MODE_IN_TRIGGER: - return 2; -#endif - case XFRM_MODE_TUNNEL: - case XFRM_MODE_BEET: - return 3; - } - return 4; -} - -static int -__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n) -{ - return __xfrm6_sort((void **)dst, (void **)src, n, - __xfrm6_tmpl_sort_cmp, 5); -} - int xfrm6_extract_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); @@ -171,12 +40,6 @@ int xfrm6_extract_header(struct sk_buff *skb) static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, .proto = IPPROTO_IPV6, - .eth_proto = htons(ETH_P_IPV6), - .owner = THIS_MODULE, - .init_tempsel = __xfrm6_init_tempsel, - .init_temprop = xfrm6_init_temprop, - .tmpl_sort = __xfrm6_tmpl_sort, - .state_sort = __xfrm6_state_sort, .output = xfrm6_output, .output_finish = xfrm6_output_finish, .extract_input = xfrm6_extract_input, diff --git a/net/key/af_key.c b/net/key/af_key.c index fe5fc4bab7ee..b67ed3a8486c 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -928,8 +928,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); - if (!addr->sadb_address_prefixlen) - BUG(); + BUG_ON(!addr->sadb_address_prefixlen); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); @@ -944,8 +943,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); - if (!addr->sadb_address_prefixlen) - BUG(); + BUG_ON(!addr->sadb_address_prefixlen); if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) { diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 6e2b4b9267e1..35bb4f3bdbe0 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -31,7 +31,6 @@ #include "l2tp_core.h" static struct dentry *rootdir; -static struct dentry *tunnels; struct l2tp_dfs_seq_data { struct net *net; @@ -326,32 +325,18 @@ static const struct file_operations l2tp_dfs_fops = { static int __init l2tp_debugfs_init(void) { - int rc = 0; - rootdir = debugfs_create_dir("l2tp", NULL); - if (IS_ERR(rootdir)) { - rc = PTR_ERR(rootdir); - rootdir = NULL; - goto out; - } - tunnels = debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops); - if (tunnels == NULL) - rc = -EIO; + debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops); pr_info("L2TP debugfs support\n"); -out: - if (rc) - pr_warn("unable to init\n"); - - return rc; + return 0; } static void __exit l2tp_debugfs_exit(void) { - debugfs_remove(tunnels); - debugfs_remove(rootdir); + debugfs_remove_recursive(rootdir); } module_init(l2tp_debugfs_init); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 1a76a0a4e3ab..687e23a8b326 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -536,7 +536,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -577,7 +577,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index cfc9fcb97465..f35899d45a9a 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -118,6 +118,8 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup + * This function does not hold refcnt on the returned dst. + * Caller must hold rcu_read_lock(). */ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, @@ -126,9 +128,8 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct dst_entry *dst = NULL; struct net_device *dev; + WARN_ON_ONCE(!rcu_read_lock_held()); if (fl6->flowi6_oif) { - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (dev && netif_is_l3_slave(dev)) dev = netdev_master_upper_dev_get_rcu(dev); @@ -136,8 +137,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_link_scope_lookup) dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); - - rcu_read_unlock(); } return dst; diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 5d2d1f746b91..3c03f6512c5f 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -68,7 +68,6 @@ static void __lapb_remove_cb(struct lapb_cb *lapb) lapb_put(lapb); } } -EXPORT_SYMBOL(lapb_register); /* * Add a socket to the bound sockets list. @@ -115,7 +114,6 @@ static struct lapb_cb *lapb_create_cb(void) { struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC); - if (!lapb) goto out; @@ -167,6 +165,7 @@ out: write_unlock_bh(&lapb_list_lock); return rc; } +EXPORT_SYMBOL(lapb_register); int lapb_unregister(struct net_device *dev) { diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index a1973a26c7fc..4f12d042c89c 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -5,6 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018-2019 Intel Corporation * Copyright (C) 2018 Intel Corporation */ @@ -974,7 +975,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_BEACON | BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS | - BSS_CHANGED_TXPOWER; + BSS_CHANGED_TXPOWER | + BSS_CHANGED_TWT; int err; int prev_beacon_int; @@ -1044,6 +1046,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.dtim_period = params->dtim_period; sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p; + sdata->vif.bss_conf.twt_responder = params->twt_responder; sdata->vif.bss_conf.ssid_len = params->ssid_len; if (params->ssid_len) @@ -1465,7 +1468,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, return ret; } - if (params->supported_rates) { + if (params->supported_rates && params->supported_rates_len) { ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef, sband, params->supported_rates, params->supported_rates_len, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 271bc2b676a4..2e7f75938c51 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -272,6 +272,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), FLAG(EXT_KEY_ID_NATIVE), + FLAG(NO_AMPDU_KEYBORDER_SUPPORT), #undef FLAG }; diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 3509ce0daea3..7b8735ced2a1 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -339,9 +339,6 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key) key->debugfs.dir = debugfs_create_dir(buf, key->local->debugfs.keys); - if (!key->debugfs.dir) - return; - sta = key->sta; if (sta) { sprintf(buf, "../../netdev:%s/stations/%pM", diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index f1f2e1c7ac0c..b1438fd4d876 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -815,9 +815,8 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata) sprintf(buf, "netdev:%s", sdata->name); sdata->vif.debugfs_dir = debugfs_create_dir(buf, sdata->local->hw.wiphy->debugfsdir); - if (sdata->vif.debugfs_dir) - sdata->debugfs.subdir_stations = debugfs_create_dir("stations", - sdata->vif.debugfs_dir); + sdata->debugfs.subdir_stations = debugfs_create_dir("stations", + sdata->vif.debugfs_dir); add_files(sdata); } @@ -842,8 +841,5 @@ void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) return; sprintf(buf, "netdev:%s", sdata->name); - if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf)) - sdata_err(sdata, - "debugfs: failed to rename debugfs dir to %s\n", - buf); + debugfs_rename(dir->d_parent, dir, dir->d_parent, buf); } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 3fd79ccb293b..c8ad20c28c43 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -957,8 +957,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) * dir might still be around. */ sta->debugfs_dir = debugfs_create_dir(mac, stations_dir); - if (!sta->debugfs_dir) - return; DEBUGFS_ADD(flags); DEBUGFS_ADD(aid); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 157ff5f890d2..dd60f6428049 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -269,50 +269,61 @@ int ieee80211_set_tx_key(struct ieee80211_key *key) assert_key_lock(local); sta->ptk_idx = key->conf.keyidx; + + if (ieee80211_hw_check(&local->hw, NO_AMPDU_KEYBORDER_SUPPORT)) + clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); return 0; } -static int ieee80211_hw_key_replace(struct ieee80211_key *old_key, - struct ieee80211_key *new_key, - bool pairwise) +static void ieee80211_pairwise_rekey(struct ieee80211_key *old, + struct ieee80211_key *new) { - struct ieee80211_sub_if_data *sdata; - struct ieee80211_local *local; - struct sta_info *sta; - int ret; - - /* Aggregation sessions are OK when running on SW crypto. - * A broken remote STA may cause issues not observed with HW - * crypto, though. - */ - if (!(old_key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) - return 0; + struct ieee80211_local *local = new->local; + struct sta_info *sta = new->sta; + int i; - assert_key_lock(old_key->local); - sta = old_key->sta; + assert_key_lock(local); - /* Unicast rekey without Extended Key ID needs special handling */ - if (new_key && sta && pairwise && - rcu_access_pointer(sta->ptk[sta->ptk_idx]) == old_key) { - local = old_key->local; - sdata = old_key->sdata; + if (new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) { + /* Extended Key ID key install, initial one or rekey */ + + if (sta->ptk_idx != INVALID_PTK_KEYIDX && + ieee80211_hw_check(&local->hw, + NO_AMPDU_KEYBORDER_SUPPORT)) { + /* Aggregation Sessions with Extended Key ID must not + * mix MPDUs with different keyIDs within one A-MPDU. + * Tear down any running Tx aggregation and all new + * Rx/Tx aggregation request during rekey if the driver + * asks us to do so. (Blocking Tx only would be + * sufficient but WLAN_STA_BLOCK_BA gets the job done + * for the few ms we need it.) + */ + set_sta_flag(sta, WLAN_STA_BLOCK_BA); + mutex_lock(&sta->ampdu_mlme.mtx); + for (i = 0; i < IEEE80211_NUM_TIDS; i++) + ___ieee80211_stop_tx_ba_session(sta, i, + AGG_STOP_LOCAL_REQUEST); + mutex_unlock(&sta->ampdu_mlme.mtx); + } + } else if (old) { + /* Rekey without Extended Key ID. + * Aggregation sessions are OK when running on SW crypto. + * A broken remote STA may cause issues not observed with HW + * crypto, though. + */ + if (!(old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + return; - /* Stop TX till we are on the new key */ - old_key->flags |= KEY_FLAG_TAINTED; + /* Stop Tx till we are on the new key */ + old->flags |= KEY_FLAG_TAINTED; ieee80211_clear_fast_xmit(sta); - - /* Aggregation sessions during rekey are complicated due to the - * reorder buffer and retransmits. Side step that by blocking - * aggregation during rekey and tear down running sessions. - */ if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_LOCAL_REQUEST); } - if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) { pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.", @@ -320,18 +331,9 @@ static int ieee80211_hw_key_replace(struct ieee80211_key *old_key, /* Flushing the driver queues *may* help prevent * the clear text leaks and freezes. */ - ieee80211_flush_queues(local, sdata, false); + ieee80211_flush_queues(local, old->sdata, false); } } - - ieee80211_key_disable_hw_accel(old_key); - - if (new_key) - ret = ieee80211_key_enable_hw_accel(new_key); - else - ret = 0; - - return ret; } static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, @@ -389,7 +391,6 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, mutex_unlock(&sdata->local->key_mtx); } - static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, bool pairwise, @@ -397,7 +398,7 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct ieee80211_key *new) { int idx; - int ret; + int ret = 0; bool defunikey, defmultikey, defmgmtkey; /* caller must provide at least one old/new */ @@ -409,16 +410,27 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); + if (new && sta && pairwise) { + /* Unicast rekey needs special handling. With Extended Key ID + * old is still NULL for the first rekey. + */ + ieee80211_pairwise_rekey(old, new); + } + if (old) { idx = old->conf.keyidx; - ret = ieee80211_hw_key_replace(old, new, pairwise); + + if (old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { + ieee80211_key_disable_hw_accel(old); + + if (new) + ret = ieee80211_key_enable_hw_accel(new); + } } else { /* new must be provided in case old is not */ idx = new->conf.keyidx; if (!new->local->wowlan) ret = ieee80211_key_enable_hw_accel(new); - else - ret = 0; } if (ret) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 55583b71ffaf..85e416248753 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -351,11 +351,11 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, sdata_lock(sdata); /* Copy the addresses to the bss_conf list */ - ifa = idev->ifa_list; + ifa = rtnl_dereference(idev->ifa_list); while (ifa) { if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN) bss_conf->arp_addr_list[c] = ifa->ifa_address; - ifa = ifa->ifa_next; + ifa = rtnl_dereference(ifa->ifa_next); c++; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 379d2ab6d327..96014f459a2b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3155,6 +3155,19 @@ static bool ieee80211_twt_req_supported(const struct sta_info *sta, IEEE80211_HE_MAC_CAP0_TWT_RES; } +static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee802_11_elems *elems) +{ + bool twt = ieee80211_twt_req_supported(sta, elems); + + if (sdata->vif.bss_conf.twt_requester != twt) { + sdata->vif.bss_conf.twt_requester = twt; + return BSS_CHANGED_TWT; + } + return 0; +} + static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, size_t len) @@ -3337,8 +3350,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta); bss_conf->he_support = sta->sta.he_cap.has_he; - bss_conf->twt_requester = - ieee80211_twt_req_supported(sta, &elems); + changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); } else { bss_conf->he_support = false; bss_conf->twt_requester = false; @@ -3998,6 +4010,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, bssid); + changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); + if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, elems.ht_operation, elems.vht_operation, elems.he_operation, @@ -4948,7 +4962,12 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, basic_rates = BIT(min_rate_index); } - new_sta->sta.supp_rates[cbss->channel->band] = rates; + if (rates) + new_sta->sta.supp_rates[cbss->channel->band] = rates; + else + sdata_info(sdata, + "No rates found, keeping mandatory only\n"); + sdata->vif.bss_conf.basic_rates = basic_rates; /* cf. IEEE 802.11 9.2.12 */ diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 6e5961d7f639..60ef8972b254 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -199,6 +199,10 @@ static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc) cfg80211_remain_on_channel_expired(&roc->sdata->wdev, roc->cookie, roc->chan, GFP_KERNEL); + else + cfg80211_tx_mgmt_expired(&roc->sdata->wdev, + roc->mgmt_tx_cookie, + roc->chan, GFP_KERNEL); list_del(&roc->list); kfree(roc); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 47ee36677c2b..a1e9fc7878aa 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -354,8 +354,10 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, break; } WARN_ONCE(i == sband->n_bitrates, - "no supported rates (0x%x) in rate_mask 0x%x with flags 0x%x\n", + "no supported rates for sta %pM (0x%x, band %d) in rate_mask 0x%x with flags 0x%x\n", + sta ? sta->addr : NULL, sta ? sta->supp_rates[sband->band] : -1, + sband->band, rate_mask, rate_flags); info->control.rates[0].count = @@ -366,9 +368,8 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, } -bool rate_control_send_low(struct ieee80211_sta *pubsta, - void *priv_sta, - struct ieee80211_tx_rate_control *txrc) +static bool rate_control_send_low(struct ieee80211_sta *pubsta, + struct ieee80211_tx_rate_control *txrc) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); struct ieee80211_supported_band *sband = txrc->sband; @@ -376,7 +377,7 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; - if (!pubsta || !priv_sta || rc_no_data_or_no_ack_use_min(txrc)) { + if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); @@ -402,7 +403,6 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta, } return false; } -EXPORT_SYMBOL(rate_control_send_low); static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask) { @@ -885,26 +885,29 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); int i; - if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { - ista = &sta->sta; - priv_sta = sta->rate_ctrl_priv; - } - for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { info->control.rates[i].idx = -1; info->control.rates[i].flags = 0; info->control.rates[i].count = 0; } + if (rate_control_send_low(sta ? &sta->sta : NULL, txrc)) + return; + if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL)) return; + if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { + ista = &sta->sta; + priv_sta = sta->rate_ctrl_priv; + } + if (ista) { spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); spin_unlock_bh(&sta->rate_ctrl_lock); } else { - ref->ops->get_rate(ref->priv, NULL, NULL, txrc); + rate_control_send_low(NULL, txrc); } if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE)) diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index a34e9c2ca626..ee86c3333999 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -340,10 +340,6 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, int delta; int sampling_ratio; - /* management/no-ack frames do not use rate control */ - if (rate_control_send_low(sta, priv_sta, txrc)) - return; - /* check multi-rate-retry capabilities & adjust lookaround_rate */ mrr_capable = mp->has_mrr && !txrc->rts && diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 298a1acb3ce5..5a882da82f0e 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1093,9 +1093,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct minstrel_priv *mp = priv; int sample_idx; - if (rate_control_send_low(sta, priv_sta, txrc)) - return; - if (!msp->is_ht) return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 187f62a48b2b..95eb8220e2e4 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation */ #include <linux/module.h> @@ -401,6 +401,47 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); + for (i = 0; i < NUM_NL80211_BANDS; i++) { + u32 mandatory = 0; + int r; + + if (!hw->wiphy->bands[i]) + continue; + + switch (i) { + case NL80211_BAND_2GHZ: + /* + * We use both here, even if we cannot really know for + * sure the station will support both, but the only use + * for this is when we don't know anything yet and send + * management frames, and then we'll pick the lowest + * possible rate anyway. + * If we don't include _G here, we cannot find a rate + * in P2P, and thus trigger the WARN_ONCE() in rate.c + */ + mandatory = IEEE80211_RATE_MANDATORY_B | + IEEE80211_RATE_MANDATORY_G; + break; + case NL80211_BAND_5GHZ: + mandatory = IEEE80211_RATE_MANDATORY_A; + break; + case NL80211_BAND_60GHZ: + WARN_ON(1); + mandatory = 0; + break; + } + + for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) { + struct ieee80211_rate *rate; + + rate = &hw->wiphy->bands[i]->bitrates[r]; + + if (!(rate->flags & mandatory)) + continue; + sta->sta.supp_rates[i] |= BIT(r); + } + } + sta->sta.smps_mode = IEEE80211_SMPS_OFF; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 21025c2c605b..d59742408d9b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -651,6 +651,17 @@ config NFT_TPROXY help This makes transparent proxy support available in nftables. +config NFT_SYNPROXY + tristate "Netfilter nf_tables SYNPROXY expression support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY expression allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + if NF_TABLES_NETDEV config NF_DUP_NETDEV diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 72cca6b48960..deada20975ff 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -110,6 +110,7 @@ obj-$(CONFIG_NFT_SOCKET) += nft_socket.o obj-$(CONFIG_NFT_OSF) += nft_osf.o obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o +obj-$(CONFIG_NFT_SYNPROXY) += nft_synproxy.o obj-$(CONFIG_NFT_NAT) += nft_chain_nat.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b96fd3f54705..5d5bdf450091 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -520,7 +520,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, ret = -EPERM; return ret; case NF_QUEUE: - ret = nf_queue(skb, state, e, s, verdict); + ret = nf_queue(skb, state, s, verdict); if (ret == 1) continue; return ret; @@ -536,28 +536,6 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, } EXPORT_SYMBOL(nf_hook_slow); - -int skb_make_writable(struct sk_buff *skb, unsigned int writable_len) -{ - if (writable_len > skb->len) - return 0; - - /* Not exclusive use of packet? Must copy. */ - if (!skb_cloned(skb)) { - if (writable_len <= skb_headlen(skb)) - return 1; - } else if (skb_clone_writable(skb, writable_len)) - return 1; - - if (writable_len <= skb_headlen(skb)) - writable_len = 0; - else - writable_len -= skb_headlen(skb); - - return !!__pskb_pull_tail(skb, writable_len); -} -EXPORT_SYMBOL(skb_make_writable); - /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 8acc4e173167..063df74b4647 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index e3884b0cca91..11ff9d4a7006 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the bitmap:ip type */ @@ -28,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index b73c37b3a791..ca7ac4a25ada 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -2,7 +2,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> */ /* Kernel module implementing an IP set type: the bitmap:ip,mac type */ @@ -28,7 +27,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index d8c140553379..704a0dda1609 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the bitmap:port type */ @@ -23,7 +22,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:port"); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 3cdf171cd468..2e151856ad99 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module for IP set management */ @@ -48,7 +48,7 @@ static unsigned int max_sets; module_param(max_sets, int, 0600); MODULE_PARM_DESC(max_sets, "maximal number of sets"); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); @@ -1290,11 +1290,13 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; ip_set_id_t index; + int ret; - /* Second pass, so parser can't fail */ - nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, - nlh->nlmsg_len - min_len, ip_set_setname_policy, - NULL); + ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, + nlh->nlmsg_len - min_len, + ip_set_setname_policy, NULL); + if (ret) + return ret; cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { @@ -1541,10 +1543,14 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; - nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, - nlh->nlmsg_len - min_len, - ip_set_adt_policy, NULL); + ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, + nlh->nlmsg_len - min_len, + ip_set_adt_policy, NULL); + if (ret) { + nlmsg_free(skb2); + return ret; + } errline = nla_data(cda[IPSET_ATTR_LINENO]); *errline = lineno; @@ -1558,10 +1564,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, return ret; } -static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_ad(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + enum ipset_adt adt, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; @@ -1590,18 +1598,17 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (attr[IPSET_ATTR_DATA]) { if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, + ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { - memset(tb, 0, sizeof(tb)); if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, + ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; @@ -1610,56 +1617,22 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, return ret; } -static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int ip_set_uadd(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[], struct netlink_ext_ack *extack) { - struct ip_set_net *inst = ip_set_pernet(net); - struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; - const struct nlattr *nla; - u32 flags = flag_exist(nlh); - bool use_lineno; - int ret = 0; - - if (unlikely(protocol_min_failed(attr) || - !attr[IPSET_ATTR_SETNAME] || - !((attr[IPSET_ATTR_DATA] != NULL) ^ - (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] && - !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] && - (!flag_nested(attr[IPSET_ATTR_ADT]) || - !attr[IPSET_ATTR_LINENO])))) - return -IPSET_ERR_PROTOCOL; - - set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (!set) - return -ENOENT; - - use_lineno = !!attr[IPSET_ATTR_LINENO]; - if (attr[IPSET_ATTR_DATA]) { - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) - return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, - use_lineno); - } else { - int nla_rem; + return ip_set_ad(net, ctnl, skb, + IPSET_ADD, nlh, attr, extack); +} - nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { - memset(tb, 0, sizeof(*tb)); - if (nla_type(nla) != IPSET_ATTR_DATA || - !flag_nested(nla) || - nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) - return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, - flags, use_lineno); - if (ret < 0) - return ret; - } - } - return ret; +static int ip_set_udel(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) +{ + return ip_set_ad(net, ctnl, skb, + IPSET_DEL, nlh, attr, extack); } static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 2384e36aef5c..2b8f959574b4 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ /* Get Layer-4 data from the packets */ diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 10f619625abd..0feb77fa9edc 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef _IP_SET_HASH_GEN_H #define _IP_SET_HASH_GEN_H @@ -622,7 +621,7 @@ retry: goto cleanup; } m->size = AHASH_INIT_SIZE; - extsize = ext_size(AHASH_INIT_SIZE, dsize); + extsize += ext_size(AHASH_INIT_SIZE, dsize); RCU_INIT_POINTER(hbucket(t, key), m); } else if (m->pos >= m->size) { struct hbucket *ht; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 69d7576be2e6..f4432d9fcad0 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip type */ @@ -27,7 +26,7 @@ #define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip"); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 6fe1ec0d2154..7a1734aad0c5 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,mark type */ diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 74ec7e097e34..32e240658334 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port type */ @@ -29,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index ced57d63b01f..15d419353179 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,ip type */ @@ -29,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 905f6cf0f55e..7a4d7afd4121 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ @@ -31,7 +30,7 @@ #define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 853e772ab4d9..d94c585d33c5 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:mac type */ @@ -20,7 +19,7 @@ #define IPSET_TYPE_REV_MAX 0 MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:mac"); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 06c91e49bf25..c259cbc3ef45 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net type */ @@ -28,7 +27,7 @@ #define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net"); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 0a8cbcdfb42b..87b29f971226 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net,iface type */ @@ -29,7 +28,7 @@ #define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 832e4f5491cb..a3ae69bfee66 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa> */ diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index a4f3f15b874a..799f2272cc65 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net,port type */ @@ -30,7 +29,7 @@ #define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,port"); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index e54d415405f3..a82b70e8b9a6 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 8ada318bf09d..6f9ead6319e0 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ +/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the list:set type */ @@ -19,7 +18,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_list:set"); diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index bfd4365a8d73..4515056ef1c2 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -358,7 +358,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, struct tcphdr *th; __u32 seq; - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) return 0; th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); @@ -435,7 +435,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, struct tcphdr *th; __u32 seq; - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) return 0; th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index d5103a9eb302..46f06f92ab8f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -34,6 +34,8 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ +#include <net/gue.h> +#include <net/gre.h> #include <net/route.h> #include <net/ip6_checksum.h> #include <net/netns/generic.h> /* net_generic() */ @@ -892,7 +894,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol) offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto out; #ifdef CONFIG_IP_VS_IPV6 @@ -1282,7 +1284,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); - if (!skb_make_writable(skb, iph->len)) + if (skb_ensure_writable(skb, iph->len)) goto drop; /* mangle the packet */ @@ -1574,6 +1576,73 @@ ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, return 1; } +/* Check the UDP tunnel and return its header length */ +static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, + unsigned int offset, __u16 af, + const union nf_inet_addr *daddr, __u8 *proto) +{ + struct udphdr _udph, *udph; + struct ip_vs_dest *dest; + + udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (!udph) + goto unk; + offset += sizeof(struct udphdr); + dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); + if (!dest) + goto unk; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + struct guehdr _gueh, *gueh; + + gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); + if (!gueh) + goto unk; + if (gueh->control != 0 || gueh->version != 0) + goto unk; + /* Later we can support also IPPROTO_IPV6 */ + if (gueh->proto_ctype != IPPROTO_IPIP) + goto unk; + *proto = gueh->proto_ctype; + return sizeof(struct udphdr) + sizeof(struct guehdr) + + (gueh->hlen << 2); + } + +unk: + return 0; +} + +/* Check the GRE tunnel and return its header length */ +static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, + unsigned int offset, __u16 af, + const union nf_inet_addr *daddr, __u8 *proto) +{ + struct gre_base_hdr _greh, *greh; + struct ip_vs_dest *dest; + + greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh); + if (!greh) + goto unk; + dest = ip_vs_find_tunnel(ipvs, af, daddr, 0); + if (!dest) + goto unk; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 type; + + /* Only support version 0 and C (csum) */ + if ((greh->flags & ~GRE_CSUM) != 0) + goto unk; + type = greh->protocol; + /* Later we can support also IPPROTO_IPV6 */ + if (type != htons(ETH_P_IP)) + goto unk; + *proto = IPPROTO_IPIP; + return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); + } + +unk: + return 0; +} + /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, @@ -1593,6 +1662,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; bool ipip, new_cp = false; + union nf_inet_addr *raddr; *related = 1; @@ -1631,20 +1701,56 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ + raddr = (union nf_inet_addr *)&cih->daddr; /* Special case for errors for IPIP packets */ ipip = false; if (cih->protocol == IPPROTO_IPIP) { + struct ip_vs_dest *dest; + if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; /* Error for our IPIP must arrive at LOCAL_IN */ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) return NF_ACCEPT; + dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); + /* Only for known tunnel */ + if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) + return NF_ACCEPT; offset += cih->ihl * 4; cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ ipip = true; + } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ + cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ + /* Error for our tunnel must arrive at LOCAL_IN */ + (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { + __u8 iproto; + int ulen; + + /* Non-first fragment has no UDP header */ + if (unlikely(cih->frag_off & htons(IP_OFFSET))) + return NF_ACCEPT; + offset2 = offset + cih->ihl * 4; + if (cih->protocol == IPPROTO_UDP) + ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, + raddr, &iproto); + else + ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, + raddr, &iproto); + if (ulen > 0) { + /* Skip IP and UDP/GRE tunnel headers */ + offset = offset2 + ulen; + /* Now we should be at the original IP header */ + cih = skb_header_pointer(skb, offset, sizeof(_ciph), + &_ciph); + if (cih && cih->version == 4 && cih->ihl >= 5 && + iproto == IPPROTO_IPIP) + ipip = true; + else + return NF_ACCEPT; + } } pd = ip_vs_proto_data_get(ipvs, cih->protocol); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 741d91aa4a8d..07e0967bf129 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -510,15 +510,37 @@ static inline unsigned int ip_vs_rs_hashkey(int af, static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned int hash; + __be16 port; if (dest->in_rs_table) return; + switch (IP_VS_DFWD_METHOD(dest)) { + case IP_VS_CONN_F_MASQ: + port = dest->port; + break; + case IP_VS_CONN_F_TUNNEL: + switch (dest->tun_type) { + case IP_VS_CONN_F_TUNNEL_TYPE_GUE: + port = dest->tun_port; + break; + case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: + case IP_VS_CONN_F_TUNNEL_TYPE_GRE: + port = 0; + break; + default: + return; + } + break; + default: + return; + } + /* * Hash by proto,addr,port, * which are the parameters of the real service. */ - hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); dest->in_rs_table = 1; @@ -550,7 +572,8 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && - (dest->protocol == protocol || dest->vfwmark)) { + (dest->protocol == protocol || dest->vfwmark) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return true; } @@ -580,7 +603,37 @@ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && - (dest->protocol == protocol || dest->vfwmark)) { + (dest->protocol == protocol || dest->vfwmark) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* Find real service record by <af,addr,tun_port>. + * In case of multiple records with the same <af,addr,tun_port>, only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, + const union nf_inet_addr *daddr, + __be16 tun_port) +{ + struct ip_vs_dest *dest; + unsigned int hash; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, tun_port); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->tun_port == tun_port && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { /* HIT */ return dest; } @@ -826,24 +879,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; + /* Need to rehash? */ + if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != + IP_VS_DFWD_METHOD(dest) || + udest->tun_type != dest->tun_type || + udest->tun_port != dest->tun_port) + ip_vs_rs_unhash(dest); + /* set the tunnel info */ dest->tun_type = udest->tun_type; dest->tun_port = udest->tun_port; + dest->tun_flags = udest->tun_flags; /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { - /* - * Put the real service in rs_table if not present. - * For now only for NAT! - */ - ip_vs_rs_hash(ipvs, dest); /* FTP-NAT requires conntrack for mangling */ if (svc->port == FTPPORT) ip_vs_register_conntrack(svc); } atomic_set(&dest->conn_flags, conn_flags); + /* Put the real service in rs_table if not present. */ + ip_vs_rs_hash(ipvs, dest); /* bind the service */ old_svc = rcu_dereference_protected(dest->svc, 1); @@ -2904,6 +2962,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3210,6 +3269,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) dest->tun_type) || nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, dest->tun_port) || + nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, + dest->tun_flags) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, @@ -3330,7 +3391,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh, *nla_tun_type, *nla_tun_port; + *nla_l_thresh, *nla_tun_type, *nla_tun_port, + *nla_tun_flags; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; @@ -3338,6 +3400,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; + nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; @@ -3353,6 +3416,9 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, if (nla_tun_port) udest->tun_port = nla_get_be16(nla_tun_port); + + if (nla_tun_flags) + udest->tun_flags = nla_get_u16(nla_tun_flags); } return 0; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index c244b2545e24..cf925906f59b 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -267,7 +267,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, return 1; /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return 0; if (cp->app_data == (void *) IP_VS_FTP_PASV) { @@ -433,7 +433,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, return 1; /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return 0; data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index b58ddb7dffd1..a0921adc31a9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -101,7 +101,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, #endif /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { @@ -148,7 +148,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, #endif /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 915ac8206076..000d961b97e4 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -159,7 +159,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { @@ -237,7 +237,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 379140075e95..153d89647c87 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -148,7 +148,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { @@ -231,7 +231,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index e101eda05d55..9c464d24beec 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -29,6 +29,7 @@ #include <linux/tcp.h> /* for tcphdr */ #include <net/ip.h> #include <net/gue.h> +#include <net/gre.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ @@ -36,6 +37,7 @@ #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> +#include <net/ip6_checksum.h> #include <net/addrconf.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> @@ -275,7 +277,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, } /* don't propagate ttl change to cloned packets */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return false; ipv6_hdr(skb)->hop_limit--; @@ -290,7 +292,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, } /* don't propagate ttl change to cloned packets */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return false; /* Decrease ttl */ @@ -381,8 +383,19 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (!dest) goto err_put; - if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 tflags = 0; + + if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + mtu -= gre_calc_hlen(tflags); + } if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; @@ -536,8 +549,19 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (!dest) goto err_put; - if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 tflags = 0; + + if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + mtu -= gre_calc_hlen(tflags); + } if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); @@ -792,7 +816,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -881,7 +905,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -1002,17 +1026,56 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb, __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); struct udphdr *udph; /* Our new UDP header */ struct guehdr *gueh; /* Our new GUE header */ + size_t hdrlen, optlen = 0; + void *data; + bool need_priv = false; + + if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + need_priv = true; + } - skb_push(skb, sizeof(struct guehdr)); + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); gueh = (struct guehdr *)skb->data; gueh->control = 0; gueh->version = 0; - gueh->hlen = 0; + gueh->hlen = optlen >> 2; gueh->flags = 0; gueh->proto_ctype = *next_protocol; + data = &gueh[1]; + + if (need_priv) { + __be32 *flags = data; + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd; + + gueh->flags |= GUE_FLAG_PRIV; + *flags = 0; + data += GUE_LEN_PRIV; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd = data; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + skb_push(skb, sizeof(struct udphdr)); skb_reset_transport_header(skb); @@ -1029,6 +1092,24 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb, return 0; } +static void +ipvs_gre_encap(struct net *net, struct sk_buff *skb, + struct ip_vs_conn *cp, __u8 *next_protocol) +{ + __be16 proto = *next_protocol == IPPROTO_IPIP ? + htons(ETH_P_IP) : htons(ETH_P_IPV6); + __be16 tflags = 0; + size_t hdrlen; + + if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + + hdrlen = gre_calc_hlen(tflags); + gre_build_header(skb, hdrlen, tflags, proto, 0, 0); + + *next_protocol = IPPROTO_GRE; +} + /* * IP Tunneling transmitter * @@ -1066,6 +1147,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int max_headroom; /* The extra header space needed */ int ret, local; int tun_type, gso_type; + int tun_flags; EnterFunction(10); @@ -1088,9 +1170,28 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + size_t gre_hdrlen; + __be16 tflags = 0; - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + gre_hdrlen = gre_calc_hlen(tflags); + + max_headroom += gre_hdrlen; + } /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; @@ -1101,8 +1202,22 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; gso_type = __tun_gso_type_mask(AF_INET, cp->af); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - gso_type |= SKB_GSO_UDP_TUNNEL; + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + gso_type |= SKB_GSO_GRE_CSUM; + else + gso_type |= SKB_GSO_GRE; + } if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; @@ -1111,8 +1226,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_set_inner_ipproto(skb, next_protocol); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - ipvs_gue_encap(net, skb, cp, &next_protocol); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) + ipvs_gre_encap(net, skb, cp, &next_protocol); skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); @@ -1170,6 +1296,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int max_headroom; /* The extra header space needed */ int ret, local; int tun_type, gso_type; + int tun_flags; EnterFunction(10); @@ -1193,9 +1320,28 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + size_t gre_hdrlen; + __be16 tflags = 0; + + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + gre_hdrlen = gre_calc_hlen(tflags); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + max_headroom += gre_hdrlen; + } skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, &next_protocol, &payload_len, @@ -1204,8 +1350,22 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; gso_type = __tun_gso_type_mask(AF_INET6, cp->af); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - gso_type |= SKB_GSO_UDP_TUNNEL; + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + gso_type |= SKB_GSO_GRE_CSUM; + else + gso_type |= SKB_GSO_GRE; + } if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; @@ -1214,8 +1374,19 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_set_inner_ipproto(skb, next_protocol); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - ipvs_gue_encap(net, skb, cp, &next_protocol); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) + ipvs_gre_encap(net, skb, cp, &next_protocol); skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); @@ -1400,7 +1571,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -1489,7 +1660,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index e52fcb9c9a96..921a7b95be68 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -37,12 +37,17 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, in_dev = __in_dev_get_rcu(rt->dst.dev); if (in_dev != NULL) { - for_primary_ifa(in_dev) { + const struct in_ifaddr *ifa; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + if (ifa->ifa_broadcast == iph->daddr) { mask = ifa->ifa_mask; break; } - } endfor_ifa(in_dev); + } } if (mask == 0) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f4f9b8344a32..bdfeacee0817 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -749,9 +749,6 @@ begin: continue; } - if (nf_ct_is_dying(ct)) - continue; - if (nf_ct_key_equal(h, tuple, zone, net)) return h; } @@ -777,20 +774,24 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conn *ct; rcu_read_lock(); -begin: + h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { + /* We have a candidate that matches the tuple we're interested + * in, try to obtain a reference and re-check tuple + */ ct = nf_ct_tuplehash_to_ctrack(h); - if (unlikely(nf_ct_is_dying(ct) || - !atomic_inc_not_zero(&ct->ct_general.use))) - h = NULL; - else { - if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { - nf_ct_put(ct); - goto begin; - } + if (likely(atomic_inc_not_zero(&ct->ct_general.use))) { + if (likely(nf_ct_key_equal(h, tuple, zone, net))) + goto found; + + /* TYPESAFE_BY_RCU recycled the candidate */ + nf_ct_put(ct); } + + h = NULL; } +found: rcu_read_unlock(); return h; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index fac6986d37a8..6497e5fc0871 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -6,7 +6,7 @@ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the 'brute force' H.323 connection tracking module by - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Jozsef Kadlecsik <kadlec@netfilter.org> * * For more information, please see http://nath323.sourceforge.net/ */ diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 37bb530d848f..a0560d175a7f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -16,6 +16,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_bridge.h> #include <net/netfilter/nf_log.h> #include <linux/ip.h> @@ -120,10 +121,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) }; EXPORT_SYMBOL_GPL(nf_ct_l4proto_find); -static unsigned int nf_confirm(struct sk_buff *skb, - unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) +unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conn_help *help; @@ -154,6 +153,7 @@ static unsigned int nf_confirm(struct sk_buff *skb, /* We've seen it coming out the other side: confirm it */ return nf_conntrack_confirm(skb); } +EXPORT_SYMBOL_GPL(nf_confirm); static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, @@ -442,12 +442,14 @@ static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto) return 0; } +static struct nf_ct_bridge_info *nf_ct_bridge_info; + static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); - bool fixup_needed = false; + bool fixup_needed = false, retry = true; int err = 0; - +retry: mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { @@ -487,6 +489,32 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto) fixup_needed = true; break; #endif + case NFPROTO_BRIDGE: + if (!nf_ct_bridge_info) { + if (!retry) { + err = -EPROTO; + goto out_unlock; + } + mutex_unlock(&nf_ct_proto_mutex); + request_module("nf_conntrack_bridge"); + retry = false; + goto retry; + } + if (!try_module_get(nf_ct_bridge_info->me)) { + err = -EPROTO; + goto out_unlock; + } + cnet->users_bridge++; + if (cnet->users_bridge > 1) + goto out_unlock; + + err = nf_register_net_hooks(net, nf_ct_bridge_info->ops, + nf_ct_bridge_info->ops_size); + if (err) + cnet->users_bridge = 0; + else + fixup_needed = true; + break; default: err = -EPROTO; break; @@ -519,47 +547,99 @@ static void nf_ct_netns_do_put(struct net *net, u8 nfproto) ARRAY_SIZE(ipv6_conntrack_ops)); break; #endif + case NFPROTO_BRIDGE: + if (!nf_ct_bridge_info) + break; + if (cnet->users_bridge && (--cnet->users_bridge == 0)) + nf_unregister_net_hooks(net, nf_ct_bridge_info->ops, + nf_ct_bridge_info->ops_size); + + module_put(nf_ct_bridge_info->me); + break; } - mutex_unlock(&nf_ct_proto_mutex); } -int nf_ct_netns_get(struct net *net, u8 nfproto) +static int nf_ct_netns_inet_get(struct net *net) { int err; - if (nfproto == NFPROTO_INET) { - err = nf_ct_netns_do_get(net, NFPROTO_IPV4); - if (err < 0) - goto err1; - err = nf_ct_netns_do_get(net, NFPROTO_IPV6); - if (err < 0) - goto err2; - } else { - err = nf_ct_netns_do_get(net, nfproto); - if (err < 0) - goto err1; - } - return 0; + err = nf_ct_netns_do_get(net, NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_netns_do_get(net, NFPROTO_IPV6); + if (err < 0) + goto err2; + return err; err2: nf_ct_netns_put(net, NFPROTO_IPV4); err1: return err; } + +int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + int err; + + switch (nfproto) { + case NFPROTO_INET: + err = nf_ct_netns_inet_get(net); + break; + case NFPROTO_BRIDGE: + err = nf_ct_netns_do_get(net, NFPROTO_BRIDGE); + if (err < 0) + return err; + + err = nf_ct_netns_inet_get(net); + if (err < 0) { + nf_ct_netns_put(net, NFPROTO_BRIDGE); + return err; + } + break; + default: + err = nf_ct_netns_do_get(net, nfproto); + break; + } + return err; +} EXPORT_SYMBOL_GPL(nf_ct_netns_get); void nf_ct_netns_put(struct net *net, uint8_t nfproto) { - if (nfproto == NFPROTO_INET) { + switch (nfproto) { + case NFPROTO_BRIDGE: + nf_ct_netns_do_put(net, NFPROTO_BRIDGE); + /* fall through */ + case NFPROTO_INET: nf_ct_netns_do_put(net, NFPROTO_IPV4); nf_ct_netns_do_put(net, NFPROTO_IPV6); - } else { + break; + default: nf_ct_netns_do_put(net, nfproto); + break; } } EXPORT_SYMBOL_GPL(nf_ct_netns_put); +void nf_ct_bridge_register(struct nf_ct_bridge_info *info) +{ + WARN_ON(nf_ct_bridge_info); + mutex_lock(&nf_ct_proto_mutex); + nf_ct_bridge_info = info; + mutex_unlock(&nf_ct_proto_mutex); +} +EXPORT_SYMBOL_GPL(nf_ct_bridge_register); + +void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info) +{ + WARN_ON(!nf_ct_bridge_info); + mutex_lock(&nf_ct_proto_mutex); + nf_ct_bridge_info = NULL; + mutex_unlock(&nf_ct_proto_mutex); +} +EXPORT_SYMBOL_GPL(nf_ct_bridge_unregister); + int nf_conntrack_proto_init(void) { int ret; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 522c08c23600..fce3d93f1541 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -336,7 +336,7 @@ static bool sctp_error(struct sk_buff *skb, if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { - if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) { + if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1e2cc83ff5da..d5fdfa00d683 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index dc21a43cd145..3066449f8bd8 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -126,7 +126,7 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; - if (!skb_make_writable(skb, optend)) + if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; @@ -176,7 +176,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb, this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; - if (!skb_make_writable(skb, protoff + sizeof(*tcph))) + if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 948b4ebbe3fb..e3d797252a98 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -53,7 +53,6 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, ft->dst_port = ctt->dst.u.tcp.port; ft->iifidx = other_dst->dev->ifindex; - ft->oifidx = dst->dev->ifindex; ft->dst_cache = dst; } diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 3574a212bdc2..bb25d4c794c7 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -374,7 +374,7 @@ static int seq_show(struct seq_file *s, void *v) continue; logger = nft_log_dereference(loggers[*pos][i]); - seq_printf(s, "%s", logger->name); + seq_puts(s, logger->name); if (i == 0 && loggers[*pos][i + 1] != NULL) seq_puts(s, ","); diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 98bf543e9891..a263505455fc 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -95,7 +95,7 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb, struct tcphdr *tcph; int oldlen, datalen; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return false; if (rep_len > match_len && @@ -145,7 +145,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, struct udphdr *udph; int datalen, oldlen; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return false; if (rep_len > match_len && diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 83a24cc5753b..7ac733ebd060 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -70,7 +70,7 @@ static bool udp_manip_pkt(struct sk_buff *skb, struct udphdr *hdr; bool do_csum; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); @@ -88,7 +88,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb, #ifdef CONFIG_NF_CT_PROTO_UDPLITE struct udphdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); @@ -114,7 +114,7 @@ sctp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(*hdr)) hdrsize = sizeof(*hdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); @@ -155,7 +155,7 @@ tcp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct tcphdr *)(skb->data + hdroff); @@ -195,7 +195,7 @@ dccp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(struct dccp_hdr)) hdrsize = sizeof(struct dccp_hdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct dccp_hdr *)(skb->data + hdroff); @@ -229,7 +229,7 @@ icmp_manip_pkt(struct sk_buff *skb, { struct icmphdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); @@ -247,7 +247,7 @@ icmpv6_manip_pkt(struct sk_buff *skb, { struct icmp6hdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmp6hdr *)(skb->data + hdroff); @@ -275,7 +275,7 @@ gre_manip_pkt(struct sk_buff *skb, /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ - if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8)) + if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8)) return false; greh = (void *)skb->data + hdroff; @@ -347,7 +347,7 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, struct iphdr *iph; unsigned int hdroff; - if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) + if (skb_ensure_writable(skb, iphdroff + sizeof(*iph))) return false; iph = (void *)skb->data + iphdroff; @@ -378,7 +378,7 @@ static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, int hdroff; u8 nexthdr; - if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h))) + if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h))) return false; ipv6h = (void *)skb->data + iphdroff; @@ -562,7 +562,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); - if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP)) return 0; @@ -784,7 +784,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); - if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) return 0; diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 4ffe5e5e65ba..f91579c821e9 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -44,15 +44,17 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, if (hooknum == NF_INET_LOCAL_OUT) { newdst = htonl(0x7F000001); } else { - struct in_device *indev; - struct in_ifaddr *ifa; + const struct in_device *indev; newdst = 0; indev = __in_dev_get_rcu(skb->dev); - if (indev && indev->ifa_list) { - ifa = indev->ifa_list; - newdst = ifa->ifa_local; + if (indev) { + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(indev->ifa_list); + if (ifa) + newdst = ifa->ifa_local; } if (!newdst) diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index 7de28fa0f14a..e338d91980d8 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -282,7 +282,7 @@ next: if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) { struct udphdr *uh; - if (!skb_make_writable(skb, skb->len)) { + if (skb_ensure_writable(skb, skb->len)) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); return NF_DROP; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 2c440015ff0c..a2b58de82600 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -156,7 +156,6 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, } static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, - const struct nf_hook_entries *entries, unsigned int index, unsigned int queuenum) { int status = -ENOENT; @@ -229,12 +228,11 @@ err: /* Packets leaving via this function must come back through nf_reinject(). */ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, - const struct nf_hook_entries *entries, unsigned int index, - unsigned int verdict) + unsigned int index, unsigned int verdict) { int ret; - ret = __nf_queue(skb, state, entries, index, verdict >> NF_VERDICT_QBITS); + ret = __nf_queue(skb, state, index, verdict >> NF_VERDICT_QBITS); if (ret < 0) { if (ret == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) @@ -340,7 +338,7 @@ next_hook: local_bh_enable(); break; case NF_QUEUE: - err = nf_queue(skb, &entry->state, hooks, i, verdict); + err = nf_queue(skb, &entry->state, i, verdict); if (err == 1) goto next_hook; break; diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 8ce74ed985c0..b101f187eda8 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -10,16 +10,16 @@ #include <net/netns/generic.h> #include <linux/proc_fs.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter/xt_tcpudp.h> -#include <linux/netfilter/xt_SYNPROXY.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nf_synproxy.h> #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_synproxy.h> unsigned int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); @@ -57,7 +57,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, case TCPOPT_MSS: if (opsize == TCPOLEN_MSS) { opts->mss = get_unaligned_be16(ptr); - opts->options |= XT_SYNPROXY_OPT_MSS; + opts->options |= NF_SYNPROXY_OPT_MSS; } break; case TCPOPT_WINDOW: @@ -65,19 +65,19 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, opts->wscale = *ptr; if (opts->wscale > TCP_MAX_WSCALE) opts->wscale = TCP_MAX_WSCALE; - opts->options |= XT_SYNPROXY_OPT_WSCALE; + opts->options |= NF_SYNPROXY_OPT_WSCALE; } break; case TCPOPT_TIMESTAMP: if (opsize == TCPOLEN_TIMESTAMP) { opts->tsval = get_unaligned_be32(ptr); opts->tsecr = get_unaligned_be32(ptr + 4); - opts->options |= XT_SYNPROXY_OPT_TIMESTAMP; + opts->options |= NF_SYNPROXY_OPT_TIMESTAMP; } break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM) - opts->options |= XT_SYNPROXY_OPT_SACK_PERM; + opts->options |= NF_SYNPROXY_OPT_SACK_PERM; break; } @@ -89,36 +89,36 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, } EXPORT_SYMBOL_GPL(synproxy_parse_options); -unsigned int synproxy_options_size(const struct synproxy_options *opts) +static unsigned int +synproxy_options_size(const struct synproxy_options *opts) { unsigned int size = 0; - if (opts->options & XT_SYNPROXY_OPT_MSS) + if (opts->options & NF_SYNPROXY_OPT_MSS) size += TCPOLEN_MSS_ALIGNED; - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) size += TCPOLEN_TSTAMP_ALIGNED; - else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) size += TCPOLEN_SACKPERM_ALIGNED; - if (opts->options & XT_SYNPROXY_OPT_WSCALE) + if (opts->options & NF_SYNPROXY_OPT_WSCALE) size += TCPOLEN_WSCALE_ALIGNED; return size; } -EXPORT_SYMBOL_GPL(synproxy_options_size); -void +static void synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) { __be32 *ptr = (__be32 *)(th + 1); u8 options = opts->options; - if (options & XT_SYNPROXY_OPT_MSS) + if (options & NF_SYNPROXY_OPT_MSS) *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss); - if (options & XT_SYNPROXY_OPT_TIMESTAMP) { - if (options & XT_SYNPROXY_OPT_SACK_PERM) + if (options & NF_SYNPROXY_OPT_TIMESTAMP) { + if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | @@ -131,58 +131,56 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); - } else if (options & XT_SYNPROXY_OPT_SACK_PERM) + } else if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); - if (options & XT_SYNPROXY_OPT_WSCALE) + if (options & NF_SYNPROXY_OPT_WSCALE) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->wscale); } -EXPORT_SYMBOL_GPL(synproxy_build_options); -void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, +void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; opts->tsval = tcp_time_stamp_raw() & ~0x3f; - if (opts->options & XT_SYNPROXY_OPT_WSCALE) { + if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; opts->wscale = info->wscale; } else opts->tsval |= 0xf; - if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) opts->tsval |= 1 << 4; - if (opts->options & XT_SYNPROXY_OPT_ECN) + if (opts->options & NF_SYNPROXY_OPT_ECN) opts->tsval |= 1 << 5; } EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); -void synproxy_check_timestamp_cookie(struct synproxy_options *opts) +static void +synproxy_check_timestamp_cookie(struct synproxy_options *opts) { opts->wscale = opts->tsecr & 0xf; if (opts->wscale != 0xf) - opts->options |= XT_SYNPROXY_OPT_WSCALE; + opts->options |= NF_SYNPROXY_OPT_WSCALE; - opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0; + opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0; - opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0; + opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0; } -EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie); -unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, - unsigned int protoff, - struct tcphdr *th, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct nf_conn_synproxy *synproxy) +static unsigned int +synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, + struct tcphdr *th, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy) { unsigned int optoff, optend; __be32 *ptr, old; @@ -193,7 +191,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, optoff = protoff + sizeof(struct tcphdr); optend = protoff + th->doff * 4; - if (!skb_make_writable(skb, optend)) + if (skb_ensure_writable(skb, optend)) return 0; while (optoff < optend) { @@ -232,7 +230,6 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, } return 1; } -EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust); static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { .len = sizeof(struct nf_conn_synproxy), @@ -413,5 +410,830 @@ static void __exit synproxy_core_exit(void) module_init(synproxy_core_init); module_exit(synproxy_core_exit); +static struct iphdr * +synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, + __be32 daddr) +{ + struct iphdr *iph; + + skb_reset_network_header(skb); + iph = skb_put(skb, sizeof(*iph)); + iph->version = 4; + iph->ihl = sizeof(*iph) / 4; + iph->tos = 0; + iph->id = 0; + iph->frag_off = htons(IP_DF); + iph->ttl = net->ipv4.sysctl_ip_default_ttl; + iph->protocol = IPPROTO_TCP; + iph->check = 0; + iph->saddr = saddr; + iph->daddr = daddr; + + return iph; +} + +static void +synproxy_send_tcp(struct net *net, + const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct iphdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + skb_dst_set_noref(nskb, skb_dst(skb)); + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) + goto free_nskb; + + if (nfct) { + nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); + nf_conntrack_get(nfct); + } + + ip_local_out(net, nskb->sk, nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +void +synproxy_send_client_synack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); +} +EXPORT_SYMBOL_GPL(synproxy_send_client_synack); + +static void +synproxy_send_server_syn(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(struct net *net, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(ntohs(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); +} + +bool +synproxy_recv_client_ack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + int mss; + + mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= NF_SYNPROXY_OPT_MSS; + + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(net, skb, th, opts, recv_seq); + return true; +} +EXPORT_SYMBOL_GPL(synproxy_recv_client_ack); + +unsigned int +ipv4_synproxy_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *nhs) +{ + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + unsigned int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (!synproxy) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb) || + ip_hdr(skb)->protocol != IPPROTO_TCP) + return NF_ACCEPT; + + thoff = ip_hdrlen(skb); + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (!th) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(net, skb, th, &opts, + ntohl(th->seq) + 1)) { + this_cpu_inc(snet->stats->cookie_retrans); + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { + synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } + + opts.options &= ~(NF_SYNPROXY_OPT_MSS | + NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(net, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(net, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(ipv4_synproxy_hook); + +static const struct nf_hook_ops ipv4_synproxy_ops[] = { + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net) +{ + int err; + + if (snet->hook_ref4 == 0) { + err = nf_register_net_hooks(net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); + if (err) + return err; + } + + snet->hook_ref4++; + return 0; +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init); + +void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net) +{ + snet->hook_ref4--; + if (snet->hook_ref4 == 0) + nf_unregister_net_hooks(net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini); + +#if IS_ENABLED(CONFIG_IPV6) +static struct ipv6hdr * +synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct ipv6hdr *iph; + + skb_reset_network_header(skb); + iph = skb_put(skb, sizeof(*iph)); + ip6_flow_hdr(iph, 0, 0); + iph->hop_limit = net->ipv6.devconf_all->hop_limit; + iph->nexthdr = IPPROTO_TCP; + iph->saddr = *saddr; + iph->daddr = *daddr; + + return iph; +} + +static void +synproxy_send_tcp_ipv6(struct net *net, + const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct ipv6hdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + struct dst_entry *dst; + struct flowi6 fl6; + int err; + + nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = niph->saddr; + fl6.daddr = niph->daddr; + fl6.fl6_sport = nth->source; + fl6.fl6_dport = nth->dest; + security_skb_classify_flow((struct sk_buff *)skb, + flowi6_to_flowi(&fl6)); + err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); + if (err) { + goto free_nskb; + } + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + goto free_nskb; + + skb_dst_set(nskb, dst); + + if (nfct) { + nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); + nf_conntrack_get(nfct); + } + + ip6_local_out(net, nskb->sk, nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +void +synproxy_send_client_synack_ipv6(struct net *net, + const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, + tcp_hdr_size); +} +EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6); + +static void +synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general, + IP_CT_NEW, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state, + const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth, + tcp_hdr_size); +} + +static void +synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(ntohs(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, + tcp_hdr_size); +} + +bool +synproxy_recv_client_ack_ipv6(struct net *net, + const struct sk_buff *skb, + const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + int mss; + + mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= NF_SYNPROXY_OPT_MSS; + + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq); + return true; +} +EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6); + +unsigned int +ipv6_synproxy_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *nhs) +{ + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + __be16 frag_off; + u8 nexthdr; + int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (!synproxy) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (thoff < 0 || nexthdr != IPPROTO_TCP) + return NF_ACCEPT; + + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (!th) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, + ntohl(th->seq) + 1)) { + this_cpu_inc(snet->stats->cookie_retrans); + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { + synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } + + opts.options &= ~(NF_SYNPROXY_OPT_MSS | + NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack_ipv6(net, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack_ipv6(net, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(ipv6_synproxy_hook); + +static const struct nf_hook_ops ipv6_synproxy_ops[] = { + { + .hook = ipv6_synproxy_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv6_synproxy_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +int +nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) +{ + int err; + + if (snet->hook_ref6 == 0) { + err = nf_register_net_hooks(net, ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); + if (err) + return err; + } + + snet->hook_ref6++; + return 0; +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init); + +void +nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) +{ + snet->hook_ref6--; + if (snet->hook_ref6 == 0) + nf_unregister_net_hooks(net, ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini); +#endif /* CONFIG_IPV6 */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bcf17fb46d96..d22d00ca78c1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1446,25 +1446,18 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) return newstats; } -static void nft_chain_stats_replace(struct net *net, - struct nft_base_chain *chain, - struct nft_stats __percpu *newstats) +static void nft_chain_stats_replace(struct nft_trans *trans) { - struct nft_stats __percpu *oldstats; + struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain); - if (newstats == NULL) + if (!nft_trans_chain_stats(trans)) return; - if (rcu_access_pointer(chain->stats)) { - oldstats = rcu_dereference_protected(chain->stats, - lockdep_commit_lock_is_held(net)); - rcu_assign_pointer(chain->stats, newstats); - synchronize_rcu(); - free_percpu(oldstats); - } else { - rcu_assign_pointer(chain->stats, newstats); + rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans), + lockdep_commit_lock_is_held(trans->ctx.net)); + + if (!nft_trans_chain_stats(trans)) static_branch_inc(&nft_counters_enabled); - } } static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) @@ -2016,15 +2009,31 @@ EXPORT_SYMBOL_GPL(nft_unregister_expr); static const struct nft_expr_type *__nft_expr_type_get(u8 family, struct nlattr *nla) { - const struct nft_expr_type *type; + const struct nft_expr_type *type, *candidate = NULL; list_for_each_entry(type, &nf_tables_expressions, list) { - if (!nla_strcmp(nla, type->name) && - (!type->family || type->family == family)) - return type; + if (!nla_strcmp(nla, type->name)) { + if (!type->family && !candidate) + candidate = type; + else if (type->family == family) + candidate = type; + } } - return NULL; + return candidate; +} + +#ifdef CONFIG_MODULES +static int nft_expr_type_request_module(struct net *net, u8 family, + struct nlattr *nla) +{ + nft_request_module(net, "nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); + if (__nft_expr_type_get(family, nla)) + return -EAGAIN; + + return 0; } +#endif static const struct nft_expr_type *nft_expr_type_get(struct net *net, u8 family, @@ -2042,9 +2051,7 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nft-expr-%u-%.*s", family, - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_expr_type_request_module(net, family, nla) == -EAGAIN) return ERR_PTR(-EAGAIN); nft_request_module(net, "nft-expr-%.*s", @@ -2137,6 +2144,12 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, (const struct nlattr * const *)info->tb); if (IS_ERR(ops)) { err = PTR_ERR(ops); +#ifdef CONFIG_MODULES + if (err == -EAGAIN) + nft_expr_type_request_module(ctx->net, + ctx->family, + tb[NFTA_EXPR_NAME]); +#endif goto err1; } } else @@ -3877,6 +3890,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_SET_ELEM_EXPIRATION] = { .type = NLA_U64 }, [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, @@ -4330,7 +4344,7 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *data, - u64 timeout, gfp_t gfp) + u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -4345,9 +4359,11 @@ void *nft_set_elem_init(const struct nft_set *set, memcpy(nft_set_ext_key(ext), key, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) - *nft_set_ext_expiration(ext) = - get_jiffies_64() + timeout; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { + *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration; + if (expiration == 0) + *nft_set_ext_expiration(ext) += timeout; + } if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) *nft_set_ext_timeout(ext) = timeout; @@ -4412,6 +4428,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_trans *trans; u32 flags = 0; u64 timeout; + u64 expiration; u8 ulen; int err; @@ -4455,6 +4472,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, timeout = set->timeout; } + expiration = 0; + if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) { + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EINVAL; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION], + &expiration); + if (err) + return err; + } + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, nla[NFTA_SET_ELEM_KEY]); if (err < 0) @@ -4537,7 +4564,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, - timeout, GFP_KERNEL); + timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) goto err3; @@ -4739,7 +4766,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, - GFP_KERNEL); + 0, GFP_KERNEL); if (elem.priv == NULL) goto err2; @@ -6359,9 +6386,9 @@ static void nft_chain_commit_update(struct nft_trans *trans) if (!nft_is_base_chain(trans->ctx.chain)) return; + nft_chain_stats_replace(trans); + basechain = nft_base_chain(trans->ctx.chain); - nft_chain_stats_replace(trans->ctx.net, basechain, - nft_trans_chain_stats(trans)); switch (nft_trans_chain_policy(trans)) { case NF_DROP: @@ -6378,6 +6405,7 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: + free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index b950cd31348b..96c74c4c7176 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -19,6 +19,7 @@ #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> +#include <net/netfilter/nft_meta.h> static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index f42326b40d6f..9f5dea0064ea 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -33,6 +33,7 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); + const struct in_ifaddr *ifa; int ret = 0; if (ttl_check == NF_OSF_TTL_TRUE) @@ -42,15 +43,13 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, else if (ip->ttl <= f_ttl) return 1; - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } - endfor_ifa(in_dev); - return ret; } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 89750f74e3a2..b6a7ce622c72 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -859,7 +859,7 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) } skb_put(e->skb, diff); } - if (!skb_make_writable(e->skb, data_len)) + if (skb_ensure_writable(e->skb, data_len)) return -ENOMEM; skb_copy_to_linear_data(e->skb, data, data_len); e->skb->ip_summed = CHECKSUM_NONE; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index dfcdea6619f1..827ab6196df9 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -21,6 +21,7 @@ #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_expect.h> struct nft_ct { enum nft_ct_keys key:8; @@ -1153,6 +1154,135 @@ static struct nft_object_type nft_ct_helper_obj_type __read_mostly = { .owner = THIS_MODULE, }; +struct nft_ct_expect_obj { + u16 l3num; + __be16 dport; + u8 l4proto; + u8 size; + u32 timeout; +}; + +static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_ct_expect_obj *priv = nft_obj_data(obj); + + if (!tb[NFTA_CT_EXPECT_L4PROTO] || + !tb[NFTA_CT_EXPECT_DPORT] || + !tb[NFTA_CT_EXPECT_TIMEOUT] || + !tb[NFTA_CT_EXPECT_SIZE]) + return -EINVAL; + + priv->l3num = ctx->family; + if (tb[NFTA_CT_EXPECT_L3PROTO]) + priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); + + priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); + priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); + priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); + priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); + + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + nf_ct_netns_put(ctx->net, ctx->family); +} + +static int nft_ct_expect_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + const struct nft_ct_expect_obj *priv = nft_obj_data(obj); + + if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) || + nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) || + nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) || + nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) || + nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size)) + return -1; + + return 0; +} + +static void nft_ct_expect_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct_expect_obj *priv = nft_obj_data(obj); + struct nf_conntrack_expect *exp; + enum ip_conntrack_info ctinfo; + struct nf_conn_help *help; + enum ip_conntrack_dir dir; + u16 l3num = priv->l3num; + struct nf_conn *ct; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct || ctinfo == IP_CT_UNTRACKED) { + regs->verdict.code = NFT_BREAK; + return; + } + dir = CTINFO2DIR(ctinfo); + + help = nfct_help(ct); + if (!help) + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + if (!help) { + regs->verdict.code = NF_DROP; + return; + } + + if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) { + regs->verdict.code = NFT_BREAK; + return; + } + if (l3num == NFPROTO_INET) + l3num = nf_ct_l3num(ct); + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + regs->verdict.code = NF_DROP; + return; + } + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num, + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + priv->l4proto, NULL, &priv->dport); + exp->timeout.expires = jiffies + priv->timeout * HZ; + + if (nf_ct_expect_related(exp) != 0) + regs->verdict.code = NF_DROP; +} + +static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { + [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 }, + [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 }, + [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 }, + [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 }, +}; + +static struct nft_object_type nft_ct_expect_obj_type; + +static const struct nft_object_ops nft_ct_expect_obj_ops = { + .type = &nft_ct_expect_obj_type, + .size = sizeof(struct nft_ct_expect_obj), + .eval = nft_ct_expect_obj_eval, + .init = nft_ct_expect_obj_init, + .destroy = nft_ct_expect_obj_destroy, + .dump = nft_ct_expect_obj_dump, +}; + +static struct nft_object_type nft_ct_expect_obj_type __read_mostly = { + .type = NFT_OBJECT_CT_EXPECT, + .ops = &nft_ct_expect_obj_ops, + .maxattr = NFTA_CT_EXPECT_MAX, + .policy = nft_ct_expect_policy, + .owner = THIS_MODULE, +}; + static int __init nft_ct_module_init(void) { int err; @@ -1170,17 +1300,23 @@ static int __init nft_ct_module_init(void) err = nft_register_obj(&nft_ct_helper_obj_type); if (err < 0) goto err2; + + err = nft_register_obj(&nft_ct_expect_obj_type); + if (err < 0) + goto err3; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err = nft_register_obj(&nft_ct_timeout_obj_type); if (err < 0) - goto err3; + goto err4; #endif return 0; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT +err4: + nft_unregister_obj(&nft_ct_expect_obj_type); +#endif err3: nft_unregister_obj(&nft_ct_helper_obj_type); -#endif err2: nft_unregister_expr(&nft_notrack_type); err1: @@ -1193,6 +1329,7 @@ static void __exit nft_ct_module_exit(void) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT nft_unregister_obj(&nft_ct_timeout_obj_type); #endif + nft_unregister_obj(&nft_ct_expect_obj_type); nft_unregister_obj(&nft_ct_helper_obj_type); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); @@ -1207,3 +1344,4 @@ MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 505bdfc66801..33833a0cb989 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -56,7 +56,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, elem = nft_set_elem_init(set, &priv->tmpl, ®s->data[priv->sreg_key], ®s->data[priv->sreg_data], - timeout, GFP_ATOMIC); + timeout, 0, GFP_ATOMIC); if (elem == NULL) goto err1; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a7aa6c5250a4..a5e8469859e3 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -59,6 +59,103 @@ err: regs->verdict.code = NFT_BREAK; } +/* find the offset to specified option. + * + * If target header is found, its offset is set in *offset and return option + * number. Otherwise, return negative error. + * + * If the first fragment doesn't contain the End of Options it is considered + * invalid. + */ +static int ipv4_find_option(struct net *net, struct sk_buff *skb, + unsigned int *offset, int target) +{ + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options *opt = (struct ip_options *)optbuf; + struct iphdr *iph, _iph; + unsigned int start; + bool found = false; + __be32 info; + int optlen; + + iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (!iph) + return -EBADMSG; + start = sizeof(struct iphdr); + + optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); + if (optlen <= 0) + return -ENOENT; + + memset(opt, 0, sizeof(struct ip_options)); + /* Copy the options since __ip_options_compile() modifies + * the options. + */ + if (skb_copy_bits(skb, start, opt->__data, optlen)) + return -EBADMSG; + opt->optlen = optlen; + + if (__ip_options_compile(net, opt, NULL, &info)) + return -EBADMSG; + + switch (target) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (!opt->srr) + break; + found = target == IPOPT_SSRR ? opt->is_strictroute : + !opt->is_strictroute; + if (found) + *offset = opt->srr + start; + break; + case IPOPT_RR: + if (!opt->rr) + break; + *offset = opt->rr + start; + found = true; + break; + case IPOPT_RA: + if (!opt->router_alert) + break; + *offset = opt->router_alert + start; + found = true; + break; + default: + return -EOPNOTSUPP; + } + return found ? target : -ENOENT; +} + +static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + struct sk_buff *skb = pkt->skb; + unsigned int offset; + int err; + + if (skb->protocol != htons(ETH_P_IP)) + goto err; + + err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type); + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + *dest = (err >= 0); + return; + } else if (err < 0) { + goto err; + } + offset += priv->offset; + + dest[priv->len / NFT_REG32_SIZE] = 0; + if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + goto err; + return; +err: + regs->verdict.code = NFT_BREAK; +} + static void * nft_tcp_header_pointer(const struct nft_pktinfo *pkt, unsigned int len, void *buffer, unsigned int *tcphdr_len) @@ -153,7 +250,8 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (i + optl > tcphdr_len || priv->len + priv->offset > optl) return; - if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len)) + if (skb_ensure_writable(pkt->skb, + pkt->xt.thoff + i + priv->len)) return; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, @@ -311,6 +409,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, return nft_validate_register_load(priv->sreg, priv->len); } +static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err = nft_exthdr_init(ctx, expr, tb); + + if (err < 0) + return err; + + switch (priv->type) { + case IPOPT_SSRR: + case IPOPT_LSRR: + case IPOPT_RR: + case IPOPT_RA: + break; + default: + return -EOPNOTSUPP; + } + return 0; +} + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -357,6 +477,14 @@ static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .dump = nft_exthdr_dump, }; +static const struct nft_expr_ops nft_exthdr_ipv4_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_ipv4_eval, + .init = nft_exthdr_ipv4_init, + .dump = nft_exthdr_dump, +}; + static const struct nft_expr_ops nft_exthdr_tcp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -397,6 +525,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv6_ops; break; + case NFT_EXTHDR_OP_IPV4: + if (ctx->family != NFPROTO_IPV6) { + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_ipv4_ops; + } + break; } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index a54329b8634a..417f8d32e9a3 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -21,23 +21,12 @@ #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nft_meta.h> #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ -struct nft_meta { - enum nft_meta_keys key:8; - union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; - }; -}; - static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -#ifdef CONFIG_NF_TABLES_BRIDGE -#include "../bridge/br_private.h" -#endif - void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -47,9 +36,6 @@ void nft_meta_get_eval(const struct nft_expr *expr, const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); struct sock *sk; u32 *dest = ®s->data[priv->dreg]; -#ifdef CONFIG_NF_TABLES_BRIDGE - const struct net_bridge_port *p; -#endif switch (priv->key) { case NFT_META_LEN: @@ -229,18 +215,6 @@ void nft_meta_get_eval(const struct nft_expr *expr, nft_reg_store8(dest, secpath_exists(skb)); break; #endif -#ifdef CONFIG_NF_TABLES_BRIDGE - case NFT_META_BRI_IIFNAME: - if (in == NULL || (p = br_port_get_rcu(in)) == NULL) - goto err; - strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); - return; - case NFT_META_BRI_OIFNAME: - if (out == NULL || (p = br_port_get_rcu(out)) == NULL) - goto err; - strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); - return; -#endif case NFT_META_IIFKIND: if (in == NULL || in->rtnl_link_ops == NULL) goto err; @@ -260,10 +234,11 @@ void nft_meta_get_eval(const struct nft_expr *expr, err: regs->verdict.code = NFT_BREAK; } +EXPORT_SYMBOL_GPL(nft_meta_get_eval); -static void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; @@ -300,16 +275,18 @@ static void nft_meta_set_eval(const struct nft_expr *expr, WARN_ON(1); } } +EXPORT_SYMBOL_GPL(nft_meta_set_eval); -static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, [NFTA_META_KEY] = { .type = NLA_U32 }, [NFTA_META_SREG] = { .type = NLA_U32 }, }; +EXPORT_SYMBOL_GPL(nft_meta_policy); -static int nft_meta_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -360,14 +337,6 @@ static int nft_meta_get_init(const struct nft_ctx *ctx, len = sizeof(u8); break; #endif -#ifdef CONFIG_NF_TABLES_BRIDGE - case NFT_META_BRI_IIFNAME: - case NFT_META_BRI_OIFNAME: - if (ctx->family != NFPROTO_BRIDGE) - return -EOPNOTSUPP; - len = IFNAMSIZ; - break; -#endif default: return -EOPNOTSUPP; } @@ -376,6 +345,7 @@ static int nft_meta_get_init(const struct nft_ctx *ctx, return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, len); } +EXPORT_SYMBOL_GPL(nft_meta_get_init); static int nft_meta_get_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -409,9 +379,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } -static int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -437,10 +407,11 @@ static int nft_meta_set_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, hooks); } +EXPORT_SYMBOL_GPL(nft_meta_set_validate); -static int nft_meta_set_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -475,9 +446,10 @@ static int nft_meta_set_init(const struct nft_ctx *ctx, return 0; } +EXPORT_SYMBOL_GPL(nft_meta_set_init); -static int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) +int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -490,8 +462,9 @@ static int nft_meta_get_dump(struct sk_buff *skb, nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_get_dump); -static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -505,15 +478,17 @@ static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_set_dump); -static void nft_meta_set_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +void nft_meta_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); if (priv->key == NFT_META_NFTRACE) static_branch_dec(&nft_trace_enabled); } +EXPORT_SYMBOL_GPL(nft_meta_set_destroy); static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, @@ -544,6 +519,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) return ERR_PTR(-EINVAL); +#ifdef CONFIG_NF_TABLES_BRIDGE + if (ctx->family == NFPROTO_BRIDGE) + return ERR_PTR(-EAGAIN); +#endif if (tb[NFTA_META_DREG]) return &nft_meta_get_ops; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 680bd9f38a81..1260f78a034d 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -240,7 +240,7 @@ static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, tsum)); } - if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) || + if (skb_ensure_writable(skb, l4csum_offset + sizeof(sum)) || skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0) return -1; @@ -256,7 +256,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, return -1; nft_csum_replace(&sum, fsum, tsum); - if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || + if (skb_ensure_writable(skb, csum_offset + sizeof(sum)) || skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) return -1; @@ -309,7 +309,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, goto err; } - if (!skb_make_writable(skb, max(offset + priv->len, 0)) || + if (skb_ensure_writable(skb, max(offset + priv->len, 0)) || skb_store_bits(skb, offset, src, priv->len) < 0) goto err; diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c new file mode 100644 index 000000000000..80060ade8a5b --- /dev/null +++ b/net/netfilter/nft_synproxy.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/types.h> +#include <net/ip.h> +#include <net/tcp.h> +#include <net/netlink.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_synproxy.h> +#include <net/netfilter/nf_synproxy.h> +#include <linux/netfilter/nf_tables.h> +#include <linux/netfilter/nf_synproxy.h> + +struct nft_synproxy { + struct nf_synproxy_info info; +}; + +static const struct nla_policy nft_synproxy_policy[NFTA_SYNPROXY_MAX + 1] = { + [NFTA_SYNPROXY_MSS] = { .type = NLA_U16 }, + [NFTA_SYNPROXY_WSCALE] = { .type = NLA_U8 }, + [NFTA_SYNPROXY_FLAGS] = { .type = NLA_U32 }, +}; + +static void nft_synproxy_tcp_options(struct synproxy_options *opts, + const struct tcphdr *tcp, + struct synproxy_net *snet, + struct nf_synproxy_info *info, + struct nft_synproxy *priv) +{ + this_cpu_inc(snet->stats->syn_received); + if (tcp->ece && tcp->cwr) + opts->options |= NF_SYNPROXY_OPT_ECN; + + opts->options &= priv->info.options; + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, opts); + else + opts->options &= ~(NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM | + NF_SYNPROXY_OPT_ECN); +} + +static void nft_synproxy_eval_v4(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + const struct tcphdr *tcp, + struct tcphdr *_tcph, + struct synproxy_options *opts) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + struct nf_synproxy_info info = priv->info; + struct net *net = nft_net(pkt); + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *skb = pkt->skb; + + if (tcp->syn) { + /* Initial SYN from client */ + nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); + synproxy_send_client_synack(net, skb, tcp, opts); + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else if (tcp->ack) { + /* ACK from client */ + if (synproxy_recv_client_ack(net, skb, tcp, opts, + ntohl(tcp->seq))) { + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else { + regs->verdict.code = NF_DROP; + } + } +} + +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) +static void nft_synproxy_eval_v6(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + const struct tcphdr *tcp, + struct tcphdr *_tcph, + struct synproxy_options *opts) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + struct nf_synproxy_info info = priv->info; + struct net *net = nft_net(pkt); + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *skb = pkt->skb; + + if (tcp->syn) { + /* Initial SYN from client */ + nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); + synproxy_send_client_synack_ipv6(net, skb, tcp, opts); + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else if (tcp->ack) { + /* ACK from client */ + if (synproxy_recv_client_ack_ipv6(net, skb, tcp, opts, + ntohl(tcp->seq))) { + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else { + regs->verdict.code = NF_DROP; + } + } +} +#endif /* CONFIG_NF_TABLES_IPV6*/ + +static void nft_synproxy_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct synproxy_options opts = {}; + struct sk_buff *skb = pkt->skb; + int thoff = pkt->xt.thoff; + const struct tcphdr *tcp; + struct tcphdr _tcph; + + if (pkt->tprot != IPPROTO_TCP) { + regs->verdict.code = NFT_BREAK; + return; + } + + if (nf_ip_checksum(skb, nft_hook(pkt), thoff, IPPROTO_TCP)) { + regs->verdict.code = NF_DROP; + return; + } + + tcp = skb_header_pointer(skb, pkt->xt.thoff, + sizeof(struct tcphdr), + &_tcph); + if (!tcp) { + regs->verdict.code = NF_DROP; + return; + } + + if (!synproxy_parse_options(skb, thoff, tcp, &opts)) { + regs->verdict.code = NF_DROP; + return; + } + + switch (skb->protocol) { + case htons(ETH_P_IP): + nft_synproxy_eval_v4(expr, regs, pkt, tcp, &_tcph, &opts); + return; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case htons(ETH_P_IPV6): + nft_synproxy_eval_v6(expr, regs, pkt, tcp, &_tcph, &opts); + return; +#endif + } + regs->verdict.code = NFT_BREAK; +} + +static int nft_synproxy_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct synproxy_net *snet = synproxy_pernet(ctx->net); + struct nft_synproxy *priv = nft_expr_priv(expr); + u32 flags; + int err; + + if (tb[NFTA_SYNPROXY_MSS]) + priv->info.mss = ntohs(nla_get_be16(tb[NFTA_SYNPROXY_MSS])); + if (tb[NFTA_SYNPROXY_WSCALE]) + priv->info.wscale = nla_get_u8(tb[NFTA_SYNPROXY_WSCALE]); + if (tb[NFTA_SYNPROXY_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_SYNPROXY_FLAGS])); + if (flags & ~NF_SYNPROXY_OPT_MASK) + return -EOPNOTSUPP; + priv->info.options = flags; + } + + err = nf_ct_netns_get(ctx->net, ctx->family); + if (err) + return err; + + switch (ctx->family) { + case NFPROTO_IPV4: + err = nf_synproxy_ipv4_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + err = nf_synproxy_ipv6_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; +#endif + case NFPROTO_INET: + case NFPROTO_BRIDGE: + err = nf_synproxy_ipv4_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + err = nf_synproxy_ipv6_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; + } + + return 0; + +nf_ct_failure: + nf_ct_netns_put(ctx->net, ctx->family); + return err; +} + +static void nft_synproxy_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct synproxy_net *snet = synproxy_pernet(ctx->net); + + switch (ctx->family) { + case NFPROTO_IPV4: + nf_synproxy_ipv4_fini(snet, ctx->net); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nf_synproxy_ipv6_fini(snet, ctx->net); + break; +#endif + case NFPROTO_INET: + case NFPROTO_BRIDGE: + nf_synproxy_ipv4_fini(snet, ctx->net); + nf_synproxy_ipv6_fini(snet, ctx->net); + break; + } + nf_ct_netns_put(ctx->net, ctx->family); +} + +static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_synproxy *priv = nft_expr_priv(expr); + + if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) || + nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) || + nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_synproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD)); +} + +static struct nft_expr_type nft_synproxy_type; +static const struct nft_expr_ops nft_synproxy_ops = { + .eval = nft_synproxy_eval, + .size = NFT_EXPR_SIZE(sizeof(struct nft_synproxy)), + .init = nft_synproxy_init, + .destroy = nft_synproxy_destroy, + .dump = nft_synproxy_dump, + .type = &nft_synproxy_type, + .validate = nft_synproxy_validate, +}; + +static struct nft_expr_type nft_synproxy_type __read_mostly = { + .ops = &nft_synproxy_ops, + .name = "synproxy", + .owner = THIS_MODULE, + .policy = nft_synproxy_policy, + .maxattr = NFTA_SYNPROXY_MAX, +}; + +static int __init nft_synproxy_module_init(void) +{ + return nft_register_expr(&nft_synproxy_type); +} + +static void __exit nft_synproxy_module_exit(void) +{ + return nft_unregister_expr(&nft_synproxy_type); +} + +module_init(nft_synproxy_module_init); +module_exit(nft_synproxy_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); +MODULE_ALIAS_NFT_EXPR("synproxy"); diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index b1054a3d18c5..eababc354ff1 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -31,7 +31,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; ipv4_change_dsfield(ip_hdr(skb), @@ -49,7 +49,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; ipv6_change_dsfield(ipv6_hdr(skb), @@ -79,7 +79,7 @@ tos_tg(struct sk_buff *skb, const struct xt_action_param *par) nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ip_hdr(skb); ipv4_change_dsfield(iph, 0, nv); @@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ipv6_hdr(skb); ipv6_change_dsfield(iph, 0, nv); diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c index 8221a5ce44bf..7873b834c300 100644 --- a/net/netfilter/xt_HL.c +++ b/net/netfilter/xt_HL.c @@ -29,7 +29,7 @@ ttl_tg(struct sk_buff *skb, const struct xt_action_param *par) const struct ipt_TTL_info *info = par->targinfo; int new_ttl; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, sizeof(*iph))) return NF_DROP; iph = ip_hdr(skb); @@ -69,7 +69,7 @@ hl_tg6(struct sk_buff *skb, const struct xt_action_param *par) const struct ip6t_HL_info *info = par->targinfo; int new_hl; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, sizeof(*ip6h))) return NF_DROP; ip6h = ipv6_hdr(skb); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 0b3a1b291c91..122db9fbb9f4 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -86,7 +86,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, if (par->fragoff != 0) return 0; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return -1; len = skb->len - tcphoff; diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 666f4ca9b15f..30e99464171b 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -28,33 +28,33 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) static unsigned int tcpoptstrip_mangle_packet(struct sk_buff *skb, const struct xt_action_param *par, - unsigned int tcphoff, unsigned int minlen) + unsigned int tcphoff) { const struct xt_tcpoptstrip_target_info *info = par->targinfo; + struct tcphdr *tcph, _th; unsigned int optl, i, j; - struct tcphdr *tcph; u_int16_t n, o; u_int8_t *opt; - int len, tcp_hdrlen; + int tcp_hdrlen; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) return XT_CONTINUE; - if (!skb_make_writable(skb, skb->len)) + tcph = skb_header_pointer(skb, tcphoff, sizeof(_th), &_th); + if (!tcph) return NF_DROP; - len = skb->len - tcphoff; - if (len < (int)sizeof(struct tcphdr)) - return NF_DROP; - - tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); tcp_hdrlen = tcph->doff * 4; + if (tcp_hdrlen < sizeof(struct tcphdr)) + return NF_DROP; - if (len < tcp_hdrlen) + if (skb_ensure_writable(skb, tcphoff + tcp_hdrlen)) return NF_DROP; - opt = (u_int8_t *)tcph; + /* must reload tcph, might have been moved */ + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + opt = (u8 *)tcph; /* * Walk through all TCP options - if we find some option to remove, @@ -88,8 +88,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, static unsigned int tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb), - sizeof(struct iphdr) + sizeof(struct tcphdr)); + return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb)); } #if IS_ENABLED(CONFIG_IP6_NF_MANGLE) @@ -106,8 +105,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (tcphoff < 0) return NF_DROP; - return tcpoptstrip_mangle_packet(skb, par, tcphoff, - sizeof(*ipv6h) + sizeof(struct tcphdr)); + return tcpoptstrip_mangle_packet(skb, par, tcphoff); } #endif diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index 140ce6be639a..0c9e014e30b4 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -2,7 +2,7 @@ /* * xt_iprange - Netfilter module to match IP address ranges * - * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> * (C) CC Computer Consultants GmbH, 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -130,7 +130,7 @@ static void __exit iprange_mt_exit(void) module_init(iprange_mt_init); module_exit(iprange_mt_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching"); MODULE_ALIAS("ipt_iprange"); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 95f64a99e425..e85ce69924ae 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -22,6 +22,9 @@ static int owner_check(const struct xt_mtchk_param *par) struct xt_owner_match_info *info = par->matchinfo; struct net *net = par->net; + if (info->match & ~XT_OWNER_MASK) + return -EINVAL; + /* Only allow the common case where the userns of the writer * matches the userns of the network namespace. */ @@ -88,11 +91,28 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } if (info->match & XT_OWNER_GID) { + unsigned int i, match = false; kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); - if ((gid_gte(filp->f_cred->fsgid, gid_min) && - gid_lte(filp->f_cred->fsgid, gid_max)) ^ - !(info->invert & XT_OWNER_GID)) + struct group_info *gi = filp->f_cred->group_info; + + if (gid_gte(filp->f_cred->fsgid, gid_min) && + gid_lte(filp->f_cred->fsgid, gid_max)) + match = true; + + if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) { + for (i = 0; i < gi->ngroups; ++i) { + kgid_t group = gi->gid[i]; + + if (gid_gte(group, gid_min) && + gid_lte(group, gid_max)) { + match = true; + break; + } + } + } + + if (match ^ !(info->invert & XT_OWNER_GID)) return false; } diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index f099228cb9c4..ecbfa291fb70 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -2,7 +2,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module which implements the set match and SET target @@ -18,7 +18,7 @@ #include <uapi/linux/netfilter/xt_set.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_DESCRIPTION("Xtables: IP set match and target module"); MODULE_ALIAS("xt_SET"); MODULE_ALIAS("ipt_set"); @@ -436,6 +436,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) { const struct xt_set_info_target_v3 *info = par->targinfo; ip_set_id_t index; + int ret = 0; if (info->add_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, @@ -453,17 +454,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) if (index == IPSET_INVALID_ID) { pr_info_ratelimited("Cannot find del_set index %u as target\n", info->del_set.index); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->add_set.index); - return -ENOENT; + ret = -ENOENT; + goto cleanup_add; } } if (info->map_set.index != IPSET_INVALID_ID) { if (strncmp(par->table, "mangle", 7)) { pr_info_ratelimited("--map-set only usable from mangle table\n"); - return -EINVAL; + ret = -EINVAL; + goto cleanup_del; } if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && @@ -471,20 +471,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); - return -EINVAL; + ret = -EINVAL; + goto cleanup_del; } index = ip_set_nfnl_get_byindex(par->net, info->map_set.index); if (index == IPSET_INVALID_ID) { pr_info_ratelimited("Cannot find map_set index %u as target\n", info->map_set.index); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->add_set.index); - if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->del_set.index); - return -ENOENT; + ret = -ENOENT; + goto cleanup_del; } } @@ -492,16 +488,21 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) info->del_set.dim > IPSET_DIM_MAX || info->map_set.dim > IPSET_DIM_MAX) { pr_info_ratelimited("SET target dimension over the limit!\n"); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->add_set.index); - if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->del_set.index); - if (info->map_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->map_set.index); - return -ERANGE; + ret = -ERANGE; + goto cleanup_mark; } return 0; +cleanup_mark: + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); +cleanup_del: + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); +cleanup_add: + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + return ret; } static void diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e9ddfd782d16..90b2ab9dd449 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -241,13 +241,8 @@ static __net_init int netlink_tap_init_net(struct net *net) return 0; } -static void __net_exit netlink_tap_exit_net(struct net *net) -{ -} - static struct pernet_operations netlink_tap_net_ops = { .init = netlink_tap_init_net, - .exit = netlink_tap_exit_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), }; @@ -2544,12 +2539,10 @@ struct nl_seq_iter { int link; }; -static int netlink_walk_start(struct nl_seq_iter *iter) +static void netlink_walk_start(struct nl_seq_iter *iter) { rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti); rhashtable_walk_start(&iter->hti); - - return 0; } static void netlink_walk_stop(struct nl_seq_iter *iter) @@ -2565,8 +2558,6 @@ static void *__netlink_seq_next(struct seq_file *seq) do { for (;;) { - int err; - nlk = rhashtable_walk_next(&iter->hti); if (IS_ERR(nlk)) { @@ -2583,9 +2574,7 @@ static void *__netlink_seq_next(struct seq_file *seq) if (++iter->link >= MAX_LINKS) return NULL; - err = netlink_walk_start(iter); - if (err) - return ERR_PTR(err); + netlink_walk_start(iter); } } while (sock_net(&nlk->sk) != seq_file_net(seq)); @@ -2597,13 +2586,10 @@ static void *netlink_seq_start(struct seq_file *seq, loff_t *posp) struct nl_seq_iter *iter = seq->private; void *obj = SEQ_START_TOKEN; loff_t pos; - int err; iter->link = 0; - err = netlink_walk_start(iter); - if (err) - return ERR_PTR(err); + netlink_walk_start(iter); for (pos = *posp; pos && obj && !IS_ERR(obj); pos--) obj = __netlink_seq_next(seq); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6747bc57b6fa..33b388103741 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1334,7 +1334,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, &flow->id, info, false, ufid_flags); if (likely(reply)) { - if (likely(!IS_ERR(reply))) { + if (!IS_ERR(reply)) { rcu_read_lock(); /*To keep RCU checker happy. */ err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 53cf07d141b4..7af0cde8b293 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -48,7 +48,7 @@ void ovs_dp_notify_wq(struct work_struct *work) if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; - if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) + if (!(netif_is_ovs_port(vport->dev))) dp_detach_port_notify(vport); } } diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 52a1ed9633ec..57d6436e6f6a 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -156,7 +156,7 @@ void ovs_netdev_detach_dev(struct vport *vport) static void netdev_destroy(struct vport *vport) { rtnl_lock(); - if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); rtnl_unlock(); @@ -166,7 +166,7 @@ static void netdev_destroy(struct vport *vport) void ovs_netdev_tunnel_destroy(struct vport *vport) { rtnl_lock(); - if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); /* We can be invoked by both explicit vport deletion and @@ -186,7 +186,7 @@ EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) { - if (likely(dev->priv_flags & IFF_OVS_DATAPATH)) + if (likely(netif_is_ovs_port(dev))) return (struct vport *) rcu_dereference_rtnl(dev->rx_handler_data); else diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index f927de9bda0a..3fc38d16c456 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -248,8 +248,6 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) */ void ovs_vport_del(struct vport *vport) { - ASSERT_OVSL(); - hlist_del_rcu(&vport->hash_node); module_put(vport->ops->owner); vport->ops->destroy(vport); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5f78df080573..8d54f3047768 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -384,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) smp_wmb(); } -static int __packet_get_status(struct packet_sock *po, void *frame) +static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; @@ -460,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, return ts_status; } -static void *packet_lookup_frame(struct packet_sock *po, - struct packet_ring_buffer *rb, - unsigned int position, - int status) +static void *packet_lookup_frame(const struct packet_sock *po, + const struct packet_ring_buffer *rb, + unsigned int position, + int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; @@ -758,7 +758,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; - if (po->stats.stats3.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; @@ -1003,7 +1003,6 @@ static void prb_fill_curr_block(char *curr, /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, - int status, unsigned int len ) { @@ -1075,7 +1074,7 @@ static void *packet_current_rx_frame(struct packet_sock *po, po->rx_ring.head, status); return curr; case TPACKET_V3: - return __packet_lookup_frame_in_block(po, skb, status, len); + return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); @@ -1083,10 +1082,10 @@ static void *packet_current_rx_frame(struct packet_sock *po, } } -static void *prb_lookup_block(struct packet_sock *po, - struct packet_ring_buffer *rb, - unsigned int idx, - int status) +static void *prb_lookup_block(const struct packet_sock *po, + const struct packet_ring_buffer *rb, + unsigned int idx, + int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); @@ -1199,12 +1198,12 @@ static void packet_free_pending(struct packet_sock *po) #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 -static bool __tpacket_has_room(struct packet_sock *po, int pow_off) +static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; - len = po->rx_ring.frame_max + 1; - idx = po->rx_ring.head; + len = READ_ONCE(po->rx_ring.frame_max) + 1; + idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) @@ -1212,12 +1211,12 @@ static bool __tpacket_has_room(struct packet_sock *po, int pow_off) return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) +static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; - len = po->rx_ring.prb_bdqc.knum_blocks; - idx = po->rx_ring.prb_bdqc.kactive_blk_num; + len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); + idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) @@ -1225,15 +1224,18 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +static int __packet_rcv_has_room(const struct packet_sock *po, + const struct sk_buff *skb) { - struct sock *sk = &po->sk; + const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { - int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) - - (skb ? skb->truesize : 0); - if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) + int rcvbuf = READ_ONCE(sk->sk_rcvbuf); + int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) + - (skb ? skb->truesize : 0); + + if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; @@ -1258,19 +1260,24 @@ static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { - int ret; - bool has_room; + int pressure, ret; - spin_lock_bh(&po->sk.sk_receive_queue.lock); ret = __packet_rcv_has_room(po, skb); - has_room = ret == ROOM_NORMAL; - if (po->pressure == has_room) - po->pressure = !has_room; - spin_unlock_bh(&po->sk.sk_receive_queue.lock); + pressure = ret != ROOM_NORMAL; + + if (READ_ONCE(po->pressure) != pressure) + WRITE_ONCE(po->pressure, pressure); return ret; } +static void packet_rcv_try_clear_pressure(struct packet_sock *po) +{ + if (READ_ONCE(po->pressure) && + __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + WRITE_ONCE(po->pressure, 0); +} + static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); @@ -1351,7 +1358,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(f->arr[i]); - if (po_next != po_skip && !po_next->pressure && + if (po_next != po_skip && !READ_ONCE(po_next->pressure) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; @@ -2126,10 +2133,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: is_drop_n_account = true; - spin_lock(&sk->sk_receive_queue.lock); - po->stats.stats1.tp_drops++; + atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); - spin_unlock(&sk->sk_receive_queue.lock); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { @@ -2193,6 +2198,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!res) goto drop_n_restore; + /* If we are flooded, just give up */ + if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { + atomic_inc(&po->tp_drops); + goto drop_n_restore; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && @@ -2263,7 +2274,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ - if (po->stats.stats1.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } @@ -2379,9 +2390,9 @@ drop: return 0; drop_n_account: - is_drop_n_account = true; - po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); + atomic_inc(&po->tp_drops); + is_drop_n_account = true; sk->sk_data_ready(sk); kfree_skb(copy_skb); @@ -3318,8 +3329,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb == NULL) goto out; - if (pkt_sk(sk)->pressure) - packet_rcv_has_room(pkt_sk(sk), NULL); + packet_rcv_try_clear_pressure(pkt_sk(sk)); if (pkt_sk(sk)->has_vnet_hdr) { err = packet_rcv_vnet(msg, skb, &len); @@ -3891,6 +3901,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; + int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3907,14 +3918,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); + drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); - st.stats3.tp_packets += st.stats3.tp_drops; + st.stats3.tp_drops = drops; + st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); - st.stats1.tp_packets += st.stats1.tp_drops; + st.stats1.tp_drops = drops; + st.stats1.tp_packets += drops; data = &st.stats1; } @@ -4133,8 +4147,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } - if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) - po->pressure = 0; + packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { diff --git a/net/packet/internal.h b/net/packet/internal.h index c70a2794456f..82fb2b10f790 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -132,6 +132,7 @@ struct packet_sock { struct net_device __rcu *cached_dev; int (*xmit)(struct sk_buff *skb); struct packet_type prot_hook ____cacheline_aligned_in_smp; + atomic_t tp_drops ____cacheline_aligned_in_smp; }; static struct packet_sock *pkt_sk(struct sock *sk) diff --git a/net/rds/ib.c b/net/rds/ib.c index b8d581b779b2..ec05d91aa9a2 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -318,6 +318,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; rds_ib_get_mr_info(rds_ibdev, iinfo); + iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } @@ -351,6 +352,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; iinfo6->max_send_sge = rds_ibdev->max_sge; rds6_ib_get_mr_info(rds_ibdev, iinfo6); + iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index a0b6abfbd277..948e3fe249ec 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -519,6 +519,9 @@ send_fragmentable: } break; #endif + + default: + BUG(); } if (ret < 0) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2c72d95c3050..360fdd3eaa77 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -877,6 +877,23 @@ config NET_ACT_CONNMARK To compile this code as a module, choose M here: the module will be called act_connmark. +config NET_ACT_CTINFO + tristate "Netfilter Connection Mark Actions" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + help + Say Y here to allow transfer of a connmark stored information. + Current actions transfer connmark stored DSCP into + ipv4/v6 diffserv and/or to transfer connmark to packet + mark. Both are useful for restoring egress based marks + back onto ingress connections for qdisc priority mapping + purposes. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_ctinfo. + config NET_ACT_SKBMOD tristate "skb data modification action" depends on NET_CLS_ACT @@ -924,14 +941,6 @@ config NET_IFE_SKBTCINDEX tristate "Support to encoding decoding skb tcindex on IFE action" depends on NET_ACT_IFE -config NET_CLS_IND - bool "Incoming device classification" - depends on NET_CLS_U32 || NET_CLS_FW - ---help--- - Say Y here to extend the u32 and fw classifier to support - classification based on the incoming device. This option is - likely to disappear in favour of the metadata ematch. - endif # NET_SCHED config NET_SCH_FIFO diff --git a/net/sched/Makefile b/net/sched/Makefile index 8a40431d7b5c..d54bfcbd7981 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o +obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o obj-$(CONFIG_NET_ACT_IFE) += act_ife.o obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c new file mode 100644 index 000000000000..10eb2bb99861 --- /dev/null +++ b/net/sched/act_ctinfo.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions + * + * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk> + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/pkt_cls.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/act_api.h> +#include <net/pkt_cls.h> +#include <uapi/linux/tc_act/tc_ctinfo.h> +#include <net/tc_act/tc_ctinfo.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_zones.h> + +static struct tc_action_ops act_ctinfo_ops; +static unsigned int ctinfo_net_id; + +static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, + struct tcf_ctinfo_params *cp, + struct sk_buff *skb, int wlen, int proto) +{ + u8 dscp, newdscp; + + newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) & + ~INET_ECN_MASK; + + switch (proto) { + case NFPROTO_IPV4: + dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK; + if (dscp != newdscp) { + if (likely(!skb_try_make_writable(skb, wlen))) { + ipv4_change_dsfield(ip_hdr(skb), + INET_ECN_MASK, + newdscp); + ca->stats_dscp_set++; + } else { + ca->stats_dscp_error++; + } + } + break; + case NFPROTO_IPV6: + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK; + if (dscp != newdscp) { + if (likely(!skb_try_make_writable(skb, wlen))) { + ipv6_change_dsfield(ipv6_hdr(skb), + INET_ECN_MASK, + newdscp); + ca->stats_dscp_set++; + } else { + ca->stats_dscp_error++; + } + } + break; + default: + break; + } +} + +static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, + struct tcf_ctinfo_params *cp, + struct sk_buff *skb) +{ + ca->stats_cpmark_set++; + skb->mark = ct->mark & cp->cpmarkmask; +} + +static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + const struct nf_conntrack_tuple_hash *thash = NULL; + struct tcf_ctinfo *ca = to_ctinfo(a); + struct nf_conntrack_tuple tuple; + struct nf_conntrack_zone zone; + enum ip_conntrack_info ctinfo; + struct tcf_ctinfo_params *cp; + struct nf_conn *ct; + int proto, wlen; + int action; + + cp = rcu_dereference_bh(ca->params); + + tcf_lastuse_update(&ca->tcf_tm); + bstats_update(&ca->tcf_bstats, skb); + action = READ_ONCE(ca->tcf_action); + + wlen = skb_network_offset(skb); + if (tc_skb_protocol(skb) == htons(ETH_P_IP)) { + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen)) + goto out; + + proto = NFPROTO_IPV4; + } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) { + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen)) + goto out; + + proto = NFPROTO_IPV6; + } else { + goto out; + } + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { /* look harder, usually ingress */ + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + proto, cp->net, &tuple)) + goto out; + zone.id = cp->zone; + zone.dir = NF_CT_DEFAULT_ZONE_DIR; + + thash = nf_conntrack_find_get(cp->net, &zone, &tuple); + if (!thash) + goto out; + + ct = nf_ct_tuplehash_to_ctrack(thash); + } + + if (cp->mode & CTINFO_MODE_DSCP) + if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask)) + tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto); + + if (cp->mode & CTINFO_MODE_CPMARK) + tcf_ctinfo_cpmark_set(ct, ca, cp, skb); + + if (thash) + nf_ct_put(ct); +out: + return action; +} + +static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { + [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct + tc_ctinfo) }, + [TCA_CTINFO_ZONE] = { .type = NLA_U16 }, + [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 }, + [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 }, + [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 }, +}; + +static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + struct nlattr *tb[TCA_CTINFO_MAX + 1]; + struct tcf_ctinfo_params *cp_new; + struct tcf_chain *goto_ch = NULL; + u32 dscpmask = 0, dscpstatemask; + struct tc_ctinfo *actparm; + struct tcf_ctinfo *ci; + u8 dscpmaskshift; + int ret = 0, err; + + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_CTINFO_ACT]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing required TCA_CTINFO_ACT attribute"); + return -EINVAL; + } + actparm = nla_data(tb[TCA_CTINFO_ACT]); + + /* do some basic validation here before dynamically allocating things */ + /* that we would otherwise have to clean up. */ + if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) { + dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]); + /* need contiguous 6 bit mask */ + dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0; + if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_CTINFO_PARMS_DSCP_MASK], + "dscp mask must be 6 contiguous bits"); + return -EINVAL; + } + dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ? + nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0; + /* mask & statemask must not overlap */ + if (dscpmask & dscpstatemask) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_CTINFO_PARMS_DSCP_STATEMASK], + "dscp statemask must not overlap dscp mask"); + return -EINVAL; + } + } + + /* done the validation:now to the actual action allocation */ + err = tcf_idr_check_alloc(tn, &actparm->index, a, bind); + if (!err) { + ret = tcf_idr_create(tn, actparm->index, est, a, + &act_ctinfo_ops, bind, false); + if (ret) { + tcf_idr_cleanup(tn, actparm->index); + return ret; + } + ret = ACT_P_CREATED; + } else if (err > 0) { + if (bind) /* don't override defaults */ + return 0; + if (!ovr) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + } else { + return err; + } + + err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + ci = to_ctinfo(*a); + + cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL); + if (unlikely(!cp_new)) { + err = -ENOMEM; + goto put_chain; + } + + cp_new->net = net; + cp_new->zone = tb[TCA_CTINFO_ZONE] ? + nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0; + if (dscpmask) { + cp_new->dscpmask = dscpmask; + cp_new->dscpmaskshift = dscpmaskshift; + cp_new->dscpstatemask = dscpstatemask; + cp_new->mode |= CTINFO_MODE_DSCP; + } + + if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) { + cp_new->cpmarkmask = + nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]); + cp_new->mode |= CTINFO_MODE_CPMARK; + } + + spin_lock_bh(&ci->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); + rcu_swap_protected(ci->params, cp_new, + lockdep_is_held(&ci->tcf_lock)); + spin_unlock_bh(&ci->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (cp_new) + kfree_rcu(cp_new, rcu); + + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + + return ret; + +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; +} + +static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + struct tcf_ctinfo *ci = to_ctinfo(a); + struct tc_ctinfo opt = { + .index = ci->tcf_index, + .refcnt = refcount_read(&ci->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, + }; + unsigned char *b = skb_tail_pointer(skb); + struct tcf_ctinfo_params *cp; + struct tcf_t t; + + spin_lock_bh(&ci->tcf_lock); + cp = rcu_dereference_protected(ci->params, + lockdep_is_held(&ci->tcf_lock)); + + tcf_tm_dump(&t, &ci->tcf_tm); + if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) + goto nla_put_failure; + + opt.action = ci->tcf_action; + if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) + goto nla_put_failure; + + if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone)) + goto nla_put_failure; + + if (cp->mode & CTINFO_MODE_DSCP) { + if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK, + cp->dscpmask)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK, + cp->dscpstatemask)) + goto nla_put_failure; + } + + if (cp->mode & CTINFO_MODE_CPMARK) { + if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK, + cp->cpmarkmask)) + goto nla_put_failure; + } + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, + ci->stats_dscp_set, TCA_CTINFO_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, + ci->stats_dscp_error, TCA_CTINFO_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, + ci->stats_cpmark_set, TCA_CTINFO_PAD)) + goto nla_put_failure; + + spin_unlock_bh(&ci->tcf_lock); + return skb->len; + +nla_put_failure: + spin_unlock_bh(&ci->tcf_lock); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tcf_idr_search(tn, a, index); +} + +static struct tc_action_ops act_ctinfo_ops = { + .kind = "ctinfo", + .id = TCA_ID_CTINFO, + .owner = THIS_MODULE, + .act = tcf_ctinfo_act, + .dump = tcf_ctinfo_dump, + .init = tcf_ctinfo_init, + .walk = tcf_ctinfo_walker, + .lookup = tcf_ctinfo_search, + .size = sizeof(struct tcf_ctinfo), +}; + +static __net_init int ctinfo_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tc_action_net_init(tn, &act_ctinfo_ops); +} + +static void __net_exit ctinfo_exit_net(struct list_head *net_list) +{ + tc_action_net_exit(net_list, ctinfo_net_id); +} + +static struct pernet_operations ctinfo_net_ops = { + .init = ctinfo_init_net, + .exit_batch = ctinfo_exit_net, + .id = &ctinfo_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init ctinfo_init_module(void) +{ + return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops); +} + +static void __exit ctinfo_cleanup_module(void) +{ + tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops); +} + +module_init(ctinfo_init_module); +module_exit(ctinfo_cleanup_module); +MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>"); +MODULE_DESCRIPTION("Connection tracking mark actions"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 58e7573dded4..055faa298c8e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -27,6 +27,9 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); +#define MIRRED_RECURSION_LIMIT 4 +static DEFINE_PER_CPU(unsigned int, mirred_rec_level); + static bool tcf_mirred_is_act_redirect(int action) { return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; @@ -210,6 +213,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct sk_buff *skb2 = skb; bool m_mac_header_xmit; struct net_device *dev; + unsigned int rec_level; int retval, err = 0; bool use_reinsert; bool want_ingress; @@ -217,6 +221,14 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, int m_eaction; int mac_len; + rec_level = __this_cpu_inc_return(mirred_rec_level); + if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) { + net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", + netdev_name(skb->dev)); + __this_cpu_dec(mirred_rec_level); + return TC_ACT_SHOT; + } + tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -277,7 +289,9 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (use_reinsert) { res->ingress = want_ingress; res->qstats = this_cpu_ptr(m->common.cpu_qstats); - return TC_ACT_REINSERT; + skb_tc_reinsert(skb, res); + __this_cpu_dec(mirred_rec_level); + return TC_ACT_CONSUMED; } } @@ -292,6 +306,7 @@ out: if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } + __this_cpu_dec(mirred_rec_level); return retval; } @@ -411,6 +426,11 @@ static void tcf_mirred_put_dev(struct net_device *dev) dev_put(dev); } +static size_t tcf_mirred_get_fill_size(const struct tc_action *act) +{ + return nla_total_size(sizeof(struct tc_mirred)); +} + static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .id = TCA_ID_MIRRED, @@ -422,6 +442,7 @@ static struct tc_action_ops act_mirred_ops = { .init = tcf_mirred_init, .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, + .get_fill_size = tcf_mirred_get_fill_size, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, .put_dev = tcf_mirred_put_dev, diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index fdeede3af72e..5d4935b51e6f 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -27,7 +27,7 @@ #include <net/dst_metadata.h> struct fl_flow_key { - int indev_ifindex; + struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; struct flow_dissector_key_control enc_control; struct flow_dissector_key_basic basic; @@ -284,7 +284,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, list_for_each_entry_rcu(mask, &head->masks, list) { fl_clear_masked_range(&skb_key, mask); - skb_key.indev_ifindex = skb->skb_iif; + skb_flow_dissect_meta(skb, &mask->dissector, &skb_key); /* skb_flow_dissect() does not set n_proto in case an unknown * protocol, so do it rather here. */ @@ -1003,15 +1003,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb, { __be16 ethertype; int ret = 0; -#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_FLOWER_INDEV]) { int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack); if (err < 0) return err; - key->indev_ifindex = err; - mask->indev_ifindex = 0xffffffff; + key->meta.ingress_ifindex = err; + mask->meta.ingress_ifindex = 0xffffffff; } -#endif fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, @@ -1264,6 +1263,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; size_t cnt = 0; + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_META, meta); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); FL_KEY_SET_IF_MASKED(mask, keys, cnt, @@ -2110,10 +2111,10 @@ static int fl_dump_key_enc_opt(struct sk_buff *skb, static int fl_dump_key(struct sk_buff *skb, struct net *net, struct fl_flow_key *key, struct fl_flow_key *mask) { - if (mask->indev_ifindex) { + if (mask->meta.ingress_ifindex) { struct net_device *dev; - dev = __dev_get_by_index(net, key->indev_ifindex); + dev = __dev_get_by_index(net, key->meta.ingress_ifindex); if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name)) goto nla_put_failure; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 4dab833f66cb..c9496c920d6f 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -8,9 +8,6 @@ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel). * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension - * - * JHS: We should remove the CONFIG_NET_CLS_IND from here - * eventually when the meta match extension is made available */ #include <linux/module.h> @@ -37,9 +34,7 @@ struct fw_filter { struct fw_filter __rcu *next; u32 id; struct tcf_result res; -#ifdef CONFIG_NET_CLS_IND int ifindex; -#endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; struct tcf_proto *tp; struct rcu_work rwork; @@ -67,10 +62,8 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, f = rcu_dereference_bh(f->next)) { if (f->id == id) { *res = f->res; -#ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, f->ifindex)) continue; -#endif /* CONFIG_NET_CLS_IND */ r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; @@ -222,7 +215,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &f->res, base); } -#ifdef CONFIG_NET_CLS_IND if (tb[TCA_FW_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack); @@ -230,7 +222,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, return ret; f->ifindex = ret; } -#endif /* CONFIG_NET_CLS_IND */ err = -EINVAL; if (tb[TCA_FW_MASK]) { @@ -276,9 +267,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, fnew->id = f->id; fnew->res = f->res; -#ifdef CONFIG_NET_CLS_IND fnew->ifindex = f->ifindex; -#endif /* CONFIG_NET_CLS_IND */ fnew->tp = f->tp; err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT, @@ -405,14 +394,12 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh, if (f->res.classid && nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) goto nla_put_failure; -#ifdef CONFIG_NET_CLS_IND if (f->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, f->ifindex); if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) goto nla_put_failure; } -#endif /* CONFIG_NET_CLS_IND */ if (head->mask != 0xFFFFFFFF && nla_put_u32(skb, TCA_FW_MASK, head->mask)) goto nla_put_failure; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 38c0a9f0f296..a30d2f8feb32 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,6 +21,7 @@ struct cls_mall_head { unsigned int in_hw_count; struct tc_matchall_pcnt __percpu *pf; struct rcu_work rwork; + bool deleting; }; static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -258,7 +259,11 @@ err_exts_init: static int mall_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { - return -EOPNOTSUPP; + struct cls_mall_head *head = rtnl_dereference(tp->root); + + head->deleting = true; + *last = true; + return 0; } static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg, @@ -269,7 +274,7 @@ static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg, if (arg->count < arg->skip) goto skip; - if (!head) + if (!head || head->deleting) return; if (arg->fn(tp, head, arg) < 0) arg->stop = 1; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index c7727de5e073..be9e46c77e8b 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -20,9 +20,6 @@ * pure RSVP doesn't need such a general approach and can use * much simpler (and faster) schemes, sort of cls_rsvp.c. * - * JHS: We should remove the CONFIG_NET_CLS_IND from here - * eventually when the meta match extension is made available - * * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro> */ @@ -48,9 +45,7 @@ struct tc_u_knode { u32 handle; struct tc_u_hnode __rcu *ht_up; struct tcf_exts exts; -#ifdef CONFIG_NET_CLS_IND int ifindex; -#endif u8 fshift; struct tcf_result res; struct tc_u_hnode __rcu *ht_down; @@ -176,12 +171,10 @@ check_terminal: if (n->sel.flags & TC_U32_TERMINAL) { *res = n->res; -#ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, n->ifindex)) { n = rcu_dereference_bh(n->next); goto next_knode; } -#endif #ifdef CONFIG_CLS_U32_PERF __this_cpu_inc(n->pf->rhit); #endif @@ -761,7 +754,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &n->res, base); } -#ifdef CONFIG_NET_CLS_IND if (tb[TCA_U32_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack); @@ -769,7 +761,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, return -EINVAL; n->ifindex = ret; } -#endif return 0; } @@ -817,9 +808,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, new->handle = n->handle; RCU_INIT_POINTER(new->ht_up, n->ht_up); -#ifdef CONFIG_NET_CLS_IND new->ifindex = n->ifindex; -#endif new->fshift = n->fshift; new->res = n->res; new->flags = n->flags; @@ -1351,14 +1340,12 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, if (tcf_exts_dump(skb, &n->exts) < 0) goto nla_put_failure; -#ifdef CONFIG_NET_CLS_IND if (n->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, n->ifindex); if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) goto nla_put_failure; } -#endif #ifdef CONFIG_CLS_U32_PERF gpf = kzalloc(sizeof(struct tc_u32_pcnt) + n->sel.nkeys * sizeof(u64), @@ -1422,9 +1409,7 @@ static int __init init_u32(void) #ifdef CONFIG_CLS_U32_PERF pr_info(" Performance counters on\n"); #endif -#ifdef CONFIG_NET_CLS_IND pr_info(" input device check on\n"); -#endif #ifdef CONFIG_NET_CLS_ACT pr_info(" Actions configured\n"); #endif diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index 243fd22f2248..9fff6480acc6 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -21,6 +21,7 @@ struct em_ipt_match { const struct xt_match *match; u32 hook; + u8 nfproto; u8 match_data[0] __aligned(8); }; @@ -71,11 +72,25 @@ static int policy_validate_match_data(struct nlattr **tb, u8 mrev) return 0; } +static int addrtype_validate_match_data(struct nlattr **tb, u8 mrev) +{ + if (mrev != 1) { + pr_err("only addrtype match revision 1 supported"); + return -EINVAL; + } + + return 0; +} + static const struct em_ipt_xt_match em_ipt_xt_matches[] = { { .match_name = "policy", .validate_match_data = policy_validate_match_data }, + { + .match_name = "addrtype", + .validate_match_data = addrtype_validate_match_data + }, {} }; @@ -115,6 +130,7 @@ static int em_ipt_change(struct net *net, void *data, int data_len, struct em_ipt_match *im = NULL; struct xt_match *match; int mdata_len, ret; + u8 nfproto; ret = nla_parse_deprecated(tb, TCA_EM_IPT_MAX, data, data_len, em_ipt_policy, NULL); @@ -125,6 +141,15 @@ static int em_ipt_change(struct net *net, void *data, int data_len, !tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO]) return -EINVAL; + nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]); + switch (nfproto) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + break; + default: + return -EINVAL; + } + match = get_xt_match(tb); if (IS_ERR(match)) { pr_err("unable to load match\n"); @@ -140,6 +165,7 @@ static int em_ipt_change(struct net *net, void *data, int data_len, im->match = match; im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]); + im->nfproto = nfproto; nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len); ret = check_match(net, im, mdata_len); @@ -182,15 +208,33 @@ static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, const struct em_ipt_match *im = (const void *)em->data; struct xt_action_param acpar = {}; struct net_device *indev = NULL; + u8 nfproto = im->match->family; struct nf_hook_state state; int ret; + switch (tc_skb_protocol(skb)) { + case htons(ETH_P_IP): + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + return 0; + if (nfproto == NFPROTO_UNSPEC) + nfproto = NFPROTO_IPV4; + break; + case htons(ETH_P_IPV6): + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + if (nfproto == NFPROTO_UNSPEC) + nfproto = NFPROTO_IPV6; + break; + default: + return 0; + } + rcu_read_lock(); if (skb->skb_iif) indev = dev_get_by_index_rcu(em->net, skb->skb_iif); - nf_hook_state_init(&state, im->hook, im->match->family, + nf_hook_state_init(&state, im->hook, nfproto, indev ?: skb->dev, skb->dev, NULL, em->net, NULL); acpar.match = im->match; @@ -213,7 +257,7 @@ static int em_ipt_dump(struct sk_buff *skb, struct tcf_ematch *em) return -EMSGSIZE; if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0) return -EMSGSIZE; - if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0) + if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->nfproto) < 0) return -EMSGSIZE; if (nla_put(skb, TCA_EM_IPT_MATCH_DATA, im->match->usersize ?: im->match->matchsize, diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index db0c2ba1d156..cebfb65d8556 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -22,10 +22,12 @@ #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) #define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) +#define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK) struct etf_sched_data { bool offload; bool deadline_mode; + bool skip_sock_check; int clockid; int queue; s32 delta; /* in ns */ @@ -77,6 +79,9 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) struct sock *sk = nskb->sk; ktime_t now; + if (q->skip_sock_check) + goto skip; + if (!sk) return false; @@ -92,6 +97,7 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) if (sk->sk_txtime_deadline_mode != q->deadline_mode) return false; +skip: now = q->get_time(); if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) return false; @@ -385,6 +391,7 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, q->clockid = qopt->clockid; q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt); switch (q->clockid) { case CLOCK_REALTIME: @@ -473,6 +480,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) if (q->deadline_mode) opt.flags |= TC_ETF_DEADLINE_MODE_ON; + if (q->skip_sock_check) + opt.flags |= TC_ETF_SKIP_SOCK_CHECK; + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 0f65f617756b..599730f804d7 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -114,6 +114,7 @@ nla_put_failure: } static const struct Qdisc_class_ops ingress_class_ops = { + .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED, .leaf = ingress_leaf, .find = ingress_find, .walk = ingress_walk, @@ -246,6 +247,7 @@ static void clsact_destroy(struct Qdisc *sch) } static const struct Qdisc_class_ops clsact_class_ops = { + .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED, .leaf = ingress_leaf, .find = clsact_find, .walk = ingress_walk, diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 9ecfb8f5902a..388750ddc57a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -21,12 +21,17 @@ #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> +#include <net/sock.h> +#include <net/tcp.h> static LIST_HEAD(taprio_list); static DEFINE_SPINLOCK(taprio_list_lock); #define TAPRIO_ALL_GATES_OPEN -1 +#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)) +#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) + struct sched_entry { struct list_head list; @@ -35,6 +40,7 @@ struct sched_entry { * packet leaves after this time. */ ktime_t close_time; + ktime_t next_txtime; atomic_t budget; int index; u32 gate_mask; @@ -55,6 +61,8 @@ struct sched_gate_list { struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; + u32 flags; + enum tk_offsets tk_offset; int clockid; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte @@ -65,9 +73,9 @@ struct taprio_sched { struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; - ktime_t (*get_time)(void); struct hrtimer advance_timer; struct list_head taprio_list; + int txtime_delay; }; static ktime_t sched_base_time(const struct sched_gate_list *sched) @@ -78,6 +86,20 @@ static ktime_t sched_base_time(const struct sched_gate_list *sched) return ns_to_ktime(sched->base_time); } +static ktime_t taprio_get_time(struct taprio_sched *q) +{ + ktime_t mono = ktime_get(); + + switch (q->tk_offset) { + case TK_OFFS_MAX: + return mono; + default: + return ktime_mono_to_any(mono, q->tk_offset); + } + + return KTIME_MAX; +} + static void taprio_free_sched_cb(struct rcu_head *head) { struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); @@ -108,20 +130,263 @@ static void switch_schedules(struct taprio_sched *q, *admin = NULL; } -static ktime_t get_cycle_time(struct sched_gate_list *sched) +/* Get how much time has been already elapsed in the current cycle. */ +static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) +{ + ktime_t time_since_sched_start; + s32 time_elapsed; + + time_since_sched_start = ktime_sub(time, sched->base_time); + div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); + + return time_elapsed; +} + +static ktime_t get_interval_end_time(struct sched_gate_list *sched, + struct sched_gate_list *admin, + struct sched_entry *entry, + ktime_t intv_start) +{ + s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); + ktime_t intv_end, cycle_ext_end, cycle_end; + + cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); + intv_end = ktime_add_ns(intv_start, entry->interval); + cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); + + if (ktime_before(intv_end, cycle_end)) + return intv_end; + else if (admin && admin != sched && + ktime_after(admin->base_time, cycle_end) && + ktime_before(admin->base_time, cycle_ext_end)) + return admin->base_time; + else + return cycle_end; +} + +static int length_to_duration(struct taprio_sched *q, int len) +{ + return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); +} + +/* Returns the entry corresponding to next available interval. If + * validate_interval is set, it only validates whether the timestamp occurs + * when the gate corresponding to the skb's traffic class is open. + */ +static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, + struct Qdisc *sch, + struct sched_gate_list *sched, + struct sched_gate_list *admin, + ktime_t time, + ktime_t *interval_start, + ktime_t *interval_end, + bool validate_interval) +{ + ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; + ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; + struct sched_entry *entry = NULL, *entry_found = NULL; + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + bool entry_available = false; + s32 cycle_elapsed; + int tc, n; + + tc = netdev_get_prio_tc_map(dev, skb->priority); + packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); + + *interval_start = 0; + *interval_end = 0; + + if (!sched) + return NULL; + + cycle = sched->cycle_time; + cycle_elapsed = get_cycle_time_elapsed(sched, time); + curr_intv_end = ktime_sub_ns(time, cycle_elapsed); + cycle_end = ktime_add_ns(curr_intv_end, cycle); + + list_for_each_entry(entry, &sched->entries, list) { + curr_intv_start = curr_intv_end; + curr_intv_end = get_interval_end_time(sched, admin, entry, + curr_intv_start); + + if (ktime_after(curr_intv_start, cycle_end)) + break; + + if (!(entry->gate_mask & BIT(tc)) || + packet_transmit_time > entry->interval) + continue; + + txtime = entry->next_txtime; + + if (ktime_before(txtime, time) || validate_interval) { + transmit_end_time = ktime_add_ns(time, packet_transmit_time); + if ((ktime_before(curr_intv_start, time) && + ktime_before(transmit_end_time, curr_intv_end)) || + (ktime_after(curr_intv_start, time) && !validate_interval)) { + entry_found = entry; + *interval_start = curr_intv_start; + *interval_end = curr_intv_end; + break; + } else if (!entry_available && !validate_interval) { + /* Here, we are just trying to find out the + * first available interval in the next cycle. + */ + entry_available = 1; + entry_found = entry; + *interval_start = ktime_add_ns(curr_intv_start, cycle); + *interval_end = ktime_add_ns(curr_intv_end, cycle); + } + } else if (ktime_before(txtime, earliest_txtime) && + !entry_available) { + earliest_txtime = txtime; + entry_found = entry; + n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); + *interval_start = ktime_add(curr_intv_start, n * cycle); + *interval_end = ktime_add(curr_intv_end, n * cycle); + } + } + + return entry_found; +} + +static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) { + struct taprio_sched *q = qdisc_priv(sch); + struct sched_gate_list *sched, *admin; + ktime_t interval_start, interval_end; struct sched_entry *entry; - ktime_t cycle = 0; - if (sched->cycle_time != 0) - return sched->cycle_time; + rcu_read_lock(); + sched = rcu_dereference(q->oper_sched); + admin = rcu_dereference(q->admin_sched); + + entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, + &interval_start, &interval_end, true); + rcu_read_unlock(); - list_for_each_entry(entry, &sched->entries, list) - cycle = ktime_add_ns(cycle, entry->interval); + return entry; +} - sched->cycle_time = cycle; +/* This returns the tstamp value set by TCP in terms of the set clock. */ +static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) +{ + unsigned int offset = skb_network_offset(skb); + const struct ipv6hdr *ipv6h; + const struct iphdr *iph; + struct ipv6hdr _ipv6h; - return cycle; + ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); + if (!ipv6h) + return 0; + + if (ipv6h->version == 4) { + iph = (struct iphdr *)ipv6h; + offset += iph->ihl * 4; + + /* special-case 6in4 tunnelling, as that is a common way to get + * v6 connectivity in the home + */ + if (iph->protocol == IPPROTO_IPV6) { + ipv6h = skb_header_pointer(skb, offset, + sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) + return 0; + } else if (iph->protocol != IPPROTO_TCP) { + return 0; + } + } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { + return 0; + } + + return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset); +} + +/* There are a few scenarios where we will have to modify the txtime from + * what is read from next_txtime in sched_entry. They are: + * 1. If txtime is in the past, + * a. The gate for the traffic class is currently open and packet can be + * transmitted before it closes, schedule the packet right away. + * b. If the gate corresponding to the traffic class is going to open later + * in the cycle, set the txtime of packet to the interval start. + * 2. If txtime is in the future, there are packets corresponding to the + * current traffic class waiting to be transmitted. So, the following + * possibilities exist: + * a. We can transmit the packet before the window containing the txtime + * closes. + * b. The window might close before the transmission can be completed + * successfully. So, schedule the packet in the next open window. + */ +static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) +{ + ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; + struct taprio_sched *q = qdisc_priv(sch); + struct sched_gate_list *sched, *admin; + ktime_t minimum_time, now, txtime; + int len, packet_transmit_time; + struct sched_entry *entry; + bool sched_changed; + + now = taprio_get_time(q); + minimum_time = ktime_add_ns(now, q->txtime_delay); + + tcp_tstamp = get_tcp_tstamp(q, skb); + minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); + + rcu_read_lock(); + admin = rcu_dereference(q->admin_sched); + sched = rcu_dereference(q->oper_sched); + if (admin && ktime_after(minimum_time, admin->base_time)) + switch_schedules(q, &admin, &sched); + + /* Until the schedule starts, all the queues are open */ + if (!sched || ktime_before(minimum_time, sched->base_time)) { + txtime = minimum_time; + goto done; + } + + len = qdisc_pkt_len(skb); + packet_transmit_time = length_to_duration(q, len); + + do { + sched_changed = 0; + + entry = find_entry_to_transmit(skb, sch, sched, admin, + minimum_time, + &interval_start, &interval_end, + false); + if (!entry) { + txtime = 0; + goto done; + } + + txtime = entry->next_txtime; + txtime = max_t(ktime_t, txtime, minimum_time); + txtime = max_t(ktime_t, txtime, interval_start); + + if (admin && admin != sched && + ktime_after(txtime, admin->base_time)) { + sched = admin; + sched_changed = 1; + continue; + } + + transmit_end_time = ktime_add(txtime, packet_transmit_time); + minimum_time = transmit_end_time; + + /* Update the txtime of current entry to the next time it's + * interval starts. + */ + if (ktime_after(transmit_end_time, interval_end)) + entry->next_txtime = ktime_add(interval_start, sched->cycle_time); + } while (sched_changed || ktime_after(transmit_end_time, interval_end)); + + entry->next_txtime = transmit_end_time; + +done: + rcu_read_unlock(); + return txtime; } static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -137,6 +402,15 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); + if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) { + if (!is_valid_interval(skb, sch)) + return qdisc_drop(skb, sch, to_free); + } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { + skb->tstamp = get_packet_txtime(skb, sch); + if (!skb->tstamp) + return qdisc_drop(skb, sch, to_free); + } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; @@ -172,6 +446,9 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) if (!skb) continue; + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) + return skb; + prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); @@ -184,11 +461,6 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) return NULL; } -static inline int length_to_duration(struct taprio_sched *q, int len) -{ - return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); -} - static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) { atomic_set(&entry->budget, @@ -232,6 +504,13 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) if (unlikely(!child)) continue; + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { + skb = child->ops->dequeue(child); + if (!skb) + continue; + goto skb_found; + } + skb = child->ops->peek(child); if (!skb) continue; @@ -243,7 +522,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) continue; len = qdisc_pkt_len(skb); - guard = ktime_add_ns(q->get_time(), + guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); /* In the case that there's no gate entry, there's no @@ -262,6 +541,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) if (unlikely(!skb)) goto done; +skb_found: qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; @@ -524,12 +804,22 @@ static int parse_taprio_schedule(struct nlattr **tb, if (err < 0) return err; + if (!new->cycle_time) { + struct sched_entry *entry; + ktime_t cycle = 0; + + list_for_each_entry(entry, &new->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + new->cycle_time = cycle; + } + return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + u32 taprio_flags) { int i, j; @@ -577,6 +867,9 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, return -EINVAL; } + if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) + continue; + /* Verify that the offset and counts do not overlap */ for (j = i + 1; j < qopt->num_tc; j++) { if (last > qopt->offset[j]) { @@ -598,14 +891,14 @@ static int taprio_get_start_time(struct Qdisc *sch, s64 n; base = sched_base_time(sched); - now = q->get_time(); + now = taprio_get_time(q); if (ktime_after(base, now)) { *start = base; return 0; } - cycle = get_cycle_time(sched); + cycle = sched->cycle_time; /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, @@ -632,7 +925,7 @@ static void setup_first_close_time(struct taprio_sched *q, first = list_first_entry(&sched->entries, struct sched_entry, list); - cycle = get_cycle_time(sched); + cycle = sched->cycle_time; /* FIXME: find a better place to do this */ sched->cycle_close_time = ktime_add_ns(base, cycle); @@ -707,6 +1000,18 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; } +static void setup_txtime(struct taprio_sched *q, + struct sched_gate_list *sched, ktime_t base) +{ + struct sched_entry *entry; + u32 interval = 0; + + list_for_each_entry(entry, &sched->entries, list) { + entry->next_txtime = ktime_add_ns(base, interval); + interval += entry->interval; + } +} + static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -715,6 +1020,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; + u32 taprio_flags = 0; int i, err, clockid; unsigned long flags; ktime_t start; @@ -727,7 +1033,21 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); - err = taprio_parse_mqprio_opt(dev, mqprio, extack); + if (tb[TCA_TAPRIO_ATTR_FLAGS]) { + taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]); + + if (q->flags != 0 && q->flags != taprio_flags) { + NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); + return -EOPNOTSUPP; + } else if (!FLAGS_VALID(taprio_flags)) { + NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); + return -EINVAL; + } + + q->flags = taprio_flags; + } + + err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags); if (err < 0) return err; @@ -786,7 +1106,18 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); - if (!hrtimer_active(&q->advance_timer)) { + if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { + if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { + NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); + err = -EINVAL; + goto unlock; + } + + q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); + } + + if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) && + !hrtimer_active(&q->advance_timer)) { hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; } @@ -806,16 +1137,16 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, switch (q->clockid) { case CLOCK_REALTIME: - q->get_time = ktime_get_real; + q->tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: - q->get_time = ktime_get; + q->tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: - q->get_time = ktime_get_boottime; + q->tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: - q->get_time = ktime_get_clocktai; + q->tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); @@ -829,20 +1160,35 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - setup_first_close_time(q, new_admin, start); + if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) { + setup_txtime(q, new_admin, start); - /* Protects against advance_sched() */ - spin_lock_irqsave(&q->current_entry_lock, flags); + if (!oper) { + rcu_assign_pointer(q->oper_sched, new_admin); + err = 0; + new_admin = NULL; + goto unlock; + } - taprio_start_sched(sch, start, new_admin); + rcu_assign_pointer(q->admin_sched, new_admin); + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); + } else { + setup_first_close_time(q, new_admin, start); - rcu_assign_pointer(q->admin_sched, new_admin); - if (admin) - call_rcu(&admin->rcu, taprio_free_sched_cb); - new_admin = NULL; + /* Protects against advance_sched() */ + spin_lock_irqsave(&q->current_entry_lock, flags); - spin_unlock_irqrestore(&q->current_entry_lock, flags); + taprio_start_sched(sch, start, new_admin); + rcu_assign_pointer(q->admin_sched, new_admin); + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); + + spin_unlock_irqrestore(&q->current_entry_lock, flags); + } + + new_admin = NULL; err = 0; unlock: @@ -1080,6 +1426,13 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; + if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) + goto options_error; + + if (q->txtime_delay && + nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) + goto options_error; + if (oper && dump_schedule(skb, oper)) goto options_error; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 64e0a594a651..e5f2fc726a98 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -253,7 +253,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) goto out; fl6_sock_release(flowlabel); } diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 2cae7440349c..74847d613835 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -94,11 +94,6 @@ static const struct net_offload sctp6_offload = { }, }; -static const struct skb_checksum_ops crc32c_csum_ops = { - .update = sctp_csum_update, - .combine = sctp_csum_combine, -}; - int __init sctp_offload_init(void) { int ret; @@ -111,7 +106,7 @@ int __init sctp_offload_init(void) if (ret) goto ipv4; - crc32c_csum_stub = &crc32c_csum_ops; + crc32c_csum_stub = &sctp_csum_ops; return ret; ipv4: diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 23af232c0a25..2d47adcb4cbe 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -81,7 +81,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist, return; } - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { /* Add the address to the local list. */ addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 7621ec2f539c..302e355f2ebc 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -123,30 +123,11 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); -static int smc_release(struct socket *sock) +static int __smc_release(struct smc_sock *smc) { - struct sock *sk = sock->sk; - struct smc_sock *smc; + struct sock *sk = &smc->sk; int rc = 0; - if (!sk) - goto out; - - smc = smc_sk(sk); - - /* cleanup for a dangling non-blocking connect */ - if (smc->connect_nonblock && sk->sk_state == SMC_INIT) - tcp_abort(smc->clcsock->sk, ECONNABORTED); - flush_work(&smc->connect_work); - - if (sk->sk_state == SMC_LISTEN) - /* smc_close_non_accepted() is called and acquires - * sock lock for child sockets again - */ - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - else - lock_sock(sk); - if (!smc->use_fallback) { rc = smc_close_active(smc); sock_set_flag(sk, SOCK_DEAD); @@ -174,6 +155,35 @@ static int smc_release(struct socket *sock) smc_conn_free(&smc->conn); } + return rc; +} + +static int smc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = 0; + + if (!sk) + goto out; + + smc = smc_sk(sk); + + /* cleanup for a dangling non-blocking connect */ + if (smc->connect_nonblock && sk->sk_state == SMC_INIT) + tcp_abort(smc->clcsock->sk, ECONNABORTED); + flush_work(&smc->connect_work); + + if (sk->sk_state == SMC_LISTEN) + /* smc_close_non_accepted() is called and acquires + * sock lock for child sockets again + */ + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + else + lock_sock(sk); + + rc = __smc_release(smc); + /* detach socket */ sock_orphan(sk); sock->sk = NULL; @@ -964,26 +974,7 @@ void smc_close_non_accepted(struct sock *sk) if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; - if (!smc->use_fallback) { - smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); - sk->sk_shutdown |= SHUTDOWN_MASK; - } - sk->sk_prot->unhash(sk); - if (smc->clcsock) { - struct socket *tcp; - - tcp = smc->clcsock; - smc->clcsock = NULL; - sock_release(tcp); - } - if (smc->use_fallback) { - sock_put(sk); /* passive closing */ - sk->sk_state = SMC_CLOSED; - } else { - if (sk->sk_state == SMC_CLOSED) - smc_conn_free(&smc->conn); - } + __smc_release(smc); release_sock(sk); sock_put(sk); /* final sock_put */ } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 745afd82f281..49bcebff6378 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -97,17 +97,19 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; - for_ifa(in_dev) { + + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (!inet_ifa_match(ipv4, ifa)) continue; prop->prefix_len = inet_mask_len(ifa->ifa_mask); prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ return 0; - } endfor_ifa(in_dev); + } return -ENOENT; } @@ -190,14 +192,15 @@ static int smc_clc_prfx_match4_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dev); + const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && inet_ifa_match(prop->outgoing_subnet, ifa)) return 0; - } endfor_ifa(in_dev); + } return -ENOENT; } diff --git a/net/socket.c b/net/socket.c index 38eec1583f6d..16449d6daeca 100644 --- a/net/socket.c +++ b/net/socket.c @@ -103,13 +103,6 @@ #include <net/busy_poll.h> #include <linux/errqueue.h> -/* proto_ops for ipv4 and ipv6 use the same {recv,send}msg function */ -#if IS_ENABLED(CONFIG_INET) -#define INDIRECT_CALL_INET4(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) -#else -#define INDIRECT_CALL_INET4(f, f1, ...) f(__VA_ARGS__) -#endif - #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; @@ -241,20 +234,13 @@ static struct kmem_cache *sock_inode_cachep __ro_after_init; static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; - struct socket_wq *wq; ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; - wq = kmalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) { - kmem_cache_free(sock_inode_cachep, ei); - return NULL; - } - init_waitqueue_head(&wq->wait); - wq->fasync_list = NULL; - wq->flags = 0; - ei->socket.wq = wq; + init_waitqueue_head(&ei->socket.wq.wait); + ei->socket.wq.fasync_list = NULL; + ei->socket.wq.flags = 0; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; @@ -265,12 +251,11 @@ static struct inode *sock_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void sock_destroy_inode(struct inode *inode) +static void sock_free_inode(struct inode *inode) { struct socket_alloc *ei; ei = container_of(inode, struct socket_alloc, vfs_inode); - kfree_rcu(ei->socket.wq, rcu); kmem_cache_free(sock_inode_cachep, ei); } @@ -295,7 +280,7 @@ static void init_inodecache(void) static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, - .destroy_inode = sock_destroy_inode, + .free_inode = sock_free_inode, .statfs = simple_statfs, }; @@ -429,7 +414,7 @@ static int sock_map_fd(struct socket *sock, int flags) } newfile = sock_alloc_file(sock, flags, NULL); - if (likely(!IS_ERR(newfile))) { + if (!IS_ERR(newfile)) { fd_install(fd, newfile); return fd; } @@ -606,7 +591,7 @@ static void __sock_release(struct socket *sock, struct inode *inode) module_put(owner); } - if (sock->wq->fasync_list) + if (sock->wq.fasync_list) pr_err("%s: fasync list not empty!\n", __func__); if (!sock->file) { @@ -641,10 +626,13 @@ EXPORT_SYMBOL(__sock_tx_timestamp); INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); +INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *, + size_t)); static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - int ret = INDIRECT_CALL_INET4(sock->ops->sendmsg, inet_sendmsg, sock, - msg, msg_data_left(msg)); + int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, + inet_sendmsg, sock, msg, + msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); return ret; } @@ -870,12 +858,15 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops); INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, - size_t , int )); + size_t, int)); +INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, + size_t, int)); static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - return INDIRECT_CALL_INET4(sock->ops->recvmsg, inet_recvmsg, sock, msg, - msg_data_left(msg), flags); + return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, + inet_recvmsg, sock, msg, msg_data_left(msg), + flags); } /** @@ -1289,13 +1280,12 @@ static int sock_fasync(int fd, struct file *filp, int on) { struct socket *sock = filp->private_data; struct sock *sk = sock->sk; - struct socket_wq *wq; + struct socket_wq *wq = &sock->wq; if (sk == NULL) return -EINVAL; lock_sock(sk); - wq = sock->wq; fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) @@ -2051,6 +2041,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, static int __sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen) { + mm_segment_t oldfs = get_fs(); + char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2063,6 +2055,22 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, + &optname, optval, &optlen, + &kernel_optval); + + if (err < 0) { + goto out_put; + } else if (err > 0) { + err = 0; + goto out_put; + } + + if (kernel_optval) { + set_fs(KERNEL_DS); + optval = (char __user __force *)kernel_optval; + } + if (level == SOL_SOCKET) err = sock_setsockopt(sock, level, optname, optval, @@ -2071,6 +2079,11 @@ static int __sys_setsockopt(int fd, int level, int optname, err = sock->ops->setsockopt(sock, level, optname, optval, optlen); + + if (kernel_optval) { + set_fs(oldfs); + kfree(kernel_optval); + } out_put: fput_light(sock->file, fput_needed); } @@ -2093,6 +2106,7 @@ static int __sys_getsockopt(int fd, int level, int optname, { int err, fput_needed; struct socket *sock; + int max_optlen; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -2100,6 +2114,8 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; + max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, @@ -2108,6 +2124,10 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); + + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, + optval, optlen, + max_optlen, err); out_put: fput_light(sock->file, fput_needed); } diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index f64f36a83063..b3815c1e8f2e 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -157,18 +157,14 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, return 0; } - skb = alloc_skb(0, GFP_ATOMIC); + skb = alloc_skb_for_msg(head); if (!skb) { STRP_STATS_INCR(strp->stats.mem_fail); desc->error = -ENOMEM; return 0; } - skb->len = head->len; - skb->data_len = head->len; - skb->truesize = head->truesize; - *_strp_msg(skb) = *_strp_msg(head); + strp->skb_nextp = &head->next; - skb_shinfo(skb)->frag_list = head; strp->skb_head = skb; head = skb; } else { diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 6c997d4a6218..1336f3cdad38 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -323,7 +323,7 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, hdr = buf_msg(skb); if (msg_user(hdr) == MSG_FRAGMENTER) - hdr = msg_get_wrapped(hdr); + hdr = msg_inner_hdr(hdr); if (msg_type(hdr) != TIPC_MCAST_MSG) return 0; @@ -392,7 +392,7 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, skb = skb_peek(pkts); hdr = buf_msg(skb); if (msg_user(hdr) == MSG_FRAGMENTER) - hdr = msg_get_wrapped(hdr); + hdr = msg_inner_hdr(hdr); msg_set_is_rcast(hdr, method->rcast); /* Switch method ? */ diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 2bed6589f41e..a809c0ec8d15 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -62,7 +62,7 @@ static struct tipc_bearer *bearer_get(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); - return rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + return rcu_dereference(tn->bearer_list[bearer_id]); } static void bearer_disable(struct net *net, struct tipc_bearer *b); @@ -210,7 +210,7 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + b = rcu_dereference(tn->bearer_list[bearer_id]); if (b) tipc_disc_add_dest(b->disc); rcu_read_unlock(); @@ -222,7 +222,7 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + b = rcu_dereference(tn->bearer_list[bearer_id]); if (b) tipc_disc_remove_dest(b->disc); rcu_read_unlock(); @@ -444,7 +444,7 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, struct net_device *dev; int delta; - dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr); + dev = (struct net_device *)rcu_dereference(b->media_ptr); if (!dev) return 0; @@ -481,7 +481,7 @@ int tipc_bearer_mtu(struct net *net, u32 bearer_id) struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(tipc_net(net)->bearer_list[bearer_id]); + b = rcu_dereference(tipc_net(net)->bearer_list[bearer_id]); if (b) mtu = b->mtu; rcu_read_unlock(); @@ -574,8 +574,8 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(dev->tipc_ptr) ?: - rcu_dereference_rtnl(orig_dev->tipc_ptr); + b = rcu_dereference(dev->tipc_ptr) ?: + rcu_dereference(orig_dev->tipc_ptr); if (likely(b && test_bit(0, &b->up) && (skb->pkt_type <= PACKET_MULTICAST))) { skb_mark_not_on_list(skb); diff --git a/net/tipc/link.c b/net/tipc/link.c index 2050fd386642..66d3a07bc571 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -107,7 +107,6 @@ struct tipc_stats { * @backlogq: queue for messages waiting to be sent * @snt_nxt: next sequence number to use for outbound messages * @prev_from: sequence number of most previous retransmission request - * @stale_cnt: counter for number of identical retransmit attempts * @stale_limit: time when repeated identical retransmits must force link reset * @ackers: # of peers that needs to ack each packet before it can be released * @acked: # last packet acked by a certain peer. Used for broadcast. @@ -167,7 +166,6 @@ struct tipc_link { u16 snd_nxt; u16 prev_from; u16 window; - u16 stale_cnt; unsigned long stale_limit; /* Reception */ @@ -209,7 +207,7 @@ enum { BC_NACK_SND_SUPPRESS, }; -#define TIPC_BC_RETR_LIM msecs_to_jiffies(10) /* [ms] */ +#define TIPC_BC_RETR_LIM (jiffies + msecs_to_jiffies(10)) #define TIPC_UC_RETR_TIME (jiffies + msecs_to_jiffies(1)) /* @@ -249,9 +247,9 @@ static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static bool tipc_link_release_pkts(struct tipc_link *l, u16 to); static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data); -static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, - struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq); +static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq); /* * Simple non-static link routines (i.e. referenced outside this file) @@ -734,7 +732,7 @@ static void link_profile_stats(struct tipc_link *l) if (msg_user(msg) == MSG_FRAGMENTER) { if (msg_type(msg) != FIRST_FRAGMENT) return; - length = msg_size(msg_get_wrapped(msg)); + length = msg_size(msg_inner_hdr(msg)); } l->stats.msg_lengths_total += length; l->stats.msg_length_counts++; @@ -910,7 +908,6 @@ void tipc_link_reset(struct tipc_link *l) l->acked = 0; l->silent_intv_cnt = 0; l->rst_cnt = 0; - l->stale_cnt = 0; l->bc_peer_is_up = false; memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); @@ -979,8 +976,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, __skb_queue_tail(transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) - TIPC_SKB_CB(skb)->nxt_retr = - jiffies + TIPC_BC_RETR_LIM; + TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; l->rcv_unacked = 0; @@ -1030,7 +1026,7 @@ static void tipc_link_advance_backlog(struct tipc_link *l, __skb_queue_tail(&l->transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) - TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM; + TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; @@ -1044,32 +1040,68 @@ static void tipc_link_advance_backlog(struct tipc_link *l, l->snd_nxt = seqno; } -static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb) +/** + * link_retransmit_failure() - Detect repeated retransmit failures + * @l: tipc link sender + * @r: tipc link receiver (= l in case of unicast) + * @from: seqno of the 1st packet in retransmit request + * @rc: returned code + * + * Return: true if the repeated retransmit failures happens, otherwise + * false + */ +static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, + u16 from, int *rc) { - struct tipc_msg *hdr = buf_msg(skb); + struct sk_buff *skb = skb_peek(&l->transmq); + struct tipc_msg *hdr; + + if (!skb) + return false; + hdr = buf_msg(skb); + + /* Detect repeated retransmit failures on same packet */ + if (r->prev_from != from) { + r->prev_from = from; + r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); + } else if (time_after(jiffies, r->stale_limit)) { + pr_warn("Retransmission failure on link <%s>\n", l->name); + link_print(l, "State of link "); + pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", + msg_user(hdr), msg_type(hdr), msg_size(hdr), + msg_errcode(hdr)); + pr_info("sqno %u, prev: %x, src: %x\n", + msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); + + trace_tipc_list_dump(&l->transmq, true, "retrans failure!"); + trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!"); + trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!"); + + if (link_is_bc_sndlink(l)) + *rc = TIPC_LINK_DOWN_EVT; + + *rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + return true; + } - pr_warn("Retransmission failure on link <%s>\n", l->name); - link_print(l, "State of link "); - pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", - msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr)); - pr_info("sqno %u, prev: %x, src: %x\n", - msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); + return false; } -/* tipc_link_retrans() - retransmit one or more packets +/* tipc_link_bc_retrans() - retransmit zero or more packets * @l: the link to transmit on * @r: the receiving link ordering the retransmit. Same as l if unicast * @from: retransmit from (inclusive) this sequence number * @to: retransmit to (inclusive) this sequence number * xmitq: queue for accumulating the retransmitted packets */ -static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, - u16 from, u16 to, struct sk_buff_head *xmitq) +static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, + u16 from, u16 to, struct sk_buff_head *xmitq) { struct sk_buff *_skb, *skb = skb_peek(&l->transmq); u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; struct tipc_msg *hdr; + int rc = 0; if (!skb) return 0; @@ -1077,20 +1109,9 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, return 0; trace_tipc_link_retrans(r, from, to, &l->transmq); - /* Detect repeated retransmit failures on same packet */ - if (r->prev_from != from) { - r->prev_from = from; - r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); - r->stale_cnt = 0; - } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { - link_retransmit_failure(l, skb); - trace_tipc_list_dump(&l->transmq, true, "retrans failure!"); - trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!"); - trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!"); - if (link_is_bc_sndlink(l)) - return TIPC_LINK_DOWN_EVT; - return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); - } + + if (link_retransmit_failure(l, r, from, &rc)) + return rc; skb_queue_walk(&l->transmq, skb) { hdr = buf_msg(skb); @@ -1101,9 +1122,9 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, if (link_is_bc_sndlink(l)) { if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; - TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM; + TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; } - _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); + _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC); if (!_skb) return 0; hdr = buf_msg(_skb); @@ -1324,17 +1345,23 @@ exit: * @gap: # of gap packets * @ga: buffer pointer to Gap ACK blocks from peer * @xmitq: queue for accumulating the retransmitted packets if any + * + * In case of a repeated retransmit failures, the call will return shortly + * with a returned code (e.g. TIPC_LINK_DOWN_EVT) */ -static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, - struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq) +static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq) { struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; - u16 seqno; - u16 n = 0; + u16 seqno, n = 0; + int rc = 0; + + if (gap && link_retransmit_failure(l, l, acked + 1, &rc)) + return rc; skb_queue_walk_safe(&l->transmq, skb, tmp) { seqno = buf_seqno(skb); @@ -1369,6 +1396,8 @@ next_gap_ack: goto next_gap_ack; } } + + return 0; } /* tipc_link_build_state_msg: prepare link state message for transmission @@ -1481,7 +1510,6 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Forward queues and wake up waiting users */ if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { - l->stale_cnt = 0; tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); @@ -1918,7 +1946,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); - tipc_link_advance_transmq(l, ack, gap, ga, xmitq); + rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); /* If NACK, retransmit will now start at right position */ if (gap) @@ -2035,7 +2063,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (more(peers_snd_nxt, l->rcv_nxt + l->window)) return rc; - rc = tipc_link_retrans(snd_l, l, from, to, xmitq); + rc = tipc_link_bc_retrans(snd_l, l, from, to, xmitq); l->snd_nxt = peers_snd_nxt; if (link_bc_rcv_gap(l)) @@ -2131,7 +2159,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, if (dnode == tipc_own_addr(l->net)) { tipc_link_bc_ack_rcv(l, acked, xmitq); - rc = tipc_link_retrans(l->bc_sndlink, l, from, to, xmitq); + rc = tipc_link_bc_retrans(l->bc_sndlink, l, from, to, xmitq); l->stats.recv_nacks++; return rc; } @@ -2550,7 +2578,7 @@ int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf) i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt); i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt); i += scnprintf(buf + i, sz - i, " %u", l->prev_from); - i += scnprintf(buf + i, sz - i, " %u", l->stale_cnt); + i += scnprintf(buf + i, sz - i, " %u", 0); i += scnprintf(buf + i, sz - i, " %u", l->acked); list = &l->transmq; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 8de02ad6e352..da509f0eb9ca 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -308,7 +308,7 @@ static inline unchar *msg_data(struct tipc_msg *m) return ((unchar *)m) + msg_hdr_sz(m); } -static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) +static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m) { return (struct tipc_msg *)msg_data(m); } @@ -486,7 +486,7 @@ static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) static inline u32 msg_origport(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) - m = msg_get_wrapped(m); + m = msg_inner_hdr(m); return msg_word(m, 4); } diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 99bd166bccec..d6165ad384c0 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -261,7 +261,7 @@ struct genl_family tipc_genl_family __ro_after_init = { .version = TIPC_GENL_V2_VERSION, .hdrsize = 0, .maxattr = TIPC_NLA_MAX, - .policy = tipc_nl_policy, + .policy = tipc_nl_policy, .netnsok = true, .module = THIS_MODULE, .ops = tipc_genl_v2_ops, diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index cf155061c472..d86030ef1232 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -691,7 +691,6 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb, struct nlattr *prop; struct nlattr *media; struct tipc_link_config *lc; - int len; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -699,10 +698,6 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb, if (!media) return -EMSGSIZE; - len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME); - if (!string_is_valid(lc->name, len)) - return -EINVAL; - if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name)) return -EMSGSIZE; @@ -723,7 +718,6 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb, struct nlattr *prop; struct nlattr *bearer; struct tipc_link_config *lc; - int len; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -731,10 +725,6 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb, if (!bearer) return -EMSGSIZE; - len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME); - if (!string_is_valid(lc->name, len)) - return -EINVAL; - if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name)) return -EMSGSIZE; diff --git a/net/tipc/node.c b/net/tipc/node.c index 550581d47d51..324a1f91b394 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1649,7 +1649,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); - u16 iseqno = msg_seqno(msg_get_wrapped(hdr)); + u16 iseqno = msg_seqno(msg_inner_hdr(hdr)); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 1405ccc9101c..287df68721df 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -76,6 +76,7 @@ struct udp_media_addr { /* struct udp_replicast - container for UDP remote addresses */ struct udp_replicast { struct udp_media_addr addr; + struct dst_cache dst_cache; struct rcu_head rcu; struct list_head list; }; @@ -158,22 +159,27 @@ static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a) /* tipc_send_msg - enqueue a send request */ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, struct udp_bearer *ub, struct udp_media_addr *src, - struct udp_media_addr *dst) + struct udp_media_addr *dst, struct dst_cache *cache) { + struct dst_entry *ndst = dst_cache_get(cache); int ttl, err = 0; - struct rtable *rt; if (dst->proto == htons(ETH_P_IP)) { - struct flowi4 fl = { - .daddr = dst->ipv4.s_addr, - .saddr = src->ipv4.s_addr, - .flowi4_mark = skb->mark, - .flowi4_proto = IPPROTO_UDP - }; - rt = ip_route_output_key(net, &fl); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto tx_error; + struct rtable *rt = (struct rtable *)ndst; + + if (!rt) { + struct flowi4 fl = { + .daddr = dst->ipv4.s_addr, + .saddr = src->ipv4.s_addr, + .flowi4_mark = skb->mark, + .flowi4_proto = IPPROTO_UDP + }; + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto tx_error; + } + dst_cache_set_ip4(cache, &rt->dst, fl.saddr); } ttl = ip4_dst_hoplimit(&rt->dst); @@ -182,17 +188,19 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, dst->port, false, true); #if IS_ENABLED(CONFIG_IPV6) } else { - struct dst_entry *ndst; - struct flowi6 fl6 = { - .flowi6_oif = ub->ifindex, - .daddr = dst->ipv6, - .saddr = src->ipv6, - .flowi6_proto = IPPROTO_UDP - }; - err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, &ndst, - &fl6); - if (err) - goto tx_error; + if (!ndst) { + struct flowi6 fl6 = { + .flowi6_oif = ub->ifindex, + .daddr = dst->ipv6, + .saddr = src->ipv6, + .flowi6_proto = IPPROTO_UDP + }; + err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, + &ndst, &fl6); + if (err) + goto tx_error; + dst_cache_set_ip6(cache, ndst, &fl6.saddr); + } ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, &src->ipv6, &dst->ipv6, 0, ttl, 0, @@ -223,14 +231,15 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, } skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); - ub = rcu_dereference_rtnl(b->media_ptr); + ub = rcu_dereference(b->media_ptr); if (!ub) { err = -ENODEV; goto out; } if (addr->broadcast != TIPC_REPLICAST_SUPPORT) - return tipc_udp_xmit(net, skb, ub, src, dst); + return tipc_udp_xmit(net, skb, ub, src, dst, + &ub->rcast.dst_cache); /* Replicast, send an skb to each configured IP address */ list_for_each_entry_rcu(rcast, &ub->rcast.list, list) { @@ -242,7 +251,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, goto out; } - err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr); + err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr, + &rcast->dst_cache); if (err) goto out; } @@ -286,6 +296,11 @@ static int tipc_udp_rcast_add(struct tipc_bearer *b, if (!rcast) return -ENOMEM; + if (dst_cache_init(&rcast->dst_cache, GFP_ATOMIC)) { + kfree(rcast); + return -ENOMEM; + } + memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr)); if (ntohs(addr->proto) == ETH_P_IP) @@ -475,7 +490,7 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) } } - ub = rcu_dereference_rtnl(b->media_ptr); + ub = rtnl_dereference(b->media_ptr); if (!ub) { rtnl_unlock(); return -EINVAL; @@ -517,7 +532,7 @@ int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b) struct udp_bearer *ub; struct nlattr *nest; - ub = rcu_dereference_rtnl(b->media_ptr); + ub = rtnl_dereference(b->media_ptr); if (!ub) return -ENODEV; @@ -742,6 +757,10 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); + err = dst_cache_init(&ub->rcast.dst_cache, GFP_ATOMIC); + if (err) + goto free; + /** * The bcast media address port is used for all peers and the ip * is used if it's a multicast address. @@ -752,12 +771,14 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, else err = tipc_udp_rcast_add(b, &remote); if (err) - goto err; + goto free; return 0; + +free: + dst_cache_destroy(&ub->rcast.dst_cache); + udp_tunnel_sock_release(ub->ubsock); err: - if (ub->ubsock) - udp_tunnel_sock_release(ub->ubsock); kfree(ub); return err; } @@ -769,12 +790,13 @@ static void cleanup_bearer(struct work_struct *work) struct udp_replicast *rcast, *tmp; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { + dst_cache_destroy(&rcast->dst_cache); list_del_rcu(&rcast->list); kfree_rcu(rcast, rcu); } - if (ub->ubsock) - udp_tunnel_sock_release(ub->ubsock); + dst_cache_destroy(&ub->rcast.dst_cache); + udp_tunnel_sock_release(ub->ubsock); synchronize_net(); kfree(ub); } @@ -784,13 +806,12 @@ static void tipc_udp_disable(struct tipc_bearer *b) { struct udp_bearer *ub; - ub = rcu_dereference_rtnl(b->media_ptr); + ub = rtnl_dereference(b->media_ptr); if (!ub) { pr_err("UDP bearer instance not found\n"); return; } - if (ub->ubsock) - sock_set_flag(ub->ubsock->sk, SOCK_DEAD); + sock_set_flag(ub->ubsock->sk, SOCK_DEAD); RCU_INIT_POINTER(ub->bearer, NULL); /* sock_release need to be done outside of rtnl lock */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index eb8f24f420f0..92fd1352c037 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -209,6 +209,29 @@ void tls_device_free_resources_tx(struct sock *sk) tls_free_partial_record(sk, tls_ctx); } +static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, + u32 seq) +{ + struct net_device *netdev; + struct sk_buff *skb; + u8 *rcd_sn; + + skb = tcp_write_queue_tail(sk); + if (skb) + TCP_SKB_CB(skb)->eor = 1; + + rcd_sn = tls_ctx->tx.rec_seq; + + down_read(&device_offload_lock); + netdev = tls_ctx->netdev; + if (netdev) + netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, + TLS_OFFLOAD_CTX_DIR_TX); + up_read(&device_offload_lock); + + clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags); +} + static void tls_append_frag(struct tls_record_info *record, struct page_frag *pfrag, int size) @@ -252,7 +275,7 @@ static int tls_push_record(struct sock *sk, skb_frag_address(frag), record->len - prot->prepend_size, record_type, - ctx->crypto_send.info.version); + prot->version); /* HW doesn't care about the data in the tag, because it fills it. */ dummy_tag_frag.page = skb_frag_page(frag); @@ -264,7 +287,11 @@ static int tls_push_record(struct sock *sk, list_add_tail(&record->list, &offload_ctx->records_list); spin_unlock_irq(&offload_ctx->lock); offload_ctx->open_record = NULL; - tls_advance_record_sn(sk, &ctx->tx, ctx->crypto_send.info.version); + + if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags)) + tls_device_resync_tx(sk, ctx, tp->write_seq); + + tls_advance_record_sn(sk, prot, &ctx->tx); for (i = 0; i < record->num_frags; i++) { frag = &record->frags[i]; @@ -551,7 +578,7 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) } static void tls_device_resync_rx(struct tls_context *tls_ctx, - struct sock *sk, u32 seq, u64 rcd_sn) + struct sock *sk, u32 seq, u8 *rcd_sn) { struct net_device *netdev; @@ -559,14 +586,17 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx, return; netdev = READ_ONCE(tls_ctx->netdev); if (netdev) - netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, rcd_sn); + netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, + TLS_OFFLOAD_CTX_DIR_RX); clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags); } -void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) +void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; + u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + struct tls_prot_info *prot; u32 is_req_pending; s64 resync_req; u32 req_seq; @@ -574,15 +604,83 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) if (tls_ctx->rx_conf != TLS_HW) return; + prot = &tls_ctx->prot_info; rx_ctx = tls_offload_ctx_rx(tls_ctx); - resync_req = atomic64_read(&rx_ctx->resync_req); - req_seq = (resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1); - is_req_pending = resync_req; + memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); - if (unlikely(is_req_pending) && req_seq == seq && - atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) { + switch (rx_ctx->resync_type) { + case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ: + resync_req = atomic64_read(&rx_ctx->resync_req); + req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; - tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); + is_req_pending = resync_req; + + if (likely(!is_req_pending) || req_seq != seq || + !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) + return; + break; + case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT: + if (likely(!rx_ctx->resync_nh_do_now)) + return; + + /* head of next rec is already in, note that the sock_inq will + * include the currently parsed message when called from parser + */ + if (tcp_inq(sk) > rcd_len) + return; + + rx_ctx->resync_nh_do_now = 0; + seq += rcd_len; + tls_bigint_increment(rcd_sn, prot->rec_seq_size); + break; + } + + tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); +} + +static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, + struct tls_offload_context_rx *ctx, + struct sock *sk, struct sk_buff *skb) +{ + struct strp_msg *rxm; + + /* device will request resyncs by itself based on stream scan */ + if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT) + return; + /* already scheduled */ + if (ctx->resync_nh_do_now) + return; + /* seen decrypted fragments since last fully-failed record */ + if (ctx->resync_nh_reset) { + ctx->resync_nh_reset = 0; + ctx->resync_nh.decrypted_failed = 1; + ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL; + return; + } + + if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt) + return; + + /* doing resync, bump the next target in case it fails */ + if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL) + ctx->resync_nh.decrypted_tgt *= 2; + else + ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL; + + rxm = strp_msg(skb); + + /* head of next rec is already in, parser will sync for us */ + if (tcp_inq(sk) > rxm->full_len) { + ctx->resync_nh_do_now = 1; + } else { + struct tls_prot_info *prot = &tls_ctx->prot_info; + u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + + memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); + tls_bigint_increment(rcd_sn, prot->rec_seq_size); + + tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq, + rcd_sn); } } @@ -610,8 +708,10 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) sg_set_buf(&sg[0], buf, rxm->full_len + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); - skb_copy_bits(skb, offset, buf, - TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + err = skb_copy_bits(skb, offset, buf, + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + if (err) + goto free_buf; /* We are interested only in the decrypted data not the auth */ err = decrypt_skb(sk, skb, sg); @@ -625,8 +725,11 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) if (skb_pagelen(skb) > offset) { copy = min_t(int, skb_pagelen(skb) - offset, data_len); - if (skb->decrypted) - skb_store_bits(skb, offset, buf, copy); + if (skb->decrypted) { + err = skb_store_bits(skb, offset, buf, copy); + if (err) + goto free_buf; + } offset += copy; buf += copy; @@ -649,8 +752,11 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) copy = min_t(int, skb_iter->len - frag_pos, data_len + rxm->offset - offset); - if (skb_iter->decrypted) - skb_store_bits(skb_iter, frag_pos, buf, copy); + if (skb_iter->decrypted) { + err = skb_store_bits(skb_iter, frag_pos, buf, copy); + if (err) + goto free_buf; + } offset += copy; buf += copy; @@ -671,10 +777,6 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) int is_encrypted = !is_decrypted; struct sk_buff *skb_iter; - /* Skip if it is already decrypted */ - if (ctx->sw.decrypted) - return 0; - /* Check if all the data is decrypted already */ skb_walk_frags(skb, skb_iter) { is_decrypted &= skb_iter->decrypted; @@ -683,12 +785,21 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) ctx->sw.decrypted |= is_decrypted; - /* Return immedeatly if the record is either entirely plaintext or + /* Return immediately if the record is either entirely plaintext or * entirely ciphertext. Otherwise handle reencrypt partially decrypted * record. */ - return (is_encrypted || is_decrypted) ? 0 : - tls_device_reencrypt(sk, skb); + if (is_decrypted) { + ctx->resync_nh_reset = 1; + return 0; + } + if (is_encrypted) { + tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb); + return 0; + } + + ctx->resync_nh_reset = 1; + return tls_device_reencrypt(sk, skb); } static void tls_device_attach(struct tls_context *ctx, struct sock *sk, @@ -762,6 +873,12 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto free_offload_ctx; } + /* Sanity-check the rec_seq_size for stack allocations */ + if (rec_seq_size > TLS_MAX_REC_SEQ_SIZE) { + rc = -EINVAL; + goto free_offload_ctx; + } + prot->prepend_size = TLS_HEADER_SIZE + nonce_size; prot->tag_size = tag_size; prot->overhead_size = prot->prepend_size + prot->tag_size; @@ -916,6 +1033,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) rc = -ENOMEM; goto release_netdev; } + context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; rc = tls_set_sw_offload(sk, ctx, 0); @@ -1023,7 +1141,7 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: if ((dev->features & NETIF_F_HW_TLS_RX) && - !dev->tlsdev_ops->tls_dev_resync_rx) + !dev->tlsdev_ops->tls_dev_resync) return NOTIFY_BAD; if (dev->tlsdev_ops && diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index c3a5fe624b4e..1d2d804ac633 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -240,7 +240,6 @@ static int fill_sg_in(struct scatterlist *sg_in, record = tls_get_record(ctx, tcp_seq, rcd_sn); if (!record) { spin_unlock_irqrestore(&ctx->lock, flags); - WARN(1, "Record not found for seq %u\n", tcp_seq); return -EINVAL; } @@ -409,7 +408,10 @@ put_sg: put_page(sg_page(&sg_in[--resync_sgs])); kfree(sg_in); free_orig: - kfree_skb(skb); + if (nskb) + consume_skb(skb); + else + kfree_skb(skb); return nskb; } @@ -424,6 +426,12 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, } EXPORT_SYMBOL_GPL(tls_validate_xmit_skb); +struct sk_buff *tls_encrypt_skb(struct sk_buff *skb) +{ + return tls_sw_fallback(skb->sk, skb); +} +EXPORT_SYMBOL_GPL(tls_encrypt_skb); + int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e2385183526e..53b4ad94e74a 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -534,7 +534,7 @@ static int tls_do_encryption(struct sock *sk, /* Unhook the record from context if encryption is not failure */ ctx->open_rec = NULL; - tls_advance_record_sn(sk, &tls_ctx->tx, prot->version); + tls_advance_record_sn(sk, prot, &tls_ctx->tx); return rc; } @@ -1485,15 +1485,16 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; - int version = prot->version; struct strp_msg *rxm = strp_msg(skb); int pad, err = 0; if (!ctx->decrypted) { #ifdef CONFIG_TLS_DEVICE - err = tls_device_decrypted(sk, skb); - if (err < 0) - return err; + if (tls_ctx->rx_conf == TLS_HW) { + err = tls_device_decrypted(sk, skb); + if (err < 0) + return err; + } #endif /* Still not decrypted after tls_device */ if (!ctx->decrypted) { @@ -1501,8 +1502,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, async); if (err < 0) { if (err == -EINPROGRESS) - tls_advance_record_sn(sk, &tls_ctx->rx, - version); + tls_advance_record_sn(sk, prot, + &tls_ctx->rx); return err; } @@ -1517,7 +1518,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, rxm->full_len -= pad; rxm->offset += prot->prepend_size; rxm->full_len -= prot->overhead_size; - tls_advance_record_sn(sk, &tls_ctx->rx, version); + tls_advance_record_sn(sk, prot, &tls_ctx->rx); ctx->decrypted = true; ctx->saved_data_ready(sk); } else { @@ -2014,8 +2015,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } #ifdef CONFIG_TLS_DEVICE - handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset, - *(u64*)tls_ctx->rx.rec_seq); + tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE, + TCP_SKB_CB(skb)->seq + rxm->offset); #endif return data_len + TLS_HEADER_SIZE; @@ -2282,8 +2283,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_priv; } - /* Sanity-check the IV size for stack allocations. */ - if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) { + /* Sanity-check the sizes for stack allocations. */ + if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE || + rec_seq_size > TLS_MAX_REC_SEQ_SIZE) { rc = -EINVAL; goto free_priv; } diff --git a/net/unix/diag.c b/net/unix/diag.c index c51a707260fa..9ff64f9df1f3 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -5,9 +5,11 @@ #include <linux/unix_diag.h> #include <linux/skbuff.h> #include <linux/module.h> +#include <linux/uidgid.h> #include <net/netlink.h> #include <net/af_unix.h> #include <net/tcp_states.h> +#include <net/sock.h> static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { @@ -111,6 +113,12 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql); } +static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb) +{ + uid_t uid = from_kuid_munged(sk_user_ns(nlskb->sk), sock_i_uid(sk)); + return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid); +} + static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, u32 portid, u32 seq, u32 flags, int sk_ino) { @@ -157,6 +165,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown)) goto out_nlmsg_trim; + if ((req->udiag_show & UDIAG_SHOW_UID) && + sk_diag_dump_uid(sk, skb)) + goto out_nlmsg_trim; + nlmsg_end(skb, nlh); return 0; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 169112f8aa1e..ab47bf3ab66e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -274,7 +274,8 @@ EXPORT_SYMBOL_GPL(vsock_insert_connected); void vsock_remove_bound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); - __vsock_remove_bound(vsk); + if (__vsock_in_bound_table(vsk)) + __vsock_remove_bound(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_bound); @@ -282,7 +283,8 @@ EXPORT_SYMBOL_GPL(vsock_remove_bound); void vsock_remove_connected(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); - __vsock_remove_connected(vsk); + if (__vsock_in_connected_table(vsk)) + __vsock_remove_connected(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_connected); @@ -318,35 +320,10 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, } EXPORT_SYMBOL_GPL(vsock_find_connected_socket); -static bool vsock_in_bound_table(struct vsock_sock *vsk) -{ - bool ret; - - spin_lock_bh(&vsock_table_lock); - ret = __vsock_in_bound_table(vsk); - spin_unlock_bh(&vsock_table_lock); - - return ret; -} - -static bool vsock_in_connected_table(struct vsock_sock *vsk) -{ - bool ret; - - spin_lock_bh(&vsock_table_lock); - ret = __vsock_in_connected_table(vsk); - spin_unlock_bh(&vsock_table_lock); - - return ret; -} - void vsock_remove_sock(struct vsock_sock *vsk) { - if (vsock_in_bound_table(vsk)) - vsock_remove_bound(vsk); - - if (vsock_in_connected_table(vsk)) - vsock_remove_connected(vsk); + vsock_remove_bound(vsk); + vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); @@ -477,8 +454,7 @@ static void vsock_pending_work(struct work_struct *work) * incoming packets can't find this socket, and to reduce the reference * count. */ - if (vsock_in_connected_table(vsk)) - vsock_remove_connected(vsk); + vsock_remove_connected(vsk); sk->sk_state = TCP_CLOSE; diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 62dcdf082349..f2084e3f7aa4 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -14,14 +14,14 @@ #include <net/sock.h> #include <net/af_vsock.h> -/* The host side's design of the feature requires 6 exact 4KB pages for - * recv/send rings respectively -- this is suboptimal considering memory - * consumption, however unluckily we have to live with it, before the - * host comes up with a better design in the future. +/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some + * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer + * hosts don't have this limitation; but, keep the defaults the same for compat. */ #define PAGE_SIZE_4K 4096 #define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6) #define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6) +#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64) /* The MTU is 16KB per the host side's design */ #define HVS_MTU_SIZE (1024 * 16) @@ -46,8 +46,9 @@ struct hvs_recv_buf { }; /* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use - * a small size, i.e. HVS_SEND_BUF_SIZE, to minimize the dynamically-allocated - * buffer, because tests show there is no significant performance difference. + * a smaller size, i.e. HVS_SEND_BUF_SIZE, to maximize concurrency between the + * guest and the host processing as one VMBUS packet is the smallest processing + * unit. * * Note: the buffer can be eliminated in the future when we add new VMBus * ringbuffer APIs that allow us to directly copy data from userspace buffer @@ -321,8 +322,11 @@ static void hvs_open_connection(struct vmbus_channel *chan) struct sockaddr_vm addr; struct sock *sk, *new = NULL; struct vsock_sock *vnew = NULL; - struct hvsock *hvs, *hvs_new = NULL; + struct hvsock *hvs = NULL; + struct hvsock *hvs_new = NULL; + int rcvbuf; int ret; + int sndbuf; if_type = &chan->offermsg.offer.if_type; if_instance = &chan->offermsg.offer.if_instance; @@ -364,9 +368,34 @@ static void hvs_open_connection(struct vmbus_channel *chan) } set_channel_read_mode(chan, HV_CALL_DIRECT); - ret = vmbus_open(chan, RINGBUFFER_HVS_SND_SIZE, - RINGBUFFER_HVS_RCV_SIZE, NULL, 0, - hvs_channel_cb, conn_from_host ? new : sk); + + /* Use the socket buffer sizes as hints for the VMBUS ring size. For + * server side sockets, 'sk' is the parent socket and thus, this will + * allow the child sockets to inherit the size from the parent. Keep + * the mins to the default value and align to page size as per VMBUS + * requirements. + * For the max, the socket core library will limit the socket buffer + * size that can be set by the user, but, since currently, the hv_sock + * VMBUS ring buffer is physically contiguous allocation, restrict it + * further. + * Older versions of hv_sock host side code cannot handle bigger VMBUS + * ring buffer size. Use the version number to limit the change to newer + * versions. + */ + if (vmbus_proto_version < VERSION_WIN10_V5) { + sndbuf = RINGBUFFER_HVS_SND_SIZE; + rcvbuf = RINGBUFFER_HVS_RCV_SIZE; + } else { + sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE); + sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE); + sndbuf = ALIGN(sndbuf, PAGE_SIZE); + rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE); + rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE); + rcvbuf = ALIGN(rcvbuf, PAGE_SIZE); + } + + ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb, + conn_from_host ? new : sk); if (ret != 0) { if (conn_from_host) { hvs_new->chan = NULL; @@ -424,6 +453,7 @@ static u32 hvs_get_local_cid(void) static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk) { struct hvsock *hvs; + struct sock *sk = sk_vsock(vsk); hvs = kzalloc(sizeof(*hvs), GFP_KERNEL); if (!hvs) @@ -431,7 +461,8 @@ static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk) vsk->trans = hvs; hvs->vsk = vsk; - + sk->sk_sndbuf = RINGBUFFER_HVS_SND_SIZE; + sk->sk_rcvbuf = RINGBUFFER_HVS_RCV_SIZE; return 0; } @@ -627,7 +658,9 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, struct hvsock *hvs = vsk->trans; struct vmbus_channel *chan = hvs->chan; struct hvs_send_buf *send_buf; - ssize_t to_write, max_writable, ret; + ssize_t to_write, max_writable; + ssize_t ret = 0; + ssize_t bytes_written = 0; BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K); @@ -635,20 +668,34 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, if (!send_buf) return -ENOMEM; - max_writable = hvs_channel_writable_bytes(chan); - to_write = min_t(ssize_t, len, max_writable); - to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE); - - ret = memcpy_from_msg(send_buf->data, msg, to_write); - if (ret < 0) - goto out; + /* Reader(s) could be draining data from the channel as we write. + * Maximize bandwidth, by iterating until the channel is found to be + * full. + */ + while (len) { + max_writable = hvs_channel_writable_bytes(chan); + if (!max_writable) + break; + to_write = min_t(ssize_t, len, max_writable); + to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE); + /* memcpy_from_msg is safe for loop as it advances the offsets + * within the message iterator. + */ + ret = memcpy_from_msg(send_buf->data, msg, to_write); + if (ret < 0) + goto out; - ret = hvs_send_data(hvs->chan, send_buf, to_write); - if (ret < 0) - goto out; + ret = hvs_send_data(hvs->chan, send_buf, to_write); + if (ret < 0) + goto out; - ret = to_write; + bytes_written += to_write; + len -= to_write; + } out: + /* If any data has been sent, return that */ + if (bytes_written) + ret = bytes_written; kfree(send_buf); return ret; } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 96dafa978268..0815d1357861 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -605,7 +605,8 @@ static int virtio_vsock_probe(struct virtio_device *vdev) return ret; /* Only one virtio-vsock device per guest is supported */ - if (the_virtio_vsock) { + if (rcu_dereference_protected(the_virtio_vsock, + lockdep_is_held(&the_virtio_vsock_mutex))) { ret = -EBUSY; goto out; } diff --git a/net/wireless/core.c b/net/wireless/core.c index 53ad3dbb76fe..45d9afcff6d5 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -859,6 +859,19 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; } + for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) { + /* + * Validate we have a policy (can be explicitly set to + * VENDOR_CMD_RAW_DATA which is non-NULL) and also that + * we have at least one of doit/dumpit. + */ + if (WARN_ON(!rdev->wiphy.vendor_commands[i].policy)) + return -EINVAL; + if (WARN_ON(!rdev->wiphy.vendor_commands[i].doit && + !rdev->wiphy.vendor_commands[i].dumpit)) + return -EINVAL; + } + #ifdef CONFIG_PM if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns && (!rdev->wiphy.wowlan->pattern_min_len || diff --git a/net/wireless/core.h b/net/wireless/core.h index 84d36ca7a7ab..ee8388fe4a92 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -531,6 +531,10 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); +struct cfg80211_internal_bss * +cfg80211_bss_update(struct cfg80211_registered_device *rdev, + struct cfg80211_internal_bss *tmp, + bool signal_valid, unsigned long ts); #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS #define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) #else diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 520d437aa8d1..fc83dd179c1a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -571,6 +571,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PEER_MEASUREMENTS] = NLA_POLICY_NESTED(nl80211_pmsr_attr_policy), [NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1), + [NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY, + .len = SAE_PASSWORD_MAX_LEN }, + [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -4447,6 +4450,8 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return true; case NL80211_CMD_CONNECT: if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD) && auth_type == NL80211_AUTHTYPE_SAE) return false; @@ -4637,6 +4642,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + params.twt_responder = + nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]); + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) @@ -8751,7 +8759,8 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) static bool nl80211_valid_wpa_versions(u32 wpa_versions) { return !(wpa_versions & ~(NL80211_WPA_VERSION_1 | - NL80211_WPA_VERSION_2)); + NL80211_WPA_VERSION_2 | + NL80211_WPA_VERSION_3)); } static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) @@ -8987,6 +8996,16 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]); } + if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) { + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD)) + return -EINVAL; + settings->sae_pwd = + nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]); + settings->sae_pwd_len = + nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]); + } + return 0; } @@ -12669,6 +12688,29 @@ static int nl80211_crit_protocol_stop(struct sk_buff *skb, return 0; } +static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + if (vcmd->policy == VENDOR_CMD_RAW_DATA) { + if (attr->nla_type & NLA_F_NESTED) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "unexpected nested data"); + return -EINVAL; + } + + return 0; + } + + if (!(attr->nla_type & NLA_F_NESTED)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "expected nested data"); + return -EINVAL; + } + + return nl80211_validate_nested(attr, vcmd->maxattr, vcmd->policy, + extack); +} + static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -12727,11 +12769,16 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NL80211_ATTR_VENDOR_DATA]); len = nla_len(info->attrs[NL80211_ATTR_VENDOR_DATA]); + + err = nl80211_vendor_check_policy(vcmd, + info->attrs[NL80211_ATTR_VENDOR_DATA], + info->extack); + if (err) + return err; } rdev->cur_cmd_info = info; - err = rdev->wiphy.vendor_commands[i].doit(&rdev->wiphy, wdev, - data, len); + err = vcmd->doit(&rdev->wiphy, wdev, data, len); rdev->cur_cmd_info = NULL; return err; } @@ -12818,6 +12865,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, if (attrbuf[NL80211_ATTR_VENDOR_DATA]) { data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]); data_len = nla_len(attrbuf[NL80211_ATTR_VENDOR_DATA]); + + err = nl80211_vendor_check_policy( + &(*rdev)->wiphy.vendor_commands[vcmd_idx], + attrbuf[NL80211_ATTR_VENDOR_DATA], + cb->extack); + if (err) + return err; } /* 0 is the first index - add 1 to parse only once */ @@ -15086,7 +15140,9 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, return; } - if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -15376,6 +15432,19 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie, } EXPORT_SYMBOL(cfg80211_remain_on_channel_expired); +void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie, + struct ieee80211_channel *chan, + gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + trace_cfg80211_tx_mgmt_expired(wdev, cookie, chan); + nl80211_send_remain_on_chan_event(NL80211_CMD_FRAME_WAIT_CANCEL, + rdev, wdev, cookie, chan, 0, gfp); +} +EXPORT_SYMBOL(cfg80211_tx_mgmt_expired); + void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp) { diff --git a/net/wireless/scan.c b/net/wireless/scan.c index aa571d727903..d66e6d4b7555 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1092,17 +1092,17 @@ struct cfg80211_non_tx_bss { }; /* Returned bss is reference counted and must be cleaned up appropriately. */ -static struct cfg80211_internal_bss * +struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *tmp, - bool signal_valid) + bool signal_valid, unsigned long ts) { struct cfg80211_internal_bss *found = NULL; if (WARN_ON(!tmp->pub.channel)) return NULL; - tmp->ts = jiffies; + tmp->ts = ts; spin_lock_bh(&rdev->bss_lock); @@ -1425,7 +1425,8 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; - res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid); + res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, + jiffies); if (!res) return NULL; @@ -1842,7 +1843,8 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; - res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid); + res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, + jiffies); if (!res) return NULL; @@ -1972,6 +1974,27 @@ out: } EXPORT_SYMBOL(cfg80211_unlink_bss); +void cfg80211_bss_iter(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + void (*iter)(struct wiphy *wiphy, + struct cfg80211_bss *bss, + void *data), + void *iter_data) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct cfg80211_internal_bss *bss; + + spin_lock_bh(&rdev->bss_lock); + + list_for_each_entry(bss, &rdev->bss_list, list) { + if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel)) + iter(wiphy, &bss->pub, iter_data); + } + + spin_unlock_bh(&rdev->bss_lock); +} +EXPORT_SYMBOL(cfg80211_bss_iter); + #ifdef CONFIG_CFG80211_WEXT static struct cfg80211_registered_device * cfg80211_get_dev_from_ifindex(struct net *net, int ifindex) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 7d34cb884840..7a6c38ddc65a 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -796,12 +796,36 @@ void cfg80211_connect_done(struct net_device *dev, u8 *next; if (params->bss) { - /* Make sure the bss entry provided by the driver is valid. */ struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss); - if (WARN_ON(list_empty(&ibss->list))) { - cfg80211_put_bss(wdev->wiphy, params->bss); - return; + if (list_empty(&ibss->list)) { + struct cfg80211_bss *found = NULL, *tmp = params->bss; + + found = cfg80211_get_bss(wdev->wiphy, NULL, + params->bss->bssid, + wdev->ssid, wdev->ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY_ANY); + if (found) { + /* The same BSS is already updated so use it + * instead, as it has latest info. + */ + params->bss = found; + } else { + /* Update with BSS provided by driver, it will + * be freshly added and ref cnted, we can free + * the old one. + * + * signal_valid can be false, as we are not + * expecting the BSS to be found. + * + * keep the old timestamp to avoid confusion + */ + cfg80211_bss_update(rdev, ibss, false, + ibss->ts); + } + + cfg80211_put_bss(wdev->wiphy, tmp); } } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 2abfff925aac..4fbb91a511ae 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2752,6 +2752,24 @@ TRACE_EVENT(cfg80211_ready_on_channel_expired, WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG) ); +TRACE_EVENT(cfg80211_tx_mgmt_expired, + TP_PROTO(struct wireless_dev *wdev, u64 cookie, + struct ieee80211_channel *chan), + TP_ARGS(wdev, cookie, chan), + TP_STRUCT__entry( + WDEV_ENTRY + __field(u64, cookie) + CHAN_ENTRY + ), + TP_fast_assign( + WDEV_ASSIGN; + __entry->cookie = cookie; + CHAN_ASSIGN(chan); + ), + TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT, + WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG) +); + TRACE_EVENT(cfg80211_new_sta, TP_PROTO(struct net_device *netdev, const u8 *mac_addr, struct station_info *sinfo), diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index f53a6ef7c155..d4d6f10aa936 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -37,6 +37,12 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) READ_ONCE(xs->umem->fq); } +bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) +{ + return xskq_has_addrs(umem->fq, cnt); +} +EXPORT_SYMBOL(xsk_umem_has_addrs); + u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) { return xskq_peek_addr(umem->fq, addr); @@ -123,13 +129,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) u64 addr; int err; - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) - return -EINVAL; + spin_lock_bh(&xs->rx_lock); + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { + err = -EINVAL; + goto out_unlock; + } if (!xskq_peek_addr(xs->umem->fq, &addr) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - xs->rx_dropped++; - return -ENOSPC; + err = -ENOSPC; + goto out_drop; } addr += xs->umem->headroom; @@ -138,13 +148,21 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) memcpy(buffer, xdp->data_meta, len + metalen); addr += metalen; err = xskq_produce_batch_desc(xs->rx, addr, len); - if (!err) { - xskq_discard_addr(xs->umem->fq); - xsk_flush(xs); - return 0; - } + if (err) + goto out_drop; + + xskq_discard_addr(xs->umem->fq); + xskq_produce_flush_desc(xs->rx); + + spin_unlock_bh(&xs->rx_lock); + xs->sk.sk_data_ready(&xs->sk); + return 0; + +out_drop: xs->rx_dropped++; +out_unlock: + spin_unlock_bh(&xs->rx_lock); return err; } @@ -166,22 +184,18 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_consume_tx_done); -bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len) +bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) { - struct xdp_desc desc; struct xdp_sock *xs; rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_list, list) { - if (!xskq_peek_desc(xs->tx, &desc)) + if (!xskq_peek_desc(xs->tx, desc)) continue; - if (xskq_produce_addr_lazy(umem->cq, desc.addr)) + if (xskq_produce_addr_lazy(umem->cq, desc->addr)) goto out; - *dma = xdp_umem_get_dma(umem, desc.addr); - *len = desc.len; - xskq_discard_desc(xs->tx); rcu_read_unlock(); return true; @@ -662,6 +676,26 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, return 0; } + case XDP_OPTIONS: + { + struct xdp_options opts = {}; + + if (len < sizeof(opts)) + return -EINVAL; + + mutex_lock(&xs->mutex); + if (xs->zc) + opts.flags |= XDP_OPTIONS_ZEROCOPY; + mutex_unlock(&xs->mutex); + + len = sizeof(opts); + if (copy_to_user(optval, &opts, len)) + return -EFAULT; + if (put_user(len, optlen)) + return -EFAULT; + + return 0; + } default: break; } @@ -819,6 +853,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); xs->state = XSK_READY; mutex_init(&xs->mutex); + spin_lock_init(&xs->rx_lock); spin_lock_init(&xs->tx_completion_lock); mutex_lock(&net->xdp.lock); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cba4a640d5e8..909c5168ed0f 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -117,6 +117,20 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) return q->nentries - (producer - q->cons_tail); } +static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) +{ + u32 entries = q->prod_tail - q->cons_tail; + + if (entries >= cnt) + return true; + + /* Refresh the local pointer. */ + q->prod_tail = READ_ONCE(q->ring->producer); + entries = q->prod_tail - q->cons_tail; + + return entries >= cnt; +} + /* UMEM queue */ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index ff654306d836..189ef15acbbc 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -271,9 +271,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) return false; if ((!dev || (dev == xfrm_dst_path(dst)->dev)) && - (!xdst->child->xfrm && x->type->get_mtu)) { - mtu = x->type->get_mtu(x, xdst->child_mtu_cached); - + (!xdst->child->xfrm)) { + mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 314973aaa414..6088bc2dc11e 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -359,28 +359,29 @@ static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); if (likely(afinfo)) err = afinfo->extract_input(x, skb); + rcu_read_unlock(); - if (err) { - rcu_read_unlock(); + if (err) return err; - } if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); - if (!inner_mode) { - rcu_read_unlock(); + if (!inner_mode) return -EAFNOSUPPORT; - } } - afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); - if (unlikely(!afinfo)) { - rcu_read_unlock(); - return -EAFNOSUPPORT; + switch (inner_mode->family) { + case AF_INET: + skb->protocol = htons(ETH_P_IP); + break; + case AF_INET6: + skb->protocol = htons(ETH_P_IPV6); + break; + default: + WARN_ON_ONCE(1); + break; } - skb->protocol = afinfo->eth_proto; - rcu_read_unlock(); return xfrm_inner_mode_encap_remove(x, inner_mode, skb); } diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 7dbe0c608df5..74868f9d81fb 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -751,11 +751,6 @@ static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn) unregister_netdevice_many(&list); } -static int __net_init xfrmi_init_net(struct net *net) -{ - return 0; -} - static void __net_exit xfrmi_exit_net(struct net *net) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); @@ -766,7 +761,6 @@ static void __net_exit xfrmi_exit_net(struct net *net) } static struct pernet_operations xfrmi_net_ops = { - .init = xfrmi_init_net, .exit = xfrmi_exit_net, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4fb58dfecc7a..8ca637a72697 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3627,7 +3627,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } xfrm_nr = ti; if (npols > 1) { - xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net); + xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 50621d982970..c6f3c4a1bd99 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -27,6 +27,8 @@ #include <linux/interrupt.h> #include <linux/kernel.h> +#include <crypto/aead.h> + #include "xfrm_hash.h" #define xfrm_state_deref_prot(table, net) \ @@ -177,63 +179,132 @@ int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol) static bool km_is_alive(const struct km_event *c); void km_state_expired(struct xfrm_state *x, int hard, u32 portid); -static DEFINE_SPINLOCK(xfrm_type_lock); int xfrm_register_type(const struct xfrm_type *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - const struct xfrm_type **typemap; int err = 0; - if (unlikely(afinfo == NULL)) + if (!afinfo) return -EAFNOSUPPORT; - typemap = afinfo->type_map; - spin_lock_bh(&xfrm_type_lock); - if (likely(typemap[type->proto] == NULL)) - typemap[type->proto] = type; - else - err = -EEXIST; - spin_unlock_bh(&xfrm_type_lock); +#define X(afi, T, name) do { \ + WARN_ON((afi)->type_ ## name); \ + (afi)->type_ ## name = (T); \ + } while (0) + + switch (type->proto) { + case IPPROTO_COMP: + X(afinfo, type, comp); + break; + case IPPROTO_AH: + X(afinfo, type, ah); + break; + case IPPROTO_ESP: + X(afinfo, type, esp); + break; + case IPPROTO_IPIP: + X(afinfo, type, ipip); + break; + case IPPROTO_DSTOPTS: + X(afinfo, type, dstopts); + break; + case IPPROTO_ROUTING: + X(afinfo, type, routing); + break; + case IPPROTO_IPV6: + X(afinfo, type, ipip6); + break; + default: + WARN_ON(1); + err = -EPROTONOSUPPORT; + break; + } +#undef X rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_type); -int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family) +void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - const struct xfrm_type **typemap; - int err = 0; if (unlikely(afinfo == NULL)) - return -EAFNOSUPPORT; - typemap = afinfo->type_map; - spin_lock_bh(&xfrm_type_lock); + return; - if (unlikely(typemap[type->proto] != type)) - err = -ENOENT; - else - typemap[type->proto] = NULL; - spin_unlock_bh(&xfrm_type_lock); +#define X(afi, T, name) do { \ + WARN_ON((afi)->type_ ## name != (T)); \ + (afi)->type_ ## name = NULL; \ + } while (0) + + switch (type->proto) { + case IPPROTO_COMP: + X(afinfo, type, comp); + break; + case IPPROTO_AH: + X(afinfo, type, ah); + break; + case IPPROTO_ESP: + X(afinfo, type, esp); + break; + case IPPROTO_IPIP: + X(afinfo, type, ipip); + break; + case IPPROTO_DSTOPTS: + X(afinfo, type, dstopts); + break; + case IPPROTO_ROUTING: + X(afinfo, type, routing); + break; + case IPPROTO_IPV6: + X(afinfo, type, ipip6); + break; + default: + WARN_ON(1); + break; + } +#undef X rcu_read_unlock(); - return err; } EXPORT_SYMBOL(xfrm_unregister_type); static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) { + const struct xfrm_type *type = NULL; struct xfrm_state_afinfo *afinfo; - const struct xfrm_type **typemap; - const struct xfrm_type *type; int modload_attempted = 0; retry: afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return NULL; - typemap = afinfo->type_map; - type = READ_ONCE(typemap[proto]); + switch (proto) { + case IPPROTO_COMP: + type = afinfo->type_comp; + break; + case IPPROTO_AH: + type = afinfo->type_ah; + break; + case IPPROTO_ESP: + type = afinfo->type_esp; + break; + case IPPROTO_IPIP: + type = afinfo->type_ipip; + break; + case IPPROTO_DSTOPTS: + type = afinfo->type_dstopts; + break; + case IPPROTO_ROUTING: + type = afinfo->type_routing; + break; + case IPPROTO_IPV6: + type = afinfo->type_ipip6; + break; + default: + break; + } + if (unlikely(type && !try_module_get(type->owner))) type = NULL; @@ -253,65 +324,71 @@ static void xfrm_put_type(const struct xfrm_type *type) module_put(type->owner); } -static DEFINE_SPINLOCK(xfrm_type_offload_lock); int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - const struct xfrm_type_offload **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; - typemap = afinfo->type_offload_map; - spin_lock_bh(&xfrm_type_offload_lock); - if (likely(typemap[type->proto] == NULL)) - typemap[type->proto] = type; - else - err = -EEXIST; - spin_unlock_bh(&xfrm_type_offload_lock); + switch (type->proto) { + case IPPROTO_ESP: + WARN_ON(afinfo->type_offload_esp); + afinfo->type_offload_esp = type; + break; + default: + WARN_ON(1); + err = -EPROTONOSUPPORT; + break; + } + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_type_offload); -int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, - unsigned short family) +void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, + unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - const struct xfrm_type_offload **typemap; - int err = 0; if (unlikely(afinfo == NULL)) - return -EAFNOSUPPORT; - typemap = afinfo->type_offload_map; - spin_lock_bh(&xfrm_type_offload_lock); + return; - if (unlikely(typemap[type->proto] != type)) - err = -ENOENT; - else - typemap[type->proto] = NULL; - spin_unlock_bh(&xfrm_type_offload_lock); + switch (type->proto) { + case IPPROTO_ESP: + WARN_ON(afinfo->type_offload_esp != type); + afinfo->type_offload_esp = NULL; + break; + default: + WARN_ON(1); + break; + } rcu_read_unlock(); - return err; } EXPORT_SYMBOL(xfrm_unregister_type_offload); static const struct xfrm_type_offload * xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load) { + const struct xfrm_type_offload *type = NULL; struct xfrm_state_afinfo *afinfo; - const struct xfrm_type_offload **typemap; - const struct xfrm_type_offload *type; retry: afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return NULL; - typemap = afinfo->type_offload_map; - type = typemap[proto]; + switch (proto) { + case IPPROTO_ESP: + type = afinfo->type_offload_esp; + break; + default: + break; + } + if ((type && !try_module_get(type->owner))) type = NULL; @@ -770,24 +847,79 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) EXPORT_SYMBOL(xfrm_sad_getinfo); static void +__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) +{ + const struct flowi4 *fl4 = &fl->u.ip4; + + sel->daddr.a4 = fl4->daddr; + sel->saddr.a4 = fl4->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl4->uli); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl, &fl4->uli); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET; + sel->prefixlen_d = 32; + sel->prefixlen_s = 32; + sel->proto = fl4->flowi4_proto; + sel->ifindex = fl4->flowi4_oif; +} + +static void +__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) +{ + const struct flowi6 *fl6 = &fl->u.ip6; + + /* Initialize temporary selector matching only to current session. */ + *(struct in6_addr *)&sel->daddr = fl6->daddr; + *(struct in6_addr *)&sel->saddr = fl6->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl6->uli); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl, &fl6->uli); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET6; + sel->prefixlen_d = 128; + sel->prefixlen_s = 128; + sel->proto = fl6->flowi6_proto; + sel->ifindex = fl6->flowi6_oif; +} + +static void xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, const struct xfrm_tmpl *tmpl, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family) { - struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family); - - if (!afinfo) - return; + switch (family) { + case AF_INET: + __xfrm4_init_tempsel(&x->sel, fl); + break; + case AF_INET6: + __xfrm6_init_tempsel(&x->sel, fl); + break; + } - afinfo->init_tempsel(&x->sel, fl); + x->id = tmpl->id; - if (family != tmpl->encap_family) { - afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family); - if (!afinfo) - return; + switch (tmpl->encap_family) { + case AF_INET: + if (x->id.daddr.a4 == 0) + x->id.daddr.a4 = daddr->a4; + x->props.saddr = tmpl->saddr; + if (x->props.saddr.a4 == 0) + x->props.saddr.a4 = saddr->a4; + break; + case AF_INET6: + if (ipv6_addr_any((struct in6_addr *)&x->id.daddr)) + memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); + memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); + if (ipv6_addr_any((struct in6_addr *)&x->props.saddr)) + memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); + break; } - afinfo->init_temprop(x, tmpl, daddr, saddr); + + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = tmpl->encap_family; } static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, @@ -1633,51 +1765,129 @@ xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, EXPORT_SYMBOL(xfrm_find_acq); #ifdef CONFIG_XFRM_SUB_POLICY -int +#if IS_ENABLED(CONFIG_IPV6) +/* distribution counting sort function for xfrm_state and xfrm_tmpl */ +static void +__xfrm6_sort(void **dst, void **src, int n, + int (*cmp)(const void *p), int maxclass) +{ + int count[XFRM_MAX_DEPTH] = { }; + int class[XFRM_MAX_DEPTH]; + int i; + + for (i = 0; i < n; i++) { + int c = cmp(src[i]); + + class[i] = c; + count[c]++; + } + + for (i = 2; i < maxclass; i++) + count[i] += count[i - 1]; + + for (i = 0; i < n; i++) { + dst[count[class[i] - 1]++] = src[i]; + src[i] = NULL; + } +} + +/* Rule for xfrm_state: + * + * rule 1: select IPsec transport except AH + * rule 2: select MIPv6 RO or inbound trigger + * rule 3: select IPsec transport AH + * rule 4: select IPsec tunnel + * rule 5: others + */ +static int __xfrm6_state_sort_cmp(const void *p) +{ + const struct xfrm_state *v = p; + + switch (v->props.mode) { + case XFRM_MODE_TRANSPORT: + if (v->id.proto != IPPROTO_AH) + return 1; + else + return 3; +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + return 2; +#endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + return 4; + } + return 5; +} + +/* Rule for xfrm_tmpl: + * + * rule 1: select IPsec transport + * rule 2: select MIPv6 RO or inbound trigger + * rule 3: select IPsec tunnel + * rule 4: others + */ +static int __xfrm6_tmpl_sort_cmp(const void *p) +{ + const struct xfrm_tmpl *v = p; + + switch (v->mode) { + case XFRM_MODE_TRANSPORT: + return 1; +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + return 2; +#endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + return 3; + } + return 4; +} +#else +static inline int __xfrm6_state_sort_cmp(const void *p) { return 5; } +static inline int __xfrm6_tmpl_sort_cmp(const void *p) { return 4; } + +static inline void +__xfrm6_sort(void **dst, void **src, int n, + int (*cmp)(const void *p), int maxclass) +{ + int i; + + for (i = 0; i < n; i++) + dst[i] = src[i]; +} +#endif /* CONFIG_IPV6 */ + +void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, - unsigned short family, struct net *net) + unsigned short family) { int i; - int err = 0; - struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - if (!afinfo) - return -EAFNOSUPPORT; - spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/ - if (afinfo->tmpl_sort) - err = afinfo->tmpl_sort(dst, src, n); + if (family == AF_INET6) + __xfrm6_sort((void **)dst, (void **)src, n, + __xfrm6_tmpl_sort_cmp, 5); else for (i = 0; i < n; i++) dst[i] = src[i]; - spin_unlock_bh(&net->xfrm.xfrm_state_lock); - rcu_read_unlock(); - return err; } -EXPORT_SYMBOL(xfrm_tmpl_sort); -int +void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, unsigned short family) { int i; - int err = 0; - struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - struct net *net = xs_net(*src); - - if (!afinfo) - return -EAFNOSUPPORT; - spin_lock_bh(&net->xfrm.xfrm_state_lock); - if (afinfo->state_sort) - err = afinfo->state_sort(dst, src, n); + if (family == AF_INET6) + __xfrm6_sort((void **)dst, (void **)src, n, + __xfrm6_state_sort_cmp, 6); else for (i = 0; i < n; i++) dst[i] = src[i]; - spin_unlock_bh(&net->xfrm.xfrm_state_lock); - rcu_read_unlock(); - return err; } -EXPORT_SYMBOL(xfrm_state_sort); #endif /* Silly enough, but I'm lazy to build resolution list */ @@ -2195,38 +2405,49 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -int xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); + struct crypto_aead *aead; + u32 blksize, net_adj = 0; + + if (x->km.state != XFRM_STATE_VALID || + !type || type->proto != IPPROTO_ESP) + return mtu - x->props.header_len; + + aead = x->data; + blksize = ALIGN(crypto_aead_blocksize(aead), 4); - if (x->km.state == XFRM_STATE_VALID && - type && type->get_mtu) - return type->get_mtu(x, mtu); + switch (x->props.mode) { + case XFRM_MODE_TRANSPORT: + case XFRM_MODE_BEET: + if (x->props.family == AF_INET) + net_adj = sizeof(struct iphdr); + else if (x->props.family == AF_INET6) + net_adj = sizeof(struct ipv6hdr); + break; + case XFRM_MODE_TUNNEL: + break; + default: + WARN_ON_ONCE(1); + break; + } - return mtu - x->props.header_len; + return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - + net_adj) & ~(blksize - 1)) + net_adj - 2; } +EXPORT_SYMBOL_GPL(xfrm_state_mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { - const struct xfrm_state_afinfo *afinfo; const struct xfrm_mode *inner_mode; const struct xfrm_mode *outer_mode; int family = x->props.family; int err; - err = -EAFNOSUPPORT; - afinfo = xfrm_state_get_afinfo(family); - if (!afinfo) - goto error; - - err = 0; - if (afinfo->init_flags) - err = afinfo->init_flags(x); - - rcu_read_unlock(); - - if (err) - goto error; + if (family == AF_INET && + xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) + x->props.flags |= XFRM_STATE_NOPMTUDISC; err = -EPROTONOSUPPORT; |