From 0e78a87306a6f55b1c7bbafad1de62c3975953ca Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 3 May 2017 08:44:27 +0200 Subject: esp4: Fix udpencap for local TCP packets. Locally generated TCP packets are usually cloned, so we do skb_cow_data() on this packets. After that we need to reload the pointer to the esp header. On udpencap this header has an offset to skb_transport_header, so take this offset into account. Fixes: 67d349ed603 ("net/esp4: Fix invalid esph pointer crash") Fixes: fca11ebde3f0 ("esp4: Reorganize esp_output") Reported-by: Don Bowman Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 65cc02bd82bc..93322f895eab 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -248,6 +248,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * u8 *tail; u8 *vaddr; int nfrags; + int esph_offset; struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; @@ -313,11 +314,13 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } cow: + esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb); + nfrags = skb_cow_data(skb, tailen, &trailer); if (nfrags < 0) goto out; tail = skb_tail_pointer(trailer); - esp->esph = ip_esp_hdr(skb); + esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset); skip_cow: esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); -- cgit v1.2.3 From 9b3eb54106cf6acd03f07cf0ab01c13676a226c2 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 3 May 2017 16:43:19 +0200 Subject: xfrm: fix stack access out of bounds with CONFIG_XFRM_SUB_POLICY When CONFIG_XFRM_SUB_POLICY=y, xfrm_dst stores a copy of the flowi for that dst. Unfortunately, the code that allocates and fills this copy doesn't care about what type of flowi (flowi, flowi4, flowi6) gets passed. In multiple code paths (from raw_sendmsg, from TCP when replying to a FIN, in vxlan, geneve, and gre), the flowi that gets passed to xfrm is actually an on-stack flowi4, so we end up reading stuff from the stack past the end of the flowi4 struct. Since xfrm_dst->origin isn't used anywhere following commit ca116922afa8 ("xfrm: Eliminate "fl" and "pol" args to xfrm_bundle_ok()."), just get rid of it. xfrm_dst->partner isn't used either, so get rid of that too. Fixes: 9d6ec938019c ("ipv4: Use flowi4 in public route lookup interfaces.") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 10 ---------- net/xfrm/xfrm_policy.c | 47 ----------------------------------------------- 2 files changed, 57 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6793a30c66b1..7e7e2b0d2915 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -979,10 +979,6 @@ struct xfrm_dst { struct flow_cache_object flo; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols, num_xfrms; -#ifdef CONFIG_XFRM_SUB_POLICY - struct flowi *origin; - struct xfrm_selector *partner; -#endif u32 xfrm_genid; u32 policy_genid; u32 route_mtu_cached; @@ -998,12 +994,6 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) dst_release(xdst->route); if (likely(xdst->u.dst.xfrm)) xfrm_state_put(xdst->u.dst.xfrm); -#ifdef CONFIG_XFRM_SUB_POLICY - kfree(xdst->origin); - xdst->origin = NULL; - kfree(xdst->partner); - xdst->partner = NULL; -#endif } #endif diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b00a1d5a7f52..ed4e52d95172 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1797,43 +1797,6 @@ free_dst: goto out; } -#ifdef CONFIG_XFRM_SUB_POLICY -static int xfrm_dst_alloc_copy(void **target, const void *src, int size) -{ - if (!*target) { - *target = kmalloc(size, GFP_ATOMIC); - if (!*target) - return -ENOMEM; - } - - memcpy(*target, src, size); - return 0; -} -#endif - -static int xfrm_dst_update_parent(struct dst_entry *dst, - const struct xfrm_selector *sel) -{ -#ifdef CONFIG_XFRM_SUB_POLICY - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - return xfrm_dst_alloc_copy((void **)&(xdst->partner), - sel, sizeof(*sel)); -#else - return 0; -#endif -} - -static int xfrm_dst_update_origin(struct dst_entry *dst, - const struct flowi *fl) -{ -#ifdef CONFIG_XFRM_SUB_POLICY - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl)); -#else - return 0; -#endif -} - static int xfrm_expand_policies(const struct flowi *fl, u16 family, struct xfrm_policy **pols, int *num_pols, int *num_xfrms) @@ -1905,16 +1868,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, xdst = (struct xfrm_dst *)dst; xdst->num_xfrms = err; - if (num_pols > 1) - err = xfrm_dst_update_parent(dst, &pols[1]->selector); - else - err = xfrm_dst_update_origin(dst, fl); - if (unlikely(err)) { - dst_free(dst); - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); - return ERR_PTR(err); - } - xdst->num_pols = num_pols; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); -- cgit v1.2.3 From d90c902449a7561f1b1d58ba5a0d11728ce8b0b2 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 5 May 2017 07:40:42 +0200 Subject: af_key: Fix slab-out-of-bounds in pfkey_compile_policy. The sadb_x_sec_len is stored in the unit 'byte divided by eight'. So we have to multiply this value by eight before we can do size checks. Otherwise we may get a slab-out-of-bounds when we memcpy the user sec_ctx. Fixes: df71837d502 ("[LSM-IPSec]: Security association restriction.") Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Steffen Klassert --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index c1950bb14735..512dc43d0ce6 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3285,7 +3285,7 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, p += pol->sadb_x_policy_len*8; sec_ctx = (struct sadb_x_sec_ctx *)p; if (len < pol->sadb_x_policy_len*8 + - sec_ctx->sadb_x_sec_len) { + sec_ctx->sadb_x_sec_len*8) { *dir = -EINVAL; goto out; } -- cgit v1.2.3 From 2c1497bbc8fdee897341ab48ee9c9209b421b8c0 Mon Sep 17 00:00:00 2001 From: Ilan Tayari Date: Mon, 8 May 2017 10:30:18 +0300 Subject: xfrm: Fix NETDEV_DOWN with IPSec offload Upon NETDEV_DOWN event, all xfrm_state objects which are bound to the device are flushed. The condition for this is wrong, though, testing dev->hw_features instead of dev->features. If a device has non-user-modifiable NETIF_F_HW_ESP, then its xfrm_state objects are not flushed, causing a crash later on after the device is deleted. Check dev->features instead of dev->hw_features. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Ilan Tayari Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 8ec8a3fcf8d4..574e6f32f94f 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -170,7 +170,7 @@ static int xfrm_dev_feat_change(struct net_device *dev) static int xfrm_dev_down(struct net_device *dev) { - if (dev->hw_features & NETIF_F_HW_ESP) + if (dev->features & NETIF_F_HW_ESP) xfrm_dev_state_flush(dev_net(dev), dev, true); xfrm_garbage_collect(dev_net(dev)); -- cgit v1.2.3 From 3c5ab3f395d66a9e4e937fcfdf6ebc63894f028b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 29 Apr 2017 20:33:09 +0300 Subject: ipvs: SNAT packet replies only for NATed connections We do not check if packet from real server is for NAT connection before performing SNAT. This causes problems for setups that use DR/TUN and allow local clients to access the real server directly, for example: - local client in director creates IPVS-DR/TUN connection CIP->VIP and the request packets are routed to RIP. Talks are finished but IPVS connection is not expired yet. - second local client creates non-IPVS connection CIP->RIP with same reply tuple RIP->CIP and when replies are received on LOCAL_IN we wrongly assign them for the first client connection because RIP->CIP matches the reply direction. As result, IPVS SNATs replies for non-IPVS connections. The problem is more visible to local UDP clients but in rare cases it can happen also for TCP or remote clients when the real server sends the reply traffic via the director. So, better to be more precise for the reply traffic. As replies are not expected for DR/TUN connections, better to not touch them. Reported-by: Nick Moriarty Tested-by: Nick Moriarty Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index d2d7bdf1d510..ad99c1ceea6f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -849,10 +849,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, { unsigned int verdict = NF_DROP; - if (IP_VS_FWD_METHOD(cp) != 0) { - pr_err("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + goto ignore_cp; /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { @@ -886,6 +884,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); + +ignore_cp: verdict = NF_ACCEPT; out: @@ -1385,8 +1385,11 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in */ cp = pp->conn_out_get(ipvs, af, skb, &iph); - if (likely(cp)) + if (likely(cp)) { + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + goto ignore_cp; return handle_response(af, skb, pd, cp, &iph, hooknum); + } /* Check for real-server-started requests */ if (atomic_read(&ipvs->conn_out_counter)) { @@ -1444,9 +1447,15 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in } } } + +out: IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; + +ignore_cp: + __ip_vs_conn_put(cp); + goto out; } /* -- cgit v1.2.3 From a2b7cbdd2559aff06cebc28a7150f81c307a90d3 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 19 Apr 2017 11:39:20 -0700 Subject: netfilter: ctnetlink: Make some parameters integer to avoid enum mismatch Not all parameters passed to ctnetlink_parse_tuple() and ctnetlink_exp_dump_tuple() match the enum type in the signatures of these functions. Since this is intended change the argument type of to be an unsigned integer value. Signed-off-by: Matthias Kaehlcke Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index dcf561b5c97a..fa752626029e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1007,9 +1007,8 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { static int ctnetlink_parse_tuple(const struct nlattr * const cda[], - struct nf_conntrack_tuple *tuple, - enum ctattr_type type, u_int8_t l3num, - struct nf_conntrack_zone *zone) + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; @@ -2447,7 +2446,7 @@ static struct nfnl_ct_hook ctnetlink_glue_hook = { static int ctnetlink_exp_dump_tuple(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, - enum ctattr_expect type) + u32 type) { struct nlattr *nest_parms; -- cgit v1.2.3 From d110a3942aca78d14929bc648aeb83ee0b245a61 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 6 May 2017 20:28:02 +0800 Subject: netfilter: don't setup nat info for confirmed ct We cannot setup nat info if the ct has been confirmed already, else, different cpu may race to handle the same ct. In extreme situation, we may hit the "BUG_ON(nf_nat_initialized(ct, maniptype))" in the nf_nat_setup_info. Also running the following commands will easily hit NF_CT_ASSERT in nf_conntrack_alter_reply: # nft flush ruleset # ping -c 2 -W 1 1.1.1.111 & # nft add table t # nft add chain t c {type nat hook postrouting priority 0 \;} # nft add rule t c snat to 4.5.6.7 WARNING: CPU: 1 PID: 10065 at net/netfilter/nf_conntrack_core.c:1472 nf_conntrack_alter_reply+0x9a/0x1a0 [nf_conntrack] [...] Call Trace: nf_nat_setup_info+0xad/0x840 [nf_nat] ? deactivate_slab+0x65d/0x6c0 nft_nat_eval+0xcd/0x100 [nft_nat] nft_do_chain+0xff/0x5d0 [nf_tables] ? mark_held_locks+0x6f/0xa0 ? __local_bh_enable_ip+0x70/0xa0 ? trace_hardirqs_on_caller+0x11f/0x190 ? ipt_do_table+0x310/0x610 ? trace_hardirqs_on+0xd/0x10 ? __local_bh_enable_ip+0x70/0xa0 ? ipt_do_table+0x32b/0x610 ? __lock_acquire+0x2ac/0x1580 ? ipt_do_table+0x32b/0x610 nft_nat_do_chain+0x65/0x80 [nft_chain_nat_ipv4] nf_nat_ipv4_fn+0x1ae/0x240 [nf_nat_ipv4] nf_nat_ipv4_out+0x4a/0xf0 [nf_nat_ipv4] nft_nat_ipv4_out+0x15/0x20 [nft_chain_nat_ipv4] nf_hook_slow+0x2c/0xf0 ip_output+0x154/0x270 So for the confirmed ct, just ignore it and return NF_ACCEPT. Fixes: 9a08ecfe74d7 ("netfilter: don't attach a nat extension by default") Signed-off-by: Liping Zhang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index b48d6b5aae8a..ef0be325a0c6 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -409,6 +409,10 @@ nf_nat_setup_info(struct nf_conn *ct, { struct nf_conntrack_tuple curr_tuple, new_tuple; + /* Can't setup nat info for confirmed ct. */ + if (nf_ct_is_confirmed(ct)) + return NF_ACCEPT; + NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || maniptype == NF_NAT_MANIP_DST); BUG_ON(nf_nat_initialized(ct, maniptype)); -- cgit v1.2.3 From d91fc59cd77c719f33eda65c194ad8f95a055190 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 7 May 2017 22:01:55 +0800 Subject: netfilter: introduce nf_conntrack_helper_put helper function And convert module_put invocation to nf_conntrack_helper_put, this is prepared for the followup patch, which will add a refcnt for cthelper, so we can reject the deleting request when cthelper is in use. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 2 ++ net/netfilter/nf_conntrack_helper.c | 6 ++++++ net/netfilter/nft_ct.c | 4 ++-- net/netfilter/xt_CT.c | 6 +++--- net/openvswitch/conntrack.c | 4 ++-- 5 files changed, 15 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index e04fa7691e5d..c1c12411103a 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -79,6 +79,8 @@ struct nf_conntrack_helper *__nf_conntrack_helper_find(const char *name, struct nf_conntrack_helper *nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum); +void nf_conntrack_helper_put(struct nf_conntrack_helper *helper); + void nf_ct_helper_init(struct nf_conntrack_helper *helper, u16 l3num, u16 protonum, const char *name, u16 default_port, u16 spec_port, u32 id, diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 3a60efa7799b..e17006b6e434 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -181,6 +181,12 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) } EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); +void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) +{ + module_put(helper->me); +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); + struct nf_conn_help * nf_ct_helper_ext_add(struct nf_conn *ct, struct nf_conntrack_helper *helper, gfp_t gfp) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index a34ceb38fc55..1678e9e75e8e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -826,9 +826,9 @@ static void nft_ct_helper_obj_destroy(struct nft_object *obj) struct nft_ct_helper_obj *priv = nft_obj_data(obj); if (priv->helper4) - module_put(priv->helper4->me); + nf_conntrack_helper_put(priv->helper4); if (priv->helper6) - module_put(priv->helper6->me); + nf_conntrack_helper_put(priv->helper6); } static void nft_ct_helper_obj_eval(struct nft_object *obj, diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index bb7ad82dcd56..623ef37de886 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -96,7 +96,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL); if (help == NULL) { - module_put(helper->me); + nf_conntrack_helper_put(helper); return -ENOMEM; } @@ -263,7 +263,7 @@ out: err4: help = nfct_help(ct); if (help) - module_put(help->helper->me); + nf_conntrack_helper_put(help->helper); err3: nf_ct_tmpl_free(ct); err2: @@ -346,7 +346,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, if (ct) { help = nfct_help(ct); if (help) - module_put(help->helper->me); + nf_conntrack_helper_put(help->helper); nf_ct_netns_put(par->net, par->family); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index bf602e33c40a..08679ebb3068 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1123,7 +1123,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); if (!help) { - module_put(helper->me); + nf_conntrack_helper_put(helper); return -ENOMEM; } @@ -1584,7 +1584,7 @@ void ovs_ct_free_action(const struct nlattr *a) static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) { if (ct_info->helper) - module_put(ct_info->helper->me); + nf_conntrack_helper_put(ct_info->helper); if (ct_info->ct) nf_ct_tmpl_free(ct_info->ct); } -- cgit v1.2.3 From 9338d7b4418e9996a7642867d8f6b482a6040ed6 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 7 May 2017 22:01:56 +0800 Subject: netfilter: nfnl_cthelper: reject del request if helper obj is in use We can still delete the ct helper even if it is in use, this will cause a use-after-free error. In more detail, I mean: # nfct helper add ssdp inet udp # iptables -t raw -A OUTPUT -p udp -j CT --helper ssdp # nfct helper delete ssdp //--> oops, succeed! BUG: unable to handle kernel paging request at 000026ca IP: 0x26ca [...] Call Trace: ? ipv4_helper+0x62/0x80 [nf_conntrack_ipv4] nf_hook_slow+0x21/0xb0 ip_output+0xe9/0x100 ? ip_fragment.constprop.54+0xc0/0xc0 ip_local_out+0x33/0x40 ip_send_skb+0x16/0x80 udp_send_skb+0x84/0x240 udp_sendmsg+0x35d/0xa50 So add reference count to fix this issue, if ct helper is used by others, reject the delete request. Apply this patch: # nfct helper delete ssdp nfct v1.4.3: netlink error: Device or resource busy Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 2 ++ net/netfilter/nf_conntrack_helper.c | 6 ++++++ net/netfilter/nfnetlink_cthelper.c | 17 +++++++++++------ 3 files changed, 19 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index c1c12411103a..c519bb5b5bb8 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -9,6 +9,7 @@ #ifndef _NF_CONNTRACK_HELPER_H #define _NF_CONNTRACK_HELPER_H +#include #include #include #include @@ -26,6 +27,7 @@ struct nf_conntrack_helper { struct hlist_node hnode; /* Internal use. */ char name[NF_CT_HELPER_NAME_LEN]; /* name of the module */ + refcount_t refcnt; struct module *me; /* pointer to self */ const struct nf_conntrack_expect_policy *expect_policy; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index e17006b6e434..7f6100ca63be 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -174,6 +174,10 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) #endif if (h != NULL && !try_module_get(h->me)) h = NULL; + if (h != NULL && !refcount_inc_not_zero(&h->refcnt)) { + module_put(h->me); + h = NULL; + } rcu_read_unlock(); @@ -183,6 +187,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) { + refcount_dec(&helper->refcnt); module_put(helper->me); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); @@ -423,6 +428,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) } } } + refcount_set(&me->refcnt, 1); hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); nf_ct_helper_count++; out: diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 950bf6eadc65..be678a323598 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -686,6 +686,7 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, tuple_set = true; } + ret = -ENOENT; list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) { cur = &nlcth->helper; j++; @@ -699,16 +700,20 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, tuple.dst.protonum != cur->tuple.dst.protonum)) continue; - found = true; - nf_conntrack_helper_unregister(cur); - kfree(cur->expect_policy); + if (refcount_dec_if_one(&cur->refcnt)) { + found = true; + nf_conntrack_helper_unregister(cur); + kfree(cur->expect_policy); - list_del(&nlcth->list); - kfree(nlcth); + list_del(&nlcth->list); + kfree(nlcth); + } else { + ret = -EBUSY; + } } /* Make sure we return success if we flush and there is no helpers */ - return (found || j == 0) ? 0 : -ENOENT; + return (found || j == 0) ? 0 : ret; } static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { -- cgit v1.2.3 From 324318f0248c31be8a08984146e7e4dd7cdd091d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 9 May 2017 16:17:37 -0400 Subject: netfilter: xtables: zero padding in data_to_user When looking up an iptables rule, the iptables binary compares the aligned match and target data (XT_ALIGN). In some cases this can exceed the actual data size to include padding bytes. Before commit f77bc5b23fb1 ("iptables: use match, target and data copy_to_user helpers") the malloc()ed bytes were overwritten by the kernel with kzalloced contents, zeroing the padding and making the comparison succeed. After this patch, the kernel copies and clears only data, leaving the padding bytes undefined. Extend the clear operation from data size to aligned data size to include the padding bytes, if any. Padding bytes can be observed in both match and target, and the bug triggered, by issuing a rule with match icmp and target ACCEPT: iptables -t mangle -A INPUT -i lo -p icmp --icmp-type 1 -j ACCEPT iptables -t mangle -D INPUT -i lo -p icmp --icmp-type 1 -j ACCEPT Fixes: f77bc5b23fb1 ("iptables: use match, target and data copy_to_user helpers") Reported-by: Paul Moore Reported-by: Richard Guy Briggs Signed-off-by: Willem de Bruijn Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- net/bridge/netfilter/ebtables.c | 9 ++++++--- net/netfilter/x_tables.c | 9 ++++++--- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index be378cf47fcc..b3044c2c62cb 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -294,7 +294,7 @@ int xt_match_to_user(const struct xt_entry_match *m, int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u); int xt_data_to_user(void __user *dst, const void *src, - int usersize, int size); + int usersize, int size, int aligned_size); void *xt_copy_counters_from_user(const void __user *user, unsigned int len, struct xt_counters_info *info, bool compat); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9ec0c9f908fa..9c6e619f452b 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1373,7 +1373,8 @@ static inline int ebt_obj_to_user(char __user *um, const char *_name, strlcpy(name, _name, sizeof(name)); if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) || put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) || - xt_data_to_user(um + entrysize, data, usersize, datasize)) + xt_data_to_user(um + entrysize, data, usersize, datasize, + XT_ALIGN(datasize))) return -EFAULT; return 0; @@ -1658,7 +1659,8 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr, if (match->compat_to_user(cm->data, m->data)) return -EFAULT; } else { - if (xt_data_to_user(cm->data, m->data, match->usersize, msize)) + if (xt_data_to_user(cm->data, m->data, match->usersize, msize, + COMPAT_XT_ALIGN(msize))) return -EFAULT; } @@ -1687,7 +1689,8 @@ static int compat_target_to_user(struct ebt_entry_target *t, if (target->compat_to_user(cm->data, t->data)) return -EFAULT; } else { - if (xt_data_to_user(cm->data, t->data, target->usersize, tsize)) + if (xt_data_to_user(cm->data, t->data, target->usersize, tsize, + COMPAT_XT_ALIGN(tsize))) return -EFAULT; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 8876b7da6884..d17769599c10 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -283,12 +283,13 @@ static int xt_obj_to_user(u16 __user *psize, u16 size, &U->u.user.revision, K->u.kernel.TYPE->revision) int xt_data_to_user(void __user *dst, const void *src, - int usersize, int size) + int usersize, int size, int aligned_size) { usersize = usersize ? : size; if (copy_to_user(dst, src, usersize)) return -EFAULT; - if (usersize != size && clear_user(dst + usersize, size - usersize)) + if (usersize != aligned_size && + clear_user(dst + usersize, aligned_size - usersize)) return -EFAULT; return 0; @@ -298,7 +299,9 @@ EXPORT_SYMBOL_GPL(xt_data_to_user); #define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ xt_data_to_user(U->data, K->data, \ K->u.kernel.TYPE->usersize, \ - C_SIZE ? : K->u.kernel.TYPE->TYPE##size) + C_SIZE ? : K->u.kernel.TYPE->TYPE##size, \ + C_SIZE ? COMPAT_XT_ALIGN(C_SIZE) : \ + XT_ALIGN(K->u.kernel.TYPE->TYPE##size)) int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u) -- cgit v1.2.3 From 87e94dbc210a720a34be5c1174faee5c84be963e Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Thu, 11 May 2017 18:56:38 +0200 Subject: netfilter: synproxy: fix conntrackd interaction This patch fixes the creation of connection tracking entry from netlink when synproxy is used. It was missing the addition of the synproxy extension. This was causing kernel crashes when a conntrack entry created by conntrackd was used after the switch of traffic from active node to the passive node. Signed-off-by: Eric Leblond Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index fa752626029e..9799a50bc604 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -45,6 +45,8 @@ #include #include #include +#include +#include #ifdef CONFIG_NF_NAT_NEEDED #include #include @@ -1827,6 +1829,8 @@ ctnetlink_create_conntrack(struct net *net, nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); nf_ct_labels_ext_add(ct); + nfct_seqadj_ext_add(ct); + nfct_synproxy_ext_add(ct); /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; -- cgit v1.2.3 From fa803605eef39372e53d7813002d73a3fcf10c88 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 14 May 2017 21:35:22 +0800 Subject: netfilter: nf_tables: can't assume lock is acquired when dumping set elems When dumping the elements related to a specified set, we may invoke the nf_tables_dump_set with the NFNL_SUBSYS_NFTABLES lock not acquired. So we should use the proper rcu operation to avoid race condition, just like other nft dump operations. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 78 +++++++++++++++++++++++++++++++------------ net/netfilter/nft_set_hash.c | 2 +- 2 files changed, 57 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 559225029740..5f4a4d48b871 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3367,35 +3367,50 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, return nf_tables_fill_setelem(args->skb, set, elem); } +struct nft_set_dump_ctx { + const struct nft_set *set; + struct nft_ctx ctx; +}; + static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { + struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); - u8 genmask = nft_genmask_cur(net); + struct nft_af_info *afi; + struct nft_table *table; struct nft_set *set; struct nft_set_dump_args args; - struct nft_ctx ctx; - struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1]; + bool set_found = false; struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; u32 portid, seq; - int event, err; + int event; - err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla, - NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy, - NULL); - if (err < 0) - return err; + rcu_read_lock(); + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (afi != dump_ctx->ctx.afi) + continue; - err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, - (void *)nla, genmask); - if (err < 0) - return err; + list_for_each_entry_rcu(table, &afi->tables, list) { + if (table != dump_ctx->ctx.table) + continue; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); - if (IS_ERR(set)) - return PTR_ERR(set); + list_for_each_entry_rcu(set, &table->sets, list) { + if (set == dump_ctx->set) { + set_found = true; + break; + } + } + break; + } + break; + } + + if (!set_found) { + rcu_read_unlock(); + return -ENOENT; + } event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM); portid = NETLINK_CB(cb->skb).portid; @@ -3407,11 +3422,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx.afi->family; + nfmsg->nfgen_family = afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff); + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) + if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) goto nla_put_failure; @@ -3422,12 +3437,13 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.cb = cb; args.skb = skb; - args.iter.genmask = nft_genmask_cur(ctx.net); + args.iter.genmask = nft_genmask_cur(net); args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; - set->ops->walk(&ctx, set, &args.iter); + set->ops->walk(&dump_ctx->ctx, set, &args.iter); + rcu_read_unlock(); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); @@ -3441,9 +3457,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; nla_put_failure: + rcu_read_unlock(); return -ENOSPC; } +static int nf_tables_dump_set_done(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -3465,7 +3488,18 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_set, + .done = nf_tables_dump_set_done, }; + struct nft_set_dump_ctx *dump_ctx; + + dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL); + if (!dump_ctx) + return -ENOMEM; + + dump_ctx->set = set; + dump_ctx->ctx = ctx; + + c.data = dump_ctx; return netlink_dump_start(nlsk, skb, nlh, &c); } return -EOPNOTSUPP; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 8ec086b6b56b..3d3a6df4ce70 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -222,7 +222,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; int err; - err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); + err = rhashtable_walk_init(&priv->ht, &hti, GFP_ATOMIC); iter->err = err; if (err) return; -- cgit v1.2.3 From 71df14b0ce094be46d105b5a3ededd83b8e779a0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 May 2017 11:17:29 +0100 Subject: netfilter: nf_tables: missing sanitization in data from userspace Do not assume userspace always sends us NFT_DATA_VALUE for bitwise and cmp expressions. Although NFT_DATA_VERDICT does not make any sense, it is still possible to handcraft a netlink message using this incorrect data type. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 19 ++++++++++++++----- net/netfilter/nft_cmp.c | 12 ++++++++++-- 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 877d9acd91ef..96bd4f325b0f 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -83,17 +83,26 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, tb[NFTA_BITWISE_MASK]); if (err < 0) return err; - if (d1.len != priv->len) - return -EINVAL; + if (d1.len != priv->len) { + err = -EINVAL; + goto err1; + } err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, tb[NFTA_BITWISE_XOR]); if (err < 0) - return err; - if (d2.len != priv->len) - return -EINVAL; + goto err1; + if (d2.len != priv->len) { + err = -EINVAL; + goto err2; + } return 0; +err2: + nft_data_uninit(&priv->xor, d2.type); +err1: + nft_data_uninit(&priv->mask, d1.type); + return err; } static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 2b96effeadc1..8c9d0fb19118 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -201,10 +201,18 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) if (err < 0) return ERR_PTR(err); + if (desc.type != NFT_DATA_VALUE) { + err = -EINVAL; + goto err1; + } + if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ) return &nft_cmp_fast_ops; - else - return &nft_cmp_ops; + + return &nft_cmp_ops; +err1: + nft_data_uninit(&data, desc.type); + return ERR_PTR(-EINVAL); } struct nft_expr_type nft_cmp_type __read_mostly = { -- cgit v1.2.3 From 591054469b3eef34bc097c30fae8ededddf8d796 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 May 2017 11:17:34 +0100 Subject: netfilter: nf_tables: revisit chain/object refcounting from elements Andreas reports that the following incremental update using our commit protocol doesn't work. # nft -f incremental-update.nft delete element ip filter client_to_any { 10.180.86.22 : goto CIn_1 } delete chain ip filter CIn_1 ... Error: Could not process rule: Device or resource busy The existing code is not well-integrated into the commit phase protocol, since element deletions do not result in refcount decrement from the preparation phase. This results in bogus EBUSY errors like the one above. Two new functions come with this patch: * nft_set_elem_activate() function is used from the abort path, to restore the set element refcounting on objects that occurred from the preparation phase. * nft_set_elem_deactivate() that is called from nft_del_setelem() to decrement set element refcounting on objects from the preparation phase in the commit protocol. The nft_data_uninit() has been renamed to nft_data_release() since this function does not uninitialize any data store in the data register, instead just releases the references to objects. Moreover, a new function nft_data_hold() has been introduced to be used from nft_set_elem_activate(). Reported-by: Andreas Schultz Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 82 ++++++++++++++++++++++++++++++++++----- net/netfilter/nft_bitwise.c | 4 +- net/netfilter/nft_cmp.c | 2 +- net/netfilter/nft_immediate.c | 5 ++- net/netfilter/nft_range.c | 4 +- 6 files changed, 81 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 028faec8fc27..8a8bab8d7b15 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -176,7 +176,7 @@ struct nft_data_desc { int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, unsigned int size, struct nft_data_desc *desc, const struct nlattr *nla); -void nft_data_uninit(const struct nft_data *data, enum nft_data_types type); +void nft_data_release(const struct nft_data *data, enum nft_data_types type); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5f4a4d48b871..da314be0c048 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3627,9 +3627,9 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); - nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE); + nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) - nft_data_uninit(nft_set_ext_data(ext), set->dtype); + nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) @@ -3638,6 +3638,18 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +/* Only called from commit path, nft_set_elem_deactivate() already deals with + * the refcounting from the preparation phase. + */ +static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); + kfree(elem); +} + static int nft_setelem_parse_flags(const struct nft_set *set, const struct nlattr *attr, u32 *flags) { @@ -3849,9 +3861,9 @@ err4: kfree(elem.priv); err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_uninit(&data, d2.type); + nft_data_release(&data, d2.type); err2: - nft_data_uninit(&elem.key.val, d1.type); + nft_data_release(&elem.key.val, d1.type); err1: return err; } @@ -3896,6 +3908,53 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return err; } +/** + * nft_data_hold - hold a nft_data item + * + * @data: struct nft_data to release + * @type: type of data + * + * Hold a nft_data item. NFT_DATA_VALUE types can be silently discarded, + * NFT_DATA_VERDICT bumps the reference to chains in case of NFT_JUMP and + * NFT_GOTO verdicts. This function must be called on active data objects + * from the second phase of the commit protocol. + */ +static void nft_data_hold(const struct nft_data *data, enum nft_data_types type) +{ + if (type == NFT_DATA_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + data->verdict.chain->use++; + break; + } + } +} + +static void nft_set_elem_activate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) + nft_data_hold(nft_set_ext_data(ext), set->dtype); + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) + (*nft_set_ext_obj(ext))->use++; +} + +static void nft_set_elem_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) + nft_data_release(nft_set_ext_data(ext), set->dtype); + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) + (*nft_set_ext_obj(ext))->use--; +} + static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -3961,6 +4020,8 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, kfree(elem.priv); elem.priv = priv; + nft_set_elem_deactivate(ctx->net, set, &elem); + nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -3970,7 +4031,7 @@ err4: err3: kfree(elem.priv); err2: - nft_data_uninit(&elem.key.val, desc.type); + nft_data_release(&elem.key.val, desc.type); err1: return err; } @@ -4777,8 +4838,8 @@ static void nf_tables_commit_release(struct nft_trans *trans) nft_set_destroy(nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: - nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem(trans).priv, true); + nf_tables_set_elem_destroy(nft_trans_elem_set(trans), + nft_trans_elem(trans).priv); break; case NFT_MSG_DELOBJ: nft_obj_destroy(nft_trans_obj(trans)); @@ -5013,6 +5074,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; + nft_set_elem_activate(net, te->set, &te->elem); te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; @@ -5498,7 +5560,7 @@ int nft_data_init(const struct nft_ctx *ctx, EXPORT_SYMBOL_GPL(nft_data_init); /** - * nft_data_uninit - release a nft_data item + * nft_data_release - release a nft_data item * * @data: struct nft_data to release * @type: type of data @@ -5506,7 +5568,7 @@ EXPORT_SYMBOL_GPL(nft_data_init); * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded, * all others need to be released by calling this function. */ -void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) +void nft_data_release(const struct nft_data *data, enum nft_data_types type) { if (type < NFT_DATA_VERDICT) return; @@ -5517,7 +5579,7 @@ void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) WARN_ON(1); } } -EXPORT_SYMBOL_GPL(nft_data_uninit); +EXPORT_SYMBOL_GPL(nft_data_release); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 96bd4f325b0f..fff8073e2a56 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -99,9 +99,9 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, return 0; err2: - nft_data_uninit(&priv->xor, d2.type); + nft_data_release(&priv->xor, d2.type); err1: - nft_data_uninit(&priv->mask, d1.type); + nft_data_release(&priv->mask, d1.type); return err; } diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 8c9d0fb19118..c2945eb3397c 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -211,7 +211,7 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) return &nft_cmp_ops; err1: - nft_data_uninit(&data, desc.type); + nft_data_release(&data, desc.type); return ERR_PTR(-EINVAL); } diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 728baf88295a..4717d7796927 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -65,7 +65,7 @@ static int nft_immediate_init(const struct nft_ctx *ctx, return 0; err1: - nft_data_uninit(&priv->data, desc.type); + nft_data_release(&priv->data, desc.type); return err; } @@ -73,7 +73,8 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); - return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg)); + + return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 9edc74eedc10..cedb96c3619f 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -102,9 +102,9 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr priv->len = desc_from.len; return 0; err2: - nft_data_uninit(&priv->data_to, desc_to.type); + nft_data_release(&priv->data_to, desc_to.type); err1: - nft_data_uninit(&priv->data_from, desc_from.type); + nft_data_release(&priv->data_from, desc_from.type); return err; } -- cgit v1.2.3 From c953d63548207a085abcb12a15fefc8a11ffdf0a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Tue, 16 May 2017 09:30:18 +0800 Subject: ebtables: arpreply: Add the standard target sanity check The info->target comes from userspace and it would be used directly. So we need to add the sanity check to make sure it is a valid standard target, although the ebtables tool has already checked it. Kernel needs to validate anything coming from userspace. If the target is set as an evil value, it would break the ebtables and cause a panic. Because the non-standard target is treated as one offset. Now add one helper function ebt_invalid_target, and we would replace the macro INVALID_TARGET later. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 5 +++++ net/bridge/netfilter/ebt_arpreply.c | 3 +++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index a30efb437e6d..e0cbf17af780 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -125,4 +125,9 @@ extern unsigned int ebt_do_table(struct sk_buff *skb, /* True if the target is not a standard target */ #define INVALID_TARGET (info->target < -NUM_STANDARD_TARGETS || info->target >= 0) +static inline bool ebt_invalid_target(int target) +{ + return (target < -NUM_STANDARD_TARGETS || target >= 0); +} + #endif diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c index 5929309beaa1..db85230e49c3 100644 --- a/net/bridge/netfilter/ebt_arpreply.c +++ b/net/bridge/netfilter/ebt_arpreply.c @@ -68,6 +68,9 @@ static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par) if (e->ethproto != htons(ETH_P_ARP) || e->invflags & EBT_IPROTO) return -EINVAL; + if (ebt_invalid_target(info->target)) + return -EINVAL; + return 0; } -- cgit v1.2.3 From bafbb9c73241760023d8981191ddd30bb1c6dbac Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Mon, 15 May 2017 17:05:47 -0400 Subject: tcp: eliminate negative reordering in tcp_clean_rtx_queue tcp_ack() can call tcp_fragment() which may dededuct the value tp->fackets_out when MSS changes. When prior_fackets is larger than tp->fackets_out, tcp_clean_rtx_queue() can invoke tcp_update_reordering() with negative values. This results in absurd tp->reodering values higher than sysctl_tcp_max_reordering. Note that tcp_update_reordering indeeds sets tp->reordering to min(sysctl_tcp_max_reordering, metric), but because the comparison is signed, a negative metric always wins. Fixes: c7caf8d3ed7a ("[TCP]: Fix reord detection due to snd_una covered holes") Reported-by: Rebecca Isaacs Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 06e2dbc2b4a2..174d4376baa5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3190,7 +3190,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets) + if (reord < prior_fackets && reord <= tp->fackets_out) tcp_update_reordering(sk, tp->fackets_out - reord, 0); delta = tcp_is_fack(tp) ? pkts_acked : -- cgit v1.2.3 From bcfc7d33110b0f33069d74138eeb7ca9acbb3c85 Mon Sep 17 00:00:00 2001 From: Thomas Winter Date: Tue, 16 May 2017 10:14:44 +1200 Subject: ipmr: vrf: Find VIFs using the actual device The skb->dev that is passed into ip_mr_input is the loX device for VRFs. When we lookup a vif for this dev, none is found as we do not create vifs for loopbacks. Instead lookup a vif for the actual device that the packet was received on, eg the vlan. Signed-off-by: Thomas Winter cc: David Ahern cc: Nikolay Aleksandrov cc: roopa Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3a02d52ed50e..551de4d023a8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1980,6 +1980,20 @@ int ip_mr_input(struct sk_buff *skb) struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; struct mr_table *mrt; + struct net_device *dev; + + /* skb->dev passed in is the loX master dev for vrfs. + * As there are no vifs associated with loopback devices, + * get the proper interface that does have a vif associated with it. + */ + dev = skb->dev; + if (netif_is_l3_master(skb->dev)) { + dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); + if (!dev) { + kfree_skb(skb); + return -ENODEV; + } + } /* Packet is looped back after forward, it should not be * forwarded second time, but still can be delivered locally. @@ -2017,7 +2031,7 @@ int ip_mr_input(struct sk_buff *skb) /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); if (!cache) { - int vif = ipmr_find_vif(mrt, skb->dev); + int vif = ipmr_find_vif(mrt, dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, @@ -2037,7 +2051,7 @@ int ip_mr_input(struct sk_buff *skb) } read_lock(&mrt_lock); - vif = ipmr_find_vif(mrt, skb->dev); + vif = ipmr_find_vif(mrt, dev); if (vif >= 0) { int err2 = ipmr_cache_unresolved(mrt, vif, skb); read_unlock(&mrt_lock); -- cgit v1.2.3 From 263eec9b2a82e8697d064709414914b5b10ac538 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 15 May 2017 17:33:37 +0200 Subject: smc: switch to usage of IB_PD_UNSAFE_GLOBAL_RKEY Currently, SMC enables remote access to physical memory when a user has successfully configured and established an SMC-connection until ten minutes after the last SMC connection is closed. Because this is considered a security risk, drivers are supposed to use IB_PD_UNSAFE_GLOBAL_RKEY in such a case. This patch changes the current SMC code to use IB_PD_UNSAFE_GLOBAL_RKEY. This improves user awareness, but does not remove the security risk itself. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 4 ++-- net/smc/smc_core.c | 16 +++------------- net/smc/smc_core.h | 2 +- net/smc/smc_ib.c | 21 ++------------------- net/smc/smc_ib.h | 2 -- 5 files changed, 8 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index e41f594a1e1d..03ec058d18df 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -204,7 +204,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); @@ -256,7 +256,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 65020e93ff21..3ac09a629ea1 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -613,19 +613,8 @@ int smc_rmb_create(struct smc_sock *smc) rmb_desc = NULL; continue; /* if mapping failed, try smaller one */ } - rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, - &rmb_desc->mr_rx[SMC_SINGLE_LINK]); - if (rc) { - smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - tmp_bufsize, rmb_desc, - DMA_FROM_DEVICE); - kfree(rmb_desc->cpu_addr); - kfree(rmb_desc); - rmb_desc = NULL; - continue; - } + rmb_desc->rkey[SMC_SINGLE_LINK] = + lgr->lnk[SMC_SINGLE_LINK].roce_pd->unsafe_global_rkey; rmb_desc->used = 1; write_lock_bh(&lgr->rmbs_lock); list_add(&rmb_desc->list, @@ -668,6 +657,7 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && + (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && test_bit(i, lgr->rtokens_used_mask)) { conn->rtoken_idx = i; return 0; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 27eb38056a27..b013cb43a327 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -93,7 +93,7 @@ struct smc_buf_desc { u64 dma_addr[SMC_LINKS_PER_LGR_MAX]; /* mapped address of buffer */ void *cpu_addr; /* virtual address of buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + u32 rkey[SMC_LINKS_PER_LGR_MAX]; /* for rmb only: * rkey provided to peer */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index cb69ab977cd7..b31715505a35 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -37,24 +37,6 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system * identifier */ -int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct ib_mr **mr) -{ - int rc; - - if (*mr) - return 0; /* already done */ - - /* obtain unique key - - * next invocation of get_dma_mr returns a different key! - */ - *mr = pd->device->get_dma_mr(pd, access_flags); - rc = PTR_ERR_OR_ZERO(*mr); - if (IS_ERR(*mr)) - *mr = NULL; - return rc; -} - static int smc_ib_modify_qp_init(struct smc_link *lnk) { struct ib_qp_attr qp_attr; @@ -210,7 +192,8 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) { int rc; - lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); + lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, + IB_PD_UNSAFE_GLOBAL_RKEY); rc = PTR_ERR_OR_ZERO(lnk->roce_pd); if (IS_ERR(lnk->roce_pd)) lnk->roce_pd = NULL; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 7e1f0e24d177..b567152a526d 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -61,8 +61,6 @@ void smc_ib_dealloc_protection_domain(struct smc_link *lnk); int smc_ib_create_protection_domain(struct smc_link *lnk); void smc_ib_destroy_queue_pair(struct smc_link *lnk); int smc_ib_create_queue_pair(struct smc_link *lnk); -int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct ib_mr **mr); int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); -- cgit v1.2.3 From 19a0f7e37c0761a0a1cbf550705a6063c9675223 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 May 2017 09:51:38 +0300 Subject: net/smc: Add warning about remote memory exposure The driver explicitly bypasses APIs to register all memory once a connection is made, and thus allows remote access to memory. Signed-off-by: Christoph Hellwig Signed-off-by: Leon Romanovsky Acked-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/smc/Kconfig b/net/smc/Kconfig index c717ef0896aa..33954852f3f8 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -8,6 +8,10 @@ config SMC The Linux implementation of the SMC-R solution is designed as a separate socket family SMC. + Warning: SMC will expose all memory for remote reads and writes + once a connection is established. Don't enable this option except + for tightly controlled lab environment. + Select this option if you want to run SMC socket applications config SMC_DIAG -- cgit v1.2.3 From f6c5775ff0bfa62b072face6bf1d40f659f194b2 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 15 May 2017 23:19:17 -0700 Subject: net: Improve handling of failures on link and route dumps In general, rtnetlink dumps do not anticipate failure to dump a single object (e.g., link or route) on a single pass. As both route and link objects have grown via more attributes, that is no longer a given. netlink dumps can handle a failure if the dump function returns an error; specifically, netlink_dump adds the return code to the response if it is <= 0 so userspace is notified of the failure. The missing piece is the rtnetlink dump functions returning the error. Fix route and link dump functions to return the errors if no object is added to an skb (detected by skb->len != 0). IPv6 route dumps (rt6_dump_route) already return the error; this patch updates IPv4 and link dumps. Other dump functions may need to be ajusted as well. Reported-by: Jan Moskyto Matejka Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 36 ++++++++++++++++++++++++------------ net/ipv4/fib_frontend.c | 15 +++++++++++---- net/ipv4/fib_trie.c | 26 ++++++++++++++------------ 3 files changed, 49 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d7f82c3450b1..49a279a7cc15 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1627,13 +1627,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, 0, flags, ext_filter_mask); - /* If we ran out of room on the first message, - * we're in trouble - */ - WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); - if (err < 0) - goto out; + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: @@ -1641,10 +1641,12 @@ cont: } } out: + err = skb->len; +out_err: cb->args[1] = idx; cb->args[0] = h; - return skb->len; + return err; } int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, @@ -3453,8 +3455,12 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); - if (err < 0 && err != -EOPNOTSUPP) - break; + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } } idx++; } @@ -3465,16 +3471,22 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) seq, dev, filter_mask, NLM_F_MULTI); - if (err < 0 && err != -EOPNOTSUPP) - break; + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } } idx++; } } + err = skb->len; +out_err: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline size_t bridge_nlmsg_size(void) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 39bd1edee676..83e3ed258467 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -763,7 +763,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; - int dumped = 0; + int dumped = 0, err; if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) @@ -783,20 +783,27 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); - if (fib_table_dump(tb, skb, cb) < 0) - goto out; + err = fib_table_dump(tb, skb, cb); + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } dumped = 1; next: e++; } } out: + err = skb->len; +out_err: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; - return skb->len; + return err; } /* Prepare and feed intra-kernel routing request. diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1201409ba1dc..51182ff2b441 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1983,6 +1983,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + int err; + if (i < s_i) { i++; continue; @@ -1993,17 +1995,14 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, continue; } - if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, - fa->fa_type, - xkey, - KEYLENGTH - fa->fa_slen, - fa->fa_tos, - fa->fa_info, NLM_F_MULTI) < 0) { + err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + tb->tb_id, fa->fa_type, + xkey, KEYLENGTH - fa->fa_slen, + fa->fa_tos, fa->fa_info, NLM_F_MULTI); + if (err < 0) { cb->args[4] = i; - return -1; + return err; } i++; } @@ -2025,10 +2024,13 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, t_key key = cb->args[3]; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { + int err; + + err = fn_trie_dump_leaf(l, tb, skb, cb); + if (err < 0) { cb->args[3] = key; cb->args[2] = count; - return -1; + return err; } ++count; -- cgit v1.2.3 From 5667c86acf021e6dcf02584408b4484a273ac68f Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Sun, 14 May 2017 21:41:55 -0700 Subject: mac80211: strictly check mesh address extension mode Mesh forwarding path checks for address extension mode to fetch appropriate proxied address and MPP address. Existing condition that looks for 6 address format is not strict enough so that frames with improper values are processed and invalid entries are added into MPP table. Fix that by adding a stricter check before processing the packet. Per IEEE Std 802.11s-2011 spec. Table 7-6g1 lists address extension mode 0x3 as reserved one. And also Table Table 9-13 does not specify 0x3 as valid address field. Fixes: 9b395bc3be1c ("mac80211: verify that skb data is present") Signed-off-by: Rajkumar Manoharan Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 3 ++- net/wireless/util.c | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 35f4c7d7a500..1f75280ba26c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2492,7 +2492,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) if (is_multicast_ether_addr(hdr->addr1)) { mpp_addr = hdr->addr3; proxied_addr = mesh_hdr->eaddr1; - } else if (mesh_hdr->flags & MESH_FLAGS_AE_A5_A6) { + } else if ((mesh_hdr->flags & MESH_FLAGS_AE) == + MESH_FLAGS_AE_A5_A6) { /* has_a4 already checked in ieee80211_rx_mesh_check */ mpp_addr = hdr->addr4; proxied_addr = mesh_hdr->eaddr2; diff --git a/net/wireless/util.c b/net/wireless/util.c index 7198373e2920..4992f1025c9d 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -454,6 +454,8 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, if (iftype == NL80211_IFTYPE_MESH_POINT) skb_copy_bits(skb, hdrlen, &mesh_flags, 1); + mesh_flags &= MESH_FLAGS_AE; + switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): @@ -469,9 +471,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, iftype != NL80211_IFTYPE_STATION)) return -1; if (iftype == NL80211_IFTYPE_MESH_POINT) { - if (mesh_flags & MESH_FLAGS_AE_A4) + if (mesh_flags == MESH_FLAGS_AE_A4) return -1; - if (mesh_flags & MESH_FLAGS_AE_A5_A6) { + if (mesh_flags == MESH_FLAGS_AE_A5_A6) { skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), tmp.h_dest, 2 * ETH_ALEN); @@ -487,9 +489,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, ether_addr_equal(tmp.h_source, addr))) return -1; if (iftype == NL80211_IFTYPE_MESH_POINT) { - if (mesh_flags & MESH_FLAGS_AE_A5_A6) + if (mesh_flags == MESH_FLAGS_AE_A5_A6) return -1; - if (mesh_flags & MESH_FLAGS_AE_A4) + if (mesh_flags == MESH_FLAGS_AE_A4) skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), tmp.h_source, ETH_ALEN); -- cgit v1.2.3 From 23d268eb240954e6e78f7cfab04f2b1e79f84489 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 16 May 2017 07:53:43 -0700 Subject: arp: honour gratuitous ARP _replies_ When arp_accept is 1, gratuitous ARPs are supposed to override matching entries irrespective of whether they arrive during locktime. This was implemented in commit 56022a8fdd87 ("ipv4: arp: update neighbour address when a gratuitous arp is received and arp_accept is set") There is a glitch in the patch though. RFC 2002, section 4.6, "ARP, Proxy ARP, and Gratuitous ARP", defines gratuitous ARPs so that they can be either of Request or Reply type. Those Reply gratuitous ARPs can be triggered with standard tooling, for example, arping -A option does just that. This patch fixes the glitch, making both Request and Reply flavours of gratuitous ARPs to behave identically. As per RFC, if gratuitous ARPs are of Reply type, their Target Hardware Address field should also be set to the link-layer address to which this cache entry should be updated. The field is present in ARP over Ethernet but not in IEEE 1394. In this patch, I don't consider any broadcasted ARP replies as gratuitous if the field is not present, to conform the standard. It's not clear whether there is such a thing for IEEE 1394 as a gratuitous ARP reply; until it's cleared up, we will ignore such broadcasts. Note that they will still update existing ARP cache entries, assuming they arrive out of locktime time interval. Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 0937b34c27ca..d54345a06f72 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -653,6 +653,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; + unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; @@ -724,6 +725,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) break; #endif default: + tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); @@ -842,8 +844,18 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) It is possible, that this option should be enabled for some devices (strip is candidate) */ - is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && - addr_type == RTN_UNICAST; + is_garp = tip == sip && addr_type == RTN_UNICAST; + + /* Unsolicited ARP _replies_ also require target hwaddr to be + * the same as source. + */ + if (is_garp && arp->ar_op == htons(ARPOP_REPLY)) + is_garp = + /* IPv4 over IEEE 1394 doesn't provide target + * hardware address field in its ARP payload. + */ + tha && + !memcmp(tha, sha, dev->addr_len); if (!n && ((arp->ar_op == htons(ARPOP_REPLY) && -- cgit v1.2.3 From 77d7123342dcf6442341b67816321d71da8b2b16 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 16 May 2017 08:44:24 -0700 Subject: neighbour: update neigh timestamps iff update is effective It's a common practice to send gratuitous ARPs after moving an IP address to another device to speed up healing of a service. To fulfill service availability constraints, the timing of network peers updating their caches to point to a new location of an IP address can be particularly important. Sometimes neigh_update calls won't touch neither lladdr nor state, for example if an update arrives in locktime interval. The neigh->updated value is tested by the protocol specific neigh code, which in turn will influence whether NEIGH_UPDATE_F_OVERRIDE gets set in the call to neigh_update() or not. As a result, we may effectively ignore the update request, bailing out of touching the neigh entry, except that we still bump its timestamps inside neigh_update. This may be a problem for updates arriving in quick succession. For example, consider the following scenario: A service is moved to another device with its IP address. The new device sends three gratuitous ARP requests into the network with ~1 seconds interval between them. Just before the first request arrives to one of network peer nodes, its neigh entry for the IP address transitions from STALE to DELAY. This transition, among other things, updates neigh->updated. Once the kernel receives the first gratuitous ARP, it ignores it because its arrival time is inside the locktime interval. The kernel still bumps neigh->updated. Then the second gratuitous ARP request arrives, and it's also ignored because it's still in the (new) locktime interval. Same happens for the third request. The node eventually heals itself (after delay_first_probe_time seconds since the initial transition to DELAY state), but it just wasted some time and require a new ARP request/reply round trip. This unfortunate behaviour both puts more load on the network, as well as reduces service availability. This patch changes neigh_update so that it bumps neigh->updated (as well as neigh->confirmed) only once we are sure that either lladdr or entry state will change). In the scenario described above, it means that the second gratuitous ARP request will actually update the entry lladdr. Ideally, we would update the neigh entry on the very first gratuitous ARP request. The locktime mechanism is designed to ignore ARP updates in a short timeframe after a previous ARP update was honoured by the kernel layer. This would require tracking timestamps for state transitions separately from timestamps when actual updates are received. This would probably involve changes in neighbour struct. Therefore, the patch doesn't tackle the issue of the first gratuitous APR ignored, leaving it for a follow-up. Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/core/neighbour.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 58b0bcc125b5..d274f81fcc2c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1132,10 +1132,6 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, lladdr = neigh->ha; } - if (new & NUD_CONNECTED) - neigh->confirmed = jiffies; - neigh->updated = jiffies; - /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ @@ -1157,6 +1153,16 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } } + /* Update timestamps only once we know we will make a change to the + * neighbour entry. Otherwise we risk to move the locktime window with + * noop updates and ignore relevant ARP updates. + */ + if (new != old || lladdr != neigh->ha) { + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + } + if (new != old) { neigh_del_timer(neigh); if (new & NUD_PROBE) -- cgit v1.2.3 From 8b0d3ea555876533b6aa61479335be2c9bdb47e7 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 16 May 2017 14:10:33 -0400 Subject: net: dsa: store CPU port pointer in the tree A dsa_switch_tree instance holds a dsa_switch pointer and a port index to identify the switch port to which the CPU is attached. Now that the DSA layer has a dsa_port structure to hold this data, use it to point the switch CPU port. This patch simply substitutes s/dst->cpu_switch/dst->cpu_dp->ds/ and s/dst->cpu_port/dst->cpu_dp->index/. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 4 ++-- drivers/net/dsa/bcm_sf2.c | 4 ++-- drivers/net/dsa/mv88e6060.c | 2 +- drivers/net/dsa/qca8k.c | 2 +- include/net/dsa.h | 13 ++++++------- net/dsa/dsa2.c | 14 ++++++-------- net/dsa/legacy.c | 10 ++++------ net/dsa/slave.c | 10 +++++----- net/dsa/tag_brcm.c | 2 +- net/dsa/tag_qca.c | 2 +- net/dsa/tag_trailer.c | 2 +- 11 files changed, 30 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index fa0eece21eef..658a12c888a8 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1344,7 +1344,7 @@ EXPORT_SYMBOL(b53_fdb_dump); int b53_br_join(struct dsa_switch *ds, int port, struct net_device *br) { struct b53_device *dev = ds->priv; - s8 cpu_port = ds->dst->cpu_port; + s8 cpu_port = ds->dst->cpu_dp->index; u16 pvlan, reg; unsigned int i; @@ -1390,7 +1390,7 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *br) { struct b53_device *dev = ds->priv; struct b53_vlan *vl = &dev->vlans[0]; - s8 cpu_port = ds->dst->cpu_port; + s8 cpu_port = ds->dst->cpu_dp->index; unsigned int i; u16 pvlan, reg, pvid; diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 2be963252ca5..215d41c1e71f 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -228,7 +228,7 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, struct phy_device *phy) { struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - s8 cpu_port = ds->dst[ds->index].cpu_port; + s8 cpu_port = ds->dst->cpu_dp->index; unsigned int i; u32 reg; @@ -832,7 +832,7 @@ static int bcm_sf2_sw_set_wol(struct dsa_switch *ds, int port, { struct net_device *p = ds->dst[ds->index].master_netdev; struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - s8 cpu_port = ds->dst[ds->index].cpu_port; + s8 cpu_port = ds->dst->cpu_dp->index; struct ethtool_wolinfo pwol; p->ethtool_ops->get_wol(p, &pwol); diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index 5934b7a4c448..dce7fa57eb55 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -176,7 +176,7 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p) ((p & 0xf) << PORT_VLAN_MAP_DBNUM_SHIFT) | (dsa_is_cpu_port(ds, p) ? ds->enabled_port_mask : - BIT(ds->dst->cpu_port))); + BIT(ds->dst->cpu_dp->index))); /* Port Association Vector: when learning source addresses * of packets, add the address to the address database using diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index a4fd4ccf7b67..942b9ac7f92a 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -507,7 +507,7 @@ qca8k_setup(struct dsa_switch *ds) pr_warn("regmap initialization failed"); /* Initialize CPU port pad mode (xMII type, delays...) */ - phy_mode = of_get_phy_mode(ds->ports[ds->dst->cpu_port].dn); + phy_mode = of_get_phy_mode(ds->dst->cpu_dp->dn); if (phy_mode < 0) { pr_err("Can't find phy-mode for master device\n"); return phy_mode; diff --git a/include/net/dsa.h b/include/net/dsa.h index 8e24677b1c62..118a8bd2fd9a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -137,10 +137,9 @@ struct dsa_switch_tree { const struct ethtool_ops *master_orig_ethtool_ops; /* - * The switch and port to which the CPU is attached. + * The switch port to which the CPU is attached. */ - struct dsa_switch *cpu_switch; - s8 cpu_port; + struct dsa_port *cpu_dp; /* * Data for the individual switch chips. @@ -251,7 +250,7 @@ struct dsa_switch { static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { - return !!(ds == ds->dst->cpu_switch && p == ds->dst->cpu_port); + return ds->dst->cpu_dp == &ds->ports[p]; } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) @@ -279,10 +278,10 @@ static inline u8 dsa_upstream_port(struct dsa_switch *ds) * Else return the (DSA) port number that connects to the * switch that is one hop closer to the cpu. */ - if (dst->cpu_switch == ds) - return dst->cpu_port; + if (dst->cpu_dp->ds == ds) + return dst->cpu_dp->index; else - return ds->rtable[dst->cpu_switch->index]; + return ds->rtable[dst->cpu_dp->ds->index]; } struct switchdev_trans; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 033b3bfb63dc..2ac62349ba12 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -443,8 +443,8 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - if (dst->cpu_switch) { - err = dsa_cpu_port_ethtool_setup(dst->cpu_switch); + if (dst->cpu_dp) { + err = dsa_cpu_port_ethtool_setup(dst->cpu_dp->ds); if (err) return err; } @@ -484,8 +484,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - if (dst->cpu_switch) - dsa_cpu_port_ethtool_restore(dst->cpu_switch); + if (dst->cpu_dp) + dsa_cpu_port_ethtool_restore(dst->cpu_dp->ds); pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; @@ -518,10 +518,8 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, if (!dst->master_netdev) dst->master_netdev = ethernet_dev; - if (!dst->cpu_switch) { - dst->cpu_switch = ds; - dst->cpu_port = index; - } + if (!dst->cpu_dp) + dst->cpu_dp = port; tag_protocol = ds->ops->get_tag_protocol(ds); dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index ad345c8b0b06..bb28b011ba5a 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -115,13 +115,12 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) continue; if (!strcmp(name, "cpu")) { - if (dst->cpu_switch) { + if (dst->cpu_dp) { netdev_err(dst->master_netdev, "multiple cpu ports?!\n"); return -EINVAL; } - dst->cpu_switch = ds; - dst->cpu_port = i; + dst->cpu_dp = &ds->ports[i]; ds->cpu_port_mask |= 1 << i; } else if (!strcmp(name, "dsa")) { ds->dsa_port_mask |= 1 << i; @@ -144,7 +143,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * tagging protocol to the preferred tagging format of this * switch. */ - if (dst->cpu_switch == ds) { + if (dst->cpu_dp->ds == ds) { enum dsa_tag_protocol tag_protocol; tag_protocol = ops->get_tag_protocol(ds); @@ -624,7 +623,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, dst->pd = pd; dst->master_netdev = dev; - dst->cpu_port = -1; for (i = 0; i < pd->nr_chips; i++) { struct dsa_switch *ds; @@ -735,7 +733,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } - dsa_cpu_port_ethtool_restore(dst->cpu_switch); + dsa_cpu_port_ethtool_restore(dst->cpu_dp->ds); dev_put(dst->master_netdev); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 7693182df81e..77324c483d14 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -821,8 +821,8 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, uint64_t *data) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds = dst->cpu_switch; - s8 cpu_port = dst->cpu_port; + struct dsa_switch *ds = dst->cpu_dp->ds; + s8 cpu_port = dst->cpu_dp->index; int count = 0; if (dst->master_ethtool_ops.get_sset_count) { @@ -838,7 +838,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds = dst->cpu_switch; + struct dsa_switch *ds = dst->cpu_dp->ds; int count = 0; if (dst->master_ethtool_ops.get_sset_count) @@ -854,8 +854,8 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds = dst->cpu_switch; - s8 cpu_port = dst->cpu_port; + struct dsa_switch *ds = dst->cpu_dp->ds; + s8 cpu_port = dst->cpu_dp->index; int len = ETH_GSTRING_LEN; int mcount = 0, count; unsigned int i; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 2a9b52c5af86..658ddee63dc9 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -101,7 +101,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, int source_port; u8 *brcm_tag; - ds = dst->cpu_switch; + ds = dst->cpu_dp->ds; if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) goto out_drop; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 3ba3f59f7a34..be3b67750ac8 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -99,7 +99,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* This protocol doesn't support cascading multiple switches so it's * safe to assume the switch is first in the tree */ - ds = dst->cpu_switch; + ds = dst->cpu_dp->ds; if (!ds) goto out_drop; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index aafc2fc74c30..aa05e276ea22 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -67,7 +67,7 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, u8 *trailer; int source_port; - ds = dst->cpu_switch; + ds = dst->cpu_dp->ds; if (skb_linearize(skb)) goto out_drop; -- cgit v1.2.3 From 2423496af35d94a87156b063ea5cedffc10a70a1 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Tue, 16 May 2017 14:36:23 -0400 Subject: ipv6: Prevent overrun when parsing v6 header options The KASAN warning repoted below was discovered with a syzkaller program. The reproducer is basically: int s = socket(AF_INET6, SOCK_RAW, NEXTHDR_HOP); send(s, &one_byte_of_data, 1, MSG_MORE); send(s, &more_than_mtu_bytes_data, 2000, 0); The socket() call sets the nexthdr field of the v6 header to NEXTHDR_HOP, the first send call primes the payload with a non zero byte of data, and the second send call triggers the fragmentation path. The fragmentation code tries to parse the header options in order to figure out where to insert the fragment option. Since nexthdr points to an invalid option, the calculation of the size of the network header can made to be much larger than the linear section of the skb and data is read outside of it. This fix makes ip6_find_1stfrag return an error if it detects running out-of-bounds. [ 42.361487] ================================================================== [ 42.364412] BUG: KASAN: slab-out-of-bounds in ip6_fragment+0x11c8/0x3730 [ 42.365471] Read of size 840 at addr ffff88000969e798 by task ip6_fragment-oo/3789 [ 42.366469] [ 42.366696] CPU: 1 PID: 3789 Comm: ip6_fragment-oo Not tainted 4.11.0+ #41 [ 42.367628] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1ubuntu1 04/01/2014 [ 42.368824] Call Trace: [ 42.369183] dump_stack+0xb3/0x10b [ 42.369664] print_address_description+0x73/0x290 [ 42.370325] kasan_report+0x252/0x370 [ 42.370839] ? ip6_fragment+0x11c8/0x3730 [ 42.371396] check_memory_region+0x13c/0x1a0 [ 42.371978] memcpy+0x23/0x50 [ 42.372395] ip6_fragment+0x11c8/0x3730 [ 42.372920] ? nf_ct_expect_unregister_notifier+0x110/0x110 [ 42.373681] ? ip6_copy_metadata+0x7f0/0x7f0 [ 42.374263] ? ip6_forward+0x2e30/0x2e30 [ 42.374803] ip6_finish_output+0x584/0x990 [ 42.375350] ip6_output+0x1b7/0x690 [ 42.375836] ? ip6_finish_output+0x990/0x990 [ 42.376411] ? ip6_fragment+0x3730/0x3730 [ 42.376968] ip6_local_out+0x95/0x160 [ 42.377471] ip6_send_skb+0xa1/0x330 [ 42.377969] ip6_push_pending_frames+0xb3/0xe0 [ 42.378589] rawv6_sendmsg+0x2051/0x2db0 [ 42.379129] ? rawv6_bind+0x8b0/0x8b0 [ 42.379633] ? _copy_from_user+0x84/0xe0 [ 42.380193] ? debug_check_no_locks_freed+0x290/0x290 [ 42.380878] ? ___sys_sendmsg+0x162/0x930 [ 42.381427] ? rcu_read_lock_sched_held+0xa3/0x120 [ 42.382074] ? sock_has_perm+0x1f6/0x290 [ 42.382614] ? ___sys_sendmsg+0x167/0x930 [ 42.383173] ? lock_downgrade+0x660/0x660 [ 42.383727] inet_sendmsg+0x123/0x500 [ 42.384226] ? inet_sendmsg+0x123/0x500 [ 42.384748] ? inet_recvmsg+0x540/0x540 [ 42.385263] sock_sendmsg+0xca/0x110 [ 42.385758] SYSC_sendto+0x217/0x380 [ 42.386249] ? SYSC_connect+0x310/0x310 [ 42.386783] ? __might_fault+0x110/0x1d0 [ 42.387324] ? lock_downgrade+0x660/0x660 [ 42.387880] ? __fget_light+0xa1/0x1f0 [ 42.388403] ? __fdget+0x18/0x20 [ 42.388851] ? sock_common_setsockopt+0x95/0xd0 [ 42.389472] ? SyS_setsockopt+0x17f/0x260 [ 42.390021] ? entry_SYSCALL_64_fastpath+0x5/0xbe [ 42.390650] SyS_sendto+0x40/0x50 [ 42.391103] entry_SYSCALL_64_fastpath+0x1f/0xbe [ 42.391731] RIP: 0033:0x7fbbb711e383 [ 42.392217] RSP: 002b:00007ffff4d34f28 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 42.393235] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fbbb711e383 [ 42.394195] RDX: 0000000000001000 RSI: 00007ffff4d34f60 RDI: 0000000000000003 [ 42.395145] RBP: 0000000000000046 R08: 00007ffff4d34f40 R09: 0000000000000018 [ 42.396056] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000400aad [ 42.396598] R13: 0000000000000066 R14: 00007ffff4d34ee0 R15: 00007fbbb717af00 [ 42.397257] [ 42.397411] Allocated by task 3789: [ 42.397702] save_stack_trace+0x16/0x20 [ 42.398005] save_stack+0x46/0xd0 [ 42.398267] kasan_kmalloc+0xad/0xe0 [ 42.398548] kasan_slab_alloc+0x12/0x20 [ 42.398848] __kmalloc_node_track_caller+0xcb/0x380 [ 42.399224] __kmalloc_reserve.isra.32+0x41/0xe0 [ 42.399654] __alloc_skb+0xf8/0x580 [ 42.400003] sock_wmalloc+0xab/0xf0 [ 42.400346] __ip6_append_data.isra.41+0x2472/0x33d0 [ 42.400813] ip6_append_data+0x1a8/0x2f0 [ 42.401122] rawv6_sendmsg+0x11ee/0x2db0 [ 42.401505] inet_sendmsg+0x123/0x500 [ 42.401860] sock_sendmsg+0xca/0x110 [ 42.402209] ___sys_sendmsg+0x7cb/0x930 [ 42.402582] __sys_sendmsg+0xd9/0x190 [ 42.402941] SyS_sendmsg+0x2d/0x50 [ 42.403273] entry_SYSCALL_64_fastpath+0x1f/0xbe [ 42.403718] [ 42.403871] Freed by task 1794: [ 42.404146] save_stack_trace+0x16/0x20 [ 42.404515] save_stack+0x46/0xd0 [ 42.404827] kasan_slab_free+0x72/0xc0 [ 42.405167] kfree+0xe8/0x2b0 [ 42.405462] skb_free_head+0x74/0xb0 [ 42.405806] skb_release_data+0x30e/0x3a0 [ 42.406198] skb_release_all+0x4a/0x60 [ 42.406563] consume_skb+0x113/0x2e0 [ 42.406910] skb_free_datagram+0x1a/0xe0 [ 42.407288] netlink_recvmsg+0x60d/0xe40 [ 42.407667] sock_recvmsg+0xd7/0x110 [ 42.408022] ___sys_recvmsg+0x25c/0x580 [ 42.408395] __sys_recvmsg+0xd6/0x190 [ 42.408753] SyS_recvmsg+0x2d/0x50 [ 42.409086] entry_SYSCALL_64_fastpath+0x1f/0xbe [ 42.409513] [ 42.409665] The buggy address belongs to the object at ffff88000969e780 [ 42.409665] which belongs to the cache kmalloc-512 of size 512 [ 42.410846] The buggy address is located 24 bytes inside of [ 42.410846] 512-byte region [ffff88000969e780, ffff88000969e980) [ 42.411941] The buggy address belongs to the page: [ 42.412405] page:ffffea000025a780 count:1 mapcount:0 mapping: (null) index:0x0 compound_mapcount: 0 [ 42.413298] flags: 0x100000000008100(slab|head) [ 42.413729] raw: 0100000000008100 0000000000000000 0000000000000000 00000001800c000c [ 42.414387] raw: ffffea00002a9500 0000000900000007 ffff88000c401280 0000000000000000 [ 42.415074] page dumped because: kasan: bad access detected [ 42.415604] [ 42.415757] Memory state around the buggy address: [ 42.416222] ffff88000969e880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 42.416904] ffff88000969e900: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 42.417591] >ffff88000969e980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 42.418273] ^ [ 42.418588] ffff88000969ea00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 42.419273] ffff88000969ea80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 42.419882] ================================================================== Reported-by: Andrey Konovalov Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- net/ipv6/ip6_offload.c | 2 ++ net/ipv6/ip6_output.c | 4 ++++ net/ipv6/output_core.c | 14 ++++++++------ net/ipv6/udp_offload.c | 2 ++ 4 files changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 93e58a5e1837..eab36abc9f22 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -117,6 +117,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (udpfrag) { unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + if (unfrag_ip6hlen < 0) + return ERR_PTR(unfrag_ip6hlen); fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); fptr->frag_off = htons(offset); if (skb->next) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 58f6288e9ba5..01deecda2f84 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -598,6 +598,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, u8 *prevhdr, nexthdr = 0; hlen = ip6_find_1stfragopt(skb, &prevhdr); + if (hlen < 0) { + err = hlen; + goto fail; + } nexthdr = *prevhdr; mtu = ip6_skb_dst_mtu(skb); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index cd4252346a32..e9065b8d3af8 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -79,14 +79,13 @@ EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; - while (offset + 1 <= packet_len) { + while (offset <= packet_len) { + struct ipv6_opt_hdr *exthdr; switch (**nexthdr) { @@ -107,13 +106,16 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) return offset; } - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; + if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) + return -EINVAL; + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; } - return offset; + return -EINVAL; } EXPORT_SYMBOL(ip6_find_1stfragopt); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index ac858c480f2f..b348cff47395 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -91,6 +91,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, * bytes to insert fragment header. */ unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + if (unfrag_ip6hlen < 0) + return ERR_PTR(unfrag_ip6hlen); nexthdr = *prevhdr; *prevhdr = NEXTHDR_FRAGMENT; unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + -- cgit v1.2.3 From 9142e9007f2d7ab58a587a1e1d921b0064a339aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 13:27:53 -0700 Subject: net: fix compile error in skb_orphan_partial() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_INET is not set, net/core/sock.c can not compile : net/core/sock.c: In function ‘skb_orphan_partial’: net/core/sock.c:1810:2: error: implicit declaration of function ‘skb_is_tcp_pure_ack’ [-Werror=implicit-function-declaration] if (skb_is_tcp_pure_ack(skb)) ^ Fix this by always including Fixes: f6ba8d33cfbb ("netem: fix skb_orphan_partial()") Signed-off-by: Eric Dumazet Reported-by: Paul Gortmaker Reported-by: Randy Dunlap Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/core/sock.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index e43e71d7856b..727f924b7f91 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -139,10 +139,7 @@ #include -#ifdef CONFIG_INET #include -#endif - #include static DEFINE_MUTEX(proto_list_mutex); -- cgit v1.2.3 From eb7b721129f1dc67041662da229a28dfc5c3c1dd Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 16 May 2017 22:40:07 +0200 Subject: net: dsa: Sort DSA tagging protocol drivers With more tag protocols being added, regain some order by sorting the entries in various places. Signed-off-by: Andrew Lunn Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 8 ++++---- net/dsa/Kconfig | 8 ++++---- net/dsa/Makefile | 6 +++--- net/dsa/dsa.c | 18 +++++++++--------- net/dsa/dsa_priv.h | 18 +++++++++--------- 5 files changed, 29 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 118a8bd2fd9a..ed767beca9c6 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -27,13 +27,13 @@ struct fixed_phy_status; enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = 0, + DSA_TAG_PROTO_BRCM, DSA_TAG_PROTO_DSA, - DSA_TAG_PROTO_TRAILER, DSA_TAG_PROTO_EDSA, - DSA_TAG_PROTO_BRCM, - DSA_TAG_PROTO_QCA, - DSA_TAG_PROTO_MTK, DSA_TAG_PROTO_LAN9303, + DSA_TAG_PROTO_MTK, + DSA_TAG_PROTO_QCA, + DSA_TAG_PROTO_TRAILER, DSA_TAG_LAST, /* MUST BE LAST */ }; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 81a0868edb1d..297389b2ab35 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -25,16 +25,16 @@ config NET_DSA_TAG_DSA config NET_DSA_TAG_EDSA bool -config NET_DSA_TAG_TRAILER +config NET_DSA_TAG_LAN9303 bool -config NET_DSA_TAG_QCA +config NET_DSA_TAG_MTK bool -config NET_DSA_TAG_MTK +config NET_DSA_TAG_TRAILER bool -config NET_DSA_TAG_LAN9303 +config NET_DSA_TAG_QCA bool endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 0b747d75e65a..f8c0251d1f43 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -6,7 +6,7 @@ dsa_core-y += dsa.o slave.o dsa2.o switch.o legacy.o dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o -dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o -dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o -dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o +dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o +dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 26130ae438da..c0a1307c87dd 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -40,26 +40,26 @@ static const struct dsa_device_ops none_ops = { }; const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { +#ifdef CONFIG_NET_DSA_TAG_BRCM + [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, +#endif #ifdef CONFIG_NET_DSA_TAG_DSA [DSA_TAG_PROTO_DSA] = &dsa_netdev_ops, #endif #ifdef CONFIG_NET_DSA_TAG_EDSA [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops, #endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - [DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops, +#ifdef CONFIG_NET_DSA_TAG_LAN9303 + [DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops, #endif -#ifdef CONFIG_NET_DSA_TAG_BRCM - [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, +#ifdef CONFIG_NET_DSA_TAG_MTK + [DSA_TAG_PROTO_MTK] = &mtk_netdev_ops, #endif #ifdef CONFIG_NET_DSA_TAG_QCA [DSA_TAG_PROTO_QCA] = &qca_netdev_ops, #endif -#ifdef CONFIG_NET_DSA_TAG_MTK - [DSA_TAG_PROTO_MTK] = &mtk_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_LAN9303 - [DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops, +#ifdef CONFIG_NET_DSA_TAG_TRAILER + [DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops, #endif [DSA_TAG_PROTO_NONE] = &none_ops, }; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f4a88e485213..e9003b79cbbc 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -75,25 +75,25 @@ void dsa_slave_unregister_notifier(void); int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); +/* tag_brcm.c */ +extern const struct dsa_device_ops brcm_netdev_ops; + /* tag_dsa.c */ extern const struct dsa_device_ops dsa_netdev_ops; /* tag_edsa.c */ extern const struct dsa_device_ops edsa_netdev_ops; -/* tag_trailer.c */ -extern const struct dsa_device_ops trailer_netdev_ops; +/* tag_lan9303.c */ +extern const struct dsa_device_ops lan9303_netdev_ops; -/* tag_brcm.c */ -extern const struct dsa_device_ops brcm_netdev_ops; +/* tag_mtk.c */ +extern const struct dsa_device_ops mtk_netdev_ops; /* tag_qca.c */ extern const struct dsa_device_ops qca_netdev_ops; -/* tag_mtk.c */ -extern const struct dsa_device_ops mtk_netdev_ops; - -/* tag_lan9303.c */ -extern const struct dsa_device_ops lan9303_netdev_ops; +/* tag_trailer.c */ +extern const struct dsa_device_ops trailer_netdev_ops; #endif -- cgit v1.2.3 From 87d83093bfc2f4938ff21524ebb50ecf53c15a64 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:07:54 +0200 Subject: net: sched: move tc_classify function to cls_api.c Move tc_classify function to cls_api.c where it belongs, rename it to fit the namespace. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 9 +++++++++ include/net/pkt_sched.h | 3 --- net/core/dev.c | 5 +++-- net/sched/cls_api.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ net/sched/sch_api.c | 48 ------------------------------------------------ net/sched/sch_atm.c | 2 +- net/sched/sch_cbq.c | 2 +- net/sched/sch_drr.c | 2 +- net/sched/sch_dsmark.c | 2 +- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_hfsc.c | 2 +- net/sched/sch_htb.c | 2 +- net/sched/sch_multiq.c | 2 +- net/sched/sch_prio.c | 2 +- net/sched/sch_qfq.c | 2 +- net/sched/sch_sfb.c | 2 +- net/sched/sch_sfq.c | 2 +- 17 files changed, 72 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 269fd78bb0ae..cb745067feb3 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -19,10 +19,19 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); #ifdef CONFIG_NET_CLS void tcf_destroy_chain(struct tcf_proto __rcu **fl); +int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res, bool compat_mode); + #else static inline void tcf_destroy_chain(struct tcf_proto __rcu **fl) { } + +static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res, bool compat_mode) +{ + return TC_ACT_UNSPEC; +} #endif static inline unsigned long diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index bec46f63f10c..2579c209ea51 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -113,9 +113,6 @@ static inline void qdisc_run(struct Qdisc *q) __qdisc_run(q); } -int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res, bool compat_mode); - static inline __be16 tc_skb_protocol(const struct sk_buff *skb) { /* We need to take extra care in case the skb came via diff --git a/net/core/dev.c b/net/core/dev.c index fca407b4a6ea..acd594c56f0a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -105,6 +105,7 @@ #include #include #include +#include #include #include #include @@ -3178,7 +3179,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res, false)) { + switch (tcf_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -3948,7 +3949,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, skb->tc_at_ingress = 1; qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res, false)) { + switch (tcf_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 22f88b35a546..af58bbef6610 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -196,6 +196,54 @@ void tcf_destroy_chain(struct tcf_proto __rcu **fl) } EXPORT_SYMBOL(tcf_destroy_chain); +/* Main classifier routine: scans classifier chain attached + * to this qdisc, (optionally) tests for protocol and asks + * specific classifiers. + */ +int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res, bool compat_mode) +{ + __be16 protocol = tc_skb_protocol(skb); +#ifdef CONFIG_NET_CLS_ACT + const int max_reclassify_loop = 4; + const struct tcf_proto *old_tp = tp; + int limit = 0; + +reclassify: +#endif + for (; tp; tp = rcu_dereference_bh(tp->next)) { + int err; + + if (tp->protocol != protocol && + tp->protocol != htons(ETH_P_ALL)) + continue; + + err = tp->classify(skb, tp, res); +#ifdef CONFIG_NET_CLS_ACT + if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) + goto reset; +#endif + if (err >= 0) + return err; + } + + return TC_ACT_UNSPEC; /* signal: continue lookup */ +#ifdef CONFIG_NET_CLS_ACT +reset: + if (unlikely(limit++ >= max_reclassify_loop)) { + net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", + tp->q->ops->id, tp->prio & 0xffff, + ntohs(tp->protocol)); + return TC_ACT_SHOT; + } + + tp = old_tp; + protocol = tc_skb_protocol(skb); + goto reclassify; +#endif +} +EXPORT_SYMBOL(tcf_classify); + /* Add/change/delete/get a filter node */ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index e88342fde1bc..a3bcd972d940 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1878,54 +1878,6 @@ done: return skb->len; } -/* Main classifier routine: scans classifier chain attached - * to this qdisc, (optionally) tests for protocol and asks - * specific classifiers. - */ -int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res, bool compat_mode) -{ - __be16 protocol = tc_skb_protocol(skb); -#ifdef CONFIG_NET_CLS_ACT - const int max_reclassify_loop = 4; - const struct tcf_proto *old_tp = tp; - int limit = 0; - -reclassify: -#endif - for (; tp; tp = rcu_dereference_bh(tp->next)) { - int err; - - if (tp->protocol != protocol && - tp->protocol != htons(ETH_P_ALL)) - continue; - - err = tp->classify(skb, tp, res); -#ifdef CONFIG_NET_CLS_ACT - if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) - goto reset; -#endif - if (err >= 0) - return err; - } - - return TC_ACT_UNSPEC; /* signal: continue lookup */ -#ifdef CONFIG_NET_CLS_ACT -reset: - if (unlikely(limit++ >= max_reclassify_loop)) { - net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", - tp->q->ops->id, tp->prio & 0xffff, - ntohs(tp->protocol)); - return TC_ACT_SHOT; - } - - tp = old_tp; - protocol = tc_skb_protocol(skb); - goto reclassify; -#endif -} -EXPORT_SYMBOL(tc_classify); - #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 40cbceed4de8..89d32fad9f89 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -377,7 +377,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, list_for_each_entry(flow, &p->flows, list) { fl = rcu_dereference_bh(flow->filter_list); if (fl) { - result = tc_classify(skb, fl, &res, true); + result = tcf_classify(skb, fl, &res, true); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 7415859fd4c3..c543ea3e3043 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -233,7 +233,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 2+n. Apply classifier. */ - result = tc_classify(skb, fl, &res, true); + result = tcf_classify(skb, fl, &res, true); if (!fl || result < 0) goto fallback; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 58a8c32eab23..446d79bb25d9 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -333,7 +333,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res, false); + result = tcf_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 1c0f877f673a..7bc638d2e67f 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -234,7 +234,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, else { struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tc_classify(skb, fl, &res, false); + int result = tcf_classify(skb, fl, &res, false); pr_debug("result %d class 0x%04x\n", result, res.classid); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 9201abce928c..42ba81ad327c 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -96,7 +96,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, filter, &res, false); + result = tcf_classify(skb, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 5cb82f6c1b06..b0dcab199205 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1142,7 +1142,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; tcf = rcu_dereference_bh(q->root.filter_list); - while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { + while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 570ef3b0c09b..640f5f336195 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -231,7 +231,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { + while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 43a3a10b3c81..25bb9ffc2df1 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -46,7 +46,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tc_classify(skb, fl, &res, false); + err = tcf_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 92c2e6d448d7..7997363f7e0d 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -42,7 +42,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tc_classify(skb, fl, &res, false); + err = tcf_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 041eba3006cc..73c7ac37b570 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -720,7 +720,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res, false); + result = tcf_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 0f777273ba29..b287880829e2 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -259,7 +259,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, struct tcf_result res; int result; - result = tc_classify(skb, fl, &res, false); + result = tcf_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 332d94be6e1c..53a641f2ccb5 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -180,7 +180,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return sfq_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, fl, &res, false); + result = tcf_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { -- cgit v1.2.3 From 6529eaba33f0465fc6d228e1d05b1745f7d0e8c9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:07:55 +0200 Subject: net: sched: introduce tcf block infractructure Currently, the filter chains are direcly put into the private structures of qdiscs. In order to be able to have multiple chains per qdisc and to allow filter chains sharing among qdiscs, there is a need for common object that would hold the chains. This introduces such object and calls it "tcf_block". Helpers to get and put the blocks are provided to be called from individual qdisc code. Also, the original filter_list pointers are left in qdisc privs to allow the entry into tcf_block processing without any added overhead of possible multiple pointer dereference on fast path. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 13 ++++++++-- include/net/sch_generic.h | 7 +++++- net/sched/cls_api.c | 48 +++++++++++++++++++++++++++++-------- net/sched/sch_api.c | 2 +- net/sched/sch_atm.c | 27 ++++++++++++++------- net/sched/sch_cbq.c | 19 ++++++++++----- net/sched/sch_drr.c | 13 ++++++---- net/sched/sch_dsmark.c | 17 ++++++++----- net/sched/sch_fq_codel.c | 15 ++++++++---- net/sched/sch_hfsc.c | 19 ++++++++++----- net/sched/sch_htb.c | 26 +++++++++++++------- net/sched/sch_ingress.c | 61 ++++++++++++++++++++++++++++++++++------------- net/sched/sch_multiq.c | 14 +++++++---- net/sched/sch_prio.c | 17 +++++++++---- net/sched/sch_qfq.c | 14 +++++++---- net/sched/sch_sfb.c | 15 ++++++++---- net/sched/sch_sfq.c | 15 ++++++++---- 17 files changed, 243 insertions(+), 99 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index cb745067feb3..e56e7157c280 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -18,12 +18,21 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); #ifdef CONFIG_NET_CLS -void tcf_destroy_chain(struct tcf_proto __rcu **fl); +int tcf_block_get(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain); +void tcf_block_put(struct tcf_block *block); int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); #else -static inline void tcf_destroy_chain(struct tcf_proto __rcu **fl) +static inline +int tcf_block_get(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain) +{ + return 0; +} + +static inline void tcf_block_put(struct tcf_block *block) { } diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 22e52093bfda..98cf2f23602d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -153,7 +153,7 @@ struct Qdisc_class_ops { void (*walk)(struct Qdisc *, struct qdisc_walker * arg); /* Filter manipulation */ - struct tcf_proto __rcu ** (*tcf_chain)(struct Qdisc *, unsigned long); + struct tcf_block * (*tcf_block)(struct Qdisc *, unsigned long); bool (*tcf_cl_offload)(u32 classid); unsigned long (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid); @@ -236,6 +236,7 @@ struct tcf_proto { struct Qdisc *q; void *data; const struct tcf_proto_ops *ops; + struct tcf_block *block; struct rcu_head rcu; }; @@ -247,6 +248,10 @@ struct qdisc_skb_cb { unsigned char data[QDISC_CB_PRIV_LEN]; }; +struct tcf_block { + struct tcf_proto __rcu **p_filter_chain; +}; + static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index af58bbef6610..d30116f77156 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -129,7 +129,8 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) } static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, - u32 prio, u32 parent, struct Qdisc *q) + u32 prio, u32 parent, struct Qdisc *q, + struct tcf_block *block) { struct tcf_proto *tp; int err; @@ -165,6 +166,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->prio = prio; tp->classid = parent; tp->q = q; + tp->block = block; err = tp->ops->init(tp); if (err) { @@ -185,7 +187,7 @@ static void tcf_proto_destroy(struct tcf_proto *tp) kfree_rcu(tp, rcu); } -void tcf_destroy_chain(struct tcf_proto __rcu **fl) +static void tcf_destroy_chain(struct tcf_proto __rcu **fl) { struct tcf_proto *tp; @@ -194,7 +196,28 @@ void tcf_destroy_chain(struct tcf_proto __rcu **fl) tcf_proto_destroy(tp); } } -EXPORT_SYMBOL(tcf_destroy_chain); + +int tcf_block_get(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain) +{ + struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); + + if (!block) + return -ENOMEM; + block->p_filter_chain = p_filter_chain; + *p_block = block; + return 0; +} +EXPORT_SYMBOL(tcf_block_get); + +void tcf_block_put(struct tcf_block *block) +{ + if (!block) + return; + tcf_destroy_chain(block->p_filter_chain); + kfree(block); +} +EXPORT_SYMBOL(tcf_block_put); /* Main classifier routine: scans classifier chain attached * to this qdisc, (optionally) tests for protocol and asks @@ -260,6 +283,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct Qdisc *q; struct tcf_proto __rcu **back; struct tcf_proto __rcu **chain; + struct tcf_block *block; struct tcf_proto *next; struct tcf_proto *tp; const struct Qdisc_class_ops *cops; @@ -328,7 +352,7 @@ replay: if (!cops) return -EINVAL; - if (cops->tcf_chain == NULL) + if (!cops->tcf_block) return -EOPNOTSUPP; /* Do we search for filter, attached to class? */ @@ -339,11 +363,13 @@ replay: } /* And the last stroke */ - chain = cops->tcf_chain(q, cl); - if (chain == NULL) { + block = cops->tcf_block(q, cl); + if (!block) { err = -EINVAL; goto errout; } + chain = block->p_filter_chain; + if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); tcf_destroy_chain(chain); @@ -387,7 +413,7 @@ replay: nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back))); tp = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, nprio, parent, q); + protocol, nprio, parent, q, block); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout; @@ -556,6 +582,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) int s_t; struct net_device *dev; struct Qdisc *q; + struct tcf_block *block; struct tcf_proto *tp, __rcu **chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); unsigned long cl = 0; @@ -577,16 +604,17 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) cops = q->ops->cl_ops; if (!cops) goto errout; - if (cops->tcf_chain == NULL) + if (!cops->tcf_block) goto errout; if (TC_H_MIN(tcm->tcm_parent)) { cl = cops->get(q, tcm->tcm_parent); if (cl == 0) goto errout; } - chain = cops->tcf_chain(q, cl); - if (chain == NULL) + block = cops->tcf_block(q, cl); + if (!block) goto errout; + chain = block->p_filter_chain; s_t = cb->args[0]; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a3bcd972d940..5d95401bbc02 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -163,7 +163,7 @@ int register_qdisc(struct Qdisc_ops *qops) if (!(cops->get && cops->put && cops->walk && cops->leaf)) goto out_einval; - if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf)) + if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf)) goto out_einval; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 89d32fad9f89..f435546c3864 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -43,6 +43,7 @@ struct atm_flow_data { struct Qdisc *q; /* FIFO, TBF, etc. */ struct tcf_proto __rcu *filter_list; + struct tcf_block *block; struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); /* chaining */ @@ -143,7 +144,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl) list_del_init(&flow->list); pr_debug("atm_tc_put: qdisc %p\n", flow->q); qdisc_destroy(flow->q); - tcf_destroy_chain(&flow->filter_list); + tcf_block_put(flow->block); if (flow->sock) { pr_debug("atm_tc_put: f_count %ld\n", file_count(flow->sock->file)); @@ -274,7 +275,13 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, error = -ENOBUFS; goto err_out; } - RCU_INIT_POINTER(flow->filter_list, NULL); + + error = tcf_block_get(&flow->block, &flow->filter_list); + if (error) { + kfree(flow); + goto err_out; + } + flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); if (!flow->q) flow->q = &noop_qdisc; @@ -346,14 +353,13 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)cl; pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - return flow ? &flow->filter_list : &p->link.filter_list; + return flow ? flow->block : p->link.block; } /* --------------------------- Qdisc operations ---------------------------- */ @@ -524,6 +530,7 @@ static struct sk_buff *atm_tc_peek(struct Qdisc *sch) static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) { struct atm_qdisc_data *p = qdisc_priv(sch); + int err; pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); INIT_LIST_HEAD(&p->flows); @@ -534,7 +541,11 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) if (!p->link.q) p->link.q = &noop_qdisc; pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - RCU_INIT_POINTER(p->link.filter_list, NULL); + + err = tcf_block_get(&p->link.block, &p->link.filter_list); + if (err) + return err; + p->link.vcc = NULL; p->link.sock = NULL; p->link.classid = sch->handle; @@ -561,7 +572,7 @@ static void atm_tc_destroy(struct Qdisc *sch) pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); list_for_each_entry(flow, &p->flows, list) - tcf_destroy_chain(&flow->filter_list); + tcf_block_put(flow->block); list_for_each_entry_safe(flow, tmp, &p->flows, list) { if (flow->ref > 1) @@ -646,7 +657,7 @@ static const struct Qdisc_class_ops atm_class_ops = { .change = atm_tc_change, .delete = atm_tc_delete, .walk = atm_tc_walk, - .tcf_chain = atm_tc_find_tcf, + .tcf_block = atm_tc_tcf_block, .bind_tcf = atm_tc_bind_filter, .unbind_tcf = atm_tc_put, .dump = atm_tc_dump_class, diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index c543ea3e3043..8dd6d0aca678 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -127,6 +127,7 @@ struct cbq_class { struct tc_cbq_xstats xstats; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; int refcnt; int filters; @@ -1405,7 +1406,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) WARN_ON(cl->filters); - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); qdisc_destroy(cl->q); qdisc_put_rtab(cl->R_tab); gen_kill_estimator(&cl->rate_est); @@ -1430,7 +1431,7 @@ static void cbq_destroy(struct Qdisc *sch) */ for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); } for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], @@ -1585,12 +1586,19 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (cl == NULL) goto failure; + err = tcf_block_get(&cl->block, &cl->filter_list); + if (err) { + kfree(cl); + return err; + } + if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { + tcf_block_put(cl->block); kfree(cl); goto failure; } @@ -1688,8 +1696,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) return 0; } -static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch, - unsigned long arg) +static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; @@ -1697,7 +1704,7 @@ static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch, if (cl == NULL) cl = &q->link; - return &cl->filter_list; + return cl->block; } static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, @@ -1756,7 +1763,7 @@ static const struct Qdisc_class_ops cbq_class_ops = { .change = cbq_change_class, .delete = cbq_delete, .walk = cbq_walk, - .tcf_chain = cbq_find_tcf, + .tcf_block = cbq_tcf_block, .bind_tcf = cbq_bind_filter, .unbind_tcf = cbq_unbind_filter, .dump = cbq_dump_class, diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 446d79bb25d9..5db2a2843c66 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -36,6 +36,7 @@ struct drr_class { struct drr_sched { struct list_head active; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; struct Qdisc_class_hash clhash; }; @@ -190,15 +191,14 @@ static void drr_put_class(struct Qdisc *sch, unsigned long arg) drr_destroy_class(sch, cl); } -static struct tcf_proto __rcu **drr_tcf_chain(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl) { struct drr_sched *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent, @@ -431,6 +431,9 @@ static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) struct drr_sched *q = qdisc_priv(sch); int err; + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; @@ -462,7 +465,7 @@ static void drr_destroy_qdisc(struct Qdisc *sch) struct hlist_node *next; unsigned int i; - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], @@ -477,7 +480,7 @@ static const struct Qdisc_class_ops drr_class_ops = { .delete = drr_delete_class, .get = drr_get_class, .put = drr_put_class, - .tcf_chain = drr_tcf_chain, + .tcf_block = drr_tcf_block, .bind_tcf = drr_bind_tcf, .unbind_tcf = drr_unbind_tcf, .graft = drr_graft_class, diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 7bc638d2e67f..ba45102cff94 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -44,6 +44,7 @@ struct mask_value { struct dsmark_qdisc_data { struct Qdisc *q; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; struct mask_value *mv; u16 indices; u8 set_tc_index; @@ -183,11 +184,11 @@ ignore: } } -static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl) { struct dsmark_qdisc_data *p = qdisc_priv(sch); - return &p->filter_list; + + return p->block; } /* --------------------------- Qdisc operations ---------------------------- */ @@ -332,7 +333,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) { struct dsmark_qdisc_data *p = qdisc_priv(sch); struct nlattr *tb[TCA_DSMARK_MAX + 1]; - int err = -EINVAL; + int err; u32 default_index = NO_DEFAULT_INDEX; u16 indices; int i; @@ -342,6 +343,10 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) goto errout; + err = tcf_block_get(&p->block, &p->filter_list); + if (err) + return err; + err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL); if (err < 0) goto errout; @@ -400,7 +405,7 @@ static void dsmark_destroy(struct Qdisc *sch) pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - tcf_destroy_chain(&p->filter_list); + tcf_block_put(p->block); qdisc_destroy(p->q); if (p->mv != p->embedded) kfree(p->mv); @@ -468,7 +473,7 @@ static const struct Qdisc_class_ops dsmark_class_ops = { .change = dsmark_change, .delete = dsmark_delete, .walk = dsmark_walk, - .tcf_chain = dsmark_find_tcf, + .tcf_block = dsmark_tcf_block, .bind_tcf = dsmark_bind_filter, .unbind_tcf = dsmark_put, .dump = dsmark_dump_class, diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 42ba81ad327c..f201e73947fb 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -55,6 +55,7 @@ struct fq_codel_flow { struct fq_codel_sched_data { struct tcf_proto __rcu *filter_list; /* optional external classifier */ + struct tcf_block *block; struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ u32 *backlogs; /* backlog table [flows_cnt] */ u32 flows_cnt; /* number of flows */ @@ -450,7 +451,7 @@ static void fq_codel_destroy(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); kvfree(q->backlogs); kvfree(q->flows); } @@ -459,6 +460,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) { struct fq_codel_sched_data *q = qdisc_priv(sch); int i; + int err; sch->limit = 10*1024; q->flows_cnt = 1024; @@ -478,6 +480,10 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) return err; } + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; + if (!q->flows) { q->flows = kvzalloc(q->flows_cnt * sizeof(struct fq_codel_flow), GFP_KERNEL); @@ -589,14 +595,13 @@ static void fq_codel_put(struct Qdisc *q, unsigned long cl) { } -static struct tcf_proto __rcu **fq_codel_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *fq_codel_tcf_block(struct Qdisc *sch, unsigned long cl) { struct fq_codel_sched_data *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, @@ -679,7 +684,7 @@ static const struct Qdisc_class_ops fq_codel_class_ops = { .leaf = fq_codel_leaf, .get = fq_codel_get, .put = fq_codel_put, - .tcf_chain = fq_codel_find_tcf, + .tcf_block = fq_codel_tcf_block, .bind_tcf = fq_codel_bind, .unbind_tcf = fq_codel_put, .dump = fq_codel_dump_class, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b0dcab199205..a324f84b1ccd 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -116,6 +116,7 @@ struct hfsc_class { struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ + struct tcf_block *block; unsigned int filter_cnt; /* filter count */ unsigned int level; /* class level in hierarchy */ @@ -1040,12 +1041,19 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + err = tcf_block_get(&cl->block, &cl->filter_list); + if (err) { + kfree(cl); + return err; + } + if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { + tcf_block_put(cl->block); kfree(cl); return err; } @@ -1091,7 +1099,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) { struct hfsc_sched *q = qdisc_priv(sch); - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); qdisc_destroy(cl->qdisc); gen_kill_estimator(&cl->rate_est); if (cl != &q->root) @@ -1261,8 +1269,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) cl->filter_cnt--; } -static struct tcf_proto __rcu ** -hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) +static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)arg; @@ -1270,7 +1277,7 @@ hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) if (cl == NULL) cl = &q->root; - return &cl->filter_list; + return cl->block; } static int @@ -1515,7 +1522,7 @@ hfsc_destroy_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], @@ -1662,7 +1669,7 @@ static const struct Qdisc_class_ops hfsc_class_ops = { .put = hfsc_put_class, .bind_tcf = hfsc_bind_tcf, .unbind_tcf = hfsc_unbind_tcf, - .tcf_chain = hfsc_tcf_chain, + .tcf_block = hfsc_tcf_block, .dump = hfsc_dump_class, .dump_stats = hfsc_dump_class_stats, .walk = hfsc_walk diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 640f5f336195..195bbca9eb0b 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -105,6 +105,7 @@ struct htb_class { int quantum; /* but stored for parent-to-leaf return */ struct tcf_proto __rcu *filter_list; /* class attached filters */ + struct tcf_block *block; int filter_cnt; int refcnt; /* usage count of this class */ @@ -156,6 +157,7 @@ struct htb_sched { /* filters for qdisc itself */ struct tcf_proto __rcu *filter_list; + struct tcf_block *block; #define HTB_WARN_TOOMANYEVENTS 0x1 unsigned int warned; /* only one warning */ @@ -1017,6 +1019,10 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; + err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL); if (err < 0) return err; @@ -1230,7 +1236,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) qdisc_destroy(cl->un.leaf.q); } gen_kill_estimator(&cl->rate_est); - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); kfree(cl); } @@ -1248,11 +1254,11 @@ static void htb_destroy(struct Qdisc *sch) * because filter need its target class alive to be able to call * unbind_filter on it (without Oops). */ - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], @@ -1396,6 +1402,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; + err = tcf_block_get(&cl->block, &cl->filter_list); + if (err) { + kfree(cl); + goto failure; + } if (htb_rate_est || tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, @@ -1403,6 +1414,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_root_sleeping_running(sch), tca[TCA_RATE] ? : &est.nla); if (err) { + tcf_block_put(cl->block); kfree(cl); goto failure; } @@ -1521,14 +1533,12 @@ failure: return err; } -static struct tcf_proto __rcu **htb_find_tcf(struct Qdisc *sch, - unsigned long arg) +static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; - struct tcf_proto __rcu **fl = cl ? &cl->filter_list : &q->filter_list; - return fl; + return cl ? cl->block : q->block; } static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, @@ -1591,7 +1601,7 @@ static const struct Qdisc_class_ops htb_class_ops = { .change = htb_change_class, .delete = htb_delete, .walk = htb_walk, - .tcf_chain = htb_find_tcf, + .tcf_block = htb_tcf_block, .bind_tcf = htb_bind_filter, .unbind_tcf = htb_unbind_filter, .dump = htb_dump_class, diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 3bab5f66c392..d8a9bebcab90 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -18,6 +18,10 @@ #include #include +struct ingress_sched_data { + struct tcf_block *block; +}; + static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; @@ -47,16 +51,23 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) { } -static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl) { - struct net_device *dev = qdisc_dev(sch); + struct ingress_sched_data *q = qdisc_priv(sch); - return &dev->ingress_cl_list; + return q->block; } static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { + struct ingress_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int err; + + err = tcf_block_get(&q->block, &dev->ingress_cl_list); + if (err) + return err; + net_inc_ingress_queue(); sch->flags |= TCQ_F_CPUSTATS; @@ -65,9 +76,9 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) static void ingress_destroy(struct Qdisc *sch) { - struct net_device *dev = qdisc_dev(sch); + struct ingress_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&dev->ingress_cl_list); + tcf_block_put(q->block); net_dec_ingress_queue(); } @@ -91,7 +102,7 @@ static const struct Qdisc_class_ops ingress_class_ops = { .get = ingress_get, .put = ingress_put, .walk = ingress_walk, - .tcf_chain = ingress_find_tcf, + .tcf_block = ingress_tcf_block, .tcf_cl_offload = ingress_cl_offload, .bind_tcf = ingress_bind_filter, .unbind_tcf = ingress_put, @@ -100,12 +111,18 @@ static const struct Qdisc_class_ops ingress_class_ops = { static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", + .priv_size = sizeof(struct ingress_sched_data), .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, .owner = THIS_MODULE, }; +struct clsact_sched_data { + struct tcf_block *ingress_block; + struct tcf_block *egress_block; +}; + static unsigned long clsact_get(struct Qdisc *sch, u32 classid) { switch (TC_H_MIN(classid)) { @@ -128,16 +145,15 @@ static unsigned long clsact_bind_filter(struct Qdisc *sch, return clsact_get(sch, classid); } -static struct tcf_proto __rcu **clsact_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl) { - struct net_device *dev = qdisc_dev(sch); + struct clsact_sched_data *q = qdisc_priv(sch); switch (cl) { case TC_H_MIN(TC_H_MIN_INGRESS): - return &dev->ingress_cl_list; + return q->ingress_block; case TC_H_MIN(TC_H_MIN_EGRESS): - return &dev->egress_cl_list; + return q->egress_block; default: return NULL; } @@ -145,6 +161,18 @@ static struct tcf_proto __rcu **clsact_find_tcf(struct Qdisc *sch, static int clsact_init(struct Qdisc *sch, struct nlattr *opt) { + struct clsact_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int err; + + err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list); + if (err) + return err; + + err = tcf_block_get(&q->egress_block, &dev->egress_cl_list); + if (err) + return err; + net_inc_ingress_queue(); net_inc_egress_queue(); @@ -155,10 +183,10 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) static void clsact_destroy(struct Qdisc *sch) { - struct net_device *dev = qdisc_dev(sch); + struct clsact_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&dev->ingress_cl_list); - tcf_destroy_chain(&dev->egress_cl_list); + tcf_block_put(q->egress_block); + tcf_block_put(q->ingress_block); net_dec_ingress_queue(); net_dec_egress_queue(); @@ -169,7 +197,7 @@ static const struct Qdisc_class_ops clsact_class_ops = { .get = clsact_get, .put = ingress_put, .walk = ingress_walk, - .tcf_chain = clsact_find_tcf, + .tcf_block = clsact_tcf_block, .tcf_cl_offload = clsact_cl_offload, .bind_tcf = clsact_bind_filter, .unbind_tcf = ingress_put, @@ -178,6 +206,7 @@ static const struct Qdisc_class_ops clsact_class_ops = { static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { .cl_ops = &clsact_class_ops, .id = "clsact", + .priv_size = sizeof(struct clsact_sched_data), .init = clsact_init, .destroy = clsact_destroy, .dump = ingress_dump, diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 25bb9ffc2df1..604767482ad0 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -32,6 +32,7 @@ struct multiq_sched_data { u16 max_bands; u16 curband; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; struct Qdisc **queues; }; @@ -170,7 +171,7 @@ multiq_destroy(struct Qdisc *sch) int band; struct multiq_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); for (band = 0; band < q->bands; band++) qdisc_destroy(q->queues[band]); @@ -243,6 +244,10 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; + q->max_bands = qdisc_dev(sch)->num_tx_queues; q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL); @@ -367,14 +372,13 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto __rcu **multiq_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl) { struct multiq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static const struct Qdisc_class_ops multiq_class_ops = { @@ -383,7 +387,7 @@ static const struct Qdisc_class_ops multiq_class_ops = { .get = multiq_get, .put = multiq_put, .walk = multiq_walk, - .tcf_chain = multiq_find_tcf, + .tcf_block = multiq_tcf_block, .bind_tcf = multiq_bind, .unbind_tcf = multiq_put, .dump = multiq_dump_class, diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 7997363f7e0d..a2404688dd01 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -25,6 +25,7 @@ struct prio_sched_data { int bands; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; u8 prio2band[TC_PRIO_MAX+1]; struct Qdisc *queues[TCQ_PRIO_BANDS]; }; @@ -145,7 +146,7 @@ prio_destroy(struct Qdisc *sch) int prio; struct prio_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); for (prio = 0; prio < q->bands; prio++) qdisc_destroy(q->queues[prio]); } @@ -204,9 +205,16 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) static int prio_init(struct Qdisc *sch, struct nlattr *opt) { + struct prio_sched_data *q = qdisc_priv(sch); + int err; + if (!opt) return -EINVAL; + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; + return prio_tune(sch, opt); } @@ -317,14 +325,13 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto __rcu **prio_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl) { struct prio_sched_data *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static const struct Qdisc_class_ops prio_class_ops = { @@ -333,7 +340,7 @@ static const struct Qdisc_class_ops prio_class_ops = { .get = prio_get, .put = prio_put, .walk = prio_walk, - .tcf_chain = prio_find_tcf, + .tcf_block = prio_tcf_block, .bind_tcf = prio_bind, .unbind_tcf = prio_put, .dump = prio_dump_class, diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 73c7ac37b570..076ad032befb 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -182,6 +182,7 @@ struct qfq_group { struct qfq_sched { struct tcf_proto __rcu *filter_list; + struct tcf_block *block; struct Qdisc_class_hash clhash; u64 oldV, V; /* Precise virtual times. */ @@ -582,15 +583,14 @@ static void qfq_put_class(struct Qdisc *sch, unsigned long arg) qfq_destroy_class(sch, cl); } -static struct tcf_proto __rcu **qfq_tcf_chain(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl) { struct qfq_sched *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, @@ -1438,6 +1438,10 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) int i, j, err; u32 max_cl_shift, maxbudg_shift, max_classes; + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; + err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; @@ -1492,7 +1496,7 @@ static void qfq_destroy_qdisc(struct Qdisc *sch) struct hlist_node *next; unsigned int i; - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], @@ -1508,7 +1512,7 @@ static const struct Qdisc_class_ops qfq_class_ops = { .delete = qfq_delete_class, .get = qfq_get_class, .put = qfq_put_class, - .tcf_chain = qfq_tcf_chain, + .tcf_block = qfq_tcf_block, .bind_tcf = qfq_bind_tcf, .unbind_tcf = qfq_unbind_tcf, .graft = qfq_graft_class, diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index b287880829e2..9756b1ccd345 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -56,6 +56,7 @@ struct sfb_bins { struct sfb_sched_data { struct Qdisc *qdisc; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; unsigned long rehash_interval; unsigned long warmup_time; /* double buffering warmup time in jiffies */ u32 max; @@ -465,7 +466,7 @@ static void sfb_destroy(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); qdisc_destroy(q->qdisc); } @@ -549,6 +550,11 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) static int sfb_init(struct Qdisc *sch, struct nlattr *opt) { struct sfb_sched_data *q = qdisc_priv(sch); + int err; + + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; q->qdisc = &noop_qdisc; return sfb_change(sch, opt); @@ -657,14 +663,13 @@ static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto __rcu **sfb_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *sfb_tcf_block(struct Qdisc *sch, unsigned long cl) { struct sfb_sched_data *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent, @@ -682,7 +687,7 @@ static const struct Qdisc_class_ops sfb_class_ops = { .change = sfb_change_class, .delete = sfb_delete, .walk = sfb_walk, - .tcf_chain = sfb_find_tcf, + .tcf_block = sfb_tcf_block, .bind_tcf = sfb_bind, .unbind_tcf = sfb_put, .dump = sfb_dump_class, diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 53a641f2ccb5..66dfd15b7946 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -126,6 +126,7 @@ struct sfq_sched_data { u8 flags; unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ struct tcf_proto __rcu *filter_list; + struct tcf_block *block; sfq_index *ht; /* Hash table ('divisor' slots) */ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ @@ -697,7 +698,7 @@ static void sfq_destroy(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); q->perturb_period = 0; del_timer_sync(&q->perturb_timer); sfq_free(q->ht); @@ -709,6 +710,11 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); int i; + int err; + + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, (unsigned long)sch); @@ -815,14 +821,13 @@ static void sfq_put(struct Qdisc *q, unsigned long cl) { } -static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl) { struct sfq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static int sfq_dump_class(struct Qdisc *sch, unsigned long cl, @@ -878,7 +883,7 @@ static const struct Qdisc_class_ops sfq_class_ops = { .leaf = sfq_leaf, .get = sfq_get, .put = sfq_put, - .tcf_chain = sfq_find_tcf, + .tcf_block = sfq_tcf_block, .bind_tcf = sfq_bind, .unbind_tcf = sfq_put, .dump = sfq_dump_class, -- cgit v1.2.3 From fbe9c5b01f97b44b1e4c7d86c092beaf707d4b9d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:07:56 +0200 Subject: net: sched: rename tcf_destroy_chain helper Make the name consistent with the rest of the helpers around. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_api.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index d30116f77156..c02b03e0b39e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -187,7 +187,7 @@ static void tcf_proto_destroy(struct tcf_proto *tp) kfree_rcu(tp, rcu); } -static void tcf_destroy_chain(struct tcf_proto __rcu **fl) +static void tcf_chain_destroy(struct tcf_proto __rcu **fl) { struct tcf_proto *tp; @@ -214,7 +214,7 @@ void tcf_block_put(struct tcf_block *block) { if (!block) return; - tcf_destroy_chain(block->p_filter_chain); + tcf_chain_destroy(block->p_filter_chain); kfree(block); } EXPORT_SYMBOL(tcf_block_put); @@ -372,7 +372,7 @@ replay: if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); - tcf_destroy_chain(chain); + tcf_chain_destroy(chain); err = 0; goto errout; } -- cgit v1.2.3 From 9d36d9e545dce53c6fff046b277c261d6568c5b9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:07:57 +0200 Subject: net: sched: replace nprio by a bool to make the function more readable The use of "nprio" variable in tc_ctl_tfilter is a bit cryptic and makes a reader wonder what is going on for a while. So help him to understand this priority allocation dance a litte bit better. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_api.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c02b03e0b39e..bf2e59cc1174 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -277,7 +277,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct tcmsg *t; u32 protocol; u32 prio; - u32 nprio; + bool prio_allocate; u32 parent; struct net_device *dev; struct Qdisc *q; @@ -306,7 +306,7 @@ replay: t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); - nprio = prio; + prio_allocate = false; parent = t->tcm_parent; cl = 0; @@ -322,6 +322,7 @@ replay: */ if (n->nlmsg_flags & NLM_F_CREATE) { prio = TC_H_MAKE(0x80000000U, 0U); + prio_allocate = true; break; } /* fall-through */ @@ -383,7 +384,7 @@ replay: back = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { - if (!nprio || + if (prio_allocate || (tp->protocol != protocol && protocol)) { err = -EINVAL; goto errout; @@ -409,11 +410,11 @@ replay: goto errout; } - if (!nprio) - nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back))); + if (prio_allocate) + prio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back))); tp = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, nprio, parent, q, block); + protocol, prio, parent, q, block); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout; -- cgit v1.2.3 From 7961973a0087824fdc9d0303b0033ab79b557278 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:07:58 +0200 Subject: net: sched: move TC_H_MAJ macro call into tcf_auto_prio Call the helper from the function rather than to always adjust the return value of the function. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index bf2e59cc1174..690457c988b2 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -125,7 +125,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) if (tp) first = tp->prio - 1; - return first; + return TC_H_MAJ(first); } static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, @@ -411,7 +411,7 @@ replay: } if (prio_allocate) - prio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back))); + prio = tcf_auto_prio(rtnl_dereference(*back)); tp = tcf_proto_create(nla_data(tca[TCA_KIND]), protocol, prio, parent, q, block); -- cgit v1.2.3 From 2190d1d0944f84c55cdfdb89c7920f8f9311bdde Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:07:59 +0200 Subject: net: sched: introduce helpers to work with filter chains Introduce struct tcf_chain object and set of helpers around it. Wraps up insertion, deletion and search in the filter chain. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/sch_generic.h | 7 ++- net/sched/cls_api.c | 148 +++++++++++++++++++++++++++++++++------------- 2 files changed, 113 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 98cf2f23602d..52bceede534b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -248,10 +248,15 @@ struct qdisc_skb_cb { unsigned char data[QDISC_CB_PRIV_LEN]; }; -struct tcf_block { +struct tcf_chain { + struct tcf_proto __rcu *filter_chain; struct tcf_proto __rcu **p_filter_chain; }; +struct tcf_block { + struct tcf_chain *chain; +}; + static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 690457c988b2..fee3d7faeb79 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -106,13 +106,12 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, - struct tcf_proto __rcu **chain, int event) + struct tcf_chain *chain, int event) { - struct tcf_proto __rcu **it_chain; struct tcf_proto *tp; - for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL; - it_chain = &tp->next) + for (tp = rtnl_dereference(chain->filter_chain); + tp; tp = rtnl_dereference(tp->next)) tfilter_notify(net, oskb, n, tp, 0, event, false); } @@ -187,26 +186,49 @@ static void tcf_proto_destroy(struct tcf_proto *tp) kfree_rcu(tp, rcu); } -static void tcf_chain_destroy(struct tcf_proto __rcu **fl) +static struct tcf_chain *tcf_chain_create(void) +{ + return kzalloc(sizeof(struct tcf_chain), GFP_KERNEL); +} + +static void tcf_chain_destroy(struct tcf_chain *chain) { struct tcf_proto *tp; - while ((tp = rtnl_dereference(*fl)) != NULL) { - RCU_INIT_POINTER(*fl, tp->next); + while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { + RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_proto_destroy(tp); } + kfree(chain); +} + +static void +tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain, + struct tcf_proto __rcu **p_filter_chain) +{ + chain->p_filter_chain = p_filter_chain; } int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain) { struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); + int err; if (!block) return -ENOMEM; - block->p_filter_chain = p_filter_chain; + block->chain = tcf_chain_create(); + if (!block->chain) { + err = -ENOMEM; + goto err_chain_create; + } + tcf_chain_filter_chain_ptr_set(block->chain, p_filter_chain); *p_block = block; return 0; + +err_chain_create: + kfree(block); + return err; } EXPORT_SYMBOL(tcf_block_get); @@ -214,7 +236,7 @@ void tcf_block_put(struct tcf_block *block) { if (!block) return; - tcf_chain_destroy(block->p_filter_chain); + tcf_chain_destroy(block->chain); kfree(block); } EXPORT_SYMBOL(tcf_block_put); @@ -267,6 +289,65 @@ reset: } EXPORT_SYMBOL(tcf_classify); +struct tcf_chain_info { + struct tcf_proto __rcu **pprev; + struct tcf_proto __rcu *next; +}; + +static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain_info *chain_info) +{ + return rtnl_dereference(*chain_info->pprev); +} + +static void tcf_chain_tp_insert(struct tcf_chain *chain, + struct tcf_chain_info *chain_info, + struct tcf_proto *tp) +{ + if (chain->p_filter_chain && + *chain_info->pprev == chain->filter_chain) + *chain->p_filter_chain = tp; + RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); + rcu_assign_pointer(*chain_info->pprev, tp); +} + +static void tcf_chain_tp_remove(struct tcf_chain *chain, + struct tcf_chain_info *chain_info, + struct tcf_proto *tp) +{ + struct tcf_proto *next = rtnl_dereference(chain_info->next); + + if (chain->p_filter_chain && tp == chain->filter_chain) + *chain->p_filter_chain = next; + RCU_INIT_POINTER(*chain_info->pprev, next); +} + +static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, + struct tcf_chain_info *chain_info, + u32 protocol, u32 prio, + bool prio_allocate) +{ + struct tcf_proto **pprev; + struct tcf_proto *tp; + + /* Check the chain for existence of proto-tcf with this priority */ + for (pprev = &chain->filter_chain; + (tp = rtnl_dereference(*pprev)); pprev = &tp->next) { + if (tp->prio >= prio) { + if (tp->prio == prio) { + if (prio_allocate || + (tp->protocol != protocol && protocol)) + return ERR_PTR(-EINVAL); + } else { + tp = NULL; + } + break; + } + } + chain_info->pprev = pprev; + chain_info->next = tp ? tp->next : NULL; + return tp; +} + /* Add/change/delete/get a filter node */ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, @@ -281,10 +362,9 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, u32 parent; struct net_device *dev; struct Qdisc *q; - struct tcf_proto __rcu **back; - struct tcf_proto __rcu **chain; + struct tcf_chain_info chain_info; + struct tcf_chain *chain; struct tcf_block *block; - struct tcf_proto *next; struct tcf_proto *tp; const struct Qdisc_class_ops *cops; unsigned long cl; @@ -369,7 +449,7 @@ replay: err = -EINVAL; goto errout; } - chain = block->p_filter_chain; + chain = block->chain; if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); @@ -378,22 +458,11 @@ replay: goto errout; } - /* Check the chain for existence of proto-tcf with this priority */ - for (back = chain; - (tp = rtnl_dereference(*back)) != NULL; - back = &tp->next) { - if (tp->prio >= prio) { - if (tp->prio == prio) { - if (prio_allocate || - (tp->protocol != protocol && protocol)) { - err = -EINVAL; - goto errout; - } - } else { - tp = NULL; - } - break; - } + tp = tcf_chain_tp_find(chain, &chain_info, protocol, + prio, prio_allocate); + if (IS_ERR(tp)) { + err = PTR_ERR(tp); + goto errout; } if (tp == NULL) { @@ -411,7 +480,7 @@ replay: } if (prio_allocate) - prio = tcf_auto_prio(rtnl_dereference(*back)); + prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info)); tp = tcf_proto_create(nla_data(tca[TCA_KIND]), protocol, prio, parent, q, block); @@ -429,8 +498,7 @@ replay: if (fh == 0) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { - next = rtnl_dereference(tp->next); - RCU_INIT_POINTER(*back, next); + tcf_chain_tp_remove(chain, &chain_info, tp); tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER, false); tcf_proto_destroy(tp); @@ -459,11 +527,10 @@ replay: err = tp->ops->delete(tp, fh, &last); if (err) goto errout; - next = rtnl_dereference(tp->next); tfilter_notify(net, skb, n, tp, t->tcm_handle, RTM_DELTFILTER, false); if (last) { - RCU_INIT_POINTER(*back, next); + tcf_chain_tp_remove(chain, &chain_info, tp); tcf_proto_destroy(tp); } goto errout; @@ -480,10 +547,8 @@ replay: err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE); if (err == 0) { - if (tp_created) { - RCU_INIT_POINTER(tp->next, rtnl_dereference(*back)); - rcu_assign_pointer(*back, tp); - } + if (tp_created) + tcf_chain_tp_insert(chain, &chain_info, tp); tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false); } else { if (tp_created) @@ -584,7 +649,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; struct Qdisc *q; struct tcf_block *block; - struct tcf_proto *tp, __rcu **chain; + struct tcf_proto *tp; + struct tcf_chain *chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); unsigned long cl = 0; const struct Qdisc_class_ops *cops; @@ -615,11 +681,11 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) block = cops->tcf_block(q, cl); if (!block) goto errout; - chain = block->p_filter_chain; + chain = block->chain; s_t = cb->args[0]; - for (tp = rtnl_dereference(*chain), t = 0; + for (tp = rtnl_dereference(chain->filter_chain), t = 0; tp; tp = rtnl_dereference(tp->next), t++) { if (t < s_t) continue; -- cgit v1.2.3 From acb31fae3b352b0b9eba7cefe1f669ad639c41d9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:08:00 +0200 Subject: net: sched: push chain dump to a separate function Since there will be multiple chains to dump, push chain dumping code to a separate function. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_api.c | 95 +++++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index fee3d7faeb79..63aa2ea5f00c 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -640,21 +640,65 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, RTM_NEWTFILTER); } +static void tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, + struct netlink_callback *cb, + long index_start, long *p_index) +{ + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = nlmsg_data(cb->nlh); + struct tcf_dump_args arg; + struct tcf_proto *tp; + + for (tp = rtnl_dereference(chain->filter_chain); + tp; tp = rtnl_dereference(tp->next), (*p_index)++) { + if (*p_index < index_start) + continue; + if (TC_H_MAJ(tcm->tcm_info) && + TC_H_MAJ(tcm->tcm_info) != tp->prio) + continue; + if (TC_H_MIN(tcm->tcm_info) && + TC_H_MIN(tcm->tcm_info) != tp->protocol) + continue; + if (*p_index > index_start) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (cb->args[1] == 0) { + if (tcf_fill_node(net, skb, tp, 0, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWTFILTER) <= 0) + break; + + cb->args[1] = 1; + } + if (!tp->ops->walk) + continue; + arg.w.fn = tcf_node_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1] - 1; + arg.w.count = 0; + tp->ops->walk(tp, &arg.w); + cb->args[1] = arg.w.count + 1; + if (arg.w.stop) + break; + } +} + /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - int t; - int s_t; struct net_device *dev; struct Qdisc *q; struct tcf_block *block; - struct tcf_proto *tp; struct tcf_chain *chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); unsigned long cl = 0; const struct Qdisc_class_ops *cops; - struct tcf_dump_args arg; + long index_start; + long index; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; @@ -683,45 +727,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) goto errout; chain = block->chain; - s_t = cb->args[0]; - - for (tp = rtnl_dereference(chain->filter_chain), t = 0; - tp; tp = rtnl_dereference(tp->next), t++) { - if (t < s_t) - continue; - if (TC_H_MAJ(tcm->tcm_info) && - TC_H_MAJ(tcm->tcm_info) != tp->prio) - continue; - if (TC_H_MIN(tcm->tcm_info) && - TC_H_MIN(tcm->tcm_info) != tp->protocol) - continue; - if (t > s_t) - memset(&cb->args[1], 0, - sizeof(cb->args)-sizeof(cb->args[0])); - if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, 0, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER) <= 0) - break; - - cb->args[1] = 1; - } - if (tp->ops->walk == NULL) - continue; - arg.w.fn = tcf_node_dump; - arg.skb = skb; - arg.cb = cb; - arg.w.stop = 0; - arg.w.skip = cb->args[1] - 1; - arg.w.count = 0; - tp->ops->walk(tp, &arg.w); - cb->args[1] = arg.w.count + 1; - if (arg.w.stop) - break; - } - - cb->args[0] = t; + index_start = cb->args[0]; + index = 0; + tcf_chain_dump(chain, skb, cb, index_start, &index); + cb->args[0] = index; errout: if (cl) -- cgit v1.2.3 From 5bc1701881e395cec51811d07ec6961f3d1b2612 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:08:01 +0200 Subject: net: sched: introduce multichain support for filters Instead of having only one filter per block, introduce a list of chains for every block. Create chain 0 by default. UAPI is extended so the user can specify which chain he wants to change. If the new attribute is not specified, chain 0 is used. That allows to maintain backward compatibility. If chain does not exist and user wants to manipulate with it, new chain is created with specified index. Also, when last filter is removed from the chain, the chain is destroyed. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 + include/net/sch_generic.h | 9 +++- include/uapi/linux/rtnetlink.h | 1 + net/sched/cls_api.c | 104 ++++++++++++++++++++++++++++++++++------- 4 files changed, 98 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e56e7157c280..2c213a69c196 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -18,6 +18,8 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); #ifdef CONFIG_NET_CLS +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index); +void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain); void tcf_block_put(struct tcf_block *block); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 52bceede534b..569b5654c30c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -236,7 +237,7 @@ struct tcf_proto { struct Qdisc *q; void *data; const struct tcf_proto_ops *ops; - struct tcf_block *block; + struct tcf_chain *chain; struct rcu_head rcu; }; @@ -251,10 +252,14 @@ struct qdisc_skb_cb { struct tcf_chain { struct tcf_proto __rcu *filter_chain; struct tcf_proto __rcu **p_filter_chain; + struct list_head list; + struct tcf_block *block; + u32 index; /* chain index */ + unsigned int refcnt; }; struct tcf_block { - struct tcf_chain *chain; + struct list_head chain_list; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index cce061382e40..6487b21b2c1e 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -549,6 +549,7 @@ enum { TCA_STAB, TCA_PAD, TCA_DUMP_INVISIBLE, + TCA_CHAIN, __TCA_MAX }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 63aa2ea5f00c..adacaf299c4a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -129,7 +129,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, u32 prio, u32 parent, struct Qdisc *q, - struct tcf_block *block) + struct tcf_chain *chain) { struct tcf_proto *tp; int err; @@ -165,7 +165,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->prio = prio; tp->classid = parent; tp->q = q; - tp->block = block; + tp->chain = chain; err = tp->ops->init(tp); if (err) { @@ -186,15 +186,26 @@ static void tcf_proto_destroy(struct tcf_proto *tp) kfree_rcu(tp, rcu); } -static struct tcf_chain *tcf_chain_create(void) +static struct tcf_chain *tcf_chain_create(struct tcf_block *block, + u32 chain_index) { - return kzalloc(sizeof(struct tcf_chain), GFP_KERNEL); + struct tcf_chain *chain; + + chain = kzalloc(sizeof(*chain), GFP_KERNEL); + if (!chain) + return NULL; + list_add_tail(&chain->list, &block->chain_list); + chain->block = block; + chain->index = chain_index; + chain->refcnt = 1; + return chain; } static void tcf_chain_destroy(struct tcf_chain *chain) { struct tcf_proto *tp; + list_del(&chain->list); while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_proto_destroy(tp); @@ -202,6 +213,30 @@ static void tcf_chain_destroy(struct tcf_chain *chain) kfree(chain); } +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index) +{ + struct tcf_chain *chain; + + list_for_each_entry(chain, &block->chain_list, list) { + if (chain->index == chain_index) { + chain->refcnt++; + return chain; + } + } + return tcf_chain_create(block, chain_index); +} +EXPORT_SYMBOL(tcf_chain_get); + +void tcf_chain_put(struct tcf_chain *chain) +{ + /* Destroy unused chain, with exception of chain 0, which is the + * default one and has to be always present. + */ + if (--chain->refcnt == 0 && !chain->filter_chain && chain->index != 0) + tcf_chain_destroy(chain); +} +EXPORT_SYMBOL(tcf_chain_put); + static void tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain, struct tcf_proto __rcu **p_filter_chain) @@ -213,16 +248,19 @@ int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain) { struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); + struct tcf_chain *chain; int err; if (!block) return -ENOMEM; - block->chain = tcf_chain_create(); - if (!block->chain) { + INIT_LIST_HEAD(&block->chain_list); + /* Create chain 0 by default, it has to be always present. */ + chain = tcf_chain_create(block, 0); + if (!chain) { err = -ENOMEM; goto err_chain_create; } - tcf_chain_filter_chain_ptr_set(block->chain, p_filter_chain); + tcf_chain_filter_chain_ptr_set(chain, p_filter_chain); *p_block = block; return 0; @@ -234,9 +272,13 @@ EXPORT_SYMBOL(tcf_block_get); void tcf_block_put(struct tcf_block *block) { + struct tcf_chain *chain, *tmp; + if (!block) return; - tcf_chain_destroy(block->chain); + + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_destroy(chain); kfree(block); } EXPORT_SYMBOL(tcf_block_put); @@ -360,10 +402,11 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, u32 prio; bool prio_allocate; u32 parent; + u32 chain_index; struct net_device *dev; struct Qdisc *q; struct tcf_chain_info chain_info; - struct tcf_chain *chain; + struct tcf_chain *chain = NULL; struct tcf_block *block; struct tcf_proto *tp; const struct Qdisc_class_ops *cops; @@ -449,7 +492,17 @@ replay: err = -EINVAL; goto errout; } - chain = block->chain; + + chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + if (chain_index > TC_ACT_EXT_VAL_MASK) { + err = -EINVAL; + goto errout; + } + chain = tcf_chain_get(block, chain_index); + if (!chain) { + err = -ENOMEM; + goto errout; + } if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); @@ -483,7 +536,7 @@ replay: prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info)); tp = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, prio, parent, q, block); + protocol, prio, parent, q, chain); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout; @@ -556,6 +609,8 @@ replay: } errout: + if (chain) + tcf_chain_put(chain); if (cl) cops->put(q, cl); if (err == -EAGAIN) @@ -584,6 +639,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index)) + goto nla_put_failure; tcm->tcm_handle = fh; if (RTM_DELTFILTER != event) { tcm->tcm_handle = 0; @@ -640,7 +697,7 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, RTM_NEWTFILTER); } -static void tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, +static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, struct netlink_callback *cb, long index_start, long *p_index) { @@ -667,7 +724,7 @@ static void tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) - break; + return false; cb->args[1] = 1; } @@ -682,14 +739,16 @@ static void tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, tp->ops->walk(tp, &arg.w); cb->args[1] = arg.w.count + 1; if (arg.w.stop) - break; + return false; } + return true; } /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; struct Qdisc *q; struct tcf_block *block; @@ -699,9 +758,15 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) const struct Qdisc_class_ops *cops; long index_start; long index; + int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; + + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + if (err) + return err; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return skb->len; @@ -725,11 +790,18 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) block = cops->tcf_block(q, cl); if (!block) goto errout; - chain = block->chain; index_start = cb->args[0]; index = 0; - tcf_chain_dump(chain, skb, cb, index_start, &index); + + list_for_each_entry(chain, &block->chain_list, list) { + if (tca[TCA_CHAIN] && + nla_get_u32(tca[TCA_CHAIN]) != chain->index) + continue; + if (!tcf_chain_dump(chain, skb, cb, index_start, &index)) + break; + } + cb->args[0] = index; errout: -- cgit v1.2.3 From 9fb9f251d229f6cabd9dbe4214eb7f1e6a4e8a9d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:08:02 +0200 Subject: net: sched: push tp down to action init Tp pointer will be needed by the next patch in order to get the chain. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 12 ++++++------ net/sched/act_api.c | 15 ++++++++------- net/sched/cls_api.c | 9 +++++---- 3 files changed, 19 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index cfa2ae33da9a..b22c6f3d6710 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -180,12 +180,12 @@ int tcf_unregister_action(struct tc_action_ops *a, int tcf_action_destroy(struct list_head *actions, int bind); int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); -int tcf_action_init(struct net *net, struct nlattr *nla, - struct nlattr *est, char *n, int ovr, - int bind, struct list_head *); -struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, - struct nlattr *est, char *n, int ovr, - int bind); +int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, int bind, + struct list_head *actions); +struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, + struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind); int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index a90e8f355c00..e389eb45b484 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -570,9 +570,9 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } -struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, - struct nlattr *est, char *name, int ovr, - int bind) +struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, + struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind) { struct tc_action *a; struct tc_action_ops *a_o; @@ -680,8 +680,9 @@ static void cleanup_a(struct list_head *actions, int ovr) a->tcfa_refcnt--; } -int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind, struct list_head *actions) +int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, int bind, + struct list_head *actions) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -693,7 +694,7 @@ int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(net, tb[i], est, name, ovr, bind); + act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1020,7 +1021,7 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, int ret = 0; LIST_HEAD(actions); - ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions); + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions); if (ret) return ret; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index adacaf299c4a..9e0c4bb82528 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -832,8 +832,9 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct tc_action *act; if (exts->police && tb[exts->police]) { - act = tcf_action_init_1(net, tb[exts->police], rate_tlv, - "police", ovr, TCA_ACT_BIND); + act = tcf_action_init_1(net, tp, tb[exts->police], + rate_tlv, "police", ovr, + TCA_ACT_BIND); if (IS_ERR(act)) return PTR_ERR(act); @@ -844,8 +845,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, LIST_HEAD(actions); int err, i = 0; - err = tcf_action_init(net, tb[exts->action], rate_tlv, - NULL, ovr, TCA_ACT_BIND, + err = tcf_action_init(net, tp, tb[exts->action], + rate_tlv, NULL, ovr, TCA_ACT_BIND, &actions); if (err) return err; -- cgit v1.2.3 From db50514f9a9c7ef1f17e9921b1cc0902746872f3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:08:03 +0200 Subject: net: sched: add termination action to allow goto chain Introduce new type of termination action called "goto_chain". This allows user to specify a chain to be processed. This action type is then processed as a return value in tcf_classify loop in similar way as "reclassify" is, only it does not reset to the first filter in chain but rather reset to the first filter of the desired chain. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + include/net/sch_generic.h | 9 +++++++-- include/uapi/linux/pkt_cls.h | 1 + net/sched/act_api.c | 40 ++++++++++++++++++++++++++++++++++++++++ net/sched/cls_api.c | 6 +++++- 5 files changed, 54 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index b22c6f3d6710..26ffd8333f50 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -42,6 +42,7 @@ struct tc_action { struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie *act_cookie; + struct tcf_chain *goto_chain; }; #define tcf_head common.tcfa_head #define tcf_index common.tcfa_index diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 569b5654c30c..368850194c94 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -193,8 +193,13 @@ struct Qdisc_ops { struct tcf_result { - unsigned long class; - u32 classid; + union { + struct { + unsigned long class; + u32 classid; + }; + const struct tcf_proto *goto_tp; + }; }; struct tcf_proto_ops { diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index d613be3b3239..1b9aa9e6b4fd 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -51,6 +51,7 @@ enum { (((combined) & (~TC_ACT_EXT_VAL_MASK)) == opcode) #define TC_ACT_JUMP __TC_ACT_EXT(1) +#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) /* Action type identifiers*/ enum { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index e389eb45b484..0ecf2a858767 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -28,6 +28,31 @@ #include #include +static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) +{ + u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK; + + if (!tp) + return -EINVAL; + a->goto_chain = tcf_chain_get(tp->chain->block, chain_index); + if (!a->goto_chain) + return -ENOMEM; + return 0; +} + +static void tcf_action_goto_chain_fini(struct tc_action *a) +{ + tcf_chain_put(a->goto_chain); +} + +static void tcf_action_goto_chain_exec(const struct tc_action *a, + struct tcf_result *res) +{ + const struct tcf_chain *chain = a->goto_chain; + + res->goto_tp = rcu_dereference_bh(chain->filter_chain); +} + static void free_tcf(struct rcu_head *head) { struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu); @@ -39,6 +64,8 @@ static void free_tcf(struct rcu_head *head) kfree(p->act_cookie->data); kfree(p->act_cookie); } + if (p->goto_chain) + tcf_action_goto_chain_fini(p); kfree(p); } @@ -465,6 +492,8 @@ repeat: else /* faulty graph, stop pipeline */ return TC_ACT_OK; } + } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) { + tcf_action_goto_chain_exec(a, res); } if (ret != TC_ACT_PIPE) @@ -657,6 +686,17 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err != ACT_P_CREATED) module_put(a_o->owner); + if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { + err = tcf_action_goto_chain_init(a, tp); + if (err) { + LIST_HEAD(actions); + + list_add_tail(&a->list, &actions); + tcf_action_destroy(&actions, bind); + return ERR_PTR(err); + } + } + return a; err_mod: diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 9e0c4bb82528..4020b8d932a1 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -307,8 +307,12 @@ reclassify: err = tp->classify(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT - if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) + if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { goto reset; + } else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) { + old_tp = res->goto_tp; + goto reset; + } #endif if (err >= 0) return err; -- cgit v1.2.3 From 9d4f97f97bb8adc47f569d995402c33de9a4fa19 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 17 May 2017 16:03:16 -0400 Subject: sch_dsmark: Fix uninitialized variable warning. We still need to initialize err to -EINVAL for the case where 'opt' is NULL in dsmark_init(). Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index ba45102cff94..7ccdd825d34e 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -333,7 +333,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) { struct dsmark_qdisc_data *p = qdisc_priv(sch); struct nlattr *tb[TCA_DSMARK_MAX + 1]; - int err; + int err = -EINVAL; u32 default_index = NO_DEFAULT_INDEX; u16 indices; int i; -- cgit v1.2.3 From 385e20706facd376f27863bd55b7cc7720d3f27b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:00 -0700 Subject: tcp: use tp->tcp_mstamp in output path Idea is to later convert tp->tcp_mstamp to a full u64 counter using usec resolution, so that we can later have fine grained TCP TS clock (RFC 7323), regardless of HZ value. We try to refresh tp->tcp_mstamp only when necessary. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 21 +++++++++++---------- net/ipv4/tcp_recovery.c | 1 - net/ipv4/tcp_timer.c | 3 ++- 4 files changed, 14 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5ab2aac5ca19..d8fe25db79f2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -483,6 +483,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) skb = tcp_write_queue_head(sk); BUG_ON(!skb); + skb_mstamp_get(&tp->tcp_mstamp); remaining = icsk->icsk_rto - min(icsk->icsk_rto, tcp_time_stamp - tcp_skb_timestamp(skb)); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a32172d69a03..4c8a6eaba6b3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -997,8 +997,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); + skb->skb_mstamp = tp->tcp_mstamp; if (clone_it) { - skb_mstamp_get(&skb->skb_mstamp); TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; tcp_rate_skb_sent(sk, skb); @@ -1906,7 +1906,6 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, const struct inet_connection_sock *icsk = inet_csk(sk); u32 age, send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); - struct skb_mstamp now; struct sk_buff *head; int win_divisor; @@ -1962,8 +1961,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, } head = tcp_write_queue_head(sk); - skb_mstamp_get(&now); - age = skb_mstamp_us_delta(&now, &head->skb_mstamp); + + age = skb_mstamp_us_delta(&tp->tcp_mstamp, &head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) goto send_now; @@ -2280,6 +2279,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } max_segs = tcp_tso_segs(sk, mss_now); + skb_mstamp_get(&tp->tcp_mstamp); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -2291,7 +2291,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tp->tcp_mstamp; goto repair; /* Skip network transmission */ } @@ -2879,7 +2879,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tp->tcp_mstamp; nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; @@ -3095,7 +3095,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); - skb_mstamp_get(&skb->skb_mstamp); + skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -3453,7 +3453,8 @@ int tcp_connect(struct sock *sk) return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); - tp->retrans_stamp = tcp_time_stamp; + skb_mstamp_get(&tp->tcp_mstamp); + tp->retrans_stamp = tp->tcp_mstamp.stamp_jiffies; tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); @@ -3572,7 +3573,6 @@ void tcp_send_ack(struct sock *sk) skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ - skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); } EXPORT_SYMBOL_GPL(tcp_send_ack); @@ -3606,15 +3606,16 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); - skb_mstamp_get(&skb->skb_mstamp); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } +/* Called from setsockopt( ... TCP_REPAIR ) */ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; + skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 362b8c75bfab..cd72b3d3879e 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -166,7 +166,6 @@ void tcp_rack_reo_timeout(struct sock *sk) u32 timeout, prior_inflight; prior_inflight = tcp_packets_in_flight(tp); - skb_mstamp_get(&tp->tcp_mstamp); tcp_rack_detect_loss(sk, &timeout); if (prior_inflight != tcp_packets_in_flight(tp)) { if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 86934bcf685a..ec7c5473c788 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -339,7 +339,7 @@ static void tcp_probe_timer(struct sock *sk) */ start_ts = tcp_skb_timestamp(tcp_send_head(sk)); if (!start_ts) - skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp); + tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) goto abort; @@ -561,6 +561,7 @@ void tcp_write_timer_handler(struct sock *sk) goto out; } + skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); event = icsk->icsk_pending; switch (event) { -- cgit v1.2.3 From d011b9a448907833a19b2f0a34381419f8ca9b23 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:02 -0700 Subject: dccp: do not use tcp_time_stamp Use our own macro instead of abusing tcp_time_stamp Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 8 ++++---- net/dccp/ccids/ccid2.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 5e3a7302f774..e1295d5f2c56 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -233,7 +233,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const u32 now = ccid2_time_stamp; + const u32 now = ccid2_jiffies32; struct ccid2_seq *next; /* slow-start after idle periods (RFC 2581, RFC 2861) */ @@ -466,7 +466,7 @@ static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, * The cleanest solution is to not use the ccid2s_sent field at all * and instead use DCCP timestamps: requires changes in other places. */ - ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent); + ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) @@ -478,7 +478,7 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) return; } - hc->tx_last_cong = ccid2_time_stamp; + hc->tx_last_cong = ccid2_jiffies32; hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; hc->tx_ssthresh = max(hc->tx_cwnd, 2U); @@ -731,7 +731,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hc->tx_rto = DCCP_TIMEOUT_INIT; hc->tx_rpdupack = -1; - hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp; + hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32; hc->tx_cwnd_used = 0; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 18c97543e522..6e50ef2898fb 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -27,7 +27,7 @@ * CCID-2 timestamping faces the same issues as TCP timestamping. * Hence we reuse/share as much of the code as possible. */ -#define ccid2_time_stamp tcp_time_stamp +#define ccid2_jiffies32 ((u32)jiffies) /* NUMDUPACK parameter from RFC 4341, p. 6 */ #define NUMDUPACK 3 -- cgit v1.2.3 From d635fbe27ebee0f4b845abe5e9620c9400785a5c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:03 -0700 Subject: tcp: use tcp_jiffies32 to feed tp->lsndtime Use tcp_jiffies32 instead of tcp_time_stamp to feed tp->lsndtime. tcp_time_stamp will soon be a litle bit more expensive than simply reading 'jiffies'. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_cubic.c | 2 +- net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_output.c | 4 ++-- net/ipv4/tcp_timer.c | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 4b45be570821..feba4c0406e5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1245,7 +1245,7 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) if (!sysctl_tcp_slow_start_after_idle || tp->packets_out || ca_ops->cong_control) return; - delta = tcp_time_stamp - tp->lsndtime; + delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) tcp_cwnd_restart(sk, delta); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1e4c76d2b827..d0bb61ee28bb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2841,7 +2841,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_retrans = tp->retrans_out; info->tcpi_fackets = tp->fackets_out; - now = tcp_time_stamp; + now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 0683ba447d77..2052ca740916 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -155,7 +155,7 @@ static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); - u32 now = tcp_time_stamp; + u32 now = tcp_jiffies32; s32 delta; delta = now - tcp_sk(sk)->lsndtime; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 06e2dbc2b4a2..c0b3f909df39 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5571,7 +5571,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ - tp->lsndtime = tcp_time_stamp; + tp->lsndtime = tcp_jiffies32; tcp_init_buffer_space(sk); @@ -6008,7 +6008,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_update_pacing_rate(sk); /* Prevent spurious tcp_cwnd_restart() on first data packet */ - tp->lsndtime = tcp_time_stamp; + tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); tcp_fast_path_on(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4c8a6eaba6b3..be9f8f483e21 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -160,7 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - const u32 now = tcp_time_stamp; + const u32 now = tcp_jiffies32; if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); @@ -1918,7 +1918,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, /* Avoid bursty behavior by allowing defer * only if the last write was recent. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) > 0) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ec7c5473c788..5f6f219a431e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -63,7 +63,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ @@ -73,7 +73,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; -- cgit v1.2.3 From c2203cf75ed7dfab8dfc7ac915a726880ee7512f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:04 -0700 Subject: tcp: use tcp_jiffies32 to feed tp->snd_cwnd_stamp Use tcp_jiffies32 instead of tcp_time_stamp to feed tp->snd_cwnd_stamp. tcp_time_stamp will soon be a litle bit more expensive than simply reading 'jiffies'. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 +++++++------- net/ipv4/tcp_metrics.c | 2 +- net/ipv4/tcp_output.c | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c0b3f909df39..6a15c9b80b09 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -463,7 +463,7 @@ void tcp_init_buffer_space(struct sock *sk) tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } /* 5. Recalculate window clamp after socket hit its memory bounds. */ @@ -1954,7 +1954,7 @@ void tcp_enter_loss(struct sock *sk) } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->retrans_out = 0; tp->lost_out = 0; @@ -2383,7 +2383,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) tcp_ecn_withdraw_cwr(tp); } } - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->undo_marker = 0; } @@ -2520,7 +2520,7 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } @@ -2590,7 +2590,7 @@ static void tcp_mtup_probe_success(struct sock *sk) tcp_mss_to_mtu(sk, tp->mss_cache) / icsk->icsk_mtup.probe_size; tp->snd_cwnd_cnt = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_ssthresh = tcp_current_ssthresh(sk); icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; @@ -2976,7 +2976,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); - tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; + tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32; } /* Restart timer after forward progress on connection. @@ -5019,7 +5019,7 @@ static void tcp_new_space(struct sock *sk) if (tcp_should_expand_sndbuf(sk)) { tcp_sndbuf_expand(sk); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } sk->sk_write_space(sk); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 653bbd67e3a3..102b2c90bb80 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -524,7 +524,7 @@ reset: tp->snd_cwnd = 1; else tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index be9f8f483e21..4bd50f0b236b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -151,7 +151,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tp->snd_cwnd = max(cwnd, restart_cwnd); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } @@ -1576,7 +1576,7 @@ static void tcp_cwnd_application_limited(struct sock *sk) } tp->snd_cwnd_used = 0; } - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) @@ -1597,14 +1597,14 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (sysctl_tcp_slow_start_after_idle && - (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && + (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); -- cgit v1.2.3 From 2660bfa84e9236016f3a4f21b7864431d9663338 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:05 -0700 Subject: tcp_bbr: use tcp_jiffies32 instead of tcp_time_stamp Use tcp_jiffies32 instead of tcp_time_stamp, since tcp_time_stamp will soon be only used for TCP TS option. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 92b045c72163..40dc4fc5f6ac 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -730,12 +730,12 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) bool filter_expired; /* Track min RTT seen in the min_rtt_win_sec filter window: */ - filter_expired = after(tcp_time_stamp, + filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { bbr->min_rtt_us = rs->rtt_us; - bbr->min_rtt_stamp = tcp_time_stamp; + bbr->min_rtt_stamp = tcp_jiffies32; } if (bbr_probe_rtt_mode_ms > 0 && filter_expired && @@ -754,7 +754,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) /* Maintain min packets in flight for max(200 ms, 1 round). */ if (!bbr->probe_rtt_done_stamp && tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { - bbr->probe_rtt_done_stamp = tcp_time_stamp + + bbr->probe_rtt_done_stamp = tcp_jiffies32 + msecs_to_jiffies(bbr_probe_rtt_mode_ms); bbr->probe_rtt_round_done = 0; bbr->next_rtt_delivered = tp->delivered; @@ -762,8 +762,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) if (bbr->round_start) bbr->probe_rtt_round_done = 1; if (bbr->probe_rtt_round_done && - after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { - bbr->min_rtt_stamp = tcp_time_stamp; + after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) { + bbr->min_rtt_stamp = tcp_jiffies32; bbr->restore_cwnd = 1; /* snap to prior_cwnd */ bbr_reset_mode(sk); } @@ -810,7 +810,7 @@ static void bbr_init(struct sock *sk) bbr->probe_rtt_done_stamp = 0; bbr->probe_rtt_round_done = 0; bbr->min_rtt_us = tcp_min_rtt(tp); - bbr->min_rtt_stamp = tcp_time_stamp; + bbr->min_rtt_stamp = tcp_jiffies32; minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ -- cgit v1.2.3 From ac35f562203a45a04f79f412509df48857f928be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:06 -0700 Subject: tcp: bic, cubic: use tcp_jiffies32 instead of tcp_time_stamp Use tcp_jiffies32 instead of tcp_time_stamp, since tcp_time_stamp will soon be only used for TCP TS option. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 6 +++--- net/ipv4/tcp_cubic.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 36087bca9f48..609965f0e298 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -84,14 +84,14 @@ static void bictcp_init(struct sock *sk) static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { if (ca->last_cwnd == cwnd && - (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; ca->last_cwnd = cwnd; - ca->last_time = tcp_time_stamp; + ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) /* record the beginning of an epoch */ - ca->epoch_start = tcp_time_stamp; + ca->epoch_start = tcp_jiffies32; /* start off normal */ if (cwnd <= low_window) { diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 2052ca740916..57ae5b5ae643 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -231,21 +231,21 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) ca->ack_cnt += acked; /* count the number of ACKed packets */ if (ca->last_cwnd == cwnd && - (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; /* The CUBIC function can update ca->cnt at most once per jiffy. * On all cwnd reduction events, ca->epoch_start is set to 0, * which will force a recalculation of ca->cnt. */ - if (ca->epoch_start && tcp_time_stamp == ca->last_time) + if (ca->epoch_start && tcp_jiffies32 == ca->last_time) goto tcp_friendliness; ca->last_cwnd = cwnd; - ca->last_time = tcp_time_stamp; + ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) { - ca->epoch_start = tcp_time_stamp; /* record beginning */ + ca->epoch_start = tcp_jiffies32; /* record beginning */ ca->ack_cnt = acked; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ @@ -276,7 +276,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) * if the cwnd < 1 million packets !!! */ - t = (s32)(tcp_time_stamp - ca->epoch_start); + t = (s32)(tcp_jiffies32 - ca->epoch_start); t += msecs_to_jiffies(ca->delay_min >> 3); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; @@ -448,7 +448,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) return; /* Discard delay samples right after fast recovery */ - if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) + if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; delay = (sample->rtt_us << 3) / USEC_PER_MSEC; -- cgit v1.2.3 From 70eabf0e1b8fe11519f793416655266605f700b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:07 -0700 Subject: tcp: use tcp_jiffies32 for rcv_tstamp and lrcvtime Use tcp_jiffies32 instead of tcp_time_stamp, since tcp_time_stamp will soon be only used for TCP TS option. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++-- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_timer.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index feba4c0406e5..5b2932b8363f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1307,8 +1307,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) { const struct inet_connection_sock *icsk = &tp->inet_conn; - return min_t(u32, tcp_time_stamp - icsk->icsk_ack.lrcvtime, - tcp_time_stamp - tp->rcv_tstamp); + return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime, + tcp_jiffies32 - tp->rcv_tstamp); } static inline int tcp_fin_time(const struct sock *sk) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6a15c9b80b09..eeb4967df25a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -672,7 +672,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_rcv_rtt_measure(tp); - now = tcp_time_stamp; + now = tcp_jiffies32; if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize @@ -3636,7 +3636,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) */ sk->sk_err_soft = 0; icsk->icsk_probes_out = 0; - tp->rcv_tstamp = tcp_time_stamp; + tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) goto no_queue; @@ -5554,7 +5554,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) struct inet_connection_sock *icsk = inet_csk(sk); tcp_set_state(sk, TCP_ESTABLISHED); - icsk->icsk_ack.lrcvtime = tcp_time_stamp; + icsk->icsk_ack.lrcvtime = tcp_jiffies32; if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 717be4de5324..59c32e0086c0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -447,7 +447,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); newicsk->icsk_rto = TCP_TIMEOUT_INIT; - newicsk->icsk_ack.lrcvtime = tcp_time_stamp; + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->packets_out = 0; newtp->retrans_out = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4bd50f0b236b..cbda5de16449 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3324,7 +3324,7 @@ static void tcp_connect_init(struct sock *sk) if (likely(!tp->repair)) tp->rcv_nxt = 0; else - tp->rcv_tstamp = tcp_time_stamp; + tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5f6f219a431e..9e0616cb8c17 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -451,7 +451,7 @@ void tcp_retransmit_timer(struct sock *sk) tp->snd_una, tp->snd_nxt); } #endif - if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { + if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { tcp_write_err(sk); goto out; } -- cgit v1.2.3 From c74df29a8d119a09ccc5e50265e3383c76278f3d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:08 -0700 Subject: tcp: use tcp_jiffies32 to feed probe_timestamp Use tcp_jiffies32 instead of tcp_time_stamp, since tcp_time_stamp will soon be only used for TCP TS option. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 +++--- net/ipv4/tcp_timer.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cbda5de16449..f0fd1b4fdb32 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1475,7 +1475,7 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } EXPORT_SYMBOL(tcp_mtup_init); @@ -1987,7 +1987,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) s32 delta; interval = net->ipv4.sysctl_tcp_probe_interval; - delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; + delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); @@ -1999,7 +1999,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); /* Update probe time stamp */ - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 9e0616cb8c17..6629f47aa7f0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -115,7 +115,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) if (net->ipv4.sysctl_tcp_mtu_probing) { if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct net *net = sock_net(sk); -- cgit v1.2.3 From 628174ccc45f648b83374d0a5bd554b0a88522ce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:09 -0700 Subject: tcp: uses jiffies_32 to feed tp->chrono_start tcp_time_stamp will no longer be tied to jiffies. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d0bb61ee28bb..b85bfe7cb11d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2757,7 +2757,7 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { stats[i] = tp->chrono_stat[i - 1]; if (i == tp->chrono_type) - stats[i] += tcp_time_stamp - tp->chrono_start; + stats[i] += tcp_jiffies32 - tp->chrono_start; stats[i] *= USEC_PER_SEC / HZ; total += stats[i]; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f0fd1b4fdb32..1011ea40c2ba 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2202,7 +2202,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { - const u32 now = tcp_time_stamp; + const u32 now = tcp_jiffies32; if (tp->chrono_type > TCP_CHRONO_UNSPEC) tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; -- cgit v1.2.3 From 594208afe40c448faca967235691ec04fe9f57e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:10 -0700 Subject: tcp: use tcp_jiffies32 in __tcp_oow_rate_limited() This place wants to use tcp_jiffies32, this is good enough. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eeb4967df25a..85575888365a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3390,7 +3390,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, u32 *last_oow_ack_time) { if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { NET_INC_STATS(net, mib_idx); @@ -3398,7 +3398,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, } } - *last_oow_ack_time = tcp_time_stamp; + *last_oow_ack_time = tcp_jiffies32; return false; /* not rate-limited: go ahead, send dupack now! */ } -- cgit v1.2.3 From ad5ad69e6b48a7e5cc0391cc57c9e8a93a0c969c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:11 -0700 Subject: tcp_westwood: use tcp_jiffies32 instead of tcp_time_stamp This CC does not need 1 ms tcp_time_stamp and can use the jiffy based 'timestamp'. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 9775453b8d17..bec9cafbe3f9 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -68,7 +68,7 @@ static void tcp_westwood_init(struct sock *sk) w->cumul_ack = 0; w->reset_rtt_min = 1; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; - w->rtt_win_sx = tcp_time_stamp; + w->rtt_win_sx = tcp_jiffies32; w->snd_una = tcp_sk(sk)->snd_una; w->first_ack = 1; } @@ -116,7 +116,7 @@ static void tcp_westwood_pkts_acked(struct sock *sk, static void westwood_update_window(struct sock *sk) { struct westwood *w = inet_csk_ca(sk); - s32 delta = tcp_time_stamp - w->rtt_win_sx; + s32 delta = tcp_jiffies32 - w->rtt_win_sx; /* Initialize w->snd_una with the first acked sequence number in order * to fix mismatch between tp->snd_una and w->snd_una for the first @@ -140,7 +140,7 @@ static void westwood_update_window(struct sock *sk) westwood_filter(w, delta); w->bk = 0; - w->rtt_win_sx = tcp_time_stamp; + w->rtt_win_sx = tcp_jiffies32; } } -- cgit v1.2.3 From 46bf466f08c9db0db1b77d3ecb5694926c73583a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:12 -0700 Subject: tcp_lp: cache tcp_time_stamp tcp_time_stamp will become slightly more expensive soon, cache its value. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_lp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index d6fb6c067af4..ef3122abb373 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -264,18 +264,19 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); + u32 now = tcp_time_stamp; u32 delta; if (sample->rtt_us > 0) tcp_lp_rtt_sample(sk, sample->rtt_us); /* calc inference */ - delta = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + delta = now - tp->rx_opt.rcv_tsecr; if ((s32)delta > 0) lp->inference = 3 * delta; /* test if within inference */ - if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) + if (lp->last_drop && (now - lp->last_drop < lp->inference)) lp->flag |= LP_WITHIN_INF; else lp->flag &= ~LP_WITHIN_INF; @@ -312,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); /* record this drop time */ - lp->last_drop = tcp_time_stamp; + lp->last_drop = now; } static struct tcp_congestion_ops tcp_lp __read_mostly = { -- cgit v1.2.3 From ac9517fcf310327fa3e3b0d8366e4b11236b1b4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:13 -0700 Subject: tcp: replace misc tcp_time_stamp to tcp_jiffies32 After this patch, all uses of tcp_time_stamp will require a change when we introduce 1 ms and/or 1 us TCP TS option. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_htcp.c | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b85bfe7cb11d..850054800526 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -386,7 +386,7 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U); + minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 4a4d8e76738f..3eb78cde6ff0 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -104,7 +104,7 @@ static void measure_achieved_throughput(struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - u32 now = tcp_time_stamp; + u32 now = tcp_jiffies32; if (icsk->icsk_ca_state == TCP_CA_Open) ca->pkts_acked = sample->pkts_acked; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 85575888365a..10e6775464f6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2911,7 +2911,7 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) struct tcp_sock *tp = tcp_sk(sk); u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; - minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp, + minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 59c32e0086c0..6504f1082bdf 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -445,7 +445,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->srtt_us = 0; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_rto = TCP_TIMEOUT_INIT; newicsk->icsk_ack.lrcvtime = tcp_jiffies32; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1011ea40c2ba..65472e931a0b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2418,10 +2418,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) timeout = max_t(u32, timeout, msecs_to_jiffies(10)); /* If RTO is shorter, just schedule TLP in its place. */ - tlp_time_stamp = tcp_time_stamp + timeout; + tlp_time_stamp = tcp_jiffies32 + timeout; rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { - s32 delta = rto_time_stamp - tcp_time_stamp; + s32 delta = rto_time_stamp - tcp_jiffies32; if (delta > 0) timeout = delta; } -- cgit v1.2.3 From 9a568de4818dea9a05af141046bd3e589245ab83 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:14 -0700 Subject: tcp: switch TCP TS option (RFC 7323) to 1ms clock TCP Timestamps option is defined in RFC 7323 Traditionally on linux, it has been tied to the internal 'jiffies' variable, because it had been a cheap and good enough generator. For TCP flows on the Internet, 1 ms resolution would be much better than 4ms or 10ms (HZ=250 or HZ=100 respectively) For TCP flows in the DC, Google has used usec resolution for more than two years with great success [1] Receive size autotuning (DRS) is indeed more precise and converges faster to optimal window size. This patch converts tp->tcp_mstamp to a plain u64 value storing a 1 usec TCP clock. This choice will allow us to upstream the 1 usec TS option as discussed in IETF 97. [1] https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/skbuff.h | 62 +------------------------- include/linux/tcp.h | 22 ++++----- include/net/tcp.h | 59 ++++++++++++++++++++---- net/ipv4/syncookies.c | 8 ++-- net/ipv4/tcp.c | 4 +- net/ipv4/tcp_bbr.c | 22 ++++----- net/ipv4/tcp_input.c | 96 ++++++++++++++++++++-------------------- net/ipv4/tcp_ipv4.c | 17 +++---- net/ipv4/tcp_lp.c | 12 ++--- net/ipv4/tcp_minisocks.c | 4 +- net/ipv4/tcp_output.c | 16 +++---- net/ipv4/tcp_rate.c | 16 +++---- net/ipv4/tcp_recovery.c | 23 +++++----- net/ipv4/tcp_timer.c | 8 ++-- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 4 +- net/netfilter/nf_synproxy_core.c | 2 +- 17 files changed, 178 insertions(+), 199 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bfc7892f6c33..7c0cb2ce8b01 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -506,66 +506,6 @@ typedef unsigned int sk_buff_data_t; typedef unsigned char *sk_buff_data_t; #endif -/** - * struct skb_mstamp - multi resolution time stamps - * @stamp_us: timestamp in us resolution - * @stamp_jiffies: timestamp in jiffies - */ -struct skb_mstamp { - union { - u64 v64; - struct { - u32 stamp_us; - u32 stamp_jiffies; - }; - }; -}; - -/** - * skb_mstamp_get - get current timestamp - * @cl: place to store timestamps - */ -static inline void skb_mstamp_get(struct skb_mstamp *cl) -{ - u64 val = local_clock(); - - do_div(val, NSEC_PER_USEC); - cl->stamp_us = (u32)val; - cl->stamp_jiffies = (u32)jiffies; -} - -/** - * skb_mstamp_delta - compute the difference in usec between two skb_mstamp - * @t1: pointer to newest sample - * @t0: pointer to oldest sample - */ -static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, - const struct skb_mstamp *t0) -{ - s32 delta_us = t1->stamp_us - t0->stamp_us; - u32 delta_jiffies = t1->stamp_jiffies - t0->stamp_jiffies; - - /* If delta_us is negative, this might be because interval is too big, - * or local_clock() drift is too big : fallback using jiffies. - */ - if (delta_us <= 0 || - delta_jiffies >= (INT_MAX / (USEC_PER_SEC / HZ))) - - delta_us = jiffies_to_usecs(delta_jiffies); - - return delta_us; -} - -static inline bool skb_mstamp_after(const struct skb_mstamp *t1, - const struct skb_mstamp *t0) -{ - s32 diff = t1->stamp_jiffies - t0->stamp_jiffies; - - if (!diff) - diff = t1->stamp_us - t0->stamp_us; - return diff > 0; -} - /** * struct sk_buff - socket buffer * @next: Next buffer in list @@ -646,7 +586,7 @@ struct sk_buff { union { ktime_t tstamp; - struct skb_mstamp skb_mstamp; + u64 skb_mstamp; }; }; struct rb_node rbnode; /* used in netem & tcp stack */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 22854f028434..542ca1ae02c4 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -123,7 +123,7 @@ struct tcp_request_sock_ops; struct tcp_request_sock { struct inet_request_sock req; const struct tcp_request_sock_ops *af_specific; - struct skb_mstamp snt_synack; /* first SYNACK sent time */ + u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; u32 txhash; u32 rcv_isn; @@ -211,7 +211,7 @@ struct tcp_sock { /* Information of the most recently (s)acked skb */ struct tcp_rack { - struct skb_mstamp mstamp; /* (Re)sent time of the skb */ + u64 mstamp; /* (Re)sent time of the skb */ u32 rtt_us; /* Associated RTT */ u32 end_seq; /* Ending TCP sequence of the skb */ u8 advanced; /* mstamp advanced since last lost marking */ @@ -240,7 +240,7 @@ struct tcp_sock { u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ /* RTT measurement */ - struct skb_mstamp tcp_mstamp; /* most recent packet received/sent */ + u64 tcp_mstamp; /* most recent packet received/sent */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ @@ -280,8 +280,8 @@ struct tcp_sock { u32 delivered; /* Total data packets delivered incl. rexmits */ u32 lost; /* Total data packets lost incl. rexmits */ u32 app_limited; /* limited until "delivered" reaches this val */ - struct skb_mstamp first_tx_mstamp; /* start of window send phase */ - struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */ + u64 first_tx_mstamp; /* start of window send phase */ + u64 delivered_mstamp; /* time we reached "delivered" */ u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ @@ -335,16 +335,16 @@ struct tcp_sock { /* Receiver side RTT estimation */ struct { - u32 rtt_us; - u32 seq; - struct skb_mstamp time; + u32 rtt_us; + u32 seq; + u64 time; } rcv_rtt_est; /* Receiver queue space */ struct { - int space; - u32 seq; - struct skb_mstamp time; + int space; + u32 seq; + u64 time; } rcvq_space; /* TCP-specific MTU probe information. */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b2932b8363f..82462db97183 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -519,7 +519,7 @@ static inline u32 tcp_cookie_time(void) u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); -__u32 cookie_init_timestamp(struct request_sock *req); +u64 cookie_init_timestamp(struct request_sock *req); bool cookie_timestamp_decode(struct tcp_options_received *opt); bool cookie_ecn_ok(const struct tcp_options_received *opt, const struct net *net, const struct dst_entry *dst); @@ -706,14 +706,55 @@ void tcp_send_window_probe(struct sock *sk); */ #define tcp_jiffies32 ((u32)jiffies) -/* Generator for TCP TS option (RFC 7323) - * Currently tied to 'jiffies' but will soon be driven by 1 ms clock. +/* + * Deliver a 32bit value for TCP timestamp option (RFC 7323) + * It is no longer tied to jiffies, but to 1 ms clock. + * Note: double check if you want to use tcp_jiffies32 instead of this. + */ +#define TCP_TS_HZ 1000 + +static inline u64 tcp_clock_ns(void) +{ + return local_clock(); +} + +static inline u64 tcp_clock_us(void) +{ + return div_u64(tcp_clock_ns(), NSEC_PER_USEC); +} + +/* This should only be used in contexts where tp->tcp_mstamp is up to date */ +static inline u32 tcp_time_stamp(const struct tcp_sock *tp) +{ + return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); +} + +/* Could use tcp_clock_us() / 1000, but this version uses a single divide */ +static inline u32 tcp_time_stamp_raw(void) +{ + return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ); +} + + +/* Refresh 1us clock of a TCP socket, + * ensuring monotically increasing values. */ -#define tcp_time_stamp ((__u32)(jiffies)) +static inline void tcp_mstamp_refresh(struct tcp_sock *tp) +{ + u64 val = tcp_clock_us(); + + if (val > tp->tcp_mstamp) + tp->tcp_mstamp = val; +} + +static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) +{ + return max_t(s64, t1 - t0, 0); +} static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { - return skb->skb_mstamp.stamp_jiffies; + return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ); } @@ -778,9 +819,9 @@ struct tcp_skb_cb { /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ - struct skb_mstamp first_tx_mstamp; + u64 first_tx_mstamp; /* when we reached the "delivered" count */ - struct skb_mstamp delivered_mstamp; + u64 delivered_mstamp; } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; @@ -896,7 +937,7 @@ struct ack_sample { * A sample is invalid if "delivered" or "interval_us" is negative. */ struct rate_sample { - struct skb_mstamp prior_mstamp; /* starting timestamp for interval */ + u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ long interval_us; /* time for tp->delivered to incr "delivered" */ @@ -1862,7 +1903,7 @@ void tcp_init(void); /* tcp_recovery.c */ extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - const struct skb_mstamp *xmit_time); + u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); /* diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 0257d965f111..6426250a58ea 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -66,10 +66,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ -__u32 cookie_init_timestamp(struct request_sock *req) +u64 cookie_init_timestamp(struct request_sock *req) { struct inet_request_sock *ireq; - u32 ts, ts_now = tcp_time_stamp; + u32 ts, ts_now = tcp_time_stamp_raw(); u32 options = 0; ireq = inet_rsk(req); @@ -88,7 +88,7 @@ __u32 cookie_init_timestamp(struct request_sock *req) ts <<= TSBITS; ts |= options; } - return ts; + return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ); } @@ -343,7 +343,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack.v64 = 0; + treq->snt_synack = 0; treq->tfo_listener = false; ireq->ir_iif = inet_request_bound_dev_if(sk, skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 850054800526..b5d18484746d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2706,7 +2706,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (!tp->repair) err = -EPERM; else - tp->tsoffset = val - tcp_time_stamp; + tp->tsoffset = val - tcp_time_stamp_raw(); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); @@ -3072,7 +3072,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_TIMESTAMP: - val = tcp_time_stamp + tp->tsoffset; + val = tcp_time_stamp_raw() + tp->tsoffset; break; case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 40dc4fc5f6ac..dbcc9352a48f 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -91,7 +91,7 @@ struct bbr { struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ u32 rtt_cnt; /* count of packet-timed rounds elapsed */ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ - struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ + u64 cycle_mstamp; /* time of this cycle phase start */ u32 mode:3, /* current bbr_mode in state machine */ prev_ca_state:3, /* CA state on previous ACK */ packet_conservation:1, /* use packet conservation? */ @@ -411,7 +411,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); bool is_full_length = - skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > + tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > bbr->min_rtt_us; u32 inflight, bw; @@ -497,7 +497,7 @@ static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; + bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); bbr->lt_last_delivered = tp->delivered; bbr->lt_last_lost = tp->lost; bbr->lt_rtt_cnt = 0; @@ -551,7 +551,7 @@ static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) struct bbr *bbr = inet_csk_ca(sk); u32 lost, delivered; u64 bw; - s32 t; + u32 t; if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ if (bbr->mode == BBR_PROBE_BW && bbr->round_start && @@ -603,15 +603,15 @@ static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) return; /* Find average delivery rate in this sampling interval. */ - t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); - if (t < 1) - return; /* interval is less than one jiffy, so wait */ - t = jiffies_to_usecs(t); - /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ - if (t < 1) { + t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; + if ((s32)t < 1) + return; /* interval is less than one ms, so wait */ + /* Check if can multiply without overflow */ + if (t >= ~0U / USEC_PER_MSEC) { bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ return; } + t *= USEC_PER_MSEC; bw = (u64)delivered * BW_UNIT; do_div(bw, t); bbr_lt_bw_interval_done(sk, bw); @@ -825,7 +825,7 @@ static void bbr_init(struct sock *sk) bbr->idle_restart = 0; bbr->full_bw = 0; bbr->full_bw_cnt = 0; - bbr->cycle_mstamp.v64 = 0; + bbr->cycle_mstamp = 0; bbr->cycle_idx = 0; bbr_reset_lt_bw_sampling(sk); bbr_reset_startup_mode(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 10e6775464f6..9a5a9e8eda89 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -441,7 +441,7 @@ void tcp_init_buffer_space(struct sock *sk) tcp_sndbuf_expand(sk); tp->rcvq_space.space = tp->rcv_wnd; - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; @@ -555,11 +555,11 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { u32 delta_us; - if (tp->rcv_rtt_est.time.v64 == 0) + if (tp->rcv_rtt_est.time == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time); + delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: @@ -571,13 +571,15 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) - tcp_rcv_rtt_update(tp, - jiffies_to_usecs(tcp_time_stamp - - tp->rx_opt.rcv_tsecr), - 0); + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { + u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; + u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + + tcp_rcv_rtt_update(tp, delta_us, 0); + } } /* @@ -590,7 +592,7 @@ void tcp_rcv_space_adjust(struct sock *sk) int time; int copied; - time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time); + time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; @@ -1134,8 +1136,8 @@ struct tcp_sacktag_state { * that was SACKed. RTO needs the earliest RTT to stay conservative, * but congestion control should still get an accurate delay signal. */ - struct skb_mstamp first_sackt; - struct skb_mstamp last_sackt; + u64 first_sackt; + u64 last_sackt; struct rate_sample *rate; int flag; }; @@ -1200,7 +1202,7 @@ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, int dup_sack, int pcount, - const struct skb_mstamp *xmit_time) + u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; @@ -1242,9 +1244,9 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; - if (state->first_sackt.v64 == 0) - state->first_sackt = *xmit_time; - state->last_sackt = *xmit_time; + if (state->first_sackt == 0) + state->first_sackt = xmit_time; + state->last_sackt = xmit_time; } if (sacked & TCPCB_LOST) { @@ -1304,7 +1306,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, - &skb->skb_mstamp); + skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); if (skb == tp->lost_skb_hint) @@ -1356,8 +1358,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_advance_highest_sack(sk, skb); tcp_skb_collapse_tstamp(prev, skb); - if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64)) - TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0; + if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) + TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); @@ -1587,7 +1589,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), - &skb->skb_mstamp); + skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); if (!before(TCP_SKB_CB(skb)->seq, @@ -2936,9 +2938,12 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * See draft-ietf-tcplw-high-performance-00, section 3.3. */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - flag & FLAG_ACKED) - seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp - - tp->rx_opt.rcv_tsecr); + flag & FLAG_ACKED) { + u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; + u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + + seq_rtt_us = ca_rtt_us = delta_us; + } if (seq_rtt_us < 0) return false; @@ -2960,12 +2965,8 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { long rtt_us = -1L; - if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) { - struct skb_mstamp now; - - skb_mstamp_get(&now); - rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); - } + if (req && !req->num_retrans && tcp_rsk(req)->snt_synack) + rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack); tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us); } @@ -3003,7 +3004,7 @@ void tcp_rearm_rto(struct sock *sk) struct sk_buff *skb = tcp_write_queue_head(sk); const u32 rto_time_stamp = tcp_skb_timestamp(skb) + rto; - s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); + s32 delta = (s32)(rto_time_stamp - tcp_jiffies32); /* delta may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ @@ -3060,9 +3061,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); - struct skb_mstamp first_ackt, last_ackt; + u64 first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); - struct skb_mstamp *now = &tp->tcp_mstamp; u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; @@ -3075,7 +3075,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, bool rtt_update; int flag = 0; - first_ackt.v64 = 0; + first_ackt = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); @@ -3106,8 +3106,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_RETRANS_DATA_ACKED; } else if (!(sacked & TCPCB_SACKED_ACKED)) { last_ackt = skb->skb_mstamp; - WARN_ON_ONCE(last_ackt.v64 == 0); - if (!first_ackt.v64) + WARN_ON_ONCE(last_ackt == 0); + if (!first_ackt) first_ackt = last_ackt; last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; @@ -3122,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, - &skb->skb_mstamp); + skb->skb_mstamp); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3165,13 +3165,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { - seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt); - ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt); + if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { + seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); + ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); } - if (sack->first_sackt.v64) { - sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt); - ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt); + if (sack->first_sackt) { + sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); + ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt); } sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, @@ -3201,7 +3201,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); } else if (skb && rtt_update && sack_rtt_us >= 0 && - sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) { + sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3553,7 +3553,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ - sack_state.first_sackt.v64 = 0; + sack_state.first_sackt = 0; sack_state.rate = &rs; /* We very likely will need to access write queue head. */ @@ -5356,7 +5356,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* @@ -5672,7 +5672,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, - tcp_time_stamp)) { + tcp_time_stamp(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; @@ -5917,7 +5917,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5929,7 +5929,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; } - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); tp->rx_opt.saw_tstamp = 0; req = tp->fastopen_rsk; if (req) { @@ -6202,7 +6202,7 @@ static void tcp_openreq_init(struct request_sock *req, req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - skb_mstamp_get(&tcp_rsk(req)->snt_synack); + tcp_rsk(req)->snt_synack = tcp_clock_us(); tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d8fe25db79f2..191b2f78b19d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -376,8 +376,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) struct sock *sk; struct sk_buff *skb; struct request_sock *fastopen; - __u32 seq, snd_una; - __u32 remaining; + u32 seq, snd_una; + s32 remaining; + u32 delta_us; int err; struct net *net = dev_net(icmp_skb->dev); @@ -483,12 +484,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) skb = tcp_write_queue_head(sk); BUG_ON(!skb); - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); + delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); remaining = icsk->icsk_rto - - min(icsk->icsk_rto, - tcp_time_stamp - tcp_skb_timestamp(skb)); + usecs_to_jiffies(delta_us); - if (remaining) { + if (remaining > 0) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); } else { @@ -812,7 +813,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp + tcptw->tw_ts_offset, + tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), @@ -840,7 +841,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp + tcp_rsk(req)->ts_off, + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index ef3122abb373..ae10ed64fe13 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -37,7 +37,7 @@ #include /* resolution of owd */ -#define LP_RESOL 1000 +#define LP_RESOL TCP_TS_HZ /** * enum tcp_lp_state @@ -147,9 +147,9 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk) tp->rx_opt.rcv_tsecr == lp->local_ref_time) goto out; - m = HZ * (tp->rx_opt.rcv_tsval - - lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - - lp->local_ref_time); + m = TCP_TS_HZ * + (tp->rx_opt.rcv_tsval - lp->remote_ref_time) / + (tp->rx_opt.rcv_tsecr - lp->local_ref_time); if (m < 0) m = -m; @@ -194,7 +194,7 @@ static u32 tcp_lp_owd_calculator(struct sock *sk) if (lp->flag & LP_VALID_RHZ) { owd = tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - - tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); + tp->rx_opt.rcv_tsecr * (LP_RESOL / TCP_TS_HZ); if (owd < 0) owd = -owd; } @@ -264,7 +264,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); - u32 now = tcp_time_stamp; + u32 now = tcp_time_stamp(tp); u32 delta; if (sample->rtt_us > 0) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6504f1082bdf..d0642df73044 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -455,7 +455,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->tlp_high_seq = 0; - newtp->lsndtime = treq->snt_synack.stamp_jiffies; + newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = treq->txhash; newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; @@ -526,7 +526,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; - newtp->rack.mstamp.v64 = 0; + newtp->rack.mstamp = 0; newtp->rack.advanced = 0; __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 65472e931a0b..478f75baee31 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1962,7 +1962,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_write_queue_head(sk); - age = skb_mstamp_us_delta(&tp->tcp_mstamp, &head->skb_mstamp); + age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) goto send_now; @@ -2279,7 +2279,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } max_segs = tcp_tso_segs(sk, mss_now); - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -3095,7 +3095,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); - skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); + tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -3191,10 +3191,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); + skb->skb_mstamp = cookie_init_timestamp(req); else #endif - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tcp_clock_us(); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); @@ -3453,8 +3453,8 @@ int tcp_connect(struct sock *sk) return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); - skb_mstamp_get(&tp->tcp_mstamp); - tp->retrans_stamp = tp->tcp_mstamp.stamp_jiffies; + tcp_mstamp_refresh(tp); + tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); @@ -3615,7 +3615,7 @@ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; - skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); + tcp_mstamp_refresh(tcp_sk(sk)); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index c6a9fa894646..ad99569d4c1e 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -78,7 +78,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - if (!scb->tx.delivered_mstamp.v64) + if (!scb->tx.delivered_mstamp) return; if (!rs->prior_delivered || @@ -89,9 +89,9 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, rs->is_retrans = scb->sacked & TCPCB_RETRANS; /* Find the duration of the "send phase" of this window: */ - rs->interval_us = skb_mstamp_us_delta( - &skb->skb_mstamp, - &scb->tx.first_tx_mstamp); + rs->interval_us = tcp_stamp_us_delta( + skb->skb_mstamp, + scb->tx.first_tx_mstamp); /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = skb->skb_mstamp; @@ -101,7 +101,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, * we don't need to reset since it'll be freed soon. */ if (scb->sacked & TCPCB_SACKED_ACKED) - scb->tx.delivered_mstamp.v64 = 0; + scb->tx.delivered_mstamp = 0; } /* Update the connection delivery information and generate a rate sample. */ @@ -125,7 +125,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ /* Return an invalid sample if no timing information is available. */ - if (!rs->prior_mstamp.v64) { + if (!rs->prior_mstamp) { rs->delivered = -1; rs->interval_us = -1; return; @@ -138,8 +138,8 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * longer phase. */ snd_us = rs->interval_us; /* send phase */ - ack_us = skb_mstamp_us_delta(&tp->tcp_mstamp, - &rs->prior_mstamp); /* ack phase */ + ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); /* Normally we expect interval_us >= min-rtt. diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index cd72b3d3879e..fe9a493d0208 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -17,12 +17,9 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) } } -static bool tcp_rack_sent_after(const struct skb_mstamp *t1, - const struct skb_mstamp *t2, - u32 seq1, u32 seq2) +static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) { - return skb_mstamp_after(t1, t2) || - (t1->v64 == t2->v64 && after(seq1, seq2)); + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): @@ -72,14 +69,14 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) scb->sacked & TCPCB_SACKED_ACKED) continue; - if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp, + if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, tp->rack.end_seq, scb->end_seq)) { /* Step 3 in draft-cheng-tcpm-rack-00.txt: * A packet is lost if its elapsed time is beyond * the recent RTT plus the reordering window. */ - u32 elapsed = skb_mstamp_us_delta(&tp->tcp_mstamp, - &skb->skb_mstamp); + u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, + skb->skb_mstamp); s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; if (remaining < 0) { @@ -127,16 +124,16 @@ void tcp_rack_mark_lost(struct sock *sk) * draft-cheng-tcpm-rack-00.txt */ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - const struct skb_mstamp *xmit_time) + u64 xmit_time) { u32 rtt_us; - if (tp->rack.mstamp.v64 && - !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp, + if (tp->rack.mstamp && + !tcp_rack_sent_after(xmit_time, tp->rack.mstamp, end_seq, tp->rack.end_seq)) return; - rtt_us = skb_mstamp_us_delta(&tp->tcp_mstamp, xmit_time); + rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); if (sacked & TCPCB_RETRANS) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior @@ -152,7 +149,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, return; } tp->rack.rtt_us = rtt_us; - tp->rack.mstamp = *xmit_time; + tp->rack.mstamp = xmit_time; tp->rack.end_seq = end_seq; tp->rack.advanced = 1; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6629f47aa7f0..27a667bce806 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -153,8 +153,8 @@ static bool retransmits_timed_out(struct sock *sk, unsigned int timeout, bool syn_set) { - unsigned int linear_backoff_thresh, start_ts; unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; + unsigned int linear_backoff_thresh, start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; @@ -172,7 +172,7 @@ static bool retransmits_timed_out(struct sock *sk, timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; } - return (tcp_time_stamp - start_ts) >= timeout; + return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); } /* A write timeout has occurred. Process the after effects. */ @@ -341,7 +341,7 @@ static void tcp_probe_timer(struct sock *sk) if (!start_ts) tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) + (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; @@ -561,7 +561,7 @@ void tcp_write_timer_handler(struct sock *sk) goto out; } - skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); + tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; switch (event) { diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 5abc3692b901..971823359f5b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -211,7 +211,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack.v64 = 0; + treq->snt_synack = 0; treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; treq->ts_off = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4f4310a36a04..233edfabe1db 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -949,7 +949,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp + tcptw->tw_ts_offset, + tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); @@ -971,7 +971,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp + tcp_rsk(req)->ts_off, + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index a504e87c6ddf..49bd8bb16b18 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -152,7 +152,7 @@ void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; - opts->tsval = tcp_time_stamp & ~0x3f; + opts->tsval = tcp_time_stamp_raw() & ~0x3f; if (opts->options & XT_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; -- cgit v1.2.3 From 7dd7eb9513bd02184d45f000ab69d78cb1fa1531 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 17 May 2017 22:54:11 -0400 Subject: ipv6: Check ip6_find_1stfragopt() return value properly. Do not use unsigned variables to see if it returns a negative error or not. Fixes: 2423496af35d ("ipv6: Prevent overrun when parsing v6 header options") Reported-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv6/ip6_offload.c | 9 ++++----- net/ipv6/ip6_output.c | 7 +++---- net/ipv6/udp_offload.c | 8 +++++--- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index eab36abc9f22..280268f1dd7b 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -63,7 +63,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, const struct net_offload *ops; int proto; struct frag_hdr *fptr; - unsigned int unfrag_ip6hlen; unsigned int payload_len; u8 *prevhdr; int offset = 0; @@ -116,10 +115,10 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, skb->network_header = (u8 *)ipv6h - skb->head; if (udpfrag) { - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); - if (unfrag_ip6hlen < 0) - return ERR_PTR(unfrag_ip6hlen); - fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); + int err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) fptr->frag_off |= htons(IP6_MF); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 01deecda2f84..d4a31becbd25 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -597,11 +597,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; - hlen = ip6_find_1stfragopt(skb, &prevhdr); - if (hlen < 0) { - err = hlen; + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) goto fail; - } + hlen = err; nexthdr = *prevhdr; mtu = ip6_skb_dst_mtu(skb); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b348cff47395..a2267f80febb 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -29,6 +29,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u8 frag_hdr_sz = sizeof(struct frag_hdr); __wsum csum; int tnl_hlen; + int err; mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -90,9 +91,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, /* Find the unfragmentable header and shift it left by frag_hdr_sz * bytes to insert fragment header. */ - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); - if (unfrag_ip6hlen < 0) - return ERR_PTR(unfrag_ip6hlen); + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + unfrag_ip6hlen = err; nexthdr = *prevhdr; *prevhdr = NEXTHDR_FRAGMENT; unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + -- cgit v1.2.3 From 751a9c763849f5859cb69ea44b0430d00672f637 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 17 May 2017 11:24:47 -0400 Subject: netfilter: xtables: fix build failure from COMPAT_XT_ALIGN outside CONFIG_COMPAT The patch in the Fixes references COMPAT_XT_ALIGN in the definition of XT_DATA_TO_USER, outside an #ifdef CONFIG_COMPAT block. Split XT_DATA_TO_USER into separate compat and non compat variants and define the first inside an CONFIG_COMPAT block. This simplifies both variants by removing branches inside the macro. Fixes: 324318f0248c ("netfilter: xtables: zero padding in data_to_user") Reported-by: Stephen Rothwell Signed-off-by: Willem de Bruijn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d17769599c10..1770c1d9b37f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -296,18 +296,17 @@ int xt_data_to_user(void __user *dst, const void *src, } EXPORT_SYMBOL_GPL(xt_data_to_user); -#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ +#define XT_DATA_TO_USER(U, K, TYPE) \ xt_data_to_user(U->data, K->data, \ K->u.kernel.TYPE->usersize, \ - C_SIZE ? : K->u.kernel.TYPE->TYPE##size, \ - C_SIZE ? COMPAT_XT_ALIGN(C_SIZE) : \ - XT_ALIGN(K->u.kernel.TYPE->TYPE##size)) + K->u.kernel.TYPE->TYPE##size, \ + XT_ALIGN(K->u.kernel.TYPE->TYPE##size)) int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u) { return XT_OBJ_TO_USER(u, m, match, 0) || - XT_DATA_TO_USER(u, m, match, 0); + XT_DATA_TO_USER(u, m, match); } EXPORT_SYMBOL_GPL(xt_match_to_user); @@ -315,7 +314,7 @@ int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u) { return XT_OBJ_TO_USER(u, t, target, 0) || - XT_DATA_TO_USER(u, t, target, 0); + XT_DATA_TO_USER(u, t, target); } EXPORT_SYMBOL_GPL(xt_target_to_user); @@ -614,6 +613,12 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, } EXPORT_SYMBOL_GPL(xt_compat_match_from_user); +#define COMPAT_XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ + xt_data_to_user(U->data, K->data, \ + K->u.kernel.TYPE->usersize, \ + C_SIZE, \ + COMPAT_XT_ALIGN(C_SIZE)) + int xt_compat_match_to_user(const struct xt_entry_match *m, void __user **dstptr, unsigned int *size) { @@ -629,7 +634,7 @@ int xt_compat_match_to_user(const struct xt_entry_match *m, if (match->compat_to_user((void __user *)cm->data, m->data)) return -EFAULT; } else { - if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm))) + if (COMPAT_XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm))) return -EFAULT; } @@ -975,7 +980,7 @@ int xt_compat_target_to_user(const struct xt_entry_target *t, if (target->compat_to_user((void __user *)ct->data, t->data)) return -EFAULT; } else { - if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct))) + if (COMPAT_XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct))) return -EFAULT; } -- cgit v1.2.3 From 122048752e3d35fb5d91a94e671f42cf31ece16e Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 1 May 2017 21:43:24 -0700 Subject: Bluetooth: Set LE Suggested Default Data Length to maximum When LE Data Packet Length Extension is supported, then actually increase the suggested default data length to the maximum to enable higher througput. < HCI Command: LE Read Maximum Data Length (0x08|0x002f) plen 0 > HCI Event: Command Complete (0x0e) plen 12 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 2120 Max RX octets: 251 Max RX time: 2120 < HCI Command: LE Read Suggested Default Data Length (0x08|0x0023) plen 0 > HCI Event: Command Complete (0x0e) plen 8 LE Read Suggested Default Data Length (0x08|0x0023) ncmd 1 Status: Success (0x00) TX octets: 27 TX time: 328 < HCI Command: LE Write Suggested Default Data Length (0x08|0x0024) plen 4 TX octets: 251 TX time: 2120 > HCI Event: Command Complete (0x0e) plen 4 LE Write Suggested Default Data Length (0x08|0x0024) ncmd 1 Status: Success (0x00) Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 05686776a5fb..4a0cac774107 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -771,6 +771,15 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) sizeof(support), &support); } + /* Set Suggested Default Data Length to maximum if supported */ + if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { + struct hci_cp_le_write_def_data_len cp; + + cp.tx_len = hdev->le_max_tx_len; + cp.tx_time = hdev->le_max_tx_time; + hci_req_add(req, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp); + } + return 0; } -- cgit v1.2.3 From 9756d33b852a17ee67539545c2acf8dc3bda4574 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 1 May 2017 23:54:17 -0700 Subject: Bluetooth: Enable LE Channel Selection Algorithm event If the Channel Selection Algorithm #2 feature is supported, then enable the new LE Channel Selection Algorithm event. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_core.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 99aa5e5e3100..dd43cfdd443a 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -399,6 +399,7 @@ enum { #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 #define HCI_LE_EXT_SCAN_POLICY 0x80 +#define HCI_LE_CHAN_SEL_ALG2 0x40 /* Connection modes */ #define HCI_CM_ACTIVE 0x0000 diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 4a0cac774107..e58b9034afff 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -635,6 +635,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) * Report */ + /* If the controller supports Channel Selection Algorithm #2 + * feature, enable the corresponding event. + */ + if (hdev->le_features[1] & HCI_LE_CHAN_SEL_ALG2) + events[2] |= 0x08; /* LE Channel Selection + * Algorithm + */ + /* If the controller supports the LE Set Scan Enable command, * enable the corresponding advertising report event. */ -- cgit v1.2.3 From 27bbca44026d81968b002d73edf6976d49edd005 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 1 May 2017 23:54:18 -0700 Subject: Bluetooth: Enable LE PHY Update Complete event If either LE Set Default PHY command or LE Set PHY commands is supported, then enable the LE PHY Update Complete event. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e58b9034afff..88a616a2b959 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -685,6 +685,12 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->commands[34] & 0x04) events[1] |= 0x01; /* LE Generate DHKey Complete */ + /* If the controller supports the LE Set Default PHY or + * LE Set PHY commands, enable the corresponding event. + */ + if (hdev->commands[35] & (0x20 | 0x40)) + events[1] |= 0x08; /* LE PHY Update Complete */ + hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events); -- cgit v1.2.3 From de2ba3039cfb61334b2523677cc032422873ff93 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 1 May 2017 23:54:19 -0700 Subject: Bluetooth: Set LE Default PHY preferences If the LE Set Default PHY command is supported, the indicate to the controller that the host has no preferences for transmitter PHY or receiver PHY selection. Issuing this command gives the controller a clear indication that other PHY can be selected if available. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 7 +++++++ net/bluetooth/hci_core.c | 12 ++++++++++++ 2 files changed, 19 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index dd43cfdd443a..fe98f0a5bef0 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1499,6 +1499,13 @@ struct hci_rp_le_read_max_data_len { __le16 rx_time; } __packed; +#define HCI_OP_LE_SET_DEFAULT_PHY 0x2031 +struct hci_cp_le_set_default_phy { + __u8 all_phys; + __u8 tx_phys; + __u8 rx_phys; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 88a616a2b959..43fecd59dfef 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -794,6 +794,18 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp); } + /* Set Default PHY parameters if command is supported */ + if (hdev->commands[35] & 0x20) { + struct hci_cp_le_set_default_phy cp; + + /* No transmitter PHY or receiver PHY preferences */ + cp.all_phys = 0x03; + cp.tx_phys = 0; + cp.rx_phys = 0; + + hci_req_add(req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp); + } + return 0; } -- cgit v1.2.3 From b56c7b2548a428d37b56951f419122ef4c75cc1b Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 2 May 2017 12:43:31 -0700 Subject: Bluetooth: Skip vendor diagnostic configuration for HCI User Channel When the HCI User Channel access is requested, then do not try to undermine it with vendor diagnostic configuration. The exclusive user is required to configure its own vendor diagnostic in that case and can not rely on the host stack support. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 43fecd59dfef..7655b4005dfb 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -148,13 +148,13 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, return -EINVAL; /* When the diagnostic flags are not persistent and the transport - * is not active, then there is no need for the vendor callback. - * - * Instead just store the desired value. If needed the setting - * will be programmed when the controller gets powered on. + * is not active or in user channel operation, then there is no need + * for the vendor callback. Instead just store the desired value and + * the setting will be programmed when the controller gets powered on. */ if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && - !test_bit(HCI_RUNNING, &hdev->flags)) + (!test_bit(HCI_RUNNING, &hdev->flags) || + hci_dev_test_flag(hdev, HCI_USER_CHANNEL))) goto done; hci_req_sync_lock(hdev); @@ -1419,6 +1419,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) * completed. */ if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag) ret = hdev->set_diag(hdev, true); -- cgit v1.2.3 From 64df6d525fcff1630098db9238bfd2b3e092d5c1 Mon Sep 17 00:00:00 2001 From: linzhang Date: Wed, 17 May 2017 12:05:07 +0800 Subject: net: x25: fix one potential use-after-free issue The function x25_init is not properly unregister related resources on error handler.It is will result in kernel oops if x25_init init failed, so add properly unregister call on error handler. Also, i adjust the coding style and make x25_register_sysctl properly return failure. Signed-off-by: linzhang Signed-off-by: David S. Miller --- include/net/x25.h | 4 ++-- net/x25/af_x25.c | 24 ++++++++++++++++-------- net/x25/sysctl_net_x25.c | 5 ++++- 3 files changed, 22 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/x25.h b/include/net/x25.h index c383aa4edbf0..6d30a01d281d 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -298,10 +298,10 @@ void x25_check_rbuf(struct sock *); /* sysctl_net_x25.c */ #ifdef CONFIG_SYSCTL -void x25_register_sysctl(void); +int x25_register_sysctl(void); void x25_unregister_sysctl(void); #else -static inline void x25_register_sysctl(void) {}; +static inline int x25_register_sysctl(void) { return 0; }; static inline void x25_unregister_sysctl(void) {}; #endif /* CONFIG_SYSCTL */ diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 8b911c29860e..5a1a98df3499 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1791,32 +1791,40 @@ void x25_kill_by_neigh(struct x25_neigh *nb) static int __init x25_init(void) { - int rc = proto_register(&x25_proto, 0); + int rc; - if (rc != 0) + rc = proto_register(&x25_proto, 0); + if (rc) goto out; rc = sock_register(&x25_family_ops); - if (rc != 0) + if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); - if (rc != 0) + if (rc) goto out_sock; - pr_info("Linux Version 0.2\n"); + rc = x25_register_sysctl(); + if (rc) + goto out_dev; - x25_register_sysctl(); rc = x25_proc_init(); - if (rc != 0) - goto out_dev; + if (rc) + goto out_sysctl; + + pr_info("Linux Version 0.2\n"); + out: return rc; +out_sysctl: + x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: + dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c index a06dfe143c67..ba078c85f0a1 100644 --- a/net/x25/sysctl_net_x25.c +++ b/net/x25/sysctl_net_x25.c @@ -73,9 +73,12 @@ static struct ctl_table x25_table[] = { { }, }; -void __init x25_register_sysctl(void) +int __init x25_register_sysctl(void) { x25_table_header = register_net_sysctl(&init_net, "net/x25", x25_table); + if (!x25_table_header) + return -ENOMEM; + return 0; } void x25_unregister_sysctl(void) -- cgit v1.2.3 From 64f5102dcb811b27d673eccc8cc0d76ce90981a4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 17 May 2017 09:50:36 +0100 Subject: udp: make function udp_skb_dtor_locked static Function udp_skb_dtor_locked does not need to be in global scope so make it static to fix sparse warning: net/ipv4/udp.c: warning: symbol 'udp_skb_dtor_locked' was not declared. Should it be static? Fixes: 6dfb4367cd911d ("udp: keep the sk_receive_queue held when splicing") Signed-off-by: Colin Ian King Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7bd56c9889b3..922a62d45714 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1218,7 +1218,7 @@ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(udp_skb_destructor); /* as above, but the caller held the rx queue lock, too */ -void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) +static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) { udp_rmem_release(sk, skb->dev_scratch, 1, true); } -- cgit v1.2.3 From a285860211bf257b0e6d522dac6006794be348af Mon Sep 17 00:00:00 2001 From: Tobias Jungel Date: Wed, 17 May 2017 09:29:12 +0200 Subject: bridge: netlink: check vlan_default_pvid range Currently it is allowed to set the default pvid of a bridge to a value above VLAN_VID_MASK (0xfff). This patch adds a check to br_validate and returns -EINVAL in case the pvid is out of bounds. Reproduce by calling: [root@test ~]# ip l a type bridge [root@test ~]# ip l a type dummy [root@test ~]# ip l s bridge0 type bridge vlan_filtering 1 [root@test ~]# ip l s bridge0 type bridge vlan_default_pvid 9999 [root@test ~]# ip l s dummy0 master bridge0 [root@test ~]# bridge vlan port vlan ids bridge0 9999 PVID Egress Untagged dummy0 9999 PVID Egress Untagged Fixes: 0f963b7592ef ("bridge: netlink: add support for default_pvid") Acked-by: Nikolay Aleksandrov Signed-off-by: Tobias Jungel Acked-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c5ce7745b230..574f78824d8a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -835,6 +835,13 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return -EPROTONOSUPPORT; } } + + if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { + __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); + + if (defpvid >= VLAN_VID_MASK) + return -EINVAL; + } #endif return 0; -- cgit v1.2.3 From 0cd2950357e31a96be03b531b4b11fe1df812c9f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 17 May 2017 13:30:44 +0300 Subject: net: make struct net_device::tx_queue_len unsigned int 4 billion packet queue is something unthinkable so use 32-bit value for now. Space savings on x86_64: add/remove: 0/0 grow/shrink: 3/70 up/down: 16/-131 (-115) function old new delta change_tx_queue_len 94 108 +14 qdisc_create 1176 1177 +1 alloc_netdev_mqs 1124 1125 +1 xenvif_alloc 533 532 -1 x25_asy_setup 167 166 -1 ... tun_queue_resize 945 940 -5 pfifo_fast_enqueue 167 162 -5 qfq_init_qdisc 168 158 -10 tap_queue_resize 810 799 -11 transmit 719 698 -21 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- drivers/net/wan/hdlc_raw_eth.c | 3 ++- include/linux/netdevice.h | 2 +- net/core/net-sysfs.c | 8 ++++++-- net/core/rtnetlink.c | 4 ++-- 4 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/drivers/net/wan/hdlc_raw_eth.c b/drivers/net/wan/hdlc_raw_eth.c index 2f11836078ab..8bd3ed905813 100644 --- a/drivers/net/wan/hdlc_raw_eth.c +++ b/drivers/net/wan/hdlc_raw_eth.c @@ -57,7 +57,8 @@ static int raw_eth_ioctl(struct net_device *dev, struct ifreq *ifr) const size_t size = sizeof(raw_hdlc_proto); raw_hdlc_proto new_settings; hdlc_device *hdlc = dev_to_hdlc(dev); - int result, old_qlen; + unsigned int old_qlen; + int result; switch (ifr->ifr_settings.type) { case IF_GET_PROTO: diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3f39d27decf4..0150b2dd3031 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1824,7 +1824,7 @@ struct net_device { #ifdef CONFIG_NET_SCHED DECLARE_HASHTABLE (qdisc_hash, 4); #endif - unsigned long tx_queue_len; + unsigned int tx_queue_len; spinlock_t tx_global_lock; int watchdog_timeo; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 65ea0ff4017c..58e6cc70500d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -323,7 +323,11 @@ NETDEVICE_SHOW_RW(flags, fmt_hex); static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) { - int res, orig_len = dev->tx_queue_len; + unsigned int orig_len = dev->tx_queue_len; + int res; + + if (new_len != (unsigned int)new_len) + return -ERANGE; if (new_len != orig_len) { dev->tx_queue_len = new_len; @@ -349,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev, return netdev_store(dev, attr, buf, len, change_tx_queue_len); } -NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); +NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d7f82c3450b1..f759f22af0af 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2046,8 +2046,8 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_TXQLEN]) { - unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); - unsigned long orig_len = dev->tx_queue_len; + unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); + unsigned int orig_len = dev->tx_queue_len; if (dev->tx_queue_len ^ value) { dev->tx_queue_len = value; -- cgit v1.2.3 From a3f96c47c8d4c38be71fdbb440a99968c764ba62 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 17 May 2017 14:52:16 +0200 Subject: udp: make *udp*_queue_rcv_skb() functions static Since the udp memory accounting refactor, we don't need any more to export the *udp*_queue_rcv_skb(). Make them static and fix a couple of sparse warnings: net/ipv4/udp.c:1615:5: warning: symbol 'udp_queue_rcv_skb' was not declared. Should it be static? net/ipv6/udp.c:572:5: warning: symbol 'udpv6_queue_rcv_skb' was not declared. Should it be static? Fixes: 850cbaddb52d ("udp: use it's own memory accounting schema") Fixes: c915fe13cbaa ("udplite: fix NULL pointer dereference") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 ++-- net/ipv4/udp_impl.h | 1 - net/ipv6/udp.c | 4 ++-- net/ipv6/udp_impl.h | 1 - 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ea6e4cff9faf..1d6219bf2d6b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1612,7 +1612,7 @@ static void udp_v4_rehash(struct sock *sk) udp_lib_rehash(sk, new_hash); } -int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -1657,7 +1657,7 @@ EXPORT_SYMBOL(udp_encap_enable); * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ -int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index feb50a16398d..a8cf8c6fb60c 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -25,7 +25,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 04862abfe4ec..06ec39b79609 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -526,7 +526,7 @@ out: return; } -int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -569,7 +569,7 @@ void udpv6_encap_enable(void) } EXPORT_SYMBOL(udpv6_encap_enable); -int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index e78bdc76dcc3..f180b3d85e31 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -26,7 +26,6 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); -int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udpv6_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS -- cgit v1.2.3 From fdcee2cbb8438702ea1b328fb6e0ac5e9a40c7f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 May 2017 07:16:40 -0700 Subject: sctp: do not inherit ipv6_{mc|ac|fl}_list from parent SCTP needs fixes similar to 83eaddab4378 ("ipv6/dccp: do not inherit ipv6_mc_list from parent"), otherwise bad things can happen. Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 142b70e959af..f5b45b8b8b16 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -677,6 +677,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; rcu_read_lock(); opt = rcu_dereference(np->opt); -- cgit v1.2.3 From de321ed38471257ee45eac145bfd539254d13954 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Wed, 17 May 2017 11:39:05 -0700 Subject: net: fix __skb_try_recv_from_queue to return the old behavior This function has to return NULL on a error case, because there is a separate error variable. The offset has to be changed only if skb is returned v2: fix udp code to not use an extra variable Cc: Paolo Abeni Cc: Eric Dumazet Cc: David S. Miller Fixes: 65101aeca522 ("net/sock: factor out dequeue/peek with offset cod") Signed-off-by: Andrei Vagin Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/datagram.c | 14 ++++++++------ net/ipv4/udp.c | 12 +++--------- 2 files changed, 11 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index a4592b43b40d..bc46118486fe 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -170,20 +170,21 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff **last) { struct sk_buff *skb; + int _off = *off; *last = queue->prev; skb_queue_walk(queue, skb) { if (flags & MSG_PEEK) { - if (*off >= skb->len && (skb->len || *off || + if (_off >= skb->len && (skb->len || _off || skb->peeked)) { - *off -= skb->len; + _off -= skb->len; continue; } if (!skb->len) { skb = skb_set_peeked(skb); if (unlikely(IS_ERR(skb))) { *err = PTR_ERR(skb); - return skb; + return NULL; } } *peeked = 1; @@ -193,6 +194,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, if (destructor) destructor(sk, skb); } + *off = _off; return skb; } return NULL; @@ -253,8 +255,6 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, *peeked = 0; do { - int _off = *off; - /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. * @@ -263,8 +263,10 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, */ spin_lock_irqsave(&queue->lock, cpu_flags); skb = __skb_try_recv_from_queue(sk, queue, flags, destructor, - peeked, &_off, err, last); + peeked, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); + if (error) + goto no_packet; if (skb) return skb; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 922a62d45714..e7b6cfcca627 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1465,16 +1465,13 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, error = -EAGAIN; *peeked = 0; do { - int _off = *off; - spin_lock_bh(&queue->lock); skb = __skb_try_recv_from_queue(sk, queue, flags, udp_skb_destructor, - peeked, &_off, err, + peeked, off, err, &last); if (skb) { spin_unlock_bh(&queue->lock); - *off = _off; return skb; } @@ -1488,20 +1485,17 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, * the sk_receive_queue lock if fwd memory scheduling * is needed. */ - _off = *off; spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); skb = __skb_try_recv_from_queue(sk, queue, flags, udp_skb_dtor_locked, - peeked, &_off, err, + peeked, off, err, &last); spin_unlock(&sk_queue->lock); spin_unlock_bh(&queue->lock); - if (skb) { - *off = _off; + if (skb) return skb; - } busy_check: if (!sk_can_busy_loop(sk)) -- cgit v1.2.3 From ea5dd34be1cfb2400587a24d1de75e538e77ce74 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 17 May 2017 15:46:03 -0400 Subject: net: dsa: include dsa.h only once The public include/net/dsa.h file is meant for DSA drivers, while all DSA core files share a common private header net/dsa/dsa_priv.h file. Ensure that dsa_priv.h is the only DSA core file to include net/dsa.h, and add a new line to separate absolute and relative headers at the same time. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 2 +- net/dsa/dsa2.c | 2 +- net/dsa/dsa_priv.h | 1 + net/dsa/legacy.c | 2 +- net/dsa/slave.c | 2 +- net/dsa/switch.c | 3 ++- net/dsa/tag_brcm.c | 2 +- net/dsa/tag_dsa.c | 2 +- net/dsa/tag_edsa.c | 2 +- net/dsa/tag_lan9303.c | 2 +- net/dsa/tag_mtk.c | 2 +- net/dsa/tag_qca.c | 2 +- net/dsa/tag_trailer.c | 2 +- 13 files changed, 14 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index c0a1307c87dd..3288a80d4d6c 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -24,7 +24,7 @@ #include #include #include -#include + #include "dsa_priv.h" static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 2ac62349ba12..4301f52e4f5a 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -18,7 +18,7 @@ #include #include #include -#include + #include "dsa_priv.h" static LIST_HEAD(dsa_switch_trees); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index e9003b79cbbc..c274130e3ac9 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -14,6 +14,7 @@ #include #include #include +#include struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index bb28b011ba5a..ac4379b8d7ac 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -22,7 +22,7 @@ #include #include #include -#include + #include "dsa_priv.h" /* switch driver registration ***********************************************/ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 77324c483d14..fb13c5d7d587 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -17,13 +17,13 @@ #include #include #include -#include #include #include #include #include #include #include + #include "dsa_priv.h" static bool dsa_slave_dev_check(struct net_device *dev); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index ca6e26e514f0..f477053308d2 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -12,7 +12,8 @@ #include #include -#include + +#include "dsa_priv.h" static int dsa_switch_bridge_join(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 658ddee63dc9..9f204f18ada3 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -12,7 +12,7 @@ #include #include #include -#include + #include "dsa_priv.h" /* This tag length is 4 bytes, older ones were 6 bytes, we do not diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 1c6633f0de01..3b62a57956a3 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -11,7 +11,7 @@ #include #include #include -#include + #include "dsa_priv.h" #define DSA_HLEN 4 diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index d9c668aa5e54..f95cafd05702 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -11,7 +11,7 @@ #include #include #include -#include + #include "dsa_priv.h" #define DSA_HLEN 4 diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 70130ed5c21a..afd59330b5f1 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -14,7 +14,7 @@ #include #include #include -#include + #include "dsa_priv.h" /* To define the outgoing port and to discover the incoming port a regular diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 837cdddb53f0..d1258e84cd71 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -13,7 +13,7 @@ */ #include -#include + #include "dsa_priv.h" #define MTK_HDR_LEN 4 diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index be3b67750ac8..2451007699b7 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -12,7 +12,7 @@ */ #include -#include + #include "dsa_priv.h" #define QCA_HDR_LEN 2 diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index aa05e276ea22..7488ab2932ab 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -11,7 +11,7 @@ #include #include #include -#include + #include "dsa_priv.h" static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From f0c24ccf491b09de53cee32114c924551218f2bc Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 17 May 2017 15:46:04 -0400 Subject: net: dsa: include switchdev.h only once DSA drivers and core use switchdev. Include switchdev.h only once, in the dsa.h public header, so that inclusion in DSA drivers or forward declarations of switchdev structures in not necessary anymore. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 1 - drivers/net/dsa/bcm_sf2.c | 1 - drivers/net/dsa/dsa_loop.c | 1 - drivers/net/dsa/mt7530.c | 1 - drivers/net/dsa/mv88e6xxx/chip.c | 1 - drivers/net/dsa/qca8k.c | 1 - include/net/dsa.h | 7 +------ net/dsa/slave.c | 1 - 8 files changed, 1 insertion(+), 13 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 658a12c888a8..fbc3eb17c7a3 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -29,7 +29,6 @@ #include #include #include -#include #include "b53_regs.h" #include "b53_priv.h" diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 215d41c1e71f..687a8bae5d73 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include "bcm_sf2.h" diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index a19e1781e9bb..6afab16d13dd 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include "dsa_loop.h" diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index b070c167e70f..1bcbe15870ed 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -28,7 +28,6 @@ #include #include #include -#include #include "mt7530.h" diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index d034d8cd7d22..386d878569ed 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -32,7 +32,6 @@ #include #include #include -#include #include "mv88e6xxx.h" #include "global1.h" diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 942b9ac7f92a..149f109dbffb 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/include/net/dsa.h b/include/net/dsa.h index ed767beca9c6..bf6a2abb9b99 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -20,6 +20,7 @@ #include #include #include +#include struct tc_action; struct phy_device; @@ -284,12 +285,6 @@ static inline u8 dsa_upstream_port(struct dsa_switch *ds) return ds->rtable[dst->cpu_dp->ds->index]; } -struct switchdev_trans; -struct switchdev_obj; -struct switchdev_obj_port_fdb; -struct switchdev_obj_port_mdb; -struct switchdev_obj_port_vlan; - #define DSA_NOTIFIER_BRIDGE_JOIN 1 #define DSA_NOTIFIER_BRIDGE_LEAVE 2 diff --git a/net/dsa/slave.c b/net/dsa/slave.c index fb13c5d7d587..91236d602301 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From b17b8a20c5cd4a264601eacf1fda29008047d05a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 May 2017 09:15:58 -0700 Subject: tcp: fix tcp_rearm_rto() skbs in (re)transmit queue no longer have a copy of jiffies at the time of the transmit : skb->skb_mstamp is now in usec unit, with no correlation to tcp_jiffies32. We have to convert rto from jiffies to usec, compute a time difference in usec, then convert the delta to HZ units. Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9a5a9e8eda89..aa1eef150dc4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3002,14 +3002,14 @@ void tcp_rearm_rto(struct sock *sk) if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); - const u32 rto_time_stamp = - tcp_skb_timestamp(skb) + rto; - s32 delta = (s32)(rto_time_stamp - tcp_jiffies32); - /* delta may not be positive if the socket is locked + u64 rto_time_stamp = skb->skb_mstamp + + jiffies_to_usecs(rto); + s64 delta_us = rto_time_stamp - tp->tcp_mstamp; + /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ - if (delta > 0) - rto = delta; + if (delta_us > 0) + rto = usecs_to_jiffies(delta_us); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX); -- cgit v1.2.3 From 14e3995e63759b80eb22a3c06958d105db4d3f79 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 16 May 2017 14:22:47 +0000 Subject: xen/9pfs: fix return value check in xen_9pfs_front_probe() In case of error, the function xenbus_read() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: 71ebd71921e4 ("xen/9pfs: connect to the backend") Signed-off-by: Wei Yongjun Reviewed-by: Stefano Stabellini --- net/9p/trans_xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 71e85643b3f9..83fe487f460e 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -454,8 +454,8 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev, goto error_xenbus; } priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL); - if (!priv->tag) { - ret = -EINVAL; + if (IS_ERR(priv->tag)) { + ret = PTR_ERR(priv->tag); goto error_xenbus; } ret = xenbus_transaction_end(xbt, 0); -- cgit v1.2.3 From aaf0475a0b3f445000c50f7fc75d5e846bf7ee7b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 18 May 2017 15:22:41 +0000 Subject: xen/9pfs: p9_trans_xen_init and p9_trans_xen_exit can be static Fixes the following sparse warnings: net/9p/trans_xen.c:528:5: warning: symbol 'p9_trans_xen_init' was not declared. Should it be static? net/9p/trans_xen.c:540:6: warning: symbol 'p9_trans_xen_exit' was not declared. Should it be static? Signed-off-by: Wei Yongjun Reviewed-by: Stefano Stabellini --- net/9p/trans_xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 83fe487f460e..6ad3e043c617 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -525,7 +525,7 @@ static struct xenbus_driver xen_9pfs_front_driver = { .otherend_changed = xen_9pfs_front_changed, }; -int p9_trans_xen_init(void) +static int p9_trans_xen_init(void) { if (!xen_domain()) return -ENODEV; @@ -537,7 +537,7 @@ int p9_trans_xen_init(void) } module_init(p9_trans_xen_init); -void p9_trans_xen_exit(void) +static void p9_trans_xen_exit(void) { v9fs_unregister_trans(&p9_xen_trans); return xenbus_unregister_driver(&xen_9pfs_front_driver); -- cgit v1.2.3 From a486cd23661c9387fb076c3f6ae8b2aa9d20d54a Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Fri, 19 May 2017 12:47:00 +0200 Subject: xfrm: fix state migration copy replay sequence numbers During xfrm migration copy replay and preplay sequence numbers from the previous state. Here is a tcpdump output showing the problem. 10.0.10.46 is running vanilla kernel, is the IKE/IPsec responder. After the migration it sent wrong sequence number, reset to 1. The migration is from 10.0.0.52 to 10.0.0.53. IP 10.0.0.52.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7cf), length 136 IP 10.0.10.46.4500 > 10.0.0.52.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x7cf), length 136 IP 10.0.0.52.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7d0), length 136 IP 10.0.10.46.4500 > 10.0.0.52.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x7d0), length 136 IP 10.0.0.53.4500 > 10.0.10.46.4500: NONESP-encap: isakmp: child_sa inf2[I] IP 10.0.10.46.4500 > 10.0.0.53.4500: NONESP-encap: isakmp: child_sa inf2[R] IP 10.0.0.53.4500 > 10.0.10.46.4500: NONESP-encap: isakmp: child_sa inf2[I] IP 10.0.10.46.4500 > 10.0.0.53.4500: NONESP-encap: isakmp: child_sa inf2[R] IP 10.0.0.53.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7d1), length 136 NOTE: next sequence is wrong 0x1 IP 10.0.10.46.4500 > 10.0.0.53.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x1), length 136 IP 10.0.0.53.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7d2), length 136 IP 10.0.10.46.4500 > 10.0.0.53.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x2), length 136 Signed-off-by: Antony Antony Reviewed-by: Richard Guy Briggs Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index fc3c5aa38754..2e291bc5f1fc 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1383,6 +1383,8 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig) x->curlft.add_time = orig->curlft.add_time; x->km.state = orig->km.state; x->km.seq = orig->km.seq; + x->replay = orig->replay; + x->preplay = orig->preplay; return x; -- cgit v1.2.3 From 6f5b24eed0278136c29c27f2a7b3a2b6a202ac68 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Tue, 16 May 2017 17:39:02 -0400 Subject: tcp: warn on negative reordering values Commit bafbb9c73241 ("tcp: eliminate negative reordering in tcp_clean_rtx_queue") fixes an issue for negative reordering metrics. To be resilient to such errors, warn and return when a negative metric is passed to tcp_update_reordering(). Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bbadd79815a4..2fa55f57ac06 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -887,6 +887,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, struct tcp_sock *tp = tcp_sk(sk); int mib_idx; + if (WARN_ON_ONCE(metric < 0)) + return; + if (metric > tp->reordering) { tp->reordering = min(sysctl_tcp_max_reordering, metric); -- cgit v1.2.3 From 9617813dba5b6c112922c60cd2bc57c6e11ae907 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 18 May 2017 15:44:37 +0200 Subject: skbuff: add stub to help computing crc32c on SCTP packets sctp_compute_checksum requires crc32c symbol (provided by libcrc32c), so it can't be used in net core. Like it has been done previously with other symbols (e.g. ipv6_dst_lookup), introduce a stub struct skb_checksum_ops to allow computation of crc32c checksum in net core after sctp.ko (and thus libcrc32c) has been loaded. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ net/core/skbuff.c | 26 ++++++++++++++++++++++++++ net/sctp/offload.c | 6 ++++++ 3 files changed, 34 insertions(+) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7c0cb2ce8b01..b1f46a0d18e2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3076,6 +3076,8 @@ struct skb_checksum_ops { __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); }; +extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly; + __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 346d3e85dfbc..d5c98117cbce 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2243,6 +2243,32 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) +{ + net_warn_ratelimited( + "%s: attempt to compute crc32c without libcrc32c.ko\n", + __func__); + return 0; +} + +static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, + int offset, int len) +{ + net_warn_ratelimited( + "%s: attempt to compute crc32c without libcrc32c.ko\n", + __func__); + return 0; +} + +static const struct skb_checksum_ops default_crc32c_ops = { + .update = warn_crc32c_csum_update, + .combine = warn_crc32c_csum_combine, +}; + +const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = + &default_crc32c_ops; +EXPORT_SYMBOL(crc32c_csum_stub); + /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 4f5a2b580aa5..b67198429db5 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -98,6 +98,11 @@ static const struct net_offload sctp6_offload = { }, }; +static const struct skb_checksum_ops crc32c_csum_ops = { + .update = sctp_csum_update, + .combine = sctp_csum_combine, +}; + int __init sctp_offload_init(void) { int ret; @@ -110,6 +115,7 @@ int __init sctp_offload_init(void) if (ret) goto ipv4; + crc32c_csum_stub = &crc32c_csum_ops; return ret; ipv4: -- cgit v1.2.3 From b72b5bf6a8fc9065f270ae135bbd47abb9d96790 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 18 May 2017 15:44:38 +0200 Subject: net: introduce skb_crc32c_csum_help skb_crc32c_csum_help is like skb_checksum_help, but it is designed for checksumming SCTP packets using crc32c (see RFC3309), provided that libcrc32c.ko has been loaded before. In case libcrc32c is not loaded, invoking skb_crc32c_csum_help on a skb results in one the following printouts: warn_crc32c_csum_update: attempt to compute crc32c without libcrc32c.ko warn_crc32c_csum_combine: attempt to compute crc32c without libcrc32c.ko Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/linux/skbuff.h | 3 ++- net/core/dev.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0150b2dd3031..abbc72e09f11 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3931,6 +3931,7 @@ void netdev_rss_key_fill(void *buffer, size_t len); int dev_get_nest_level(struct net_device *dev); int skb_checksum_help(struct sk_buff *skb); +int skb_crc32c_csum_help(struct sk_buff *skb); struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b1f46a0d18e2..62d62964c743 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -193,7 +193,8 @@ * accordingly. Note the there is no indication in the skbuff that the * CHECKSUM_PARTIAL refers to an SCTP checksum, a driver that supports * both IP checksum offload and SCTP CRC offload must verify which offload - * is configured for a packet presumably by inspecting packet headers. + * is configured for a packet presumably by inspecting packet headers; in + * case, skb_crc32c_csum_help is provided to compute CRC on SCTP packets. * * NETIF_F_FCOE_CRC - This feature indicates that a device is capable of * offloading the FCOE CRC in a packet. To perform this offload the stack diff --git a/net/core/dev.c b/net/core/dev.c index acd594c56f0a..8356d5f05f89 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -143,6 +143,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2612,6 +2613,46 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +int skb_crc32c_csum_help(struct sk_buff *skb) +{ + __le32 crc32c_csum; + int ret = 0, offset, start; + + if (skb->ip_summed != CHECKSUM_PARTIAL) + goto out; + + if (unlikely(skb_is_gso(skb))) + goto out; + + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (unlikely(skb_has_shared_frag(skb))) { + ret = __skb_linearize(skb); + if (ret) + goto out; + } + start = skb_checksum_start_offset(skb); + offset = start + offsetof(struct sctphdr, checksum); + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + ret = -EINVAL; + goto out; + } + if (skb_cloned(skb) && + !skb_clone_writable(skb, offset + sizeof(__le32))) { + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (ret) + goto out; + } + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, + skb->len - start, ~(__u32)0, + crc32c_csum_stub)); + *(__le32 *)(skb->data + offset) = crc32c_csum; + skb->ip_summed = CHECKSUM_NONE; +out: + return ret; +} + __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; -- cgit v1.2.3 From 219f1d79871257e9603f504dce0fe8ebf47aad08 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 18 May 2017 15:44:39 +0200 Subject: sk_buff: remove support for csum_bad in sk_buff This bit was introduced with commit 5a21232983aa ("net: Support for csum_bad in skbuff") to reduce the stack workload when processing RX packets carrying a wrong Internet Checksum. Up to now, only one driver and GRO core are setting it. Suggested-by: Tom Herbert Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 2 +- include/linux/netdevice.h | 4 +--- include/linux/skbuff.h | 23 ++--------------------- net/bridge/netfilter/nft_reject_bridge.c | 5 +---- net/core/dev.c | 3 --- net/ipv4/netfilter/nf_reject_ipv4.c | 2 +- net/ipv6/netfilter/nf_reject_ipv6.c | 3 --- 7 files changed, 6 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 3a8a4aa13687..9a0817938eca 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -223,7 +223,7 @@ int aq_ring_rx_clean(struct aq_ring_s *self, int *work_done, int budget) skb->protocol = eth_type_trans(skb, ndev); if (unlikely(buff->is_cso_err)) { ++self->stats.rx.errors; - __skb_mark_checksum_bad(skb); + skb->ip_summed = CHECKSUM_NONE; } else { if (buff->is_ip_cso) { __skb_incr_checksum_unnecessary(skb); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index abbc72e09f11..c1611ace5336 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2573,9 +2573,7 @@ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_gro_checksum_validate_complete(skb, \ compute_pseudo(skb, proto)); \ - if (__ret) \ - __skb_mark_checksum_bad(skb); \ - else \ + if (!__ret) \ skb_gro_incr_csum_unnecessary(skb); \ __ret; \ }) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 62d62964c743..c38f890d425e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -685,7 +685,7 @@ struct sk_buff { __u8 csum_valid:1; __u8 csum_complete_sw:1; __u8 csum_level:2; - __u8 csum_bad:1; + __u8 __csum_bad_unused:1; /* one bit hole */ __u8 dst_pending_confirm:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE @@ -3336,21 +3336,6 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) } } -static inline void __skb_mark_checksum_bad(struct sk_buff *skb) -{ - /* Mark current checksum as bad (typically called from GRO - * path). In the case that ip_summed is CHECKSUM_NONE - * this must be the first checksum encountered in the packet. - * When ip_summed is CHECKSUM_UNNECESSARY, this is the first - * checksum after the last one validated. For UDP, a zero - * checksum can not be marked as bad. - */ - - if (skb->ip_summed == CHECKSUM_NONE || - skb->ip_summed == CHECKSUM_UNNECESSARY) - skb->csum_bad = 1; -} - /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise @@ -3404,9 +3389,6 @@ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb, skb->csum_valid = 1; return 0; } - } else if (skb->csum_bad) { - /* ip_summed == CHECKSUM_NONE in this case */ - return (__force __sum16)1; } skb->csum = psum; @@ -3466,8 +3448,7 @@ static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto) static inline bool __skb_checksum_convert_check(struct sk_buff *skb) { - return (skb->ip_summed == CHECKSUM_NONE && - skb->csum_valid && !skb->csum_bad); + return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid); } static inline void __skb_checksum_convert(struct sk_buff *skb, diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 346ef6b00b8f..c16dd3a47fc6 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -111,7 +111,7 @@ static void nft_reject_br_send_v4_unreach(struct net *net, __wsum csum; u8 proto; - if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb)) + if (!nft_bridge_iphdr_validate(oldskb)) return; /* IP header checks: fragment. */ @@ -226,9 +226,6 @@ static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) __be16 fo; u8 proto = ip6h->nexthdr; - if (skb->csum_bad) - return false; - if (skb_csum_unnecessary(skb)) return true; diff --git a/net/core/dev.c b/net/core/dev.c index 8356d5f05f89..f0281ff45e77 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4678,9 +4678,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - if (skb->csum_bad) - goto normal; - gro_list_prepare(napi, skb); rcu_read_lock(); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 7cd8d0d918f8..6f8d9e5e062b 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -172,7 +172,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) struct iphdr *iph = ip_hdr(skb_in); u8 proto; - if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET)) + if (iph->frag_off & htons(IP_OFFSET)) return; if (skb_csum_unnecessary(skb_in)) { diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index eedee5d108d9..f63b18e05c69 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -220,9 +220,6 @@ static bool reject6_csum_ok(struct sk_buff *skb, int hook) __be16 fo; u8 proto; - if (skb->csum_bad) - return false; - if (skb_csum_unnecessary(skb)) return true; -- cgit v1.2.3 From dba003067a43a9699bef0c4bdbe320ece5a109b8 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 18 May 2017 15:44:40 +0200 Subject: net: use skb->csum_not_inet to identify packets needing crc32c skb->csum_not_inet carries the indication on which algorithm is needed to compute checksum on skb in the transmit path, when skb->ip_summed is equal to CHECKSUM_PARTIAL. If skb carries a SCTP packet and crc32c hasn't been yet written in L4 header, skb->csum_not_inet is assigned to 1; otherwise, assume Internet Checksum is needed and thus set skb->csum_not_inet to 0. Suggested-by: Tom Herbert Signed-off-by: Davide Caratti Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 16 +++++++++------- net/core/dev.c | 1 + net/sched/act_csum.c | 1 + net/sctp/offload.c | 1 + net/sctp/output.c | 1 + 5 files changed, 13 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c38f890d425e..a43d2086bb7f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -189,12 +189,13 @@ * * NETIF_F_SCTP_CRC - This feature indicates that a device is capable of * offloading the SCTP CRC in a packet. To perform this offload the stack - * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset - * accordingly. Note the there is no indication in the skbuff that the - * CHECKSUM_PARTIAL refers to an SCTP checksum, a driver that supports - * both IP checksum offload and SCTP CRC offload must verify which offload - * is configured for a packet presumably by inspecting packet headers; in - * case, skb_crc32c_csum_help is provided to compute CRC on SCTP packets. + * will set set csum_start and csum_offset accordingly, set ip_summed to + * CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in + * the skbuff that the CHECKSUM_PARTIAL refers to CRC32c. + * A driver that supports both IP checksum offload and SCTP CRC32c offload + * must verify which offload is configured for a packet by testing the + * value of skb->csum_not_inet; skb_crc32c_csum_help is provided to resolve + * CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1. * * NETIF_F_FCOE_CRC - This feature indicates that a device is capable of * offloading the FCOE CRC in a packet. To perform this offload the stack @@ -557,6 +558,7 @@ typedef unsigned char *sk_buff_data_t; * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS + * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL * @dst_pending_confirm: need to confirm neighbour * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking @@ -685,7 +687,7 @@ struct sk_buff { __u8 csum_valid:1; __u8 csum_complete_sw:1; __u8 csum_level:2; - __u8 __csum_bad_unused:1; /* one bit hole */ + __u8 csum_not_inet:1; __u8 dst_pending_confirm:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE diff --git a/net/core/dev.c b/net/core/dev.c index f0281ff45e77..71107d1f3051 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2649,6 +2649,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb) crc32c_csum_stub)); *(__le32 *)(skb->data + offset) = crc32c_csum; skb->ip_summed = CHECKSUM_NONE; + skb->csum_not_inet = 0; out: return ret; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index ab6fdbd34db7..3317a2f579da 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -350,6 +350,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl, sctph->checksum = sctp_compute_cksum(skb, skb_network_offset(skb) + ihl); skb->ip_summed = CHECKSUM_NONE; + skb->csum_not_inet = 0; return 1; } diff --git a/net/sctp/offload.c b/net/sctp/offload.c index b67198429db5..275925b93b29 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -35,6 +35,7 @@ static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; + skb->csum_not_inet = 0; return sctp_compute_cksum(skb, skb_transport_offset(skb)); } diff --git a/net/sctp/output.c b/net/sctp/output.c index 1409a875ad8e..e2edf2ebbade 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -538,6 +538,7 @@ merge: } else { chksum: head->ip_summed = CHECKSUM_PARTIAL; + head->csum_not_inet = 1; head->csum_start = skb_transport_header(head) - head->head; head->csum_offset = offsetof(struct sctphdr, checksum); } -- cgit v1.2.3 From 43c26a1a45938624fb9301e8bf7dfabbed293619 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 18 May 2017 15:44:41 +0200 Subject: net: more accurate checksumming in validate_xmit_skb() skb_csum_hwoffload_help() uses netdev features and skb->csum_not_inet to determine if skb needs software computation of Internet Checksum or crc32c (or nothing, if this computation can be done by the hardware). Use it in place of skb_checksum_help() in validate_xmit_skb() to avoid corruption of non-GSO SCTP packets having skb->ip_summed equal to CHECKSUM_PARTIAL. While at it, remove references to skb_csum_off_chk* functions, since they are not present anymore in Linux _ see commit cf53b1da73bd ("Revert "net: Add driver helper functions to determine checksum offloadability""). Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- Documentation/networking/checksum-offloads.txt | 11 +++++++---- include/linux/netdevice.h | 3 +++ include/linux/skbuff.h | 13 +++++-------- net/core/dev.c | 14 ++++++++++++-- 4 files changed, 27 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/Documentation/networking/checksum-offloads.txt b/Documentation/networking/checksum-offloads.txt index 56e36861245f..d52d191bbb0c 100644 --- a/Documentation/networking/checksum-offloads.txt +++ b/Documentation/networking/checksum-offloads.txt @@ -35,6 +35,9 @@ This interface only allows a single checksum to be offloaded. Where encapsulation is used, the packet may have multiple checksum fields in different header layers, and the rest will have to be handled by another mechanism such as LCO or RCO. +CRC32c can also be offloaded using this interface, by means of filling + skb->csum_start and skb->csum_offset as described above, and setting + skb->csum_not_inet: see skbuff.h comment (section 'D') for more details. No offloading of the IP header checksum is performed; it is always done in software. This is OK because when we build the IP header, we obviously have it in cache, so summing it isn't expensive. It's also rather short. @@ -49,9 +52,9 @@ A driver declares its offload capabilities in netdev->hw_features; see and csum_offset given in the SKB; if it tries to deduce these itself in hardware (as some NICs do) the driver should check that the values in the SKB match those which the hardware will deduce, and if not, fall back to - checksumming in software instead (with skb_checksum_help or one of the - skb_csum_off_chk* functions as mentioned in include/linux/skbuff.h). This - is a pain, but that's what you get when hardware tries to be clever. + checksumming in software instead (with skb_csum_hwoffload_help() or one of + the skb_checksum_help() / skb_crc32c_csum_help functions, as mentioned in + include/linux/skbuff.h). The stack should, for the most part, assume that checksum offload is supported by the underlying device. The only place that should check is @@ -60,7 +63,7 @@ The stack should, for the most part, assume that checksum offload is may include other offloads besides TX Checksum Offload) and, if they are not supported or enabled on the device (determined by netdev->features), performs the corresponding offload in software. In the case of TX - Checksum Offload, that means calling skb_checksum_help(skb). + Checksum Offload, that means calling skb_csum_hwoffload_help(skb, features). LCO: Local Checksum Offload diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c1611ace5336..f8f7cd52a0a0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3930,6 +3930,9 @@ void netdev_rss_key_fill(void *buffer, size_t len); int dev_get_nest_level(struct net_device *dev); int skb_checksum_help(struct sk_buff *skb); int skb_crc32c_csum_help(struct sk_buff *skb); +int skb_csum_hwoffload_help(struct sk_buff *skb, + const netdev_features_t features); + struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a43d2086bb7f..43d7ca07b2ff 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -162,14 +162,11 @@ * * NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM are being deprecated in favor of * NETIF_F_HW_CSUM. New devices should use NETIF_F_HW_CSUM to indicate - * checksum offload capability. If a device has limited checksum capabilities - * (for instance can only perform NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM as - * described above) a helper function can be called to resolve - * CHECKSUM_PARTIAL. The helper functions are skb_csum_off_chk*. The helper - * function takes a spec argument that describes the protocol layer that is - * supported for checksum offload and can be called for each packet. If a - * packet does not match the specification for offload, skb_checksum_help - * is called to resolve the checksum. + * checksum offload capability. + * skb_csum_hwoffload_help() can be called to resolve CHECKSUM_PARTIAL based + * on network device checksumming capabilities: if a packet does not match + * them, skb_checksum_help or skb_crc32c_help (depending on the value of + * csum_not_inet, see item D.) is called to resolve the checksum. * * CHECKSUM_NONE: * diff --git a/net/core/dev.c b/net/core/dev.c index 71107d1f3051..bb136f726890 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2996,6 +2996,17 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, return skb; } +int skb_csum_hwoffload_help(struct sk_buff *skb, + const netdev_features_t features) +{ + if (unlikely(skb->csum_not_inet)) + return !!(features & NETIF_F_SCTP_CRC) ? 0 : + skb_crc32c_csum_help(skb); + + return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb); +} +EXPORT_SYMBOL(skb_csum_hwoffload_help); + static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -3034,8 +3045,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_CSUM_MASK) && - skb_checksum_help(skb)) + if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } -- cgit v1.2.3 From 7529390d08f07fbf9b0174c5a87600b5caa1a8e8 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 18 May 2017 15:44:42 +0200 Subject: openvswitch: more accurate checksumming in queue_userspace_packet() if skb carries an SCTP packet and ip_summed is CHECKSUM_PARTIAL, it needs CRC32c in place of Internet Checksum: use skb_csum_hwoffload_help to avoid corrupting such packets while queueing them towards userspace. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 7b17da9a94a0..9ddc9f8412a2 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -453,7 +453,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, /* Complete checksum if needed */ if (skb->ip_summed == CHECKSUM_PARTIAL && - (err = skb_checksum_help(skb))) + (err = skb_csum_hwoffload_help(skb, 0))) goto out; /* Older versions of OVS user space enforce alignment of the last -- cgit v1.2.3 From 499350a5a6e7512d9ed369ed63a4244b6536f4f8 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 18 May 2017 11:22:33 -0700 Subject: tcp: initialize rcv_mss to TCP_MIN_MSS instead of 0 When tcp_disconnect() is called, inet_csk_delack_init() sets icsk->icsk_ack.rcv_mss to 0. This could potentially cause tcp_recvmsg() => tcp_cleanup_rbuf() => __tcp_select_window() call path to have division by 0 issue. So this patch initializes rcv_mss to TCP_MIN_MSS instead of 0. Reported-by: Andrey Konovalov Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1e4c76d2b827..842b575f8fdd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2320,6 +2320,10 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 + * issue in __tcp_select_window() + */ + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); -- cgit v1.2.3 From 34eb5fe07831458cf8238d54c1fc847dedeaf68c Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 18 May 2017 12:41:18 -0700 Subject: arp: fixed error in a comment the is_garp code deals just with gratuitous ARP packets, not every unsolicited packet. This patch is a result of a discussion in netdev: http://marc.info/?l=linux-netdev&m=149506354216994 Suggested-by: Julian Anastasov Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index d54345a06f72..053492af8a6e 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -846,7 +846,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) */ is_garp = tip == sip && addr_type == RTN_UNICAST; - /* Unsolicited ARP _replies_ also require target hwaddr to be + /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. */ if (is_garp && arp->ar_op == htons(ARPOP_REPLY)) -- cgit v1.2.3 From 6fd05633bdafc0ae6ec0d55e61af10780d4d3530 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 18 May 2017 12:41:19 -0700 Subject: arp: decompose is_garp logic into a separate function The code is quite involving already to earn a separate function for itself. If anything, it helps arp_process readability. Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 053492af8a6e..ca6e1e6c1496 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -641,6 +641,27 @@ void arp_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(arp_xmit); +static bool arp_is_garp(struct net_device *dev, int addr_type, + __be16 ar_op, + __be32 sip, __be32 tip, + unsigned char *sha, unsigned char *tha) +{ + bool is_garp = tip == sip && addr_type == RTN_UNICAST; + + /* Gratuitous ARP _replies_ also require target hwaddr to be + * the same as source. + */ + if (is_garp && ar_op == htons(ARPOP_REPLY)) + is_garp = + /* IPv4 over IEEE 1394 doesn't provide target + * hardware address field in its ARP payload. + */ + tha && + !memcmp(tha, sha, dev->addr_len); + + return is_garp; +} + /* * Process an arp request. */ @@ -844,18 +865,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) It is possible, that this option should be enabled for some devices (strip is candidate) */ - is_garp = tip == sip && addr_type == RTN_UNICAST; - - /* Gratuitous ARP _replies_ also require target hwaddr to be - * the same as source. - */ - if (is_garp && arp->ar_op == htons(ARPOP_REPLY)) - is_garp = - /* IPv4 over IEEE 1394 doesn't provide target - * hardware address field in its ARP payload. - */ - tha && - !memcmp(tha, sha, dev->addr_len); + is_garp = arp_is_garp(dev, addr_type, arp->ar_op, + sip, tip, sha, tha); if (!n && ((arp->ar_op == htons(ARPOP_REPLY) && -- cgit v1.2.3 From d9ef2e7bf99f59179b89d5c1c4d5b4919375daee Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 18 May 2017 12:41:20 -0700 Subject: arp: postpone addr_type calculation to as late as possible The addr_type retrieval can be costly, so it's worth trying to avoid its calculation as much as possible. This patch makes it calculated only for gratuitous ARP packets. This is especially important since later we may want to move is_garp calculation outside of arp_accept block, at which point the costly operation will be executed for all setups. The patch is the result of a discussion in net-dev: http://marc.info/?l=linux-netdev&m=149506354216994 Suggested-by: Julian Anastasov Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index ca6e1e6c1496..c22103cec823 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -641,12 +641,12 @@ void arp_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(arp_xmit); -static bool arp_is_garp(struct net_device *dev, int addr_type, - __be16 ar_op, +static bool arp_is_garp(struct net *net, struct net_device *dev, + int *addr_type, __be16 ar_op, __be32 sip, __be32 tip, unsigned char *sha, unsigned char *tha) { - bool is_garp = tip == sip && addr_type == RTN_UNICAST; + bool is_garp = tip == sip; /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. @@ -659,6 +659,11 @@ static bool arp_is_garp(struct net_device *dev, int addr_type, tha && !memcmp(tha, sha, dev->addr_len); + if (is_garp) { + *addr_type = inet_addr_type_dev_table(net, dev, sip); + if (*addr_type != RTN_UNICAST) + is_garp = false; + } return is_garp; } @@ -859,18 +864,23 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); if (IN_DEV_ARP_ACCEPT(in_dev)) { - unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip); + addr_type = -1; /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ - is_garp = arp_is_garp(dev, addr_type, arp->ar_op, + is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); if (!n && - ((arp->ar_op == htons(ARPOP_REPLY) && - addr_type == RTN_UNICAST) || is_garp)) + (is_garp || + (arp->ar_op == htons(ARPOP_REPLY) && + (addr_type == RTN_UNICAST || + (addr_type < 0 && + /* postpone calculation to as late as possible */ + inet_addr_type_dev_table(net, dev, sip) == + RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } -- cgit v1.2.3 From 7d472a59c0e5ec117220a05de6b370447fb6cb66 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 18 May 2017 12:41:21 -0700 Subject: arp: always override existing neigh entries with gratuitous ARP Currently, when arp_accept is 1, we always override existing neigh entries with incoming gratuitous ARP replies. Otherwise, we override them only if new replies satisfy _locktime_ conditional (packets arrive not earlier than _locktime_ seconds since the last update to the neigh entry). The idea behind locktime is to pick the very first (=> close) reply received in a unicast burst when ARP proxies are used. This helps to avoid ARP thrashing where Linux would switch back and forth from one proxy to another. This logic has nothing to do with gratuitous ARP replies that are generally not aligned in time when multiple IP address carriers send them into network. This patch enforces overriding of existing neigh entries by all incoming gratuitous ARP packets, irrespective of their time of arrival. This will make the kernel honour all incoming gratuitous ARP packets. Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c22103cec823..ae96e6f3e0cb 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -863,16 +863,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); - if (IN_DEV_ARP_ACCEPT(in_dev)) { + if (n || IN_DEV_ARP_ACCEPT(in_dev)) { addr_type = -1; + is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, + sip, tip, sha, tha); + } + if (IN_DEV_ARP_ACCEPT(in_dev)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ - is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, - sip, tip, sha, tha); - if (!n && (is_garp || (arp->ar_op == htons(ARPOP_REPLY) && -- cgit v1.2.3 From 6d18c732b95c0a9d35e9f978b4438bba15412284 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 19 May 2017 22:20:29 +0800 Subject: bridge: start hello_timer when enabling KERNEL_STP in br_stp_start Since commit 76b91c32dd86 ("bridge: stp: when using userspace stp stop kernel hello and hold timers"), bridge would not start hello_timer if stp_enabled is not KERNEL_STP when br_dev_open. The problem is even if users set stp_enabled with KERNEL_STP later, the timer will still not be started. It causes that KERNEL_STP can not really work. Users have to re-ifup the bridge to avoid this. This patch is to fix it by starting br->hello_timer when enabling KERNEL_STP in br_stp_start. As an improvement, it's also to start hello_timer again only when br->stp_enabled is KERNEL_STP in br_hello_timer_expired, there is no reason to start the timer again when it's NO_STP. Fixes: 76b91c32dd86 ("bridge: stp: when using userspace stp stop kernel hello and hold timers") Reported-by: Haidong Li Signed-off-by: Xin Long Acked-by: Nikolay Aleksandrov Reviewed-by: Ivan Vecera Signed-off-by: David S. Miller --- net/bridge/br_stp_if.c | 1 + net/bridge/br_stp_timer.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 08341d2aa9c9..0db8102995a5 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -179,6 +179,7 @@ static void br_stp_start(struct net_bridge *br) br_debug(br, "using kernel STP\n"); /* To start timers on any ports left in blocking */ + mod_timer(&br->hello_timer, jiffies + br->hello_time); br_port_state_selection(br); } diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index c98b3e5c140a..60b6fe277a8b 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -40,7 +40,7 @@ static void br_hello_timer_expired(unsigned long arg) if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); - if (br->stp_enabled != BR_USER_STP) + if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time)); } -- cgit v1.2.3 From b8210a9e4bea6354eccc5d8a50ecc21ea7486dc9 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 19 May 2017 17:52:35 +0200 Subject: net: define receive timestamp filter for NTP Add HWTSTAMP_FILTER_NTP_ALL to the hwtstamp_rx_filters enum for timestamping of NTP packets. There is currently only one driver (phyter) that could support it directly. CC: Richard Cochran CC: Willem de Bruijn Signed-off-by: Miroslav Lichvar Signed-off-by: David S. Miller --- include/uapi/linux/net_tstamp.h | 3 +++ net/core/dev_ioctl.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 464dcca5ed68..0749fb13e517 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -125,6 +125,9 @@ enum hwtstamp_rx_filters { HWTSTAMP_FILTER_PTP_V2_SYNC, /* PTP v2/802.AS1, any layer, Delay_req packet */ HWTSTAMP_FILTER_PTP_V2_DELAY_REQ, + + /* NTP, UDP, all versions and packet modes */ + HWTSTAMP_FILTER_NTP_ALL, }; #endif /* _NET_TIMESTAMPING_H */ diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index b94b1d293506..8f036a76b92e 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -227,6 +227,8 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: rx_filter_valid = 1; break; + case HWTSTAMP_FILTER_NTP_ALL: + break; } if (!tx_type_valid || !rx_filter_valid) -- cgit v1.2.3 From e3412575488ac2408f737a14296cce34c9d8b4f8 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 19 May 2017 17:52:36 +0200 Subject: net: ethernet: update drivers to handle HWTSTAMP_FILTER_NTP_ALL Include HWTSTAMP_FILTER_NTP_ALL in net_hwtstamp_validate() as a valid filter and update drivers which can timestamp all packets, or which explicitly list unsupported filters instead of using a default case, to handle the filter. CC: Richard Cochran CC: Willem de Bruijn Signed-off-by: Miroslav Lichvar Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 1 + drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 1 + drivers/net/ethernet/cavium/liquidio/lio_main.c | 1 + drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 1 + drivers/net/ethernet/cavium/octeon/octeon_mgmt.c | 1 + drivers/net/ethernet/intel/e1000e/netdev.c | 1 + drivers/net/ethernet/intel/i40e/i40e_ptp.c | 1 + drivers/net/ethernet/intel/igb/igb_ptp.c | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_clock.c | 1 + drivers/net/ethernet/neterion/vxge/vxge-main.c | 1 + drivers/net/ethernet/qlogic/qede/qede_ptp.c | 1 + drivers/net/ethernet/sfc/ef10.c | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + drivers/net/ethernet/ti/cpsw.c | 1 + drivers/net/ethernet/tile/tilegx.c | 1 + net/core/dev_ioctl.c | 3 +-- 18 files changed, 18 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index c772420fa41c..89b21d7c537b 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -1268,6 +1268,7 @@ static int xgbe_set_hwtstamp_settings(struct xgbe_prv_data *pdata, case HWTSTAMP_FILTER_NONE: break; + case HWTSTAMP_FILTER_NTP_ALL: case HWTSTAMP_FILTER_ALL: XGMAC_SET_BITS(mac_tscr, MAC_TSCR, TSENALL, 1); XGMAC_SET_BITS(mac_tscr, MAC_TSCR, TSENA, 1); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 7414ffd70c90..14c236e5bdb1 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -15351,6 +15351,7 @@ int bnx2x_configure_ptp_filters(struct bnx2x *bp) break; case HWTSTAMP_FILTER_ALL: case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_NTP_ALL: bp->rx_filter = HWTSTAMP_FILTER_NONE; break; case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index 649f2aaf0afb..ba012427edd6 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -3024,6 +3024,7 @@ static int hwtstamp_ioctl(struct net_device *netdev, struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: conf.rx_filter = HWTSTAMP_FILTER_ALL; break; default: diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c index d51c8d8d9a35..31d737c22648 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c @@ -2085,6 +2085,7 @@ static int hwtstamp_ioctl(struct net_device *netdev, struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: conf.rx_filter = HWTSTAMP_FILTER_ALL; break; default: diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c index a2138686c605..2887bcaf6af5 100644 --- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c +++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c @@ -755,6 +755,7 @@ static int octeon_mgmt_ioctl_hwtstamp(struct net_device *netdev, case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: p->has_rx_tstamp = have_hw_timestamps; config.rx_filter = HWTSTAMP_FILTER_ALL; if (p->has_rx_tstamp) { diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index b3679728caac..0ff9295ed449 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -3680,6 +3680,7 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, * Delay Request messages but not both so fall-through to * time stamp all packets. */ + case HWTSTAMP_FILTER_NTP_ALL: case HWTSTAMP_FILTER_ALL: is_l2 = true; is_l4 = true; diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c index 18c1cc08da97..0efff18ee336 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c @@ -562,6 +562,7 @@ static int i40e_ptp_set_timestamp_mode(struct i40e_pf *pf, config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT; } break; + case HWTSTAMP_FILTER_NTP_ALL: case HWTSTAMP_FILTER_ALL: default: return -ERANGE; diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index 7a3fd4d74592..d333d6d80194 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -941,6 +941,7 @@ static int igb_ptp_set_timestamp_mode(struct igb_adapter *adapter, is_l4 = true; break; case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_NTP_ALL: case HWTSTAMP_FILTER_ALL: /* 82576 cannot timestamp all packets, which it needs to do to * support both V1 Sync and Delay_Req messages diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c index ef0635e0918c..d44c728fdc0b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c @@ -883,6 +883,7 @@ static int ixgbe_ptp_set_timestamp_mode(struct ixgbe_adapter *adapter, IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER); break; case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_NTP_ALL: case HWTSTAMP_FILTER_ALL: /* The X550 controller is capable of timestamping all packets, * which allows it to accept any filter. diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 94fab20ef146..82436742ad75 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2375,6 +2375,7 @@ static int mlx4_en_hwtstamp_set(struct net_device *dev, struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: config.rx_filter = HWTSTAMP_FILTER_ALL; break; default: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c index e706a87fc8b2..e29494464cae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c @@ -128,6 +128,7 @@ int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: /* Disable CQE compression */ netdev_warn(dev, "Disabling cqe compression"); err = mlx5e_modify_rx_cqe_compression_locked(priv, false); diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c index 6a4310af5d97..50ea69d88480 100644 --- a/drivers/net/ethernet/neterion/vxge/vxge-main.c +++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c @@ -3218,6 +3218,7 @@ static int vxge_hwtstamp_set(struct vxgedev *vdev, void __user *data) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: if (vdev->devh->config.hwts_en != VXGE_HW_HWTS_ENABLE) return -EFAULT; diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.c b/drivers/net/ethernet/qlogic/qede/qede_ptp.c index 24f06e2ef43e..9b2280badaf7 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ptp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.c @@ -244,6 +244,7 @@ static int qede_ptp_cfg_filters(struct qede_dev *edev) break; case HWTSTAMP_FILTER_ALL: case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_NTP_ALL: ptp->rx_filter = HWTSTAMP_FILTER_NONE; rx_filter = QED_PTP_FILTER_ALL; break; diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 78efb2822b86..ad9c4ded2b90 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -6068,6 +6068,7 @@ static int efx_ef10_ptp_set_ts_config(struct efx_nic *efx, case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: init->rx_filter = HWTSTAMP_FILTER_ALL; rc = efx_ptp_change_mode(efx, true, 0); if (!rc) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index a74c481401c4..cce862b81f3e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -644,6 +644,7 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr) ptp_over_ethernet = PTP_TCR_TSIPENA; break; + case HWTSTAMP_FILTER_NTP_ALL: case HWTSTAMP_FILTER_ALL: /* time stamp any incoming packet */ config.rx_filter = HWTSTAMP_FILTER_ALL; diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index f4d7aec50479..37fc16521143 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1734,6 +1734,7 @@ static int cpsw_hwtstamp_set(struct net_device *dev, struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: return -ERANGE; case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c index 7c634bc75615..aec95382ea5c 100644 --- a/drivers/net/ethernet/tile/tilegx.c +++ b/drivers/net/ethernet/tile/tilegx.c @@ -512,6 +512,7 @@ static int tile_hwtstamp_set(struct net_device *dev, struct ifreq *rq) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: config.rx_filter = HWTSTAMP_FILTER_ALL; break; default: diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 8f036a76b92e..77f04e71100f 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -225,9 +225,8 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: - rx_filter_valid = 1; - break; case HWTSTAMP_FILTER_NTP_ALL: + rx_filter_valid = 1; break; } -- cgit v1.2.3 From 90b602f80397657429373ca009f98aec4dd3c553 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 19 May 2017 17:52:37 +0200 Subject: net: add function to retrieve original skb device using NAPI ID Since commit b68581778cd0 ("net: Make skb->skb_iif always track skb->dev") skbs don't have the original index of the interface which received the packet. This information is now needed for a new control message related to hardware timestamping. Instead of adding a new field to skb, we can find the device by the NAPI ID if it is available, i.e. CONFIG_NET_RX_BUSY_POLL is enabled and the driver is using NAPI. Add dev_get_by_napi_id() and also skb_napi_id() to hide the CONFIG_NET_RX_BUSY_POLL ifdef. CC: Richard Cochran Suggested-by: Willem de Bruijn Acked-by: Willem de Bruijn Signed-off-by: Miroslav Lichvar Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/linux/skbuff.h | 9 +++++++++ net/core/dev.c | 26 ++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f8f7cd52a0a0..c50c9218e31e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2456,6 +2456,7 @@ static inline int dev_recursion_level(void) struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); +struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1713e4b7ea9f..8acce7143f6a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -858,6 +858,15 @@ static inline bool skb_pkt_type_ok(u32 ptype) return ptype <= PACKET_OTHERHOST; } +static inline unsigned int skb_napi_id(const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + return skb->napi_id; +#else + return 0; +#endif +} + void kfree_skb(struct sk_buff *skb); void kfree_skb_list(struct sk_buff *segs); void skb_tx_error(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index bb136f726890..3d98fbf4cbb0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -163,6 +163,7 @@ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, struct netdev_notifier_info *info); +static struct napi_struct *napi_by_id(unsigned int napi_id); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -866,6 +867,31 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) } EXPORT_SYMBOL(dev_get_by_index); +/** + * dev_get_by_napi_id - find a device by napi_id + * @napi_id: ID of the NAPI struct + * + * Search for an interface by NAPI ID. Returns %NULL if the device + * is not found or a pointer to the device. The device has not had + * its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_napi_id(unsigned int napi_id) +{ + struct napi_struct *napi; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (napi_id < MIN_NAPI_ID) + return NULL; + + napi = napi_by_id(napi_id); + + return napi ? napi->dev : NULL; +} +EXPORT_SYMBOL(dev_get_by_napi_id); + /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace -- cgit v1.2.3 From aad9c8c470f2a8321a99eb053630ce0e199558d6 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 19 May 2017 17:52:38 +0200 Subject: net: add new control message for incoming HW-timestamped packets Add SOF_TIMESTAMPING_OPT_PKTINFO option to request a new control message for incoming packets with hardware timestamps. It contains the index of the real interface which received the packet and the length of the packet at layer 2. The index is useful with bonding, bridges and other interfaces, where IP_PKTINFO doesn't allow applications to determine which PHC made the timestamp. With the L2 length (and link speed) it is possible to transpose preamble timestamps to trailer timestamps, which are used in the NTP protocol. While this information could be provided by two new socket options independently from timestamping, it doesn't look like they would be very useful. With this option any performance impact is limited to hardware timestamping. Use dev_get_by_napi_id() to get the device and its index. On kernels with disabled CONFIG_NET_RX_BUSY_POLL or drivers not using NAPI, a zero index will be returned in the control message. CC: Richard Cochran Acked-by: Willem de Bruijn Signed-off-by: Miroslav Lichvar Signed-off-by: David S. Miller --- Documentation/networking/timestamping.txt | 10 ++++++++++ include/uapi/asm-generic/socket.h | 2 ++ include/uapi/linux/net_tstamp.h | 11 ++++++++++- net/socket.c | 27 ++++++++++++++++++++++++++- 4 files changed, 48 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index 96f50694a748..ce11e3a08c0d 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -193,6 +193,16 @@ SOF_TIMESTAMPING_OPT_STATS: the transmit timestamps, such as how long a certain block of data was limited by peer's receiver window. +SOF_TIMESTAMPING_OPT_PKTINFO: + + Enable the SCM_TIMESTAMPING_PKTINFO control message for incoming + packets with hardware timestamps. The message contains struct + scm_ts_pktinfo, which supplies the index of the real interface which + received the packet and its length at layer 2. A valid (non-zero) + interface index will be returned only if CONFIG_NET_RX_BUSY_POLL is + enabled and the driver is using NAPI. The struct contains also two + other fields, but they are reserved and undefined. + New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate regardless of the setting of sysctl net.core.tstamp_allow_data. diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 2b488565599d..a5f6e819fafd 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -100,4 +100,6 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 0749fb13e517..dee74d39da94 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -9,6 +9,7 @@ #ifndef _NET_TIMESTAMPING_H #define _NET_TIMESTAMPING_H +#include #include /* for SO_TIMESTAMPING */ /* SO_TIMESTAMPING gets an integer bit field comprised of these values */ @@ -26,8 +27,9 @@ enum { SOF_TIMESTAMPING_OPT_CMSG = (1<<10), SOF_TIMESTAMPING_OPT_TSONLY = (1<<11), SOF_TIMESTAMPING_OPT_STATS = (1<<12), + SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_PKTINFO, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; @@ -130,4 +132,11 @@ enum hwtstamp_rx_filters { HWTSTAMP_FILTER_NTP_ALL, }; +/* SCM_TIMESTAMPING_PKTINFO control message */ +struct scm_ts_pktinfo { + __u32 if_index; + __u32 pkt_length; + __u32 reserved[2]; +}; + #endif /* _NET_TIMESTAMPING_H */ diff --git a/net/socket.c b/net/socket.c index c2564eb25c6b..67db7d8a3b81 100644 --- a/net/socket.c +++ b/net/socket.c @@ -662,6 +662,27 @@ static bool skb_is_err_queue(const struct sk_buff *skb) return skb->pkt_type == PACKET_OUTGOING; } +static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct scm_ts_pktinfo ts_pktinfo; + struct net_device *orig_dev; + + if (!skb_mac_header_was_set(skb)) + return; + + memset(&ts_pktinfo, 0, sizeof(ts_pktinfo)); + + rcu_read_lock(); + orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); + if (orig_dev) + ts_pktinfo.if_index = orig_dev->ifindex; + rcu_read_unlock(); + + ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO, + sizeof(ts_pktinfo), &ts_pktinfo); +} + /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ @@ -699,8 +720,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, empty = 0; if (shhwtstamps && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) + ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) { empty = 0; + if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && + !skb_is_err_queue(skb)) + put_ts_pktinfo(msg, skb); + } if (!empty) { put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, sizeof(tss), &tss); -- cgit v1.2.3 From b50a5c70ffa4fd6b6da324ab54c84adf48fb17d9 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 19 May 2017 17:52:40 +0200 Subject: net: allow simultaneous SW and HW transmit timestamping Add SOF_TIMESTAMPING_OPT_TX_SWHW option to allow an outgoing packet to be looped to the socket's error queue with a software timestamp even when a hardware transmit timestamp is expected to be provided by the driver. Applications using this option will receive two separate messages from the error queue, one with a software timestamp and the other with a hardware timestamp. As the hardware timestamp is saved to the shared skb info, which may happen before the first message with software timestamp is received by the application, the hardware timestamp is copied to the SCM_TIMESTAMPING control message only when the skb has no software timestamp or it is an incoming packet. While changing sw_tx_timestamp(), inline it in skb_tx_timestamp() as there are no other users. CC: Richard Cochran CC: Willem de Bruijn Signed-off-by: Miroslav Lichvar Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/timestamping.txt | 8 ++++++++ include/linux/skbuff.h | 10 ++-------- include/uapi/linux/net_tstamp.h | 3 ++- net/core/skbuff.c | 4 ++++ net/socket.c | 20 ++++++++++++++++++-- 5 files changed, 34 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index 50eb0e554778..196ba17cc344 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -203,6 +203,14 @@ SOF_TIMESTAMPING_OPT_PKTINFO: enabled and the driver is using NAPI. The struct contains also two other fields, but they are reserved and undefined. +SOF_TIMESTAMPING_OPT_TX_SWHW: + + Request both hardware and software timestamps for outgoing packets + when SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE + are enabled at the same time. If both timestamps are generated, + two separate messages will be looped to the socket's error queue, + each containing just one timestamp. + New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate regardless of the setting of sysctl net.core.tstamp_allow_data. diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8acce7143f6a..45a59c1e0cc7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3259,13 +3259,6 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps); -static inline void sw_tx_timestamp(struct sk_buff *skb) -{ - if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP && - !(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) - skb_tstamp_tx(skb, NULL); -} - /** * skb_tx_timestamp() - Driver hook for transmit timestamping * @@ -3281,7 +3274,8 @@ static inline void sw_tx_timestamp(struct sk_buff *skb) static inline void skb_tx_timestamp(struct sk_buff *skb) { skb_clone_tx_timestamp(skb); - sw_tx_timestamp(skb); + if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) + skb_tstamp_tx(skb, NULL); } /** diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index dee74d39da94..3d421d912193 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -28,8 +28,9 @@ enum { SOF_TIMESTAMPING_OPT_TSONLY = (1<<11), SOF_TIMESTAMPING_OPT_STATS = (1<<12), SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13), + SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_PKTINFO, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TX_SWHW, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d5c98117cbce..780b7c1563d0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3901,6 +3901,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; + if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) + return; + tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; diff --git a/net/socket.c b/net/socket.c index 67db7d8a3b81..cb355a7ef135 100644 --- a/net/socket.c +++ b/net/socket.c @@ -662,6 +662,19 @@ static bool skb_is_err_queue(const struct sk_buff *skb) return skb->pkt_type == PACKET_OUTGOING; } +/* On transmit, software and hardware timestamps are returned independently. + * As the two skb clones share the hardware timestamp, which may be updated + * before the software timestamp is received, a hardware TX timestamp may be + * returned only if there is no software TX timestamp. Ignore false software + * timestamps, which may be made in the __sock_recv_timestamp() call when the + * option SO_TIMESTAMP(NS) is enabled on the socket, even when the skb has a + * hardware timestamp. + */ +static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) +{ + return skb->tstamp && !false_tstamp && skb_is_err_queue(skb); +} + static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct scm_ts_pktinfo ts_pktinfo; @@ -691,14 +704,16 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); struct scm_timestamping tss; - int empty = 1; + int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ - if (need_software_tstamp && skb->tstamp == 0) + if (need_software_tstamp && skb->tstamp == 0) { __net_timestamp(skb); + false_tstamp = 1; + } if (need_software_tstamp) { if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { @@ -720,6 +735,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, empty = 0; if (shhwtstamps && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + !skb_is_swtx_tstamp(skb, false_tstamp) && ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) { empty = 0; if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && -- cgit v1.2.3 From 332b4fc88698dd0429924a61e09d1734940d80a0 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 19 May 2017 09:55:48 -0700 Subject: dcb: enforce minimum length on IEEE_APPS attribute Found by reviewing the warning about unused policy table. The code implies that it meant to check for size, but since it unrolled the loop for attribute validation that is never used. Instead do explicit check for attribute. Compile tested only. Needs review by original author. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 93106120f987..733f523707ac 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -178,10 +178,6 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, }; -static const struct nla_policy dcbnl_ieee_app[DCB_ATTR_IEEE_APP_MAX + 1] = { - [DCB_ATTR_IEEE_APP] = {.len = sizeof(struct dcb_app)}, -}; - /* DCB number of traffic classes nested attributes. */ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG}, @@ -1463,8 +1459,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { struct dcb_app *app_data; + if (nla_type(attr) != DCB_ATTR_IEEE_APP) continue; + + if (nla_len(attr) < sizeof(struct dcb_app)) { + err = -ERANGE; + goto err; + } + app_data = nla_data(attr); if (ops->ieee_setapp) err = ops->ieee_setapp(netdev, app_data); -- cgit v1.2.3 From 9e7b19c51681f041af418ee87e5bc7b4b67e3318 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 19 May 2017 09:55:49 -0700 Subject: ila: propagate error code in ila_output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This warning: net/ipv6/ila/ila_lwt.c: In function ‘ila_output’: net/ipv6/ila/ila_lwt.c:42:6: warning: variable ‘err’ set but not used [-Wunused-but-set-variable] It looks like the code attempts to set propagate different error values, but always returned -EINVAL. Compile tested only. Needs review by original author. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv6/ila/ila_lwt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index b3df03e3faa0..f4a413aba423 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -91,7 +91,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) drop: kfree_skb(skb); - return -EINVAL; + return err; } static int ila_input(struct sk_buff *skb) -- cgit v1.2.3 From 9691724e5658dfb19d747b00bf34ce9df0d1b20b Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 19 May 2017 09:55:51 -0700 Subject: inet: fix warning about missing prototype The prototype for inet_rcv_saddr_equal was not being included. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1054d330bf9d..82dec8825d28 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; -- cgit v1.2.3 From c718c6d66b249954d38eebe74724229f635fa655 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 19 May 2017 09:55:52 -0700 Subject: tcpnv: do not export local function The TCP New Vegas congestion control was exporting an internal function tcpnv_get_info which is not used by any other in tree kernel code. Make it static. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_nv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 5de82a8d4d87..6d650ed3cb59 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -424,8 +424,8 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) } /* Extract info for Tcp socket info provided via netlink */ -size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, - union tcp_cc_info *info) +static size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct tcpnv *ca = inet_csk_ca(sk); @@ -440,7 +440,6 @@ size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, } return 0; } -EXPORT_SYMBOL_GPL(tcpnv_get_info); static struct tcp_congestion_ops tcpnv __read_mostly = { .init = tcpnv_init, -- cgit v1.2.3 From 9dc621afa8d673d4bdaab2d850699cd98a50b14a Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 19 May 2017 09:55:54 -0700 Subject: fou: make local function static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The build header functions are not used by any other code. net/ipv6/fou6.c:36:5: warning: no previous prototype for ‘fou6_build_header’ [-Wmissing-prototypes] net/ipv6/fou6.c:54:5: warning: no previous prototype for ‘gue6_build_header’ [-Wmissing-prototypes] Need to do some code rearranging to satisfy different Kconfig possiblities. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/fou.c | 82 ++++++++++++++++++++++++++++----------------------------- net/ipv6/fou6.c | 14 +++++----- 2 files changed, 47 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 805f6607f8d9..8e0257d01200 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -859,25 +860,6 @@ size_t gue_encap_hlen(struct ip_tunnel_encap *e) } EXPORT_SYMBOL(gue_encap_hlen); -static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, - struct flowi4 *fl4, u8 *protocol, __be16 sport) -{ - struct udphdr *uh; - - skb_push(skb, sizeof(struct udphdr)); - skb_reset_transport_header(skb); - - uh = udp_hdr(skb); - - uh->dest = e->dport; - uh->source = sport; - uh->len = htons(skb->len); - udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, - fl4->saddr, fl4->daddr, skb->len); - - *protocol = IPPROTO_UDP; -} - int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type) { @@ -894,24 +876,6 @@ int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, } EXPORT_SYMBOL(__fou_build_header); -int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi4 *fl4) -{ - int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : - SKB_GSO_UDP_TUNNEL; - __be16 sport; - int err; - - err = __fou_build_header(skb, e, protocol, &sport, type); - if (err) - return err; - - fou_build_udp(skb, e, fl4, protocol, sport); - - return 0; -} -EXPORT_SYMBOL(fou_build_header); - int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type) { @@ -985,8 +949,46 @@ int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, } EXPORT_SYMBOL(__gue_build_header); -int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi4 *fl4) +#ifdef CONFIG_NET_FOU_IP_TUNNELS + +static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, + struct flowi4 *fl4, u8 *protocol, __be16 sport) +{ + struct udphdr *uh; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; +} + +static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; + __be16 sport; + int err; + + err = __fou_build_header(skb, e, protocol, &sport, type); + if (err) + return err; + + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} + +static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) { int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; @@ -1001,9 +1003,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } -EXPORT_SYMBOL(gue_build_header); -#ifdef CONFIG_NET_FOU_IP_TUNNELS static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index 9ea249b9451e..6de3c04b0f30 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -14,6 +14,8 @@ #include #include +#if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL) + static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, struct flowi6 *fl6, u8 *protocol, __be16 sport) { @@ -33,8 +35,8 @@ static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, *protocol = IPPROTO_UDP; } -int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi6 *fl6) +static int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi6 *fl6) { __be16 sport; int err; @@ -49,10 +51,9 @@ int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } -EXPORT_SYMBOL(fou6_build_header); -int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi6 *fl6) +static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi6 *fl6) { __be16 sport; int err; @@ -67,9 +68,6 @@ int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } -EXPORT_SYMBOL(gue6_build_header); - -#if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL) static const struct ip6_tnl_encap_ops fou_ip6tun_ops = { .encap_hlen = fou_encap_hlen, -- cgit v1.2.3 From 0a9fc39e4105350601e59f2914d445f373fcdd8b Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 19 May 2017 09:55:55 -0700 Subject: ipv6: drop unused variables in seg6_genl_dumphac THe seg6_pernet_data variable was set but never used. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv6/seg6.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 5f44ffed2576..15fba55e3da8 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -303,13 +303,9 @@ static int seg6_genl_dumphmac_done(struct netlink_callback *cb) static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb) { struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; - struct net *net = sock_net(skb->sk); - struct seg6_pernet_data *sdata; struct seg6_hmac_info *hinfo; int ret; - sdata = seg6_pernet(net); - ret = rhashtable_walk_start(iter); if (ret && ret != -EAGAIN) goto done; -- cgit v1.2.3 From 4ab688793e086ef6d1744a0f803fe9770a1ae5d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 21 May 2017 10:39:00 -0700 Subject: tcp: fix tcp_probe_timer() for TCP_USER_TIMEOUT TCP_USER_TIMEOUT is still converted to jiffies value in icsk_user_timeout So we need to make a conversion for the cases HZ != 1000 Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 27a667bce806..c4a35ba7f8ed 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -341,7 +341,8 @@ static void tcp_probe_timer(struct sock *sk) if (!start_ts) tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) + (s32)(tcp_time_stamp(tp) - start_ts) > + jiffies_to_msecs(icsk->icsk_user_timeout)) goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; -- cgit v1.2.3 From 5d4acfc1411050d9a105a04bea4915a68d41071d Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Mon, 22 May 2017 08:42:28 +0200 Subject: Bluetooth: Delete error messages for failed memory allocations in two functions Omit two extra messages for memory allocation failures in these functions. This issue was detected by using the Coccinelle software. Link: http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf Signed-off-by: Markus Elfring Signed-off-by: Marcel Holtmann --- net/bluetooth/ecdh_helper.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/ecdh_helper.c b/net/bluetooth/ecdh_helper.c index 24d4e60f8c48..c7b1a9aee579 100644 --- a/net/bluetooth/ecdh_helper.c +++ b/net/bluetooth/ecdh_helper.c @@ -89,11 +89,9 @@ bool compute_ecdh_secret(const u8 public_key[64], const u8 private_key[32], p.curve_id = ECC_CURVE_NIST_P256; buf_len = crypto_ecdh_key_len(&p); buf = kmalloc(buf_len, GFP_KERNEL); - if (!buf) { - pr_err("alg: kpp: Failed to allocate %d bytes for buf\n", - buf_len); + if (!buf) goto free_req; - } + crypto_ecdh_encode_key(buf, buf_len, &p); /* Set A private Key */ @@ -170,11 +168,8 @@ bool generate_ecdh_keys(u8 public_key[64], u8 private_key[32]) p.key_size = 32; buf_len = crypto_ecdh_key_len(&p); buf = kmalloc(buf_len, GFP_KERNEL); - if (!buf) { - pr_err("alg: kpp: Failed to allocate %d bytes for buf\n", - buf_len); + if (!buf) goto free_req; - } do { if (tries++ >= max_tries) -- cgit v1.2.3 From 232cd35d0804cc241eb887bb8d4d9b3b9881c64a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 May 2017 14:17:48 -0700 Subject: ipv6: fix out of bound writes in __ip6_append_data() Andrey Konovalov and idaifish@gmail.com reported crashes caused by one skb shared_info being overwritten from __ip6_append_data() Andrey program lead to following state : copy -4200 datalen 2000 fraglen 2040 maxfraglen 2040 alloclen 2048 transhdrlen 0 offset 0 fraggap 6200 The skb_copy_and_csum_bits(skb_prev, maxfraglen, data + transhdrlen, fraggap, 0); is overwriting skb->head and skb_shared_info Since we apparently detect this rare condition too late, move the code earlier to even avoid allocating skb and risking crashes. Once again, many thanks to Andrey and syzkaller team. Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Reported-by: Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d4a31becbd25..bf8a58a1c32d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1466,6 +1466,11 @@ alloc_new_skb: */ alloclen += sizeof(struct frag_hdr); + copy = datalen - transhdrlen - fraggap; + if (copy < 0) { + err = -EINVAL; + goto error; + } if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len, @@ -1515,13 +1520,9 @@ alloc_new_skb: data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } - copy = datalen - transhdrlen - fraggap; - - if (copy < 0) { - err = -EINVAL; - kfree_skb(skb); - goto error; - } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + if (copy > 0 && + getfrag(from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; -- cgit v1.2.3 From 6d8422a175ccf2846d9460ed2b6228fe0b12c243 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 21 May 2017 10:12:02 -0600 Subject: net: ipv4: Plumb extack through route add functions Plumb extack argument down to route add functions. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_frontend.c | 16 +++++++++------- net/ipv4/fib_lookup.h | 3 ++- net/ipv4/fib_semantics.c | 22 +++++++++++++--------- net/ipv4/fib_trie.c | 4 ++-- 5 files changed, 28 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 6692c5758b33..42e8b8f55f7c 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -263,7 +263,8 @@ struct fib_table { int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags); -int fib_table_insert(struct net *, struct fib_table *, struct fib_config *); +int fib_table_insert(struct net *, struct fib_table *, struct fib_config *, + struct netlink_ext_ack *extack); int fib_table_delete(struct net *, struct fib_table *, struct fib_config *); int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 83e3ed258467..511edff76c01 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -594,7 +594,8 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) } else { tb = fib_new_table(net, cfg.fc_table); if (tb) - err = fib_table_insert(net, tb, &cfg); + err = fib_table_insert(net, tb, + &cfg, NULL); else err = -ENOBUFS; } @@ -626,14 +627,15 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, - struct nlmsghdr *nlh, struct fib_config *cfg) + struct nlmsghdr *nlh, struct fib_config *cfg, + struct netlink_ext_ack *extack) { struct nlattr *attr; int err, remaining; struct rtmsg *rtm; err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, - NULL); + extack); if (err < 0) goto errout; @@ -718,7 +720,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_table *tb; int err; - err = rtm_to_fib_config(net, skb, nlh, &cfg); + err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; @@ -741,7 +743,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_table *tb; int err; - err = rtm_to_fib_config(net, skb, nlh, &cfg); + err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; @@ -751,7 +753,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_table_insert(net, tb, &cfg); + err = fib_table_insert(net, tb, &cfg, extack); errout: return err; } @@ -845,7 +847,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) - fib_table_insert(net, tb, &cfg); + fib_table_insert(net, tb, &cfg, NULL); else fib_table_delete(net, tb, &cfg); } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 9c02920725db..2704e08545da 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -28,7 +28,8 @@ static inline void fib_alias_accessed(struct fib_alias *fa) /* Exported by fib_semantics.c */ void fib_release_info(struct fib_info *); -struct fib_info *fib_create_info(struct fib_config *cfg); +struct fib_info *fib_create_info(struct fib_config *cfg, + struct netlink_ext_ack *extack); int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da449ddb8cc1..8587d1b55b53 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -454,7 +454,8 @@ static int fib_detect_death(struct fib_info *fi, int order, #ifdef CONFIG_IP_ROUTE_MULTIPATH -static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) +static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, + struct netlink_ext_ack *extack) { int nhs = 0; @@ -468,7 +469,8 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) } static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, - int remaining, struct fib_config *cfg) + int remaining, struct fib_config *cfg, + struct netlink_ext_ack *extack) { int ret; @@ -714,7 +716,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) * |-> {local prefix} (terminal node) */ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, - struct fib_nh *nh) + struct fib_nh *nh, struct netlink_ext_ack *extack) { int err = 0; struct net *net; @@ -797,7 +799,6 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) return -EINVAL; - rcu_read_lock(); err = -ENODEV; in_dev = inetdev_by_index(net, nh->nh_oif); @@ -980,7 +981,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) return 0; } -struct fib_info *fib_create_info(struct fib_config *cfg) +struct fib_info *fib_create_info(struct fib_config *cfg, + struct netlink_ext_ack *extack) { int err; struct fib_info *fi = NULL; @@ -1000,7 +1002,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { - nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); + nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); if (nhs == 0) goto err_inval; } @@ -1062,7 +1064,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); + err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); if (err != 0) goto failure; if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) @@ -1129,7 +1131,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ - if (nhs != 1 || nh->nh_gw) + if (nhs != 1) + goto err_inval; + if (nh->nh_gw) goto err_inval; nh->nh_scope = RT_SCOPE_NOWHERE; nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); @@ -1140,7 +1144,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, fi, nexthop_nh); + err = fib_check_nh(cfg, fi, nexthop_nh, extack); if (err != 0) goto failure; if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 51182ff2b441..6d0f6c79d9aa 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1101,7 +1101,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, - struct fib_config *cfg) + struct fib_config *cfg, struct netlink_ext_ack *extack) { enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct trie *t = (struct trie *)tb->tb_data; @@ -1125,7 +1125,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if ((plen < KEYLENGTH) && (key << plen)) return -EINVAL; - fi = fib_create_info(cfg); + fi = fib_create_info(cfg, extack); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; -- cgit v1.2.3 From c3ab2b4ec8f7c0700bf10957171c479bf3dbca52 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 21 May 2017 10:12:03 -0600 Subject: net: ipv4: Add extack messages for route add failures Add messages for non-obvious errors (e.g, no need to add text for malloc failures or ENODEV failures). This mostly covers the annoying EINVAL errors Some message strings violate the 80-columns but searchable strings need to trump that rule. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netlink.h | 5 +++ net/ipv4/fib_frontend.c | 2 + net/ipv4/fib_semantics.c | 115 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 100 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 5fff5ba5964e..a68aad484c69 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -97,6 +97,11 @@ struct netlink_ext_ack { #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) +#define NL_SET_BAD_ATTR(extack, attr) do { \ + if ((extack)) \ + (extack)->bad_attr = (attr); \ +} while (0) + extern void netlink_kernel_release(struct sock *sk); extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups); extern int netlink_change_ngroups(struct sock *sk, unsigned int groups); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 511edff76c01..14d2f7bd7c76 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -656,6 +656,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { + NL_SET_ERR_MSG(extack, "Invalid route type"); err = -EINVAL; goto errout; } @@ -726,6 +727,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, tb = fib_get_table(net, cfg.fc_table); if (!tb) { + NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; goto errout; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8587d1b55b53..4852e183afe0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -465,7 +466,13 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, } /* leftover implies invalid nexthop configuration, discard it */ - return remaining > 0 ? 0 : nhs; + if (remaining > 0) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - extra data after nexthops"); + nhs = 0; + } + + return nhs; } static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, @@ -477,11 +484,17 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, change_nexthops(fi) { int attrlen; - if (!rtnh_ok(rtnh, remaining)) + if (!rtnh_ok(rtnh, remaining)) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - extra data after nexthop"); return -EINVAL; + } - if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { + NL_SET_ERR_MSG(extack, + "Invalid flags for nexthop - can not contain DEAD or LINKDOWN"); return -EINVAL; + } nexthop_nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; @@ -507,8 +520,12 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); - if (!nla_entype) + if (!nla_entype) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, + "Encap type is missing"); goto err_inval; + } ret = lwtunnel_build_state(nla_get_u16( nla_entype), @@ -729,16 +746,25 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (nh->nh_flags & RTNH_F_ONLINK) { unsigned int addr_type; - if (cfg->fc_scope >= RT_SCOPE_LINK) + if (cfg->fc_scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid scope"); return -EINVAL; + } dev = __dev_get_by_index(net, nh->nh_oif); if (!dev) return -ENODEV; - if (!(dev->flags & IFF_UP)) + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, + "Nexthop device is not up"); return -ENETDOWN; + } addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw); - if (addr_type != RTN_UNICAST) + if (addr_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway"); return -EINVAL; + } if (!netif_carrier_ok(dev)) nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; @@ -778,18 +804,25 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } if (err) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway"); rcu_read_unlock(); return err; } } err = -EINVAL; - if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; + } nh->nh_scope = res.scope; nh->nh_oif = FIB_RES_OIF(res); nh->nh_dev = dev = FIB_RES_DEV(res); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG(extack, + "No egress device for nexthop gateway"); goto out; + } dev_hold(dev); if (!netif_carrier_ok(dev)) nh->nh_flags |= RTNH_F_LINKDOWN; @@ -797,16 +830,21 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } else { struct in_device *in_dev; - if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) + if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { + NL_SET_ERR_MSG(extack, + "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); return -EINVAL; + } rcu_read_lock(); err = -ENODEV; in_dev = inetdev_by_index(net, nh->nh_oif); if (!in_dev) goto out; err = -ENETDOWN; - if (!(in_dev->dev->flags & IFF_UP)) + if (!(in_dev->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); goto out; + } nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; @@ -994,11 +1032,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto err_inval; /* Fast check to catch the most weird cases */ - if (fib_props[cfg->fc_type].scope > cfg->fc_scope) + if (fib_props[cfg->fc_type].scope > cfg->fc_scope) { + NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; + } - if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { + NL_SET_ERR_MSG(extack, + "Invalid rtm_flags - can not contain DEAD or LINKDOWN"); goto err_inval; + } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { @@ -1067,15 +1110,26 @@ struct fib_info *fib_create_info(struct fib_config *cfg, err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); if (err != 0) goto failure; - if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) + if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) { + NL_SET_ERR_MSG(extack, + "Nexthop device index does not match RTA_OIF"); goto err_inval; - if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) + } + if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) { + NL_SET_ERR_MSG(extack, + "Nexthop gateway does not match RTA_GATEWAY"); goto err_inval; + } #ifdef CONFIG_IP_ROUTE_CLASSID - if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) + if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { + NL_SET_ERR_MSG(extack, + "Nexthop class id does not match RTA_FLOW"); goto err_inval; + } #endif #else + NL_SET_ERR_MSG(extack, + "Multipath support not enabled in kernel"); goto err_inval; #endif } else { @@ -1084,8 +1138,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg, if (cfg->fc_encap) { struct lwtunnel_state *lwtstate; - if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) + if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) { + NL_SET_ERR_MSG(extack, + "LWT encap type not specified"); goto err_inval; + } err = lwtunnel_build_state(cfg->fc_encap_type, cfg->fc_encap, AF_INET, cfg, &lwtstate); @@ -1108,8 +1165,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } if (fib_props[cfg->fc_type].error) { - if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) + if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) { + NL_SET_ERR_MSG(extack, + "Gateway, device and multipath can not be specified for this route type"); goto err_inval; + } goto link_it; } else { switch (cfg->fc_type) { @@ -1120,21 +1180,30 @@ struct fib_info *fib_create_info(struct fib_config *cfg, case RTN_MULTICAST: break; default: + NL_SET_ERR_MSG(extack, "Invalid route type"); goto err_inval; } } - if (cfg->fc_scope > RT_SCOPE_HOST) + if (cfg->fc_scope > RT_SCOPE_HOST) { + NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; + } if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ - if (nhs != 1) + if (nhs != 1) { + NL_SET_ERR_MSG(extack, + "Route with host scope can not have multiple nexthops"); goto err_inval; - if (nh->nh_gw) + } + if (nh->nh_gw) { + NL_SET_ERR_MSG(extack, + "Route with host scope can not have a gateway"); goto err_inval; + } nh->nh_scope = RT_SCOPE_NOWHERE; nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); err = -ENODEV; @@ -1154,8 +1223,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi->fib_flags |= RTNH_F_LINKDOWN; } - if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) + if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) { + NL_SET_ERR_MSG(extack, "Invalid prefsrc address"); goto err_inval; + } change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); -- cgit v1.2.3 From 333c430167c21b96de81a674fa6cbe84b09475dc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 21 May 2017 10:12:04 -0600 Subject: net: ipv6: Plumb extack through route add functions Plumb extack argument down to route add functions. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- include/net/ip6_route.h | 2 +- net/ipv6/addrconf.c | 4 ++-- net/ipv6/ip6_fib.c | 14 +++++++----- net/ipv6/route.c | 57 +++++++++++++++++++++++++++---------------------- 5 files changed, 46 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c979c878df1c..aa50e2e6fa2a 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -277,7 +277,8 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void *arg); int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc); + struct nl_info *info, struct mx6_config *mxc, + struct netlink_ext_ack *extack); int fib6_del(struct rt6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f5e625f53367..f3da9dd2a8db 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -90,7 +90,7 @@ void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); -int ip6_route_add(struct fib6_config *cfg); +int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); int ip6_ins_rt(struct rt6_info *); int ip6_del_rt(struct rt6_info *); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6a4fb1e629fb..25443fd946a8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2280,7 +2280,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, cfg.fc_flags |= RTF_NONEXTHOP; #endif - ip6_route_add(&cfg); + ip6_route_add(&cfg, NULL); } @@ -2335,7 +2335,7 @@ static void addrconf_add_mroute(struct net_device *dev) ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); - ip6_route_add(&cfg); + ip6_route_add(&cfg, NULL); } static struct inet6_dev *addrconf_add_dev(struct net_device *dev) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d4bf2c68a545..c1197e167d3e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -473,7 +473,8 @@ out: static struct fib6_node *fib6_add_1(struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, - int replace_required, int sernum) + int replace_required, int sernum, + struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; @@ -964,7 +965,8 @@ void fib6_force_start_gc(struct net *net) */ int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc) + struct nl_info *info, struct mx6_config *mxc, + struct netlink_ext_ack *extack) { struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; @@ -987,7 +989,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, - replace_required, sernum); + replace_required, sernum, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; @@ -1028,7 +1030,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sn = fib6_add_1(sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum); + allow_create, replace_required, sernum, + extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated @@ -1047,7 +1050,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum); + allow_create, replace_required, sernum, + extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dc61b0b5e64e..ca754ec4054a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -938,14 +938,15 @@ EXPORT_SYMBOL(rt6_lookup); */ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, - struct mx6_config *mxc) + struct mx6_config *mxc, + struct netlink_ext_ack *extack) { int err; struct fib6_table *table; table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); - err = fib6_add(&table->tb6_root, rt, info, mxc); + err = fib6_add(&table->tb6_root, rt, info, mxc, extack); write_unlock_bh(&table->tb6_lock); return err; @@ -956,7 +957,7 @@ int ip6_ins_rt(struct rt6_info *rt) struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; struct mx6_config mxc = { .mx = NULL, }; - return __ip6_ins_rt(rt, &info, &mxc); + return __ip6_ins_rt(rt, &info, &mxc, NULL); } static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, @@ -1844,7 +1845,8 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, return rt; } -static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) +static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct rt6_info *rt = NULL; @@ -2111,13 +2113,14 @@ out: return ERR_PTR(err); } -int ip6_route_add(struct fib6_config *cfg) +int ip6_route_add(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct mx6_config mxc = { .mx = NULL, }; struct rt6_info *rt; int err; - rt = ip6_route_info_create(cfg); + rt = ip6_route_info_create(cfg, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; @@ -2128,7 +2131,7 @@ int ip6_route_add(struct fib6_config *cfg) if (err) goto out; - err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc); + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); kfree(mxc.mx); @@ -2222,7 +2225,8 @@ out_put: return err; } -static int ip6_route_del(struct fib6_config *cfg) +static int ip6_route_del(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct fib6_table *table; struct fib6_node *fn; @@ -2483,7 +2487,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, if (!prefixlen) cfg.fc_flags |= RTF_DEFAULT; - ip6_route_add(&cfg); + ip6_route_add(&cfg, NULL); return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); } @@ -2529,7 +2533,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, cfg.fc_gateway = *gwaddr; - if (!ip6_route_add(&cfg)) { + if (!ip6_route_add(&cfg, NULL)) { struct fib6_table *table; table = fib6_get_table(dev_net(dev), cfg.fc_table); @@ -2622,10 +2626,10 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); switch (cmd) { case SIOCADDRT: - err = ip6_route_add(&cfg); + err = ip6_route_add(&cfg, NULL); break; case SIOCDELRT: - err = ip6_route_del(&cfg); + err = ip6_route_del(&cfg, NULL); break; default: err = -EINVAL; @@ -2903,7 +2907,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, - struct fib6_config *cfg) + struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; @@ -3097,7 +3102,8 @@ static void ip6_route_mpath_notify(struct rt6_info *rt, inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); } -static int ip6_route_multipath_add(struct fib6_config *cfg) +static int ip6_route_multipath_add(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct rt6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; @@ -3145,7 +3151,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) r_cfg.fc_encap_type = nla_get_u16(nla); } - rt = ip6_route_info_create(&r_cfg); + rt = ip6_route_info_create(&r_cfg, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; @@ -3170,7 +3176,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { rt_last = nh->rt6_info; - err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc); + err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); /* save reference to first route for notification */ if (!rt_notif && !err) rt_notif = nh->rt6_info; @@ -3212,7 +3218,7 @@ add_errout: list_for_each_entry(nh, &rt6_nh_list, next) { if (err_nh == nh) break; - ip6_route_del(&nh->r_cfg); + ip6_route_del(&nh->r_cfg, extack); } cleanup: @@ -3227,7 +3233,8 @@ cleanup: return err; } -static int ip6_route_multipath_del(struct fib6_config *cfg) +static int ip6_route_multipath_del(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct fib6_config r_cfg; struct rtnexthop *rtnh; @@ -3254,7 +3261,7 @@ static int ip6_route_multipath_del(struct fib6_config *cfg) r_cfg.fc_flags |= RTF_GATEWAY; } } - err = ip6_route_del(&r_cfg); + err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; @@ -3270,15 +3277,15 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config cfg; int err; - err = rtm_to_fib6_config(skb, nlh, &cfg); + err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_mp) - return ip6_route_multipath_del(&cfg); + return ip6_route_multipath_del(&cfg, extack); else { cfg.fc_delete_all_nh = 1; - return ip6_route_del(&cfg); + return ip6_route_del(&cfg, extack); } } @@ -3288,14 +3295,14 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config cfg; int err; - err = rtm_to_fib6_config(skb, nlh, &cfg); + err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_mp) - return ip6_route_multipath_add(&cfg); + return ip6_route_multipath_add(&cfg, extack); else - return ip6_route_add(&cfg); + return ip6_route_add(&cfg, extack); } static size_t rt6_nlmsg_size(struct rt6_info *rt) -- cgit v1.2.3 From d5d531cb50a848b9f6767fcd7ef0c7767b3e9b21 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 21 May 2017 10:12:05 -0600 Subject: net: ipv6: Add extack messages for route add failures Add messages for non-obvious errors (e.g, no need to add text for malloc failures or ENODEV failures). This mostly covers the annoying EINVAL errors Some message strings violate the 80-columns but searchable strings need to trump that rule. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 4 ++++ net/ipv6/route.c | 40 ++++++++++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c1197e167d3e..deea901746c8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -498,6 +498,8 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { + NL_SET_ERR_MSG(extack, + "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } @@ -544,6 +546,8 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, * That would keep IPv6 consistent with IPv4 */ if (replace_required) { + NL_SET_ERR_MSG(extack, + "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ca754ec4054a..80bda31ffbbe 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1857,14 +1857,25 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, int err = -EINVAL; /* RTF_PCPU is an internal flag; can not be set by userspace */ - if (cfg->fc_flags & RTF_PCPU) + if (cfg->fc_flags & RTF_PCPU) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); goto out; + } - if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) + if (cfg->fc_dst_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + goto out; + } + if (cfg->fc_src_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid source address length"); goto out; + } #ifndef CONFIG_IPV6_SUBTREES - if (cfg->fc_src_len) + if (cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, + "Specifying source address requires IPV6_SUBTREES to be enabled"); goto out; + } #endif if (cfg->fc_ifindex) { err = -ENODEV; @@ -2015,9 +2026,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, err = -EINVAL; if (ipv6_chk_addr_and_flags(net, gw_addr, gwa_type & IPV6_ADDR_LINKLOCAL ? - dev : NULL, 0, 0)) + dev : NULL, 0, 0)) { + NL_SET_ERR_MSG(extack, "Invalid gateway address"); goto out; - + } rt->rt6i_gateway = *gw_addr; if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { @@ -2033,8 +2045,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, addressing */ if (!(gwa_type & (IPV6_ADDR_UNICAST | - IPV6_ADDR_MAPPED))) + IPV6_ADDR_MAPPED))) { + NL_SET_ERR_MSG(extack, + "Invalid gateway address"); goto out; + } if (cfg->fc_table) { grt = ip6_nh_lookup_table(net, cfg, gw_addr); @@ -2074,8 +2089,14 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } err = -EINVAL; - if (!dev || (dev->flags & IFF_LOOPBACK)) + if (!dev) { + NL_SET_ERR_MSG(extack, "Egress device not specified"); + goto out; + } else if (dev->flags & IFF_LOOPBACK) { + NL_SET_ERR_MSG(extack, + "Egress device can not be loopback device for this route"); goto out; + } } err = -ENODEV; @@ -2084,6 +2105,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (!ipv6_addr_any(&cfg->fc_prefsrc)) { if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { + NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; goto out; } @@ -2234,8 +2256,10 @@ static int ip6_route_del(struct fib6_config *cfg, int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); - if (!table) + if (!table) { + NL_SET_ERR_MSG(extack, "FIB table does not exist"); return err; + } read_lock_bh(&table->tb6_lock); -- cgit v1.2.3 From 241c4667fcf3b64f84a892e2b656027d85e73e6b Mon Sep 17 00:00:00 2001 From: "Rosen, Rami" Date: Sun, 21 May 2017 22:12:38 +0300 Subject: net: socket: fix a typo in sockfd_lookup(). This patch fixes a typo in sockfd_lookup() in net/socket.c. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index cb355a7ef135..8f9dab330d57 100644 --- a/net/socket.c +++ b/net/socket.c @@ -461,7 +461,7 @@ EXPORT_SYMBOL(sock_from_file); * @err: pointer to an error code return * * The file handle passed in is locked and the socket it is bound - * too is returned. If an error occurs the err pointer is overwritten + * to is returned. If an error occurs the err pointer is overwritten * with a negative errno code and NULL is returned. The function checks * for both invalid handles and passing a handle which is not a socket. * -- cgit v1.2.3 From a777f715caf0ff95652a872d1af24942cf5002fc Mon Sep 17 00:00:00 2001 From: Rohit Chavan Date: Mon, 22 May 2017 11:59:15 +0530 Subject: net: ipv4: tcp: fixed comment coding style issue Fixed a coding style issue Signed-off-by: Rohit Chavan Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b5d18484746d..9a56077eafea 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2183,7 +2183,7 @@ adjudge_to_death: /* Now socket is owned by kernel and we acquire BH lock - to finish close. No need to check for user refs. + * to finish close. No need to check for user refs. */ local_bh_disable(); bh_lock_sock(sk); @@ -2471,7 +2471,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_MAXSEG: /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet - * know which interface is going to be used */ + * know which interface is going to be used + */ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { err = -EINVAL; break; -- cgit v1.2.3 From 499fde662f1957e3cb8d192a94a099ebe19c714b Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 19 May 2017 11:21:59 -0700 Subject: vsock: use new wait API for vsock_stream_sendmsg() As reported by Michal, vsock_stream_sendmsg() could still sleep at vsock_stream_has_space() after prepare_to_wait(): vsock_stream_has_space vmci_transport_stream_has_space vmci_qpair_produce_free_space qp_lock qp_acquire_queue_mutex mutex_lock Just switch to the new wait API like we did for commit d9dc8b0f8b4e ("net: fix sleeping for sk_wait_event()"). Reported-by: Michal Kubecek Cc: Stefan Hajnoczi Cc: Jorgen Hansen Cc: "Michael S. Tsirkin" Cc: Claudio Imbrenda Signed-off-by: Cong Wang Reviewed-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 6f7f6757ceef..dfc8c51e4d74 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1540,8 +1540,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, long timeout; int err; struct vsock_transport_send_notify_data send_data; - - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); sk = sock->sk; vsk = vsock_sk(sk); @@ -1584,11 +1583,10 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out; - while (total_written < len) { ssize_t written; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); while (vsock_stream_has_space(vsk) == 0 && sk->sk_err == 0 && !(sk->sk_shutdown & SEND_SHUTDOWN) && @@ -1597,33 +1595,30 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } err = transport->notify_send_pre_block(vsk, &send_data); if (err < 0) { - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } release_sock(sk); - timeout = schedule_timeout(timeout); + timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } else if (timeout == 0) { err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } - - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); /* These checks occur both as part of and after the loop * conditional since we need to check before and after -- cgit v1.2.3 From bd080488a6cfd37135becedfdc87643b139c2345 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 19 May 2017 19:30:43 +0200 Subject: bridge: fix hello and hold timers starting/stopping Current bridge code incorrectly handles starting/stopping of hello and hold timers during STP enable/disable. 1. Timers are stopped in br_stp_start() during NO_STP->USER_STP transition. The timers are already stopped in NO_STP state so this is confusing no-op. 2. During USER_STP->NO_STP transition the timers are started. This does not make sense and is confusion because the timer should not be active in NO_STP state. Cc: davem@davemloft.net Cc: sashok@cumulusnetworks.com Cc: stephen@networkplumber.org Cc: bridge@lists.linux-foundation.org Cc: lucien.xin@gmail.com Cc: nikolay@cumulusnetworks.com Signed-off-by: Ivan Vecera Reviewed-by: Xin Long Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_stp_if.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'net') diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 08341d2aa9c9..a05027027513 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -150,7 +150,6 @@ static int br_stp_call_user(struct net_bridge *br, char *arg) static void br_stp_start(struct net_bridge *br) { - struct net_bridge_port *p; int err = -ENOENT; if (net_eq(dev_net(br->dev), &init_net)) @@ -169,11 +168,6 @@ static void br_stp_start(struct net_bridge *br) if (!err) { br->stp_enabled = BR_USER_STP; br_debug(br, "userspace STP started\n"); - - /* Stop hello and hold timers */ - del_timer(&br->hello_timer); - list_for_each_entry(p, &br->port_list, list) - del_timer(&p->hold_timer); } else { br->stp_enabled = BR_KERNEL_STP; br_debug(br, "using kernel STP\n"); @@ -187,7 +181,6 @@ static void br_stp_start(struct net_bridge *br) static void br_stp_stop(struct net_bridge *br) { - struct net_bridge_port *p; int err; if (br->stp_enabled == BR_USER_STP) { @@ -196,10 +189,6 @@ static void br_stp_stop(struct net_bridge *br) br_err(br, "failed to stop userspace STP (%d)\n", err); /* To start timers on any ports left in blocking */ - mod_timer(&br->hello_timer, jiffies + br->hello_time); - list_for_each_entry(p, &br->port_list, list) - mod_timer(&p->hold_timer, - round_jiffies(jiffies + BR_HOLD_TIME)); spin_lock_bh(&br->lock); br_port_state_selection(br); spin_unlock_bh(&br->lock); -- cgit v1.2.3 From 2d76b2f8b54abd16225cd80afca36ed43f113c41 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 22 May 2017 16:46:13 +0200 Subject: net: sched: cls_matchall: fix null pointer dereference Since the head is guaranteed by the check above to be null, the call_rcu would explode. Remove the previously logically dead code that was made logically very much alive and kicking. Fixes: 985538eee06f ("net/sched: remove redundant null check on head") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index dee469fed967..51859b8edd7e 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -203,7 +203,6 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) head; rcu_assign_pointer(tp->root, new); - call_rcu(&head->rcu, mall_destroy_rcu); return 0; err_replace_hw_filter: -- cgit v1.2.3 From fd364541319749c9880bf3ad613eac80fc6ea91e Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:36 -0400 Subject: net: dsa: change scope of STP state setter Instead of having multiple STP state helpers scoping a slave device supporting both the DSA logic and the switchdev binding, provide a single dsa_port_set_state helper scoping a DSA port, as well as its dsa_port_set_state_now wrapper which skips the prepare phase. This allows us to better separate the DSA logic from the slave device handling. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 91236d602301..403d1dfe7f50 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -85,13 +85,15 @@ static inline bool dsa_port_is_bridged(struct dsa_port *dp) return !!dp->bridge_dev; } -static void dsa_slave_set_state(struct net_device *dev, u8 state) +static int dsa_port_set_state(struct dsa_port *dp, u8 state, + struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; struct dsa_switch *ds = dp->ds; int port = dp->index; + if (switchdev_trans_ph_prepare(trans)) + return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; + if (ds->ops->port_stp_state_set) ds->ops->port_stp_state_set(ds, port, state); @@ -110,6 +112,17 @@ static void dsa_slave_set_state(struct net_device *dev, u8 state) } dp->stp_state = state; + + return 0; +} + +static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) +{ + int err; + + err = dsa_port_set_state(dp, state, NULL); + if (err) + pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } static int dsa_slave_open(struct net_device *dev) @@ -147,7 +160,7 @@ static int dsa_slave_open(struct net_device *dev) goto clear_promisc; } - dsa_slave_set_state(dev, stp_state); + dsa_port_set_state_now(p->dp, stp_state); if (p->phy) phy_start(p->phy); @@ -189,7 +202,7 @@ static int dsa_slave_close(struct net_device *dev) if (ds->ops->port_disable) ds->ops->port_disable(ds, p->dp->index, p->phy); - dsa_slave_set_state(dev, BR_STATE_DISABLED); + dsa_port_set_state_now(p->dp, BR_STATE_DISABLED); return 0; } @@ -386,21 +399,6 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } -static int dsa_slave_stp_state_set(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - - if (switchdev_trans_ph_prepare(trans)) - return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; - - dsa_slave_set_state(dev, attr->u.stp_state); - - return 0; -} - static int dsa_slave_vlan_filtering(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) @@ -465,11 +463,13 @@ static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; int ret; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: - ret = dsa_slave_stp_state_set(dev, attr, trans); + ret = dsa_port_set_state(dp, attr->u.stp_state, trans); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: ret = dsa_slave_vlan_filtering(dev, attr, trans); @@ -621,7 +621,7 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev, /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional */ - dsa_slave_set_state(dev, BR_STATE_FORWARDING); + dsa_port_set_state_now(p->dp, BR_STATE_FORWARDING); } static int dsa_slave_port_attr_get(struct net_device *dev, -- cgit v1.2.3 From a93ecdd9484a51e830249ee9877ac1c1544e75ac Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:37 -0400 Subject: net: dsa: change scope of notifier call chain Change the scope of the fabric notification helper from the DSA slave to the DSA port, since this is a DSA layer specific notion, that can be used by non-slave ports (CPU and DSA). Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 403d1dfe7f50..371f6d267917 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -27,10 +27,9 @@ static bool dsa_slave_dev_check(struct net_device *dev); -static int dsa_slave_notify(struct net_device *dev, unsigned long e, void *v) +static int dsa_port_notify(struct dsa_port *dp, unsigned long e, void *v) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct raw_notifier_head *nh = &p->dp->ds->dst->nh; + struct raw_notifier_head *nh = &dp->ds->dst->nh; int err; err = raw_notifier_call_chain(nh, e, v); @@ -589,7 +588,7 @@ static int dsa_slave_bridge_port_join(struct net_device *dev, */ p->dp->bridge_dev = br; - err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_JOIN, &info); + err = dsa_port_notify(p->dp, DSA_NOTIFIER_BRIDGE_JOIN, &info); /* The bridging is rolled back on error */ if (err) @@ -614,7 +613,7 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev, */ p->dp->bridge_dev = NULL; - err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_LEAVE, &info); + err = dsa_port_notify(p->dp, DSA_NOTIFIER_BRIDGE_LEAVE, &info); if (err) netdev_err(dev, "failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); -- cgit v1.2.3 From 17d7802b7772ddcf505581fe22cffcd2e8b5120e Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:38 -0400 Subject: net: dsa: change scope of bridging code Now that the bridge join and leave functions only deal with a DSA port, change their scope from the DSA slave net_device to the DSA generic dsa_port. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 371f6d267917..1ad62ef8c261 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -572,13 +572,11 @@ static int dsa_slave_port_obj_dump(struct net_device *dev, return err; } -static int dsa_slave_bridge_port_join(struct net_device *dev, - struct net_device *br) +static int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) { - struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_notifier_bridge_info info = { - .sw_index = p->dp->ds->index, - .port = p->dp->index, + .sw_index = dp->ds->index, + .port = dp->index, .br = br, }; int err; @@ -586,24 +584,22 @@ static int dsa_slave_bridge_port_join(struct net_device *dev, /* Here the port is already bridged. Reflect the current configuration * so that drivers can program their chips accordingly. */ - p->dp->bridge_dev = br; + dp->bridge_dev = br; - err = dsa_port_notify(p->dp, DSA_NOTIFIER_BRIDGE_JOIN, &info); + err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_JOIN, &info); /* The bridging is rolled back on error */ if (err) - p->dp->bridge_dev = NULL; + dp->bridge_dev = NULL; return err; } -static void dsa_slave_bridge_port_leave(struct net_device *dev, - struct net_device *br) +static void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) { - struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_notifier_bridge_info info = { - .sw_index = p->dp->ds->index, - .port = p->dp->index, + .sw_index = dp->ds->index, + .port = dp->index, .br = br, }; int err; @@ -611,16 +607,16 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev, /* Here the port is already unbridged. Reflect the current configuration * so that drivers can program their chips accordingly. */ - p->dp->bridge_dev = NULL; + dp->bridge_dev = NULL; - err = dsa_port_notify(p->dp, DSA_NOTIFIER_BRIDGE_LEAVE, &info); + err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_LEAVE, &info); if (err) - netdev_err(dev, "failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); + pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional */ - dsa_port_set_state_now(p->dp, BR_STATE_FORWARDING); + dsa_port_set_state_now(dp, BR_STATE_FORWARDING); } static int dsa_slave_port_attr_get(struct net_device *dev, @@ -1526,14 +1522,16 @@ static bool dsa_slave_dev_check(struct net_device *dev) static int dsa_slave_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; int err = NOTIFY_DONE; if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { - err = dsa_slave_bridge_port_join(dev, info->upper_dev); + err = dsa_port_bridge_join(dp, info->upper_dev); err = notifier_from_errno(err); } else { - dsa_slave_bridge_port_leave(dev, info->upper_dev); + dsa_port_bridge_leave(dp, info->upper_dev); err = NOTIFY_OK; } } -- cgit v1.2.3 From 3fdb023b5e2bad17d0b66e8903de6e38b0c16ca2 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:39 -0400 Subject: net: dsa: change scope of FDB handlers Change the scope of the switchdev FDB object handlers from the DSA slave device to the generic DSA port, so that the future port-wide API can also be used for other port types, such as CPU and DSA links. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1ad62ef8c261..e9c3ea09cc09 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -299,47 +299,44 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev, return -EOPNOTSUPP; } -static int dsa_slave_port_fdb_add(struct net_device *dev, - const struct switchdev_obj_port_fdb *fdb, - struct switchdev_trans *trans) +static int dsa_port_fdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_fdb *fdb, + struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; if (switchdev_trans_ph_prepare(trans)) { if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) return -EOPNOTSUPP; - return ds->ops->port_fdb_prepare(ds, p->dp->index, fdb, trans); + return ds->ops->port_fdb_prepare(ds, dp->index, fdb, trans); } - ds->ops->port_fdb_add(ds, p->dp->index, fdb, trans); + ds->ops->port_fdb_add(ds, dp->index, fdb, trans); return 0; } -static int dsa_slave_port_fdb_del(struct net_device *dev, - const struct switchdev_obj_port_fdb *fdb) +static int dsa_port_fdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_fdb *fdb) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; int ret = -EOPNOTSUPP; if (ds->ops->port_fdb_del) - ret = ds->ops->port_fdb_del(ds, p->dp->index, fdb); + ret = ds->ops->port_fdb_del(ds, dp->index, fdb); return ret; } -static int dsa_slave_port_fdb_dump(struct net_device *dev, - struct switchdev_obj_port_fdb *fdb, - switchdev_obj_dump_cb_t *cb) +static int dsa_port_fdb_dump(struct dsa_port *dp, + struct switchdev_obj_port_fdb *fdb, + switchdev_obj_dump_cb_t *cb) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; if (ds->ops->port_fdb_dump) - return ds->ops->port_fdb_dump(ds, p->dp->index, fdb, cb); + return ds->ops->port_fdb_dump(ds, dp->index, fdb, cb); return -EOPNOTSUPP; } @@ -488,6 +485,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct switchdev_trans *trans) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; int err; /* For the prepare phase, ensure the full set of changes is feasable in @@ -497,9 +496,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_FDB: - err = dsa_slave_port_fdb_add(dev, - SWITCHDEV_OBJ_PORT_FDB(obj), - trans); + err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj), trans); break; case SWITCHDEV_OBJ_ID_PORT_MDB: err = dsa_slave_port_mdb_add(dev, SWITCHDEV_OBJ_PORT_MDB(obj), @@ -521,12 +518,13 @@ static int dsa_slave_port_obj_add(struct net_device *dev, static int dsa_slave_port_obj_del(struct net_device *dev, const struct switchdev_obj *obj) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; int err; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_FDB: - err = dsa_slave_port_fdb_del(dev, - SWITCHDEV_OBJ_PORT_FDB(obj)); + err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_MDB: err = dsa_slave_port_mdb_del(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); @@ -547,13 +545,13 @@ static int dsa_slave_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj, switchdev_obj_dump_cb_t *cb) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; int err; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_FDB: - err = dsa_slave_port_fdb_dump(dev, - SWITCHDEV_OBJ_PORT_FDB(obj), - cb); + err = dsa_port_fdb_dump(dp, SWITCHDEV_OBJ_PORT_FDB(obj), cb); break; case SWITCHDEV_OBJ_ID_PORT_MDB: err = dsa_slave_port_mdb_dump(dev, SWITCHDEV_OBJ_PORT_MDB(obj), -- cgit v1.2.3 From bcebb976ec433e1f8a81d1a70db26420e85386aa Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:40 -0400 Subject: net: dsa: change scope of MDB handlers Change the scope of the switchdev MDB object handlers from the DSA slave device to the generic DSA port, so that the future port-wide API can also be used for other port types, such as CPU and DSA links. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index e9c3ea09cc09..0921d306aedf 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -341,46 +341,43 @@ static int dsa_port_fdb_dump(struct dsa_port *dp, return -EOPNOTSUPP; } -static int dsa_slave_port_mdb_add(struct net_device *dev, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans) +static int dsa_port_mdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb, + struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; if (switchdev_trans_ph_prepare(trans)) { if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) return -EOPNOTSUPP; - return ds->ops->port_mdb_prepare(ds, p->dp->index, mdb, trans); + return ds->ops->port_mdb_prepare(ds, dp->index, mdb, trans); } - ds->ops->port_mdb_add(ds, p->dp->index, mdb, trans); + ds->ops->port_mdb_add(ds, dp->index, mdb, trans); return 0; } -static int dsa_slave_port_mdb_del(struct net_device *dev, - const struct switchdev_obj_port_mdb *mdb) +static int dsa_port_mdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; if (ds->ops->port_mdb_del) - return ds->ops->port_mdb_del(ds, p->dp->index, mdb); + return ds->ops->port_mdb_del(ds, dp->index, mdb); return -EOPNOTSUPP; } -static int dsa_slave_port_mdb_dump(struct net_device *dev, - struct switchdev_obj_port_mdb *mdb, - switchdev_obj_dump_cb_t *cb) +static int dsa_port_mdb_dump(struct dsa_port *dp, + struct switchdev_obj_port_mdb *mdb, + switchdev_obj_dump_cb_t *cb) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; if (ds->ops->port_mdb_dump) - return ds->ops->port_mdb_dump(ds, p->dp->index, mdb, cb); + return ds->ops->port_mdb_dump(ds, dp->index, mdb, cb); return -EOPNOTSUPP; } @@ -499,8 +496,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev, err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj), trans); break; case SWITCHDEV_OBJ_ID_PORT_MDB: - err = dsa_slave_port_mdb_add(dev, SWITCHDEV_OBJ_PORT_MDB(obj), - trans); + err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_port_vlan_add(dev, @@ -527,7 +523,7 @@ static int dsa_slave_port_obj_del(struct net_device *dev, err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_MDB: - err = dsa_slave_port_mdb_del(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); + err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_port_vlan_del(dev, @@ -554,8 +550,7 @@ static int dsa_slave_port_obj_dump(struct net_device *dev, err = dsa_port_fdb_dump(dp, SWITCHDEV_OBJ_PORT_FDB(obj), cb); break; case SWITCHDEV_OBJ_ID_PORT_MDB: - err = dsa_slave_port_mdb_dump(dev, SWITCHDEV_OBJ_PORT_MDB(obj), - cb); + err = dsa_port_mdb_dump(dp, SWITCHDEV_OBJ_PORT_MDB(obj), cb); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_port_vlan_dump(dev, -- cgit v1.2.3 From 01676d129c1a7645879a104cbe5ac43bfa3c25a4 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:41 -0400 Subject: net: dsa: change scope of VLAN handlers Change the scope of the switchdev VLAN object handlers from the DSA slave device to the generic DSA port, so that the future port-wide API can also be used for other port types, such as CPU and DSA links. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 0921d306aedf..de39da69fd33 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -254,12 +254,10 @@ out: return 0; } -static int dsa_slave_port_vlan_add(struct net_device *dev, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans) +static int dsa_port_vlan_add(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan, + struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; struct dsa_switch *ds = dp->ds; if (switchdev_trans_ph_prepare(trans)) { @@ -274,27 +272,25 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, return 0; } -static int dsa_slave_port_vlan_del(struct net_device *dev, - const struct switchdev_obj_port_vlan *vlan) +static int dsa_port_vlan_del(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; if (!ds->ops->port_vlan_del) return -EOPNOTSUPP; - return ds->ops->port_vlan_del(ds, p->dp->index, vlan); + return ds->ops->port_vlan_del(ds, dp->index, vlan); } -static int dsa_slave_port_vlan_dump(struct net_device *dev, - struct switchdev_obj_port_vlan *vlan, - switchdev_obj_dump_cb_t *cb) +static int dsa_port_vlan_dump(struct dsa_port *dp, + struct switchdev_obj_port_vlan *vlan, + switchdev_obj_dump_cb_t *cb) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; if (ds->ops->port_vlan_dump) - return ds->ops->port_vlan_dump(ds, p->dp->index, vlan, cb); + return ds->ops->port_vlan_dump(ds, dp->index, vlan, cb); return -EOPNOTSUPP; } @@ -499,9 +495,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev, err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - err = dsa_slave_port_vlan_add(dev, - SWITCHDEV_OBJ_PORT_VLAN(obj), - trans); + err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), + trans); break; default: err = -EOPNOTSUPP; @@ -526,8 +521,7 @@ static int dsa_slave_port_obj_del(struct net_device *dev, err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - err = dsa_slave_port_vlan_del(dev, - SWITCHDEV_OBJ_PORT_VLAN(obj)); + err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj)); break; default: err = -EOPNOTSUPP; @@ -553,9 +547,7 @@ static int dsa_slave_port_obj_dump(struct net_device *dev, err = dsa_port_mdb_dump(dp, SWITCHDEV_OBJ_PORT_MDB(obj), cb); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - err = dsa_slave_port_vlan_dump(dev, - SWITCHDEV_OBJ_PORT_VLAN(obj), - cb); + err = dsa_port_vlan_dump(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), cb); break; default: err = -EOPNOTSUPP; -- cgit v1.2.3 From c02c4175cb9f434af8f20045dd43ae9e573c8da2 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:42 -0400 Subject: net: dsa: change scope of VLAN filtering setter Change the scope of the switchdev VLAN filtering attribute setter from the DSA slave device to the generic DSA port, so that the future port-wide API can also be used for other port types, such as CPU and DSA links. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index de39da69fd33..216eb38a847d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -388,20 +388,18 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } -static int dsa_slave_vlan_filtering(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) +static int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, + struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ if (switchdev_trans_ph_prepare(trans)) return 0; if (ds->ops->port_vlan_filtering) - return ds->ops->port_vlan_filtering(ds, p->dp->index, - attr->u.vlan_filtering); + return ds->ops->port_vlan_filtering(ds, dp->index, + vlan_filtering); return 0; } @@ -461,7 +459,8 @@ static int dsa_slave_port_attr_set(struct net_device *dev, ret = dsa_port_set_state(dp, attr->u.stp_state, trans); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: - ret = dsa_slave_vlan_filtering(dev, attr, trans); + ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, + trans); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: ret = dsa_slave_ageing_time(dev, attr, trans); -- cgit v1.2.3 From 072bb1903a0ff810c6091d2f6bf7c80e76dab0e6 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:43 -0400 Subject: net: dsa: change scope of ageing time setter Change the scope of the switchdev bridge ageing time attribute setter from the DSA slave device to the generic DSA port, so that the future port-wide API can also be used for other port types, such as CPU and DSA links. Also ds->ports is now a contiguous array of dsa_port structures, thus their addresses cannot be NULL. Remove the useless check in dsa_fastest_ageing_time. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 216eb38a847d..b0150f79dcdd 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -412,21 +412,19 @@ static unsigned int dsa_fastest_ageing_time(struct dsa_switch *ds, for (i = 0; i < ds->num_ports; ++i) { struct dsa_port *dp = &ds->ports[i]; - if (dp && dp->ageing_time && dp->ageing_time < ageing_time) + if (dp->ageing_time && dp->ageing_time < ageing_time) ageing_time = dp->ageing_time; } return ageing_time; } -static int dsa_slave_ageing_time(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) +static int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, + struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time); + unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock); unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); + struct dsa_switch *ds = dp->ds; if (switchdev_trans_ph_prepare(trans)) { if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) @@ -437,7 +435,7 @@ static int dsa_slave_ageing_time(struct net_device *dev, } /* Keep the fastest ageing time in case of multiple bridges */ - p->dp->ageing_time = ageing_time; + dp->ageing_time = ageing_time; ageing_time = dsa_fastest_ageing_time(ds, ageing_time); if (ds->ops->set_ageing_time) @@ -463,7 +461,7 @@ static int dsa_slave_port_attr_set(struct net_device *dev, trans); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: - ret = dsa_slave_ageing_time(dev, attr, trans); + ret = dsa_port_ageing_time(dp, attr->u.ageing_time, trans); break; default: ret = -EOPNOTSUPP; -- cgit v1.2.3 From a40c175b4a4a2c6f7e111ed6dc0186c75287dff0 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:44 -0400 Subject: net: dsa: move port state setters Add a new port.c file to hold all DSA port-wide logic. This patch moves in the code which sets a port state. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/Makefile | 2 +- net/dsa/dsa_priv.h | 5 +++++ net/dsa/port.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 40 --------------------------------------- 4 files changed, 61 insertions(+), 41 deletions(-) create mode 100644 net/dsa/port.c (limited to 'net') diff --git a/net/dsa/Makefile b/net/dsa/Makefile index f8c0251d1f43..90e5aa6f7d0f 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,6 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o slave.o dsa2.o switch.o legacy.o +dsa_core-y += dsa.o dsa2.o legacy.o port.o slave.o switch.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index c274130e3ac9..cda218cd9b05 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -60,6 +60,11 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); int dsa_legacy_register(void); void dsa_legacy_unregister(void); +/* port.c */ +int dsa_port_set_state(struct dsa_port *dp, u8 state, + struct switchdev_trans *trans); +void dsa_port_set_state_now(struct dsa_port *dp, u8 state); + /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); diff --git a/net/dsa/port.c b/net/dsa/port.c new file mode 100644 index 000000000000..6cc4704190fd --- /dev/null +++ b/net/dsa/port.c @@ -0,0 +1,55 @@ +/* + * Handling of a single switch port + * + * Copyright (c) 2017 Savoir-faire Linux Inc. + * Vivien Didelot + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + +#include "dsa_priv.h" + +int dsa_port_set_state(struct dsa_port *dp, u8 state, + struct switchdev_trans *trans) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + if (switchdev_trans_ph_prepare(trans)) + return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; + + if (ds->ops->port_stp_state_set) + ds->ops->port_stp_state_set(ds, port, state); + + if (ds->ops->port_fast_age) { + /* Fast age FDB entries or flush appropriate forwarding database + * for the given port, if we are moving it from Learning or + * Forwarding state, to Disabled or Blocking or Listening state. + */ + + if ((dp->stp_state == BR_STATE_LEARNING || + dp->stp_state == BR_STATE_FORWARDING) && + (state == BR_STATE_DISABLED || + state == BR_STATE_BLOCKING || + state == BR_STATE_LISTENING)) + ds->ops->port_fast_age(ds, port); + } + + dp->stp_state = state; + + return 0; +} + +void dsa_port_set_state_now(struct dsa_port *dp, u8 state) +{ + int err; + + err = dsa_port_set_state(dp, state, NULL); + if (err) + pr_err("DSA: failed to set STP state %u (%d)\n", state, err); +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index b0150f79dcdd..2c57c7205aa3 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -84,46 +84,6 @@ static inline bool dsa_port_is_bridged(struct dsa_port *dp) return !!dp->bridge_dev; } -static int dsa_port_set_state(struct dsa_port *dp, u8 state, - struct switchdev_trans *trans) -{ - struct dsa_switch *ds = dp->ds; - int port = dp->index; - - if (switchdev_trans_ph_prepare(trans)) - return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; - - if (ds->ops->port_stp_state_set) - ds->ops->port_stp_state_set(ds, port, state); - - if (ds->ops->port_fast_age) { - /* Fast age FDB entries or flush appropriate forwarding database - * for the given port, if we are moving it from Learning or - * Forwarding state, to Disabled or Blocking or Listening state. - */ - - if ((dp->stp_state == BR_STATE_LEARNING || - dp->stp_state == BR_STATE_FORWARDING) && - (state == BR_STATE_DISABLED || - state == BR_STATE_BLOCKING || - state == BR_STATE_LISTENING)) - ds->ops->port_fast_age(ds, port); - } - - dp->stp_state = state; - - return 0; -} - -static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) -{ - int err; - - err = dsa_port_set_state(dp, state, NULL); - if (err) - pr_err("DSA: failed to set STP state %u (%d)\n", state, err); -} - static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); -- cgit v1.2.3 From cfbed329beb2e44562c2c6b292142e3c9adc3203 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:45 -0400 Subject: net: dsa: move bridging routines Move the DSA port code which bridges a port in port.c, where it belongs. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 2 ++ net/dsa/port.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 57 ----------------------------------------------------- 3 files changed, 60 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index cda218cd9b05..f0b6cd3c8a65 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -64,6 +64,8 @@ void dsa_legacy_unregister(void); int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); void dsa_port_set_state_now(struct dsa_port *dp, u8 state); +int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); +void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index 6cc4704190fd..da8577fb3d07 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -11,9 +11,20 @@ */ #include +#include #include "dsa_priv.h" +static int dsa_port_notify(struct dsa_port *dp, unsigned long e, void *v) +{ + struct raw_notifier_head *nh = &dp->ds->dst->nh; + int err; + + err = raw_notifier_call_chain(nh, e, v); + + return notifier_to_errno(err); +} + int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans) { @@ -53,3 +64,50 @@ void dsa_port_set_state_now(struct dsa_port *dp, u8 state) if (err) pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } + +int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) +{ + struct dsa_notifier_bridge_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .br = br, + }; + int err; + + /* Here the port is already bridged. Reflect the current configuration + * so that drivers can program their chips accordingly. + */ + dp->bridge_dev = br; + + err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_JOIN, &info); + + /* The bridging is rolled back on error */ + if (err) + dp->bridge_dev = NULL; + + return err; +} + +void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) +{ + struct dsa_notifier_bridge_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .br = br, + }; + int err; + + /* Here the port is already unbridged. Reflect the current configuration + * so that drivers can program their chips accordingly. + */ + dp->bridge_dev = NULL; + + err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_LEAVE, &info); + if (err) + pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); + + /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, + * so allow it to be in BR_STATE_FORWARDING to be kept functional + */ + dsa_port_set_state_now(dp, BR_STATE_FORWARDING); +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2c57c7205aa3..ab298c41b8e7 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -27,16 +27,6 @@ static bool dsa_slave_dev_check(struct net_device *dev); -static int dsa_port_notify(struct dsa_port *dp, unsigned long e, void *v) -{ - struct raw_notifier_head *nh = &dp->ds->dst->nh; - int err; - - err = raw_notifier_call_chain(nh, e, v); - - return notifier_to_errno(err); -} - /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { @@ -514,53 +504,6 @@ static int dsa_slave_port_obj_dump(struct net_device *dev, return err; } -static int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) -{ - struct dsa_notifier_bridge_info info = { - .sw_index = dp->ds->index, - .port = dp->index, - .br = br, - }; - int err; - - /* Here the port is already bridged. Reflect the current configuration - * so that drivers can program their chips accordingly. - */ - dp->bridge_dev = br; - - err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_JOIN, &info); - - /* The bridging is rolled back on error */ - if (err) - dp->bridge_dev = NULL; - - return err; -} - -static void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) -{ - struct dsa_notifier_bridge_info info = { - .sw_index = dp->ds->index, - .port = dp->index, - .br = br, - }; - int err; - - /* Here the port is already unbridged. Reflect the current configuration - * so that drivers can program their chips accordingly. - */ - dp->bridge_dev = NULL; - - err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_LEAVE, &info); - if (err) - pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); - - /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, - * so allow it to be in BR_STATE_FORWARDING to be kept functional - */ - dsa_port_set_state_now(dp, BR_STATE_FORWARDING); -} - static int dsa_slave_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) { -- cgit v1.2.3 From 4d61d3043bef7b61e7c30276488ff310bee0d897 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:46 -0400 Subject: net: dsa: move VLAN filtering setter Move the DSA port code which sets VLAN filtering on a port in port.c, where it belongs. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 2 ++ net/dsa/port.c | 16 ++++++++++++++++ net/dsa/slave.c | 16 ---------------- 3 files changed, 18 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f0b6cd3c8a65..c145223247c5 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -66,6 +66,8 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state, void dsa_port_set_state_now(struct dsa_port *dp, u8 state); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, + struct switchdev_trans *trans); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index da8577fb3d07..c9f95aaf25f1 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -111,3 +111,19 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) */ dsa_port_set_state_now(dp, BR_STATE_FORWARDING); } + +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, + struct switchdev_trans *trans) +{ + struct dsa_switch *ds = dp->ds; + + /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ + if (switchdev_trans_ph_prepare(trans)) + return 0; + + if (ds->ops->port_vlan_filtering) + return ds->ops->port_vlan_filtering(ds, dp->index, + vlan_filtering); + + return 0; +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ab298c41b8e7..32e7e78313ba 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -338,22 +338,6 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } -static int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, - struct switchdev_trans *trans) -{ - struct dsa_switch *ds = dp->ds; - - /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ - if (switchdev_trans_ph_prepare(trans)) - return 0; - - if (ds->ops->port_vlan_filtering) - return ds->ops->port_vlan_filtering(ds, dp->index, - vlan_filtering); - - return 0; -} - static unsigned int dsa_fastest_ageing_time(struct dsa_switch *ds, unsigned int ageing_time) { -- cgit v1.2.3 From d87bd94e1c2006c1bb1d717020116940f9d0735a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:47 -0400 Subject: net: dsa: move ageing time setter Move the DSA port code which sets a port ageing time in port.c, where it belongs. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 2 ++ net/dsa/port.c | 40 ++++++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 40 ---------------------------------------- 3 files changed, 42 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index c145223247c5..b0f9837bf5ed 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -68,6 +68,8 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct switchdev_trans *trans); +int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, + struct switchdev_trans *trans); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index c9f95aaf25f1..3382fdc07a11 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -127,3 +127,43 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, return 0; } + +static unsigned int dsa_fastest_ageing_time(struct dsa_switch *ds, + unsigned int ageing_time) +{ + int i; + + for (i = 0; i < ds->num_ports; ++i) { + struct dsa_port *dp = &ds->ports[i]; + + if (dp->ageing_time && dp->ageing_time < ageing_time) + ageing_time = dp->ageing_time; + } + + return ageing_time; +} + +int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, + struct switchdev_trans *trans) +{ + unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock); + unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); + struct dsa_switch *ds = dp->ds; + + if (switchdev_trans_ph_prepare(trans)) { + if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) + return -ERANGE; + if (ds->ageing_time_max && ageing_time > ds->ageing_time_max) + return -ERANGE; + return 0; + } + + /* Keep the fastest ageing time in case of multiple bridges */ + dp->ageing_time = ageing_time; + ageing_time = dsa_fastest_ageing_time(ds, ageing_time); + + if (ds->ops->set_ageing_time) + return ds->ops->set_ageing_time(ds, ageing_time); + + return 0; +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 32e7e78313ba..1b0f396c4314 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -338,46 +338,6 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } -static unsigned int dsa_fastest_ageing_time(struct dsa_switch *ds, - unsigned int ageing_time) -{ - int i; - - for (i = 0; i < ds->num_ports; ++i) { - struct dsa_port *dp = &ds->ports[i]; - - if (dp->ageing_time && dp->ageing_time < ageing_time) - ageing_time = dp->ageing_time; - } - - return ageing_time; -} - -static int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, - struct switchdev_trans *trans) -{ - unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock); - unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); - struct dsa_switch *ds = dp->ds; - - if (switchdev_trans_ph_prepare(trans)) { - if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) - return -ERANGE; - if (ds->ageing_time_max && ageing_time > ds->ageing_time_max) - return -ERANGE; - return 0; - } - - /* Keep the fastest ageing time in case of multiple bridges */ - dp->ageing_time = ageing_time; - ageing_time = dsa_fastest_ageing_time(ds, ageing_time); - - if (ds->ops->set_ageing_time) - return ds->ops->set_ageing_time(ds, ageing_time); - - return 0; -} - static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) -- cgit v1.2.3 From d1cffff008dc2f238bfd0700c0f5027980089510 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:48 -0400 Subject: net: dsa: move FDB handlers Move the DSA port code which handles FDB objects in port.c, where it belongs. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 7 +++++++ net/dsa/port.c | 40 ++++++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 42 ------------------------------------------ 3 files changed, 47 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index b0f9837bf5ed..d003a2554c7a 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -70,6 +70,13 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct switchdev_trans *trans); int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, struct switchdev_trans *trans); +int dsa_port_fdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_fdb *fdb, + struct switchdev_trans *trans); +int dsa_port_fdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_fdb *fdb); +int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb, + switchdev_obj_dump_cb_t *cb); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index 3382fdc07a11..18ec6d432152 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -167,3 +167,43 @@ int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, return 0; } + +int dsa_port_fdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_fdb *fdb, + struct switchdev_trans *trans) +{ + struct dsa_switch *ds = dp->ds; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) + return -EOPNOTSUPP; + + return ds->ops->port_fdb_prepare(ds, dp->index, fdb, trans); + } + + ds->ops->port_fdb_add(ds, dp->index, fdb, trans); + + return 0; +} + +int dsa_port_fdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_fdb *fdb) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->ops->port_fdb_del) + return -EOPNOTSUPP; + + return ds->ops->port_fdb_del(ds, dp->index, fdb); +} + +int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb, + switchdev_obj_dump_cb_t *cb) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->ops->port_fdb_dump) + return ds->ops->port_fdb_dump(ds, dp->index, fdb, cb); + + return -EOPNOTSUPP; +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1b0f396c4314..d9b7bf759f44 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -245,48 +245,6 @@ static int dsa_port_vlan_dump(struct dsa_port *dp, return -EOPNOTSUPP; } -static int dsa_port_fdb_add(struct dsa_port *dp, - const struct switchdev_obj_port_fdb *fdb, - struct switchdev_trans *trans) -{ - struct dsa_switch *ds = dp->ds; - - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) - return -EOPNOTSUPP; - - return ds->ops->port_fdb_prepare(ds, dp->index, fdb, trans); - } - - ds->ops->port_fdb_add(ds, dp->index, fdb, trans); - - return 0; -} - -static int dsa_port_fdb_del(struct dsa_port *dp, - const struct switchdev_obj_port_fdb *fdb) -{ - struct dsa_switch *ds = dp->ds; - int ret = -EOPNOTSUPP; - - if (ds->ops->port_fdb_del) - ret = ds->ops->port_fdb_del(ds, dp->index, fdb); - - return ret; -} - -static int dsa_port_fdb_dump(struct dsa_port *dp, - struct switchdev_obj_port_fdb *fdb, - switchdev_obj_dump_cb_t *cb) -{ - struct dsa_switch *ds = dp->ds; - - if (ds->ops->port_fdb_dump) - return ds->ops->port_fdb_dump(ds, dp->index, fdb, cb); - - return -EOPNOTSUPP; -} - static int dsa_port_mdb_add(struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans) -- cgit v1.2.3 From 3a9afea37e298f4989629553d44b9bf50dc46125 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:49 -0400 Subject: net: dsa: move MDB handlers Move the DSA port code which handles MDB objects in port.c, where it belongs. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 7 +++++++ net/dsa/port.c | 40 ++++++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 41 ----------------------------------------- 3 files changed, 47 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d003a2554c7a..c2a595036746 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -77,6 +77,13 @@ int dsa_port_fdb_del(struct dsa_port *dp, const struct switchdev_obj_port_fdb *fdb); int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb, switchdev_obj_dump_cb_t *cb); +int dsa_port_mdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb, + struct switchdev_trans *trans); +int dsa_port_mdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb, + switchdev_obj_dump_cb_t *cb); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index 18ec6d432152..4ed0124a8d4b 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -207,3 +207,43 @@ int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb, return -EOPNOTSUPP; } + +int dsa_port_mdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb, + struct switchdev_trans *trans) +{ + struct dsa_switch *ds = dp->ds; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) + return -EOPNOTSUPP; + + return ds->ops->port_mdb_prepare(ds, dp->index, mdb, trans); + } + + ds->ops->port_mdb_add(ds, dp->index, mdb, trans); + + return 0; +} + +int dsa_port_mdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->ops->port_mdb_del) + return ds->ops->port_mdb_del(ds, dp->index, mdb); + + return -EOPNOTSUPP; +} + +int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb, + switchdev_obj_dump_cb_t *cb) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->ops->port_mdb_dump) + return ds->ops->port_mdb_dump(ds, dp->index, mdb, cb); + + return -EOPNOTSUPP; +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d9b7bf759f44..9adcb8267d9a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -245,47 +245,6 @@ static int dsa_port_vlan_dump(struct dsa_port *dp, return -EOPNOTSUPP; } -static int dsa_port_mdb_add(struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans) -{ - struct dsa_switch *ds = dp->ds; - - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) - return -EOPNOTSUPP; - - return ds->ops->port_mdb_prepare(ds, dp->index, mdb, trans); - } - - ds->ops->port_mdb_add(ds, dp->index, mdb, trans); - - return 0; -} - -static int dsa_port_mdb_del(struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb) -{ - struct dsa_switch *ds = dp->ds; - - if (ds->ops->port_mdb_del) - return ds->ops->port_mdb_del(ds, dp->index, mdb); - - return -EOPNOTSUPP; -} - -static int dsa_port_mdb_dump(struct dsa_port *dp, - struct switchdev_obj_port_mdb *mdb, - switchdev_obj_dump_cb_t *cb) -{ - struct dsa_switch *ds = dp->ds; - - if (ds->ops->port_mdb_dump) - return ds->ops->port_mdb_dump(ds, dp->index, mdb, cb); - - return -EOPNOTSUPP; -} - static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_slave_priv *p = netdev_priv(dev); -- cgit v1.2.3 From 076e713365c9f4ca1ff3eca9122664a5359b94da Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:50 -0400 Subject: net: dsa: move VLAN handlers Move the DSA port code which handles VLAN objects in port.c, where it belongs. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 8 ++++++++ net/dsa/port.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 41 ----------------------------------------- 3 files changed, 49 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index c2a595036746..16021a891095 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -84,6 +84,14 @@ int dsa_port_mdb_del(struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb, switchdev_obj_dump_cb_t *cb); +int dsa_port_vlan_add(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan, + struct switchdev_trans *trans); +int dsa_port_vlan_del(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan); +int dsa_port_vlan_dump(struct dsa_port *dp, + struct switchdev_obj_port_vlan *vlan, + switchdev_obj_dump_cb_t *cb); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index 4ed0124a8d4b..f211b0dfb12d 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -247,3 +247,44 @@ int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb, return -EOPNOTSUPP; } + +int dsa_port_vlan_add(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan, + struct switchdev_trans *trans) +{ + struct dsa_switch *ds = dp->ds; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) + return -EOPNOTSUPP; + + return ds->ops->port_vlan_prepare(ds, dp->index, vlan, trans); + } + + ds->ops->port_vlan_add(ds, dp->index, vlan, trans); + + return 0; +} + +int dsa_port_vlan_del(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan) +{ + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->port_vlan_del) + return -EOPNOTSUPP; + + return ds->ops->port_vlan_del(ds, dp->index, vlan); +} + +int dsa_port_vlan_dump(struct dsa_port *dp, + struct switchdev_obj_port_vlan *vlan, + switchdev_obj_dump_cb_t *cb) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->ops->port_vlan_dump) + return ds->ops->port_vlan_dump(ds, dp->index, vlan, cb); + + return -EOPNOTSUPP; +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 9adcb8267d9a..887e26695519 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -204,47 +204,6 @@ out: return 0; } -static int dsa_port_vlan_add(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans) -{ - struct dsa_switch *ds = dp->ds; - - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) - return -EOPNOTSUPP; - - return ds->ops->port_vlan_prepare(ds, dp->index, vlan, trans); - } - - ds->ops->port_vlan_add(ds, dp->index, vlan, trans); - - return 0; -} - -static int dsa_port_vlan_del(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan) -{ - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->port_vlan_del) - return -EOPNOTSUPP; - - return ds->ops->port_vlan_del(ds, dp->index, vlan); -} - -static int dsa_port_vlan_dump(struct dsa_port *dp, - struct switchdev_obj_port_vlan *vlan, - switchdev_obj_dump_cb_t *cb) -{ - struct dsa_switch *ds = dp->ds; - - if (ds->ops->port_vlan_dump) - return ds->ops->port_vlan_dump(ds, dp->index, vlan, cb); - - return -EOPNOTSUPP; -} - static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_slave_priv *p = netdev_priv(dev); -- cgit v1.2.3 From 52c96f9d7003c74c7fbec7438c0ed78df0cc1c79 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:51 -0400 Subject: net: dsa: move notifier info to private header The DSA notifier events and info structure definitions are not meant for DSA drivers and users, but only used internally by the DSA core files. Move them from the public net/dsa.h file to the private dsa_priv.h file. Also use this opportunity to turn the events into an anonymous enum, because we don't care about the values, and this will prevent future conflicts when adding (and sorting) new events. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 10 ---------- net/dsa/dsa_priv.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 791fed62fb16..c0e567c0c824 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -285,16 +285,6 @@ static inline u8 dsa_upstream_port(struct dsa_switch *ds) return ds->rtable[dst->cpu_dp->ds->index]; } -#define DSA_NOTIFIER_BRIDGE_JOIN 1 -#define DSA_NOTIFIER_BRIDGE_LEAVE 2 - -/* DSA_NOTIFIER_BRIDGE_* */ -struct dsa_notifier_bridge_info { - struct net_device *br; - int sw_index; - int port; -}; - struct dsa_switch_ops { /* * Legacy probing. diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 16021a891095..c19241eb094b 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -16,6 +16,18 @@ #include #include +enum { + DSA_NOTIFIER_BRIDGE_JOIN, + DSA_NOTIFIER_BRIDGE_LEAVE, +}; + +/* DSA_NOTIFIER_BRIDGE_* */ +struct dsa_notifier_bridge_info { + struct net_device *br; + int sw_index; + int port; +}; + struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From 1faabf7440f17999f41973e91878c13ad9f080b2 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:52 -0400 Subject: net: dsa: add notifier for ageing time This patch keeps the port-wide ageing time handling code in dsa_port_ageing_time, pushes the requested ageing time value in a new switch fabric notification, and moves the switch-wide ageing time handling code in dsa_switch_ageing_time. This has the effect that now not only the switch that the target port belongs to can be programmed, but all switches composing the switch fabric. For the moment, keep the current behavior and ignore other switches. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 8 ++++++++ net/dsa/port.c | 37 ++++++++----------------------------- net/dsa/switch.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index c19241eb094b..becaf8a61b13 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -17,10 +17,18 @@ #include enum { + DSA_NOTIFIER_AGEING_TIME, DSA_NOTIFIER_BRIDGE_JOIN, DSA_NOTIFIER_BRIDGE_LEAVE, }; +/* DSA_NOTIFIER_AGEING_TIME */ +struct dsa_notifier_ageing_time_info { + struct switchdev_trans *trans; + unsigned int ageing_time; + int sw_index; +}; + /* DSA_NOTIFIER_BRIDGE_* */ struct dsa_notifier_bridge_info { struct net_device *br; diff --git a/net/dsa/port.c b/net/dsa/port.c index f211b0dfb12d..59328a35394d 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -128,44 +128,23 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, return 0; } -static unsigned int dsa_fastest_ageing_time(struct dsa_switch *ds, - unsigned int ageing_time) -{ - int i; - - for (i = 0; i < ds->num_ports; ++i) { - struct dsa_port *dp = &ds->ports[i]; - - if (dp->ageing_time && dp->ageing_time < ageing_time) - ageing_time = dp->ageing_time; - } - - return ageing_time; -} - int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, struct switchdev_trans *trans) { unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock); unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); - struct dsa_switch *ds = dp->ds; + struct dsa_notifier_ageing_time_info info = { + .ageing_time = ageing_time, + .sw_index = dp->ds->index, + .trans = trans, + }; - if (switchdev_trans_ph_prepare(trans)) { - if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) - return -ERANGE; - if (ds->ageing_time_max && ageing_time > ds->ageing_time_max) - return -ERANGE; - return 0; - } + if (switchdev_trans_ph_prepare(trans)) + return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info); - /* Keep the fastest ageing time in case of multiple bridges */ dp->ageing_time = ageing_time; - ageing_time = dsa_fastest_ageing_time(ds, ageing_time); - if (ds->ops->set_ageing_time) - return ds->ops->set_ageing_time(ds, ageing_time); - - return 0; + return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info); } int dsa_port_fdb_add(struct dsa_port *dp, diff --git a/net/dsa/switch.c b/net/dsa/switch.c index f477053308d2..540770ecc8b0 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -12,9 +12,52 @@ #include #include +#include #include "dsa_priv.h" +static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, + unsigned int ageing_time) +{ + int i; + + for (i = 0; i < ds->num_ports; ++i) { + struct dsa_port *dp = &ds->ports[i]; + + if (dp->ageing_time && dp->ageing_time < ageing_time) + ageing_time = dp->ageing_time; + } + + return ageing_time; +} + +static int dsa_switch_ageing_time(struct dsa_switch *ds, + struct dsa_notifier_ageing_time_info *info) +{ + unsigned int ageing_time = info->ageing_time; + struct switchdev_trans *trans = info->trans; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (switchdev_trans_ph_prepare(trans)) { + if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) + return -ERANGE; + if (ds->ageing_time_max && ageing_time > ds->ageing_time_max) + return -ERANGE; + return 0; + } + + /* Program the fastest ageing time in case of multiple bridges */ + ageing_time = dsa_switch_fastest_ageing_time(ds, ageing_time); + + if (ds->ops->set_ageing_time) + return ds->ops->set_ageing_time(ds, ageing_time); + + return 0; +} + static int dsa_switch_bridge_join(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { @@ -48,6 +91,9 @@ static int dsa_switch_event(struct notifier_block *nb, int err; switch (event) { + case DSA_NOTIFIER_AGEING_TIME: + err = dsa_switch_ageing_time(ds, info); + break; case DSA_NOTIFIER_BRIDGE_JOIN: err = dsa_switch_bridge_join(ds, info); break; -- cgit v1.2.3 From 685fb6a40ddace10a0bc8a680ab6ba65c6cdfdaf Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:53 -0400 Subject: net: dsa: add FDB notifier Add two new DSA_NOTIFIER_FDB_ADD and DSA_NOTIFIER_FDB_DEL events to notify not only a single switch, but all switches of a the fabric when an FDB entry is added or removed. For the moment, keep the current behavior and ignore other switches. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 10 ++++++++++ net/dsa/port.c | 29 +++++++++++++---------------- net/dsa/switch.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index becaf8a61b13..6a7d0d7d0489 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -20,6 +20,8 @@ enum { DSA_NOTIFIER_AGEING_TIME, DSA_NOTIFIER_BRIDGE_JOIN, DSA_NOTIFIER_BRIDGE_LEAVE, + DSA_NOTIFIER_FDB_ADD, + DSA_NOTIFIER_FDB_DEL, }; /* DSA_NOTIFIER_AGEING_TIME */ @@ -36,6 +38,14 @@ struct dsa_notifier_bridge_info { int port; }; +/* DSA_NOTIFIER_FDB_* */ +struct dsa_notifier_fdb_info { + const struct switchdev_obj_port_fdb *fdb; + struct switchdev_trans *trans; + int sw_index; + int port; +}; + struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/port.c b/net/dsa/port.c index 59328a35394d..ed88d8381642 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -151,29 +151,26 @@ int dsa_port_fdb_add(struct dsa_port *dp, const struct switchdev_obj_port_fdb *fdb, struct switchdev_trans *trans) { - struct dsa_switch *ds = dp->ds; - - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) - return -EOPNOTSUPP; - - return ds->ops->port_fdb_prepare(ds, dp->index, fdb, trans); - } - - ds->ops->port_fdb_add(ds, dp->index, fdb, trans); + struct dsa_notifier_fdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .trans = trans, + .fdb = fdb, + }; - return 0; + return dsa_port_notify(dp, DSA_NOTIFIER_FDB_ADD, &info); } int dsa_port_fdb_del(struct dsa_port *dp, const struct switchdev_obj_port_fdb *fdb) { - struct dsa_switch *ds = dp->ds; - - if (ds->ops->port_fdb_del) - return -EOPNOTSUPP; + struct dsa_notifier_fdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .fdb = fdb, + }; - return ds->ops->port_fdb_del(ds, dp->index, fdb); + return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info); } int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb, diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 540770ecc8b0..e71cc860d32c 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -84,6 +84,43 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, return 0; } +static int dsa_switch_fdb_add(struct dsa_switch *ds, + struct dsa_notifier_fdb_info *info) +{ + const struct switchdev_obj_port_fdb *fdb = info->fdb; + struct switchdev_trans *trans = info->trans; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) + return -EOPNOTSUPP; + + return ds->ops->port_fdb_prepare(ds, info->port, fdb, trans); + } + + ds->ops->port_fdb_add(ds, info->port, fdb, trans); + + return 0; +} + +static int dsa_switch_fdb_del(struct dsa_switch *ds, + struct dsa_notifier_fdb_info *info) +{ + const struct switchdev_obj_port_fdb *fdb = info->fdb; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (!ds->ops->port_fdb_del) + return -EOPNOTSUPP; + + return ds->ops->port_fdb_del(ds, info->port, fdb); +} + static int dsa_switch_event(struct notifier_block *nb, unsigned long event, void *info) { @@ -100,6 +137,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_BRIDGE_LEAVE: err = dsa_switch_bridge_leave(ds, info); break; + case DSA_NOTIFIER_FDB_ADD: + err = dsa_switch_fdb_add(ds, info); + break; + case DSA_NOTIFIER_FDB_DEL: + err = dsa_switch_fdb_del(ds, info); + break; default: err = -EOPNOTSUPP; break; -- cgit v1.2.3 From 8ae5bcdc5d98a99e59f194101e7acd2e9d055758 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:54 -0400 Subject: net: dsa: add MDB notifier Add two new DSA_NOTIFIER_MDB_ADD and DSA_NOTIFIER_MDB_DEL events to notify not only a single switch, but all switches of a the fabric when an MDB entry is added or removed. For the moment, keep the current behavior and ignore other switches. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 10 ++++++++++ net/dsa/port.c | 29 +++++++++++++---------------- net/dsa/switch.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 6a7d0d7d0489..2b60293b325c 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -22,6 +22,8 @@ enum { DSA_NOTIFIER_BRIDGE_LEAVE, DSA_NOTIFIER_FDB_ADD, DSA_NOTIFIER_FDB_DEL, + DSA_NOTIFIER_MDB_ADD, + DSA_NOTIFIER_MDB_DEL, }; /* DSA_NOTIFIER_AGEING_TIME */ @@ -46,6 +48,14 @@ struct dsa_notifier_fdb_info { int port; }; +/* DSA_NOTIFIER_MDB_* */ +struct dsa_notifier_mdb_info { + const struct switchdev_obj_port_mdb *mdb; + struct switchdev_trans *trans; + int sw_index; + int port; +}; + struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/port.c b/net/dsa/port.c index ed88d8381642..c7c4920e7bc9 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -188,29 +188,26 @@ int dsa_port_mdb_add(struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans) { - struct dsa_switch *ds = dp->ds; - - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) - return -EOPNOTSUPP; - - return ds->ops->port_mdb_prepare(ds, dp->index, mdb, trans); - } - - ds->ops->port_mdb_add(ds, dp->index, mdb, trans); + struct dsa_notifier_mdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .trans = trans, + .mdb = mdb, + }; - return 0; + return dsa_port_notify(dp, DSA_NOTIFIER_MDB_ADD, &info); } int dsa_port_mdb_del(struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb) { - struct dsa_switch *ds = dp->ds; - - if (ds->ops->port_mdb_del) - return ds->ops->port_mdb_del(ds, dp->index, mdb); + struct dsa_notifier_mdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mdb = mdb, + }; - return -EOPNOTSUPP; + return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, &info); } int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb, diff --git a/net/dsa/switch.c b/net/dsa/switch.c index e71cc860d32c..b7e8e45869fc 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -121,6 +121,43 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds, return ds->ops->port_fdb_del(ds, info->port, fdb); } +static int dsa_switch_mdb_add(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) +{ + const struct switchdev_obj_port_mdb *mdb = info->mdb; + struct switchdev_trans *trans = info->trans; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) + return -EOPNOTSUPP; + + return ds->ops->port_mdb_prepare(ds, info->port, mdb, trans); + } + + ds->ops->port_mdb_add(ds, info->port, mdb, trans); + + return 0; +} + +static int dsa_switch_mdb_del(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) +{ + const struct switchdev_obj_port_mdb *mdb = info->mdb; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (!ds->ops->port_mdb_del) + return -EOPNOTSUPP; + + return ds->ops->port_mdb_del(ds, info->port, mdb); +} + static int dsa_switch_event(struct notifier_block *nb, unsigned long event, void *info) { @@ -143,6 +180,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_FDB_DEL: err = dsa_switch_fdb_del(ds, info); break; + case DSA_NOTIFIER_MDB_ADD: + err = dsa_switch_mdb_add(ds, info); + break; + case DSA_NOTIFIER_MDB_DEL: + err = dsa_switch_mdb_del(ds, info); + break; default: err = -EOPNOTSUPP; break; -- cgit v1.2.3 From d0c627b8740ca6243054263fbc98981a36ac5618 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 19 May 2017 17:00:55 -0400 Subject: net: dsa: add VLAN notifier Add two new DSA_NOTIFIER_VLAN_ADD and DSA_NOTIFIER_VLAN_DEL events to notify not only a single switch, but all switches of a the fabric when an VLAN entry is added or removed. For the moment, keep the current behavior and ignore other switches. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 10 ++++++++++ net/dsa/port.c | 29 +++++++++++++---------------- net/dsa/switch.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 2b60293b325c..1d52f9051d0e 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -24,6 +24,8 @@ enum { DSA_NOTIFIER_FDB_DEL, DSA_NOTIFIER_MDB_ADD, DSA_NOTIFIER_MDB_DEL, + DSA_NOTIFIER_VLAN_ADD, + DSA_NOTIFIER_VLAN_DEL, }; /* DSA_NOTIFIER_AGEING_TIME */ @@ -56,6 +58,14 @@ struct dsa_notifier_mdb_info { int port; }; +/* DSA_NOTIFIER_VLAN_* */ +struct dsa_notifier_vlan_info { + const struct switchdev_obj_port_vlan *vlan; + struct switchdev_trans *trans; + int sw_index; + int port; +}; + struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/port.c b/net/dsa/port.c index c7c4920e7bc9..c88c0cec8454 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -225,29 +225,26 @@ int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans) { - struct dsa_switch *ds = dp->ds; - - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) - return -EOPNOTSUPP; - - return ds->ops->port_vlan_prepare(ds, dp->index, vlan, trans); - } - - ds->ops->port_vlan_add(ds, dp->index, vlan, trans); + struct dsa_notifier_vlan_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .trans = trans, + .vlan = vlan, + }; - return 0; + return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); } int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan) { - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->port_vlan_del) - return -EOPNOTSUPP; + struct dsa_notifier_vlan_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .vlan = vlan, + }; - return ds->ops->port_vlan_del(ds, dp->index, vlan); + return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); } int dsa_port_vlan_dump(struct dsa_port *dp, diff --git a/net/dsa/switch.c b/net/dsa/switch.c index b7e8e45869fc..c1e4b2d5a3ae 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -158,6 +158,43 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds, return ds->ops->port_mdb_del(ds, info->port, mdb); } +static int dsa_switch_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_vlan_info *info) +{ + const struct switchdev_obj_port_vlan *vlan = info->vlan; + struct switchdev_trans *trans = info->trans; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) + return -EOPNOTSUPP; + + return ds->ops->port_vlan_prepare(ds, info->port, vlan, trans); + } + + ds->ops->port_vlan_add(ds, info->port, vlan, trans); + + return 0; +} + +static int dsa_switch_vlan_del(struct dsa_switch *ds, + struct dsa_notifier_vlan_info *info) +{ + const struct switchdev_obj_port_vlan *vlan = info->vlan; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (!ds->ops->port_vlan_del) + return -EOPNOTSUPP; + + return ds->ops->port_vlan_del(ds, info->port, vlan); +} + static int dsa_switch_event(struct notifier_block *nb, unsigned long event, void *info) { @@ -186,6 +223,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_MDB_DEL: err = dsa_switch_mdb_del(ds, info); break; + case DSA_NOTIFIER_VLAN_ADD: + err = dsa_switch_vlan_add(ds, info); + break; + case DSA_NOTIFIER_VLAN_DEL: + err = dsa_switch_vlan_del(ds, info); + break; default: err = -EOPNOTSUPP; break; -- cgit v1.2.3 From 1b57b6210f4e52904393be97c62122aae69bc8aa Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 23 May 2017 09:58:07 +0100 Subject: cfg80211: make cfg80211_sched_scan_results() work from atomic context Drivers should be able to call cfg80211_sched_scan_results() from atomic context. However, with the introduction of multiple scheduled scan feature this requirement was not taken into account resulting in regression shown below. [ 119.021594] BUG: scheduling while atomic: irq/47-iwlwifi/517/0x00000200 [ 119.021604] Modules linked in: [...] [ 119.021759] CPU: 1 PID: 517 Comm: irq/47-iwlwifi Not tainted 4.12.0-rc2-t440s-20170522+ #1 [ 119.021763] Hardware name: LENOVO 20AQS03H00/20AQS03H00, BIOS GJET91WW (2.41 ) 09/21/2016 [ 119.021766] Call Trace: [ 119.021778] ? dump_stack+0x5c/0x84 [ 119.021784] ? __schedule_bug+0x4c/0x70 [ 119.021792] ? __schedule+0x496/0x5c0 [ 119.021798] ? schedule+0x2d/0x80 [ 119.021804] ? schedule_preempt_disabled+0x5/0x10 [ 119.021810] ? __mutex_lock.isra.0+0x18e/0x4c0 [ 119.021817] ? __wake_up+0x2f/0x50 [ 119.021833] ? cfg80211_sched_scan_results+0x19/0x60 [cfg80211] [ 119.021844] ? cfg80211_sched_scan_results+0x19/0x60 [cfg80211] [ 119.021859] ? iwl_mvm_rx_lmac_scan_iter_complete_notif+0x17/0x30 [iwlmvm] [ 119.021869] ? iwl_pcie_rx_handle+0x2a9/0x7e0 [iwlwifi] [ 119.021878] ? iwl_pcie_irq_handler+0x17c/0x730 [iwlwifi] [ 119.021884] ? irq_forced_thread_fn+0x60/0x60 [ 119.021887] ? irq_thread_fn+0x16/0x40 [ 119.021892] ? irq_thread+0x109/0x180 [ 119.021896] ? wake_threads_waitq+0x30/0x30 [ 119.021901] ? kthread+0xf2/0x130 [ 119.021905] ? irq_thread_dtor+0x90/0x90 [ 119.021910] ? kthread_create_on_node+0x40/0x40 [ 119.021915] ? ret_from_fork+0x26/0x40 Fixes: b34939b98369 ("cfg80211: add request id to cfg80211_sched_scan_*() api") Reported-by: Sander Eikelenboom Signed-off-by: Arend van Spriel Signed-off-by: Johannes Berg --- net/wireless/scan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 14d5f0c8c45f..9f0901f3e42b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -322,9 +322,9 @@ cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid) { struct cfg80211_sched_scan_request *pos; - ASSERT_RTNL(); + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); - list_for_each_entry(pos, &rdev->sched_scan_req_list, list) { + list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list) { if (pos->reqid == reqid) return pos; } @@ -398,13 +398,13 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) trace_cfg80211_sched_scan_results(wiphy, reqid); /* ignore if we're not scanning */ - rtnl_lock(); + rcu_read_lock(); request = cfg80211_find_sched_scan_req(rdev, reqid); if (request) { request->report_results = true; queue_work(cfg80211_wq, &rdev->sched_scan_res_wk); } - rtnl_unlock(); + rcu_read_unlock(); } EXPORT_SYMBOL(cfg80211_sched_scan_results); -- cgit v1.2.3 From 31efcc250a1dea96edca6595a9639d898cf99ae5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 20 May 2017 15:01:31 +0200 Subject: net/sched: properly assign RCU pointer in tcf_chain_tp_insert/remove *p_filter_chain is rcu-dereferenced on reader path. So here in writer, property assign the pointer. Fixes: 2190d1d0944f ("net: sched: introduce helpers to work with filter chains") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 4020b8d932a1..85088ed07f6a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -351,7 +351,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain, { if (chain->p_filter_chain && *chain_info->pprev == chain->filter_chain) - *chain->p_filter_chain = tp; + rcu_assign_pointer(*chain->p_filter_chain, tp); RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); } @@ -363,7 +363,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, struct tcf_proto *next = rtnl_dereference(chain_info->next); if (chain->p_filter_chain && tp == chain->filter_chain) - *chain->p_filter_chain = next; + RCU_INIT_POINTER(*chain->p_filter_chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); } -- cgit v1.2.3 From f93e1cdcf42c1218e2a73be477d8ac21135e7f56 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 20 May 2017 15:01:32 +0200 Subject: net/sched: fix filter flushing When user instructs to remove all filters from chain, we cannot destroy the chain as other actions may hold a reference. Also the put in errout would try to destroy it again. So instead, just walk the chain and remove all existing filters. Fixes: 5bc1701881e3 ("net: sched: introduce multichain support for filters") Signed-off-by: Jiri Pirko Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 85088ed07f6a..01a8b8b4bab8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -201,15 +201,22 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, return chain; } -static void tcf_chain_destroy(struct tcf_chain *chain) +static void tcf_chain_flush(struct tcf_chain *chain) { struct tcf_proto *tp; - list_del(&chain->list); + if (*chain->p_filter_chain) + RCU_INIT_POINTER(*chain->p_filter_chain, NULL); while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_proto_destroy(tp); } +} + +static void tcf_chain_destroy(struct tcf_chain *chain) +{ + list_del(&chain->list); + tcf_chain_flush(chain); kfree(chain); } @@ -510,7 +517,7 @@ replay: if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); - tcf_chain_destroy(chain); + tcf_chain_flush(chain); err = 0; goto errout; } -- cgit v1.2.3 From 8fafda77762df659e578e349c4a67196d94957dc Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Tue, 23 May 2017 13:21:05 +0800 Subject: net: ieee802154: remove explicit set skb->sk Explicit set skb->sk is needless, sock_alloc_send_skb is already set it. Signed-off-by: Lin Zhang Acked-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/ieee802154/socket.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index eedba7670b51..b01a1f04626b 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -301,7 +301,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto out_skb; skb->dev = dev; - skb->sk = sk; skb->protocol = htons(ETH_P_IEEE802154); dev_put(dev); @@ -690,7 +689,6 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto out_skb; skb->dev = dev; - skb->sk = sk; skb->protocol = htons(ETH_P_IEEE802154); dev_put(dev); -- cgit v1.2.3 From a611c58b3d42a92e6b23423e166dd17c0c7fffce Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Tue, 23 May 2017 13:29:39 +0800 Subject: net: ieee802154: fix net_device reference release too early This patch fixes the kernel oops when release net_device reference in advance. In function raw_sendmsg(i think the dgram_sendmsg has the same problem), there is a race condition between dev_put and dev_queue_xmit when the device is gong that maybe lead to dev_queue_ximt to see an illegal net_device pointer. My test kernel is 3.13.0-32 and because i am not have a real 802154 device, so i change lowpan_newlink function to this: /* find and hold real wpan device */ real_dev = dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) return -ENODEV; // if (real_dev->type != ARPHRD_IEEE802154) { // dev_put(real_dev); // return -EINVAL; // } lowpan_dev_info(dev)->real_dev = real_dev; lowpan_dev_info(dev)->fragment_tag = 0; mutex_init(&lowpan_dev_info(dev)->dev_list_mtx); Also, in order to simulate preempt, i change the raw_sendmsg function to this: skb->dev = dev; skb->sk = sk; skb->protocol = htons(ETH_P_IEEE802154); dev_put(dev); //simulate preempt schedule_timeout_uninterruptible(30 * HZ); err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); and this is my userspace test code named test_send_data: int main(int argc, char **argv) { char buf[127]; int sockfd; sockfd = socket(AF_IEEE802154, SOCK_RAW, 0); if (sockfd < 0) { printf("create sockfd error: %s\n", strerror(errno)); return -1; } send(sockfd, buf, sizeof(buf), 0); return 0; } This is my test case: root@zhanglin-x-computer:~/develop/802154# uname -a Linux zhanglin-x-computer 3.13.0-32-generic #57-Ubuntu SMP Tue Jul 15 03:51:08 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux root@zhanglin-x-computer:~/develop/802154# ip link add link eth0 name lowpan0 type lowpan root@zhanglin-x-computer:~/develop/802154# //keep the lowpan0 device down root@zhanglin-x-computer:~/develop/802154# ./test_send_data & //wait a while root@zhanglin-x-computer:~/develop/802154# ip link del link dev lowpan0 //the device is gone //oops [381.303307] general protection fault: 0000 [#1]SMP [381.303407] Modules linked in: af_802154 6lowpan bnep rfcomm bluetooth nls_iso8859_1 snd_hda_codec_hdmi snd_hda_codec_realtek rts5139(C) snd_hda_intel snd_had_codec snd_hwdep snd_pcm snd_page_alloc snd_seq_midi snd_seq_midi_event snd_rawmidi snd_req intel_rapl snd_seq_device coretemp i915 kvm_intel kvm snd_timer snd crct10dif_pclmul crc32_pclmul ghash_clmulni_intel cypted drm_kms_helper drm i2c_algo_bit soundcore video mac_hid parport_pc ppdev ip parport hid_generic usbhid hid ahci r8169 mii libahdi [381.304286] CPU:1 PID: 2524 Commm: 1 Tainted: G C 0 3.13.0-32-generic [381.304409] Hardware name: Haier Haier DT Computer/Haier DT Codputer, BIOS FIBT19H02_X64 06/09/2014 [381.304546] tasks: ffff000096965fc0 ti: ffffB0013779c000 task.ti: ffffB8013779c000 [381.304659] RIP: 0010:[] [] __dev_queue_ximt+0x61/0x500 [381.304798] RSP: 0018:ffffB8013779dca0 EFLAGS: 00010202 [381.304880] RAX: 272b031d57565351 RBX: 0000000000000000 RCX: ffff8800968f1a00 [381.304987] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8800968f1a00 [381.305095] RBP: ffff8e013773dce0 R08: 0000000000000266 R09: 0000000000000004 [381.305202] R10: 0000000000000004 R11: 0000000000000005 R12: ffff88013902e000 [381.305310] R13: 000000000000007f R14: 000000000000007f R15: ffff8800968f1a00 [381.305418] FS: 00007fc57f50f740(0000) GS: ffff88013fc80000(0000) knlGS: 0000000000000000 [381.305540] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [381.305627] CR2: 00007fad0841c000 CR3: 00000001368dd000 CR4: 00000000001007e0 [361.905734] Stack: [381.305768] 00000000002052d0 000000003facb30a ffff88013779dcc0 ffff880137764000 [381.305898] ffff88013779de70 000000000000007f 000000000000007f ffff88013902e000 [381.306026] ffff88013779dcf0 ffffffff81622490 ffff88013779dd39 ffffffffa03af9f1 [381.306155] Call Trace: [381.306202] [] dev_queue_xmit+0x10/0x20 [381.306294] [] raw_sendmsg+0x1b1/0x270 [af_802154] [381.306396] [] ieee802154_sock_sendmsg+0x14/0x20 [af_802154] [381.306512] [] sock_sendmsg+0x8b/0xc0 [381.306600] [] ? __d_alloc+0x25/0x180 [381.306687] [] ? kmem_cache_alloc_trace+0x1c6/0x1f0 [381.306791] [] SYSC_sendto+0x121/0x1c0 [381.306878] [] ? vtime_account_user+x54/0x60 [381.306975] [] ? syscall_trace_enter+0x145/0x250 [381.307073] [] SyS_sendto+0xe/0x10 [381.307156] [] tracesys+0xe1/0xe6 [381.307233] Code: c6 a1 a4 ff 41 8b 57 78 49 8b 47 20 85 d2 48 8b 80 78 07 00 00 75 21 49 8b 57 18 48 85 d2 74 18 48 85 c0 74 13 8b 92 ac 01 00 00 <3b> 50 10 73 08 8b 44 90 14 41 89 47 78 41 f6 84 24 d5 00 00 00 [381.307801] RIP [] _dev_queue_xmit+0x61/0x500 [381.307901] RSP [381.347512] Kernel panic - not syncing: Fatal exception in interrupt [381.347747] drm_kms_helper: panic occurred, switching back to text console In my opinion, there is always exist a chance that the device is gong before call dev_queue_xmit. I think the latest kernel is have the same problem and that dev_put should be behind of the dev_queue_xmit. Signed-off-by: Lin Zhang Acked-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/ieee802154/socket.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index b01a1f04626b..a60658c85a9a 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -303,12 +303,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) skb->dev = dev; skb->protocol = htons(ETH_P_IEEE802154); - dev_put(dev); - err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); + dev_put(dev); + return err ?: size; out_skb: @@ -691,12 +691,12 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) skb->dev = dev; skb->protocol = htons(ETH_P_IEEE802154); - dev_put(dev); - err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); + dev_put(dev); + return err ?: size; out_skb: -- cgit v1.2.3 From 6f4dbd149d2a151b89d1a5bbf7530ee5546c7908 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 11:33:16 +0200 Subject: libceph: use kbasename() and kill ceph_file_part() Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/ceph_debug.h | 6 +++--- net/ceph/ceph_common.c | 13 ------------- 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h index aa2e19182d99..51c5bd64bd00 100644 --- a/include/linux/ceph/ceph_debug.h +++ b/include/linux/ceph/ceph_debug.h @@ -3,6 +3,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include + #ifdef CONFIG_CEPH_LIB_PRETTYDEBUG /* @@ -12,12 +14,10 @@ */ # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) -extern const char *ceph_file_part(const char *s, int len); # define dout(fmt, ...) \ pr_debug("%.*s %12.12s:%-4d : " fmt, \ 8 - (int)sizeof(KBUILD_MODNAME), " ", \ - ceph_file_part(__FILE__, sizeof(__FILE__)), \ - __LINE__, ##__VA_ARGS__) + kbasename(__FILE__), __LINE__, ##__VA_ARGS__) # else /* faux printk call just to see any compiler warnings. */ # define dout(fmt, ...) do { \ diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4fd02831beed..47e94b560ba0 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -56,19 +56,6 @@ static const struct kernel_param_ops param_ops_supported_features = { module_param_cb(supported_features, ¶m_ops_supported_features, NULL, S_IRUGO); -/* - * find filename portion of a path (/foo/bar/baz -> baz) - */ -const char *ceph_file_part(const char *s, int len) -{ - const char *e = s + len; - - while (e != s && *(e-1) != '/') - e--; - return e; -} -EXPORT_SYMBOL(ceph_file_part); - const char *ceph_msg_type_name(int type) { switch (type) { -- cgit v1.2.3 From 1759f7b0e3fab1d1882d7c680af9d12c5c111b0e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 11:38:17 +0200 Subject: libceph: make ceph_msg_data_advance() return void Both callers ignore the returned bool. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/messenger.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 5766a6c896c4..d7ab481b2508 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1174,8 +1174,8 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, * Returns true if the result moves the cursor on to the next piece * of the data item. */ -static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, - size_t bytes) +static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) { bool new_piece; @@ -1207,8 +1207,6 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, new_piece = true; } cursor->need_crc = new_piece; - - return new_piece; } static size_t sizeof_footer(struct ceph_connection *con) @@ -1577,7 +1575,6 @@ static int write_partial_message_data(struct ceph_connection *con) size_t page_offset; size_t length; bool last_piece; - bool need_crc; int ret; page = ceph_msg_data_next(cursor, &page_offset, &length, @@ -1592,7 +1589,7 @@ static int write_partial_message_data(struct ceph_connection *con) } if (do_datacrc && cursor->need_crc) crc = ceph_crc32c_page(crc, page, page_offset, length); - need_crc = ceph_msg_data_advance(cursor, (size_t)ret); + ceph_msg_data_advance(cursor, (size_t)ret); } dout("%s %p msg %p done\n", __func__, con, msg); @@ -2299,7 +2296,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = ceph_crc32c_page(crc, page, page_offset, ret); - (void) ceph_msg_data_advance(cursor, (size_t)ret); + ceph_msg_data_advance(cursor, (size_t)ret); } if (do_datacrc) con->in_data_crc = crc; -- cgit v1.2.3 From f3b4e55ded9b3c52831a7d2ab9e511293c99fc11 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 11:59:22 +0200 Subject: libceph: drop version variable from ceph_monmap_decode() It's set but not used: CEPH_FEATURE_MONNAMES feature bit isn't advertised, which guarantees a v1 MonMap. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/mon_client.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 29a0ef351c5e..250f11f78609 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -43,15 +43,13 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) int i, err = -EINVAL; struct ceph_fsid fsid; u32 epoch, num_mon; - u16 version; u32 len; ceph_decode_32_safe(&p, end, len, bad); ceph_decode_need(&p, end, len, bad); dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); - - ceph_decode_16_safe(&p, end, version, bad); + p += sizeof(u16); /* skip version */ ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); -- cgit v1.2.3 From d18a1247c4070390fc0c2d83d89a72afe921882e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 12:21:56 +0200 Subject: libceph: validate blob_struct_v in process_one_ticket() None of these are validated in userspace, but since we do validate reply_struct_v in ceph_x_proc_ticket_reply(), tkt_struct_v (first) and CephXServiceTicket struct_v (second) in process_one_ticket(), validate CephXTicketBlob struct_v as well. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/auth_x.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 2034fb926670..d0126df33f1f 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -215,6 +215,9 @@ static int process_one_ticket(struct ceph_auth_client *ac, dout(" ticket blob is %d bytes\n", dlen); ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad); blob_struct_v = ceph_decode_8(ptp); + if (blob_struct_v != 1) + goto bad; + new_secret_id = ceph_decode_64(ptp); ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend); if (ret) -- cgit v1.2.3 From b51456a6096ebf9f4ceb2cc7e176b471d4b70af0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 14:24:36 +0200 Subject: libceph: fix error handling in process_one_ticket() Don't leak key internals after new_session_key is populated. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/auth_x.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index d0126df33f1f..8757fb87dab8 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -151,7 +151,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, struct timespec validity; void *tp, *tpend; void **ptp; - struct ceph_crypto_key new_session_key; + struct ceph_crypto_key new_session_key = { 0 }; struct ceph_buffer *new_ticket_blob; unsigned long new_expires, new_renew_after; u64 new_secret_id; @@ -237,13 +237,13 @@ static int process_one_ticket(struct ceph_auth_client *ac, type, ceph_entity_type_name(type), th->secret_id, (int)th->ticket_blob->vec.iov_len); xi->have_keys |= th->service; - -out: - return ret; + return 0; bad: ret = -EINVAL; - goto out; +out: + ceph_crypto_key_destroy(&new_session_key); + return ret; } static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, -- cgit v1.2.3 From 293dffaad8d500e1a5336eeb90d544cf40d4fbd8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 May 2017 17:25:10 +0300 Subject: libceph: NULL deref on crush_decode() error path If there is not enough space then ceph_decode_32_safe() does a goto bad. We need to return an error code in that situation. The current code returns ERR_PTR(0) which is NULL. The callers are not expecting that and it results in a NULL dereference. Fixes: f24e9980eb86 ("ceph: OSD client") Signed-off-by: Dan Carpenter Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index ffe9e904d4d1..55e3a477f92d 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -317,6 +317,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) u32 yes; struct crush_rule *r; + err = -EINVAL; ceph_decode_32_safe(p, end, yes, bad); if (!yes) { dout("crush_decode NO rule %d off %x %p to %p\n", -- cgit v1.2.3 From f3c0eb05e258c6a48c2d1ef2fa71ffb6ff63cd18 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 18 May 2017 18:01:43 +0200 Subject: netfilter: conntrack: fix false CRC32c mismatch using paged skb sctp_compute_cksum() implementation assumes that at least the SCTP header is in the linear part of skb: modify conntrack error callback to avoid false CRC32c mismatch, if the transport header is partially/entirely paged. Fixes: cf6e007eef83 ("netfilter: conntrack: validate SCTP crc32c in PREROUTING") Signed-off-by: Davide Caratti Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_sctp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 13875d599a85..1c5b14a6cab3 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -512,16 +512,19 @@ static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb, u8 pf, unsigned int hooknum) { const struct sctphdr *sh; - struct sctphdr _sctph; const char *logmsg; - sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); - if (!sh) { + if (skb->len < dataoff + sizeof(struct sctphdr)) { logmsg = "nf_ct_sctp: short packet "; goto out_invalid; } if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && skb->ip_summed == CHECKSUM_NONE) { + if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) { + logmsg = "nf_ct_sctp: failed to read header "; + goto out_invalid; + } + sh = (const struct sctphdr *)(skb->data + dataoff); if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { logmsg = "nf_ct_sctp: bad CRC "; goto out_invalid; -- cgit v1.2.3 From d2df92e98a34a5619dadd29c6291113c009181e7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 May 2017 00:37:10 +0200 Subject: netfilter: nft_set_rbtree: handle element re-addition after deletion The existing code selects no next branch to be inspected when re-inserting an inactive element into the rb-tree, looping endlessly. This patch restricts the check for active elements to the EEXIST case only. Fixes: e701001e7cbe ("netfilter: nft_rbtree: allow adjacent intervals with dynamic updates") Reported-by: Wolfgang Bumiller Tested-by: Wolfgang Bumiller Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index e97e2fb53f0a..fbdbaa00dd5f 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -116,17 +116,17 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, else if (d > 0) p = &parent->rb_right; else { - if (nft_set_elem_active(&rbe->ext, genmask)) { - if (nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(new)) - p = &parent->rb_left; - else if (!nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_end(new)) - p = &parent->rb_right; - else { - *ext = &rbe->ext; - return -EEXIST; - } + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(new)) { + p = &parent->rb_left; + } else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(new)) { + p = &parent->rb_right; + } else if (nft_set_elem_active(&rbe->ext, genmask)) { + *ext = &rbe->ext; + return -EEXIST; + } else { + p = &parent->rb_left; } } } -- cgit v1.2.3 From 124dffea9e8e372509e055aebd118e85518fd644 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 21 May 2017 22:38:11 +0800 Subject: netfilter: nat: use atomic bit op to clear the _SRC_NAT_DONE_BIT We need to clear the IPS_SRC_NAT_DONE_BIT to indicate that the ct has been removed from nat_bysource table. But unfortunately, we use the non-atomic bit operation: "ct->status &= ~IPS_NAT_DONE_MASK". So there's a race condition that we may clear the _DYING_BIT set by another CPU unexpectedly. Since we don't care about the IPS_DST_NAT_DONE_BIT, so just using clear_bit to clear the IPS_SRC_NAT_DONE_BIT is enough. Also note, this is the last user which use the non-atomic bit operation to update the confirmed ct->status. Reported-by: Florian Westphal Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index ef0be325a0c6..6c72922d20ca 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -566,7 +566,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ - ct->status &= ~IPS_NAT_DONE_MASK; + clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, nf_nat_bysource_params); -- cgit v1.2.3 From fefa92679dbe0c613e62b6c27235dcfbe9640ad1 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 21 May 2017 07:22:49 +0800 Subject: netfilter: ctnetlink: fix incorrect nf_ct_put during hash resize If nf_conntrack_htable_size was adjusted by the user during the ct dump operation, we may invoke nf_ct_put twice for the same ct, i.e. the "last" ct. This will cause the ct will be freed but still linked in hash buckets. It's very easy to reproduce the problem by the following commands: # while : ; do echo $RANDOM > /proc/sys/net/netfilter/nf_conntrack_buckets done # while : ; do conntrack -L done # iperf -s 127.0.0.1 & # iperf -c 127.0.0.1 -P 60 -t 36000 After a while, the system will hang like this: NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [bash:20184] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [iperf:20382] ... So at last if we find cb->args[1] is equal to "last", this means hash resize happened, then we can set cb->args[1] to 0 to fix the above issue. Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9799a50bc604..a8be9b72e6cd 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -890,8 +890,13 @@ restart: } out: local_bh_enable(); - if (last) + if (last) { + /* nf ct hash resize happened, now clear the leftover. */ + if ((struct nf_conn *)cb->args[1] == last) + cb->args[1] = 0; + nf_ct_put(last); + } while (i) { i--; -- cgit v1.2.3 From d2c23c0075d7091bf749411bd2ee757cf4ec356c Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Mon, 22 May 2017 22:18:28 +0200 Subject: xprtrdma: Delete an error message for a failed memory allocation in xprt_rdma_bc_setup() Omit an extra message for a memory allocation failure in this function. This issue was detected by using the Coccinelle software. Link: http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf Signed-off-by: Markus Elfring Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/backchannel.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 24fedd4b117e..03f6b5840764 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -119,11 +119,9 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) for (i = 0; i < (reqs << 1); i++) { rqst = kzalloc(sizeof(*rqst), GFP_KERNEL); - if (!rqst) { - pr_err("RPC: %s: Failed to create bc rpc_rqst\n", - __func__); + if (!rqst) goto out_free; - } + dprintk("RPC: %s: new rqst %p\n", __func__, rqst); rqst->rq_xprt = &r_xprt->rx_xprt; -- cgit v1.2.3 From 0a2ad541071f99eaf4589c3551176fca191c1ee2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 5 May 2017 18:47:37 +0800 Subject: libceph: cleanup old messages according to reconnect seq when reopen a connection, use 'reconnect seq' to clean up messages that have already been received by peer. Link: http://tracker.ceph.com/issues/18690 Signed-off-by: "Yan, Zheng" Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d7ab481b2508..588a91930051 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2228,10 +2228,18 @@ static void process_ack(struct ceph_connection *con) struct ceph_msg *m; u64 ack = le64_to_cpu(con->in_temp_ack); u64 seq; + bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ); + struct list_head *list = reconnect ? &con->out_queue : &con->out_sent; - while (!list_empty(&con->out_sent)) { - m = list_first_entry(&con->out_sent, struct ceph_msg, - list_head); + /* + * In the reconnect case, con_fault() has requeued messages + * in out_sent. We should cleanup old messages according to + * the reconnect seq. + */ + while (!list_empty(list)) { + m = list_first_entry(list, struct ceph_msg, list_head); + if (reconnect && m->needs_out_seq) + break; seq = le64_to_cpu(m->hdr.seq); if (seq > ack) break; @@ -2240,6 +2248,7 @@ static void process_ack(struct ceph_connection *con) m->ack_stamp = jiffies; ceph_msg_remove(m); } + prepare_read_tag(con); } -- cgit v1.2.3 From 3ab2137915aea0ce7b3ec02e0f260ecc0f1c289d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 23 May 2017 13:28:54 +0800 Subject: sctp: fix stream update when processing dupcookie Since commit 3dbcc105d556 ("sctp: alloc stream info when initializing asoc"), stream and stream.out info are always alloced when creating an asoc. So it's not correct to check !asoc->stream before updating stream info when processing dupcookie, but would be better to check asoc state instead. Fixes: 3dbcc105d556 ("sctp: alloc stream info when initializing asoc") Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/associola.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index a9708da28eb5..95238284c422 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1176,7 +1176,9 @@ void sctp_assoc_update(struct sctp_association *asoc, asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; - if (!asoc->stream) { + + if (sctp_state(asoc, COOKIE_WAIT)) { + sctp_stream_free(asoc->stream); asoc->stream = new->stream; new->stream = NULL; } -- cgit v1.2.3 From 7e06297768886337707f5833942b3bd524a6d3d5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 23 May 2017 13:28:55 +0800 Subject: sctp: set new_asoc temp when processing dupcookie After sctp changed to use transport hashtable, a transport would be added into global hashtable when adding the peer to an asoc, then the asoc can be got by searching the transport in the hashtbale. The problem is when processing dupcookie in sctp_sf_do_5_2_4_dupcook, a new asoc would be created. A peer with the same addr and port as the one in the old asoc might be added into the new asoc, but fail to be added into the hashtable, as they also belong to the same sk. It causes that sctp's dupcookie processing can not really work. Since the new asoc will be freed after copying it's information to the old asoc, it's more like a temp asoc. So this patch is to fix it by setting it as a temp asoc to avoid adding it's any transport into the hashtable and also avoid allocing assoc_id. An extra thing it has to do is to also alloc stream info for any temp asoc, as sctp dupcookie process needs it to update old asoc. But I don't think it would hurt something, as a temp asoc would always be freed after finishing processing cookie echo packet. Reported-by: Jianwen Ji Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 13 ++++--------- net/sctp/sm_statefuns.c | 3 +++ 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 8a08f13469c4..92e332e17391 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2454,16 +2454,11 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * stream sequence number shall be set to 0. */ - /* Allocate storage for the negotiated streams if it is not a temporary - * association. - */ - if (!asoc->temp) { - if (sctp_stream_init(asoc, gfp)) - goto clean_up; + if (sctp_stream_init(asoc, gfp)) + goto clean_up; - if (sctp_assoc_set_id(asoc, gfp)) - goto clean_up; - } + if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) + goto clean_up; /* ADDIP Section 4.1 ASCONF Chunk Procedures * diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 4f5e6cfc7f60..f863b5573e42 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2088,6 +2088,9 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net, } } + /* Set temp so that it won't be added into hashtable */ + new_asoc->temp = 1; + /* Compare the tie_tag in cookie with the verification tag of * current association. */ -- cgit v1.2.3 From 0ff50e83b5122e836ca492fefb11656b225ac29c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 23 May 2017 13:20:28 +0200 Subject: net: rtnetlink: bail out from rtnl_fdb_dump() on parse error rtnl_fdb_dump() failed to check the result of nlmsg_parse(), which led to contents of |ifm| being uninitialized because nlh->nlmsglen was too small to accommodate |ifm|. The uninitialized data may affect some branches and result in unwanted effects, although kernel data doesn't seem to leak to the userspace directly. The bug has been detected with KMSAN and syzkaller. For the record, here is the KMSAN report: ================================================================== BUG: KMSAN: use of unitialized memory in rtnl_fdb_dump+0x5dc/0x1000 CPU: 0 PID: 1039 Comm: probe Not tainted 4.11.0-rc5+ #2727 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x143/0x1b0 lib/dump_stack.c:52 kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:1007 __kmsan_warning_32+0x66/0xb0 mm/kmsan/kmsan_instr.c:491 rtnl_fdb_dump+0x5dc/0x1000 net/core/rtnetlink.c:3230 netlink_dump+0x84f/0x1190 net/netlink/af_netlink.c:2168 __netlink_dump_start+0xc97/0xe50 net/netlink/af_netlink.c:2258 netlink_dump_start ./include/linux/netlink.h:165 rtnetlink_rcv_msg+0xae9/0xb40 net/core/rtnetlink.c:4094 netlink_rcv_skb+0x339/0x5a0 net/netlink/af_netlink.c:2339 rtnetlink_rcv+0x83/0xa0 net/core/rtnetlink.c:4110 netlink_unicast_kernel net/netlink/af_netlink.c:1272 netlink_unicast+0x13b7/0x1480 net/netlink/af_netlink.c:1298 netlink_sendmsg+0x10b8/0x10f0 net/netlink/af_netlink.c:1844 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 ___sys_sendmsg+0xd4b/0x10f0 net/socket.c:1997 __sys_sendmsg net/socket.c:2031 SYSC_sendmsg+0x2c6/0x3f0 net/socket.c:2042 SyS_sendmsg+0x87/0xb0 net/socket.c:2038 do_syscall_64+0x102/0x150 arch/x86/entry/common.c:285 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:246 RIP: 0033:0x401300 RSP: 002b:00007ffc3b0e6d58 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002b0 RCX: 0000000000401300 RDX: 0000000000000000 RSI: 00007ffc3b0e6d80 RDI: 0000000000000003 RBP: 00007ffc3b0e6e00 R08: 000000000000000b R09: 0000000000000004 R10: 000000000000000d R11: 0000000000000246 R12: 0000000000000000 R13: 00000000004065a0 R14: 0000000000406630 R15: 0000000000000000 origin: 000000008fe00056 save_stack_trace+0x59/0x60 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:352 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:247 kmsan_poison_shadow+0x6d/0xc0 mm/kmsan/kmsan.c:260 slab_alloc_node mm/slub.c:2743 __kmalloc_node_track_caller+0x1f4/0x390 mm/slub.c:4349 __kmalloc_reserve net/core/skbuff.c:138 __alloc_skb+0x2cd/0x740 net/core/skbuff.c:231 alloc_skb ./include/linux/skbuff.h:933 netlink_alloc_large_skb net/netlink/af_netlink.c:1144 netlink_sendmsg+0x934/0x10f0 net/netlink/af_netlink.c:1819 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 ___sys_sendmsg+0xd4b/0x10f0 net/socket.c:1997 __sys_sendmsg net/socket.c:2031 SYSC_sendmsg+0x2c6/0x3f0 net/socket.c:2042 SyS_sendmsg+0x87/0xb0 net/socket.c:2038 do_syscall_64+0x102/0x150 arch/x86/entry/common.c:285 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:246 ================================================================== and the reproducer: ================================================================== #include #include #include #include int main() { int sock = socket(PF_NETLINK, SOCK_DGRAM | SOCK_NONBLOCK, 0); struct msghdr msg; memset(&msg, 0, sizeof(msg)); char nlmsg_buf[32]; memset(nlmsg_buf, 0, sizeof(nlmsg_buf)); struct nlmsghdr *nlmsg = nlmsg_buf; nlmsg->nlmsg_len = 0x11; nlmsg->nlmsg_type = 0x1e; // RTM_NEWROUTE = RTM_BASE + 0x0e // type = 0x0e = 1110b // kind = 2 nlmsg->nlmsg_flags = 0x101; // NLM_F_ROOT | NLM_F_REQUEST nlmsg->nlmsg_seq = 0; nlmsg->nlmsg_pid = 0; nlmsg_buf[16] = (char)7; struct iovec iov; iov.iov_base = nlmsg_buf; iov.iov_len = 17; msg.msg_iov = &iov; msg.msg_iovlen = 1; sendmsg(sock, &msg, 0); return 0; } ================================================================== Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 49a279a7cc15..9e2c0a7cb325 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3231,8 +3231,11 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; - if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, NULL) == 0) { + err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, NULL); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { if (tb[IFLA_MASTER]) br_idx = nla_get_u32(tb[IFLA_MASTER]); } -- cgit v1.2.3 From ac4bb5de27010e41f027c635dedca1393e7ebf55 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 23 May 2017 18:40:44 +0200 Subject: net: flow_dissector: add support for dissection of tcp flags Add support for dissection of tcp flags. Uses similar function call to tcp dissection function as arp, mpls and others. Signed-off-by: Jiri Pirko Acked-by: Or Gerlitz Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 9 +++++++++ net/core/flow_dissector.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 8d21d448daa9..efe34eec61dc 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -157,6 +157,14 @@ struct flow_dissector_key_eth_addrs { unsigned char src[ETH_ALEN]; }; +/** + * struct flow_dissector_key_tcp: + * @flags: flags + */ +struct flow_dissector_key_tcp { + __be16 flags; +}; + enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ @@ -177,6 +185,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */ + FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_MAX, }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 28d94bce4df8..5a45943081f5 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -342,6 +343,30 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, return FLOW_DISSECT_RET_OUT_PROTO_AGAIN; } +static void +__skb_flow_dissect_tcp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int thoff, int hlen) +{ + struct flow_dissector_key_tcp *key_tcp; + struct tcphdr *th, _th; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) + return; + + th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); + if (!th) + return; + + if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) + return; + + key_tcp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TCP, + target_container); + key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -683,6 +708,10 @@ ip_proto_again: case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); goto mpls; + case IPPROTO_TCP: + __skb_flow_dissect_tcp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; } -- cgit v1.2.3 From fdfc7dd6ca39b117c709dceee8d32ac4447294d6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 23 May 2017 18:40:45 +0200 Subject: net/sched: flower: add support for matching on tcp flags Benefit from the support of tcp flags dissection and allow user to insert rules matching on tcp flags. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 3 +++ net/sched/cls_flower.c | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 1b9aa9e6b4fd..c6e8cf5e9c40 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -451,6 +451,9 @@ enum { TCA_FLOWER_KEY_MPLS_TC, /* u8 - 3 bits */ TCA_FLOWER_KEY_MPLS_LABEL, /* be32 - 20 bits */ + TCA_FLOWER_KEY_TCP_FLAGS, /* be16 */ + TCA_FLOWER_KEY_TCP_FLAGS_MASK, /* be16 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index ca526c0881bd..fb74a47830f4 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -49,6 +49,7 @@ struct fl_flow_key { }; struct flow_dissector_key_ports enc_tp; struct flow_dissector_key_mpls mpls; + struct flow_dissector_key_tcp tcp; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -424,6 +425,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -596,6 +599,9 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK, sizeof(key->tp.dst)); + fl_set_key_val(tb, &key->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS, + &mask->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS_MASK, + sizeof(key->tcp.flags)); } else if (key->basic.ip_proto == IPPROTO_UDP) { fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK, @@ -766,6 +772,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_TCP, tcp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ICMP, icmp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, @@ -1215,7 +1223,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->tp.src)) || fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK, - sizeof(key->tp.dst)))) + sizeof(key->tp.dst)) || + fl_dump_key_val(skb, &key->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS, + &mask->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS_MASK, + sizeof(key->tcp.flags)))) goto nla_put_failure; else if (key->basic.ip_proto == IPPROTO_UDP && (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, -- cgit v1.2.3 From 64dba236a15770af1e07f90f23a54789c9f9a3ba Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 23 May 2017 15:20:59 -0400 Subject: net: dsa: support cross-chip ageing time Now that the switchdev bridge ageing time attribute is propagated to all switch chips of the fabric, each switch can check if the requested value is valid and program itself, so that the whole fabric shares a common ageing time setting. This is especially needed for switch chips in between others, containing no bridge port members but evidently used in the data path. To achieve that, remove the condition which skips the other switches. We also don't need to identify the target switch anymore, thus remove the sw_index member of the dsa_notifier_ageing_time_info notifier structure. On ZII Dev Rev B (with two 88E6352 and one 88E6185) and ZII Dev Rev C (with two 88E6390X), we have the following hardware configuration: # ip link add name br0 type bridge # ip link set master br0 dev lan6 br0: port 1(lan6) entered blocking state br0: port 1(lan6) entered disabled state # echo 2000 > /sys/class/net/br0/bridge/ageing_time Before this patch: zii-rev-b# cat /sys/kernel/debug/mv88e6xxx/sw*/age_time 300000 300000 15000 zii-rev-c# cat /sys/kernel/debug/mv88e6xxx/sw*/age_time 300000 18750 After this patch: zii-rev-b# cat /sys/kernel/debug/mv88e6xxx/sw*/age_time 15000 15000 15000 zii-rev-c# cat /sys/kernel/debug/mv88e6xxx/sw*/age_time 18750 18750 Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 1 - net/dsa/port.c | 1 - net/dsa/switch.c | 4 ---- 3 files changed, 6 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 1d52f9051d0e..c1d4180651af 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -32,7 +32,6 @@ enum { struct dsa_notifier_ageing_time_info { struct switchdev_trans *trans; unsigned int ageing_time; - int sw_index; }; /* DSA_NOTIFIER_BRIDGE_* */ diff --git a/net/dsa/port.c b/net/dsa/port.c index c88c0cec8454..efc3bce3a89d 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -135,7 +135,6 @@ int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); struct dsa_notifier_ageing_time_info info = { .ageing_time = ageing_time, - .sw_index = dp->ds->index, .trans = trans, }; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index c1e4b2d5a3ae..d8e5c311ee7c 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -37,10 +37,6 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds, unsigned int ageing_time = info->ageing_time; struct switchdev_trans *trans = info->trans; - /* Do not care yet about other switch chips of the fabric */ - if (ds->index != info->sw_index) - return 0; - if (switchdev_trans_ph_prepare(trans)) { if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) return -ERANGE; -- cgit v1.2.3 From ce682ef6e3e019f98cafbdc7058668e0ea8f4a13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 May 2017 12:38:35 -0700 Subject: tcp: fix TCP_SYNCNT flakes After the mentioned commit, some of our packetdrill tests became flaky. TCP_SYNCNT socket option can limit the number of SYN retransmits. retransmits_timed_out() has to compare times computations based on local_clock() while timers are based on jiffies. With NTP adjustments and roundings we can observe 999 ms delay for 1000 ms timers. We end up sending one extra SYN packet. Gimmick added in commit 6fa12c850314 ("Revert Backoff [v3]: Calculate TCP's connection close threshold as a time value") makes no real sense for TCP_SYN_SENT sockets where no RTO backoff can happen at all. Lets use a simpler logic for TCP_SYN_SENT sockets and remove @syn_set parameter from retransmits_timed_out() Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock") Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c4a35ba7f8ed..c0feeeef962a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -139,21 +139,17 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) * @timeout: A custom timeout value. * If set to 0 the default timeout is calculated and used. * Using TCP_RTO_MIN and the number of unsuccessful retransmits. - * @syn_set: true if the SYN Bit was set. * * The default "timeout" value this function can calculate and use * is equivalent to the timeout of a TCP Connection * after "boundary" unsuccessful, exponentially backed-off - * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if - * syn_set flag is set. - * + * retransmissions with an initial RTO of TCP_RTO_MIN. */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, - unsigned int timeout, - bool syn_set) + unsigned int timeout) { - unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; + const unsigned int rto_base = TCP_RTO_MIN; unsigned int linear_backoff_thresh, start_ts; if (!inet_csk(sk)->icsk_retransmits) @@ -181,8 +177,8 @@ static int tcp_write_timeout(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); + bool expired, do_reset; int retry_until; - bool do_reset, syn_set = false; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { @@ -196,9 +192,9 @@ static int tcp_write_timeout(struct sock *sk) sk_rethink_txhash(sk); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; - syn_set = true; + expired = icsk->icsk_retransmits >= retry_until; } else { - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable * Fast Open on this path on recurring timeouts after @@ -224,15 +220,15 @@ static int tcp_write_timeout(struct sock *sk) retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || - !retransmits_timed_out(sk, retry_until, 0, 0); + !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } + expired = retransmits_timed_out(sk, retry_until, + icsk->icsk_user_timeout); } - - if (retransmits_timed_out(sk, retry_until, - syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -540,7 +536,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0)) + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); out:; -- cgit v1.2.3 From ee538dcea28930bd95606fe00a834935d6fb5613 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 23 May 2017 09:11:59 +0200 Subject: net: sched: cls_api: make reclassify return all the way back to the original tp With the introduction of chain goto action, the reclassification would cause the re-iteration of the actual chain. It makes more sense to restart the whole thing and re-iterate starting from the original tp - start of chain 0. Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/cls_api.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 01a8b8b4bab8..89fbb35bc666 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -300,7 +300,8 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, __be16 protocol = tc_skb_protocol(skb); #ifdef CONFIG_NET_CLS_ACT const int max_reclassify_loop = 4; - const struct tcf_proto *old_tp = tp; + const struct tcf_proto *orig_tp = tp; + const struct tcf_proto *first_tp; int limit = 0; reclassify: @@ -315,9 +316,10 @@ reclassify: err = tp->classify(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { + first_tp = orig_tp; goto reset; } else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) { - old_tp = res->goto_tp; + first_tp = res->goto_tp; goto reset; } #endif @@ -335,7 +337,7 @@ reset: return TC_ACT_SHOT; } - tp = old_tp; + tp = first_tp; protocol = tc_skb_protocol(skb); goto reclassify; #endif -- cgit v1.2.3 From 367a8ce896f14018cc2c6cf2681aa440fff274f4 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 23 May 2017 09:42:37 -0700 Subject: net_sched: only create filter chains for new filters/actions tcf_chain_get() always creates a new filter chain if not found in existing ones. This is totally unnecessary when we get or delete filters, new chain should be only created for new filters (or new actions). Fixes: 5bc1701881e3 ("net: sched: introduce multichain support for filters") Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 ++- net/sched/act_api.c | 2 +- net/sched/cls_api.c | 13 +++++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2c213a69c196..f7762295b7b8 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -18,7 +18,8 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); #ifdef CONFIG_NET_CLS -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index); +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create); void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 0ecf2a858767..aed6cf2e9fd8 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -34,7 +34,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) if (!tp) return -EINVAL; - a->goto_chain = tcf_chain_get(tp->chain->block, chain_index); + a->goto_chain = tcf_chain_get(tp->chain->block, chain_index, true); if (!a->goto_chain) return -ENOMEM; return 0; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 89fbb35bc666..39da0c5801c9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -220,7 +220,8 @@ static void tcf_chain_destroy(struct tcf_chain *chain) kfree(chain); } -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index) +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create) { struct tcf_chain *chain; @@ -230,7 +231,10 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index) return chain; } } - return tcf_chain_create(block, chain_index); + if (create) + return tcf_chain_create(block, chain_index); + else + return NULL; } EXPORT_SYMBOL(tcf_chain_get); @@ -511,9 +515,10 @@ replay: err = -EINVAL; goto errout; } - chain = tcf_chain_get(block, chain_index); + chain = tcf_chain_get(block, chain_index, + n->nlmsg_type == RTM_NEWTFILTER); if (!chain) { - err = -ENOMEM; + err = n->nlmsg_type == RTM_NEWTFILTER ? -ENOMEM : -EINVAL; goto errout; } -- cgit v1.2.3 From d0e1a1b5a833b625c93d3d49847609350ebd79db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 May 2017 15:24:46 -0700 Subject: tcp: better validation of received ack sequences Paul Fiterau Brostean reported : Linux TCP stack we analyze exhibits behavior that seems odd to me. The scenario is as follows (all packets have empty payloads, no window scaling, rcv/snd window size should not be a factor): TEST HARNESS (CLIENT) LINUX SERVER 1. - LISTEN (server listen, then accepts) 2. - --> --> SYN-RECEIVED 3. - <-- <-- SYN-RECEIVED 4. - --> --> ESTABLISHED 5. - <-- <-- FIN WAIT-1 (server opts to close the data connection calling "close" on the connection socket) 6. - --> --> CLOSING (client sends FIN,ACK with not yet sent acknowledgement number) 7. - <-- <-- CLOSING (ACK is 102 instead of 101, why?) ... (silence from CLIENT) 8. - <-- <-- CLOSING (retransmission, again ACK is 102) Now, note that packet 6 while having the expected sequence number, acknowledges something that wasn't sent by the server. So I would expect the packet to maybe prompt an ACK response from the server, and then be ignored. Yet it is not ignored and actually leads to an increase of the acknowledgement number in the server's retransmission of the FIN,ACK packet. The explanation I found is that the FIN in packet 6 was processed, despite the acknowledgement number being unacceptable. Further experiments indeed show that the server processes this FIN, transitioning to CLOSING, then on receiving an ACK for the FIN it had send in packet 5, the server (or better said connection) transitions from CLOSING to TIME_WAIT (as signaled by netstat). Indeed, tcp_rcv_state_process() calls tcp_ack() but does not exploit the @acceptable status but for TCP_SYN_RECV state. What we want here is to send a challenge ACK, if not in TCP_SYN_RECV state. TCP_FIN_WAIT1 state is not the only state we should fix. Add a FLAG_NO_CHALLENGE_ACK so that tcp_rcv_state_process() can choose to send a challenge ACK and discard the packet instead of wrongly change socket state. With help from Neal Cardwell. Signed-off-by: Eric Dumazet Reported-by: Paul Fiterau Brostean Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2fa55f57ac06..9f4380662196 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -112,6 +112,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ +#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -3568,7 +3569,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) { /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - tp->max_window)) { - tcp_send_challenge_ack(sk, skb); + if (!(flag & FLAG_NO_CHALLENGE_ACK)) + tcp_send_challenge_ack(sk, skb); return -1; } goto old_ack; @@ -5951,13 +5953,17 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) /* step 5: check the ACK field */ acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT) > 0; + FLAG_UPDATE_TS_RECENT | + FLAG_NO_CHALLENGE_ACK) > 0; + if (!acceptable) { + if (sk->sk_state == TCP_SYN_RECV) + return 1; /* send one RST */ + tcp_send_challenge_ack(sk, skb); + goto discard; + } switch (sk->sk_state) { case TCP_SYN_RECV: - if (!acceptable) - return 1; - if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); @@ -6026,14 +6032,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * our SYNACK so stop the SYNACK timer. */ if (req) { - /* Return RST if ack_seq is invalid. - * Note that RFC793 only says to generate a - * DUPACK for it but for TCP Fast Open it seems - * better to treat this case like TCP_SYN_RECV - * above. - */ - if (!acceptable) - return 1; /* We no longer need the request sock. */ reqsk_fastopen_remove(sk, req, false); tcp_rearm_rto(sk); -- cgit v1.2.3 From 7c3f1875c66fbc19762760097cabc91849ea0bbb Mon Sep 17 00:00:00 2001 From: Roman Kapl Date: Wed, 24 May 2017 10:22:22 +0200 Subject: net: move somaxconn init from sysctl code The default value for somaxconn is set in sysctl_core_net_init(), but this function is not called when kernel is configured without CONFIG_SYSCTL. This results in the kernel not being able to accept TCP connections, because the backlog has zero size. Usually, the user ends up with: "TCP: request_sock_TCP: Possible SYN flooding on port 7. Dropping request. Check SNMP counters." If SYN cookies are not enabled the connection is rejected. Before ef547f2ac16 (tcp: remove max_qlen_log), the effects were less severe, because the backlog was always at least eight slots long. Signed-off-by: Roman Kapl Signed-off-by: David S. Miller --- net/core/net_namespace.c | 19 +++++++++++++++++++ net/core/sysctl_net_core.c | 2 -- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1934efd4a9d4..26bbfababff2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -315,6 +315,25 @@ out_undo: goto out; } +static int __net_init net_defaults_init_net(struct net *net) +{ + net->core.sysctl_somaxconn = SOMAXCONN; + return 0; +} + +static struct pernet_operations net_defaults_ops = { + .init = net_defaults_init_net, +}; + +static __init int net_defaults_init(void) +{ + if (register_pernet_subsys(&net_defaults_ops)) + panic("Cannot initialize net default settings"); + + return 0; +} + +core_initcall(net_defaults_init); #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index ea23254b2457..b7cd9aafe99e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -479,8 +479,6 @@ static __net_init int sysctl_core_net_init(struct net *net) { struct ctl_table *tbl; - net->core.sysctl_somaxconn = SOMAXCONN; - tbl = netns_core_table; if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); -- cgit v1.2.3 From 878cd3ba37f77ded9c85e9857e3182a7fe8f5dc3 Mon Sep 17 00:00:00 2001 From: "Rosen, Rami" Date: Wed, 24 May 2017 18:34:11 +0300 Subject: net/packet: remove unused parameter in prb_curr_blk_in_use(). This patch removes unused parameter from prb_curr_blk_in_use() method in net/packet/af_packet.c. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/packet/af_packet.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e3eeed19cc7a..82ca49fba336 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -196,8 +196,7 @@ static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status); static void packet_increment_head(struct packet_ring_buffer *buff); -static int prb_curr_blk_in_use(struct tpacket_kbdq_core *, - struct tpacket_block_desc *); +static int prb_curr_blk_in_use(struct tpacket_block_desc *); static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, struct packet_sock *); static void prb_retire_current_block(struct tpacket_kbdq_core *, @@ -721,7 +720,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data) /* Case 1. Queue was frozen because user-space was * lagging behind. */ - if (prb_curr_blk_in_use(pkc, pbd)) { + if (prb_curr_blk_in_use(pbd)) { /* * Ok, user-space is still behind. * So just refresh the timer. @@ -972,8 +971,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, } } -static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc, - struct tpacket_block_desc *pbd) +static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd) { return TP_STATUS_USER & BLOCK_STATUS(pbd); } @@ -1064,7 +1062,7 @@ static void *__packet_lookup_frame_in_block(struct packet_sock *po, * Check if that last block which caused the queue to freeze, * is still in_use by user-space. */ - if (prb_curr_blk_in_use(pkc, pbd)) { + if (prb_curr_blk_in_use(pbd)) { /* Can't record this packet */ return NULL; } else { -- cgit v1.2.3 From 2baec2c3f854d1f79c7bb28386484e144e864a14 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 24 May 2017 17:02:32 +0100 Subject: rxrpc: Support network namespacing Support network namespacing in AF_RXRPC with the following changes: (1) All the local endpoint, peer and call lists, locks, counters, etc. are moved into the per-namespace record. (2) All the connection tracking is moved into the per-namespace record with the exception of the client connection ID tree, which is kept global so that connection IDs are kept unique per-machine. (3) Each namespace gets its own epoch. This allows each network namespace to pretend to be a separate client machine. (4) The /proc/net/rxrpc_xxx files are now called /proc/net/rxrpc/xxx and the contents reflect the namespace. fs/afs/ should be okay with this patch as it explicitly requires the current net namespace to be init_net to permit a mount to proceed at the moment. It will, however, need updating so that cells, IP addresses and DNS records are per-namespace also. Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/Makefile | 1 + net/rxrpc/af_rxrpc.c | 35 +++++------ net/rxrpc/ar-internal.h | 65 ++++++++++++++++---- net/rxrpc/call_accept.c | 14 +++-- net/rxrpc/call_object.c | 39 ++++++------ net/rxrpc/conn_client.c | 153 +++++++++++++++++++++++------------------------ net/rxrpc/conn_object.c | 55 ++++++++--------- net/rxrpc/conn_service.c | 11 ++-- net/rxrpc/local_object.c | 48 +++++++-------- net/rxrpc/net_ns.c | 85 ++++++++++++++++++++++++++ net/rxrpc/peer_object.c | 26 ++++---- net/rxrpc/proc.c | 40 +++++++++---- 12 files changed, 356 insertions(+), 216 deletions(-) create mode 100644 net/rxrpc/net_ns.c (limited to 'net') diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index b9da4d6b914f..9c68d2f8ba39 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -19,6 +19,7 @@ rxrpc-y := \ local_event.o \ local_object.o \ misc.o \ + net_ns.o \ output.o \ peer_event.o \ peer_object.o \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 7fb59c3f1542..cd34ffbff1d1 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -38,9 +38,6 @@ MODULE_PARM_DESC(debug, "RxRPC debugging mask"); static struct proto rxrpc_proto; static const struct proto_ops rxrpc_rpc_ops; -/* local epoch for detecting local-end reset */ -u32 rxrpc_epoch; - /* current debugging ID */ atomic_t rxrpc_debug_id; @@ -155,7 +152,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) memcpy(&rx->srx, srx, sizeof(rx->srx)); - local = rxrpc_lookup_local(&rx->srx); + local = rxrpc_lookup_local(sock_net(sock->sk), &rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); goto error_unlock; @@ -434,7 +431,7 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) ret = -EAFNOSUPPORT; goto error_unlock; } - local = rxrpc_lookup_local(&rx->srx); + local = rxrpc_lookup_local(sock_net(sock->sk), &rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); goto error_unlock; @@ -582,9 +579,6 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, _enter("%p,%d", sock, protocol); - if (!net_eq(net, &init_net)) - return -EAFNOSUPPORT; - /* we support transport protocol UDP/UDP6 only */ if (protocol != PF_INET && IS_ENABLED(CONFIG_AF_RXRPC_IPV6) && protocol != PF_INET6) @@ -780,8 +774,6 @@ static int __init af_rxrpc_init(void) BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); - get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); - rxrpc_epoch |= RXRPC_RANDOM_EPOCH; get_random_bytes(&tmp, sizeof(tmp)); tmp &= 0x3fffffff; if (tmp == 0) @@ -809,6 +801,10 @@ static int __init af_rxrpc_init(void) goto error_security; } + ret = register_pernet_subsys(&rxrpc_net_ops); + if (ret) + goto error_pernet; + ret = proto_register(&rxrpc_proto, 1); if (ret < 0) { pr_crit("Cannot register protocol\n"); @@ -839,11 +835,6 @@ static int __init af_rxrpc_init(void) goto error_sysctls; } -#ifdef CONFIG_PROC_FS - proc_create("rxrpc_calls", 0, init_net.proc_net, &rxrpc_call_seq_fops); - proc_create("rxrpc_conns", 0, init_net.proc_net, - &rxrpc_connection_seq_fops); -#endif return 0; error_sysctls: @@ -855,6 +846,8 @@ error_key_type: error_sock: proto_unregister(&rxrpc_proto); error_proto: + unregister_pernet_subsys(&rxrpc_net_ops); +error_pernet: rxrpc_exit_security(); error_security: destroy_workqueue(rxrpc_workqueue); @@ -875,14 +868,16 @@ static void __exit af_rxrpc_exit(void) unregister_key_type(&key_type_rxrpc); sock_unregister(PF_RXRPC); proto_unregister(&rxrpc_proto); - rxrpc_destroy_all_calls(); - rxrpc_destroy_all_connections(); + unregister_pernet_subsys(&rxrpc_net_ops); ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0); ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0); - rxrpc_destroy_all_locals(); - remove_proc_entry("rxrpc_conns", init_net.proc_net); - remove_proc_entry("rxrpc_calls", init_net.proc_net); + /* Make sure the local and peer records pinned by any dying connections + * are released. + */ + rcu_barrier(); + rxrpc_destroy_client_conn_ids(); + destroy_workqueue(rxrpc_workqueue); rxrpc_exit_security(); kmem_cache_destroy(rxrpc_call_jar); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7486926e60a8..067dbb3121d0 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -11,6 +11,8 @@ #include #include +#include +#include #include #include #include @@ -64,6 +66,37 @@ enum { RXRPC_CLOSE, /* socket is being closed */ }; +/* + * Per-network namespace data. + */ +struct rxrpc_net { + struct proc_dir_entry *proc_net; /* Subdir in /proc/net */ + u32 epoch; /* Local epoch for detecting local-end reset */ + struct list_head calls; /* List of calls active in this namespace */ + rwlock_t call_lock; /* Lock for ->calls */ + + struct list_head conn_proc_list; /* List of conns in this namespace for proc */ + struct list_head service_conns; /* Service conns in this namespace */ + rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */ + struct delayed_work service_conn_reaper; + + unsigned int nr_client_conns; + unsigned int nr_active_client_conns; + bool kill_all_client_conns; + spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ + spinlock_t client_conn_discard_lock; /* Prevent multiple discarders */ + struct list_head waiting_client_conns; + struct list_head active_client_conns; + struct list_head idle_client_conns; + struct delayed_work client_conn_reaper; + + struct list_head local_endpoints; + struct mutex local_mutex; /* Lock for ->local_endpoints */ + + spinlock_t peer_hash_lock; /* Lock for ->peer_hash */ + DECLARE_HASHTABLE (peer_hash, 10); +}; + /* * Service backlog preallocation. * @@ -211,6 +244,7 @@ struct rxrpc_security { struct rxrpc_local { struct rcu_head rcu; atomic_t usage; + struct rxrpc_net *rxnet; /* The network ns in which this resides */ struct list_head link; struct socket *socket; /* my UDP socket */ struct work_struct processor; @@ -601,7 +635,6 @@ struct rxrpc_ack_summary { * af_rxrpc.c */ extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; -extern u32 rxrpc_epoch; extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; @@ -634,8 +667,6 @@ extern const char *const rxrpc_call_states[]; extern const char *const rxrpc_call_completions[]; extern unsigned int rxrpc_max_call_lifetime; extern struct kmem_cache *rxrpc_call_jar; -extern struct list_head rxrpc_calls; -extern rwlock_t rxrpc_call_lock; struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); struct rxrpc_call *rxrpc_alloc_call(gfp_t); @@ -653,7 +684,7 @@ void rxrpc_see_call(struct rxrpc_call *); void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_cleanup_call(struct rxrpc_call *); -void __exit rxrpc_destroy_all_calls(void); +void rxrpc_destroy_all_calls(struct rxrpc_net *); static inline bool rxrpc_is_service_call(const struct rxrpc_call *call) { @@ -773,7 +804,8 @@ int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_call *); void rxrpc_put_client_conn(struct rxrpc_connection *); -void __exit rxrpc_destroy_all_client_connections(void); +void rxrpc_discard_expired_client_conns(struct work_struct *); +void rxrpc_destroy_all_client_connections(struct rxrpc_net *); /* * conn_event.c @@ -784,9 +816,6 @@ void rxrpc_process_connection(struct work_struct *); * conn_object.c */ extern unsigned int rxrpc_connection_expiry; -extern struct list_head rxrpc_connections; -extern struct list_head rxrpc_connection_proc_list; -extern rwlock_t rxrpc_connection_lock; int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); @@ -800,7 +829,8 @@ void rxrpc_see_connection(struct rxrpc_connection *); void rxrpc_get_connection(struct rxrpc_connection *); struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *); void rxrpc_put_service_conn(struct rxrpc_connection *); -void __exit rxrpc_destroy_all_connections(void); +void rxrpc_service_connection_reaper(struct work_struct *); +void rxrpc_destroy_all_connections(struct rxrpc_net *); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) { @@ -828,7 +858,7 @@ static inline void rxrpc_put_connection(struct rxrpc_connection *conn) */ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); -struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t); +struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t); void rxrpc_new_incoming_connection(struct rxrpc_connection *, struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); @@ -861,9 +891,9 @@ extern void rxrpc_process_local_events(struct rxrpc_local *); /* * local_object.c */ -struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *); +struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *); void __rxrpc_put_local(struct rxrpc_local *); -void __exit rxrpc_destroy_all_locals(void); +void rxrpc_destroy_all_locals(struct rxrpc_net *); static inline void rxrpc_get_local(struct rxrpc_local *local) { @@ -901,6 +931,17 @@ extern unsigned int rxrpc_resend_timeout; extern const s8 rxrpc_ack_priority[]; +/* + * net_ns.c + */ +extern unsigned int rxrpc_net_id; +extern struct pernet_operations rxrpc_net_ops; + +static inline struct rxrpc_net *rxrpc_net(struct net *net) +{ + return net_generic(net, rxrpc_net_id); +} + /* * output.c */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 1752fcf8e8f1..a8515b0d4717 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -38,6 +38,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, { const void *here = __builtin_return_address(0); struct rxrpc_call *call; + struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); int max, tmp; unsigned int size = RXRPC_BACKLOG_MAX; unsigned int head, tail, call_head, call_tail; @@ -79,7 +80,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, if (CIRC_CNT(head, tail, size) < max) { struct rxrpc_connection *conn; - conn = rxrpc_prealloc_service_connection(gfp); + conn = rxrpc_prealloc_service_connection(rxnet, gfp); if (!conn) return -ENOMEM; b->conn_backlog[head] = conn; @@ -136,9 +137,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); - write_lock(&rxrpc_call_lock); - list_add_tail(&call->link, &rxrpc_calls); - write_unlock(&rxrpc_call_lock); + write_lock(&rxnet->call_lock); + list_add_tail(&call->link, &rxnet->calls); + write_unlock(&rxnet->call_lock); b->call_backlog[call_head] = call; smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1)); @@ -185,6 +186,7 @@ int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp) void rxrpc_discard_prealloc(struct rxrpc_sock *rx) { struct rxrpc_backlog *b = rx->backlog; + struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); unsigned int size = RXRPC_BACKLOG_MAX, head, tail; if (!b) @@ -209,10 +211,10 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->conn_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_connection *conn = b->conn_backlog[tail]; - write_lock(&rxrpc_connection_lock); + write_lock(&rxnet->conn_lock); list_del(&conn->link); list_del(&conn->proc_link); - write_unlock(&rxrpc_connection_lock); + write_unlock(&rxnet->conn_lock); kfree(conn); tail = (tail + 1) & (size - 1); } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 47f7f4205653..692110808baa 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -44,8 +44,6 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { }; struct kmem_cache *rxrpc_call_jar; -LIST_HEAD(rxrpc_calls); -DEFINE_RWLOCK(rxrpc_call_lock); static void rxrpc_call_timer_expired(unsigned long _call) { @@ -207,6 +205,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call, *xcall; + struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); struct rb_node *parent, **pp; const void *here = __builtin_return_address(0); int ret; @@ -255,9 +254,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); - write_lock(&rxrpc_call_lock); - list_add_tail(&call->link, &rxrpc_calls); - write_unlock(&rxrpc_call_lock); + write_lock(&rxnet->call_lock); + list_add_tail(&call->link, &rxnet->calls); + write_unlock(&rxnet->call_lock); /* From this point on, the call is protected by its own lock. */ release_sock(&rx->sk); @@ -508,6 +507,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) */ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { + struct rxrpc_net *rxnet; const void *here = __builtin_return_address(0); int n; @@ -520,9 +520,12 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) _debug("call %d dead", call->debug_id); ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); - write_lock(&rxrpc_call_lock); - list_del_init(&call->link); - write_unlock(&rxrpc_call_lock); + if (!list_empty(&call->link)) { + rxnet = rxrpc_net(sock_net(&call->socket->sk)); + write_lock(&rxnet->call_lock); + list_del_init(&call->link); + write_unlock(&rxnet->call_lock); + } rxrpc_cleanup_call(call); } @@ -570,21 +573,23 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) } /* - * Make sure that all calls are gone. + * Make sure that all calls are gone from a network namespace. To reach this + * point, any open UDP sockets in that namespace must have been closed, so any + * outstanding calls cannot be doing I/O. */ -void __exit rxrpc_destroy_all_calls(void) +void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) { struct rxrpc_call *call; _enter(""); - if (list_empty(&rxrpc_calls)) + if (list_empty(&rxnet->calls)) return; - write_lock(&rxrpc_call_lock); + write_lock(&rxnet->call_lock); - while (!list_empty(&rxrpc_calls)) { - call = list_entry(rxrpc_calls.next, struct rxrpc_call, link); + while (!list_empty(&rxnet->calls)) { + call = list_entry(rxnet->calls.next, struct rxrpc_call, link); _debug("Zapping call %p", call); rxrpc_see_call(call); @@ -595,10 +600,10 @@ void __exit rxrpc_destroy_all_calls(void) rxrpc_call_states[call->state], call->flags, call->events); - write_unlock(&rxrpc_call_lock); + write_unlock(&rxnet->call_lock); cond_resched(); - write_lock(&rxrpc_call_lock); + write_lock(&rxnet->call_lock); } - write_unlock(&rxrpc_call_lock); + write_unlock(&rxnet->call_lock); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index e8dea0d49e7f..c86f3202f967 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -31,7 +31,7 @@ * may freely grant available channels to new calls and calls may be * waiting on it for channels to become available. * - * The connection is on the rxrpc_active_client_conns list which is kept + * The connection is on the rxnet->active_client_conns list which is kept * in activation order for culling purposes. * * rxrpc_nr_active_client_conns is held incremented also. @@ -46,7 +46,7 @@ * expires, the EXPOSED flag is cleared and the connection transitions to * the INACTIVE state. * - * The connection is on the rxrpc_idle_client_conns list which is kept in + * The connection is on the rxnet->idle_client_conns list which is kept in * order of how soon they'll expire. * * There are flags of relevance to the cache: @@ -85,27 +85,13 @@ __read_mostly unsigned int rxrpc_reap_client_connections = 900; __read_mostly unsigned int rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; __read_mostly unsigned int rxrpc_conn_idle_client_fast_expiry = 2 * HZ; -static unsigned int rxrpc_nr_client_conns; -static unsigned int rxrpc_nr_active_client_conns; -static __read_mostly bool rxrpc_kill_all_client_conns; - -static DEFINE_SPINLOCK(rxrpc_client_conn_cache_lock); -static DEFINE_SPINLOCK(rxrpc_client_conn_discard_mutex); -static LIST_HEAD(rxrpc_waiting_client_conns); -static LIST_HEAD(rxrpc_active_client_conns); -static LIST_HEAD(rxrpc_idle_client_conns); - /* * We use machine-unique IDs for our client connections. */ DEFINE_IDR(rxrpc_client_conn_ids); static DEFINE_SPINLOCK(rxrpc_conn_id_lock); -static void rxrpc_cull_active_client_conns(void); -static void rxrpc_discard_expired_client_conns(struct work_struct *); - -static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap, - rxrpc_discard_expired_client_conns); +static void rxrpc_cull_active_client_conns(struct rxrpc_net *); /* * Get a connection ID and epoch for a client connection from the global pool. @@ -116,6 +102,7 @@ static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap, static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, gfp_t gfp) { + struct rxrpc_net *rxnet = conn->params.local->rxnet; int id; _enter(""); @@ -131,7 +118,7 @@ static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, spin_unlock(&rxrpc_conn_id_lock); idr_preload_end(); - conn->proto.epoch = rxrpc_epoch; + conn->proto.epoch = rxnet->epoch; conn->proto.cid = id << RXRPC_CIDSHIFT; set_bit(RXRPC_CONN_HAS_IDR, &conn->flags); _leave(" [CID %x]", conn->proto.cid); @@ -183,6 +170,7 @@ static struct rxrpc_connection * rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) { struct rxrpc_connection *conn; + struct rxrpc_net *rxnet = cp->local->rxnet; int ret; _enter(""); @@ -213,9 +201,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) if (ret < 0) goto error_2; - write_lock(&rxrpc_connection_lock); - list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list); - write_unlock(&rxrpc_connection_lock); + write_lock(&rxnet->conn_lock); + list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); + write_unlock(&rxnet->conn_lock); /* We steal the caller's peer ref. */ cp->peer = NULL; @@ -243,12 +231,13 @@ error_0: */ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) { + struct rxrpc_net *rxnet = conn->params.local->rxnet; int id_cursor, id, distance, limit; if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags)) goto dont_reuse; - if (conn->proto.epoch != rxrpc_epoch) + if (conn->proto.epoch != rxnet->epoch) goto mark_dont_reuse; /* The IDR tree gets very expensive on memory if the connection IDs are @@ -440,12 +429,13 @@ error: /* * Activate a connection. */ -static void rxrpc_activate_conn(struct rxrpc_connection *conn) +static void rxrpc_activate_conn(struct rxrpc_net *rxnet, + struct rxrpc_connection *conn) { trace_rxrpc_client(conn, -1, rxrpc_client_to_active); conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE; - rxrpc_nr_active_client_conns++; - list_move_tail(&conn->cache_link, &rxrpc_active_client_conns); + rxnet->nr_active_client_conns++; + list_move_tail(&conn->cache_link, &rxnet->active_client_conns); } /* @@ -460,7 +450,8 @@ static void rxrpc_activate_conn(struct rxrpc_connection *conn) * channels if it has been culled to make space and then re-requested by a new * call. */ -static void rxrpc_animate_client_conn(struct rxrpc_connection *conn) +static void rxrpc_animate_client_conn(struct rxrpc_net *rxnet, + struct rxrpc_connection *conn) { unsigned int nr_conns; @@ -469,12 +460,12 @@ static void rxrpc_animate_client_conn(struct rxrpc_connection *conn) if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE) goto out; - spin_lock(&rxrpc_client_conn_cache_lock); + spin_lock(&rxnet->client_conn_cache_lock); - nr_conns = rxrpc_nr_client_conns; + nr_conns = rxnet->nr_client_conns; if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) { trace_rxrpc_client(conn, -1, rxrpc_client_count); - rxrpc_nr_client_conns = nr_conns + 1; + rxnet->nr_client_conns = nr_conns + 1; } switch (conn->cache_state) { @@ -494,21 +485,21 @@ static void rxrpc_animate_client_conn(struct rxrpc_connection *conn) } out_unlock: - spin_unlock(&rxrpc_client_conn_cache_lock); + spin_unlock(&rxnet->client_conn_cache_lock); out: _leave(" [%d]", conn->cache_state); return; activate_conn: _debug("activate"); - rxrpc_activate_conn(conn); + rxrpc_activate_conn(rxnet, conn); goto out_unlock; wait_for_capacity: _debug("wait"); trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting); conn->cache_state = RXRPC_CONN_CLIENT_WAITING; - list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns); + list_move_tail(&conn->cache_link, &rxnet->waiting_client_conns); goto out_unlock; } @@ -660,18 +651,19 @@ int rxrpc_connect_call(struct rxrpc_call *call, struct sockaddr_rxrpc *srx, gfp_t gfp) { + struct rxrpc_net *rxnet = cp->local->rxnet; int ret; _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - rxrpc_discard_expired_client_conns(NULL); - rxrpc_cull_active_client_conns(); + rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper.work); + rxrpc_cull_active_client_conns(rxnet); ret = rxrpc_get_client_conn(call, cp, srx, gfp); if (ret < 0) return ret; - rxrpc_animate_client_conn(call->conn); + rxrpc_animate_client_conn(rxnet, call->conn); rxrpc_activate_channels(call->conn); ret = rxrpc_wait_for_channel(call, gfp); @@ -729,6 +721,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) unsigned int channel = call->cid & RXRPC_CHANNELMASK; struct rxrpc_connection *conn = call->conn; struct rxrpc_channel *chan = &conn->channels[channel]; + struct rxrpc_net *rxnet = rxrpc_net(sock_net(&call->socket->sk)); trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); call->conn = NULL; @@ -750,7 +743,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) /* We must deactivate or idle the connection if it's now * waiting for nothing. */ - spin_lock(&rxrpc_client_conn_cache_lock); + spin_lock(&rxnet->client_conn_cache_lock); if (conn->cache_state == RXRPC_CONN_CLIENT_WAITING && list_empty(&conn->waiting_calls) && !conn->active_chans) @@ -787,14 +780,14 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) * list. It might even get moved back to the active list whilst we're * waiting for the lock. */ - spin_lock(&rxrpc_client_conn_cache_lock); + spin_lock(&rxnet->client_conn_cache_lock); switch (conn->cache_state) { case RXRPC_CONN_CLIENT_ACTIVE: if (list_empty(&conn->waiting_calls)) { rxrpc_deactivate_one_channel(conn, channel); if (!conn->active_chans) { - rxrpc_nr_active_client_conns--; + rxnet->nr_active_client_conns--; goto idle_connection; } goto out; @@ -820,7 +813,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) } out: - spin_unlock(&rxrpc_client_conn_cache_lock); + spin_unlock(&rxnet->client_conn_cache_lock); out_2: spin_unlock(&conn->channel_lock); rxrpc_put_connection(conn); @@ -835,11 +828,11 @@ idle_connection: trace_rxrpc_client(conn, channel, rxrpc_client_to_idle); conn->idle_timestamp = jiffies; conn->cache_state = RXRPC_CONN_CLIENT_IDLE; - list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns); - if (rxrpc_idle_client_conns.next == &conn->cache_link && - !rxrpc_kill_all_client_conns) + list_move_tail(&conn->cache_link, &rxnet->idle_client_conns); + if (rxnet->idle_client_conns.next == &conn->cache_link && + !rxnet->kill_all_client_conns) queue_delayed_work(rxrpc_workqueue, - &rxrpc_client_conn_reap, + &rxnet->client_conn_reaper, rxrpc_conn_idle_client_expiry); } else { trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive); @@ -857,6 +850,7 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) { struct rxrpc_connection *next = NULL; struct rxrpc_local *local = conn->params.local; + struct rxrpc_net *rxnet = local->rxnet; unsigned int nr_conns; trace_rxrpc_client(conn, -1, rxrpc_client_cleanup); @@ -875,18 +869,18 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) { trace_rxrpc_client(conn, -1, rxrpc_client_uncount); - spin_lock(&rxrpc_client_conn_cache_lock); - nr_conns = --rxrpc_nr_client_conns; + spin_lock(&rxnet->client_conn_cache_lock); + nr_conns = --rxnet->nr_client_conns; if (nr_conns < rxrpc_max_client_connections && - !list_empty(&rxrpc_waiting_client_conns)) { - next = list_entry(rxrpc_waiting_client_conns.next, + !list_empty(&rxnet->waiting_client_conns)) { + next = list_entry(rxnet->waiting_client_conns.next, struct rxrpc_connection, cache_link); rxrpc_get_connection(next); - rxrpc_activate_conn(next); + rxrpc_activate_conn(rxnet, next); } - spin_unlock(&rxrpc_client_conn_cache_lock); + spin_unlock(&rxnet->client_conn_cache_lock); } rxrpc_kill_connection(conn); @@ -921,10 +915,10 @@ void rxrpc_put_client_conn(struct rxrpc_connection *conn) /* * Kill the longest-active client connections to make room for new ones. */ -static void rxrpc_cull_active_client_conns(void) +static void rxrpc_cull_active_client_conns(struct rxrpc_net *rxnet) { struct rxrpc_connection *conn; - unsigned int nr_conns = rxrpc_nr_client_conns; + unsigned int nr_conns = rxnet->nr_client_conns; unsigned int nr_active, limit; _enter(""); @@ -936,12 +930,12 @@ static void rxrpc_cull_active_client_conns(void) } limit = rxrpc_reap_client_connections; - spin_lock(&rxrpc_client_conn_cache_lock); - nr_active = rxrpc_nr_active_client_conns; + spin_lock(&rxnet->client_conn_cache_lock); + nr_active = rxnet->nr_active_client_conns; while (nr_active > limit) { - ASSERT(!list_empty(&rxrpc_active_client_conns)); - conn = list_entry(rxrpc_active_client_conns.next, + ASSERT(!list_empty(&rxnet->active_client_conns)); + conn = list_entry(rxnet->active_client_conns.next, struct rxrpc_connection, cache_link); ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE); @@ -953,14 +947,14 @@ static void rxrpc_cull_active_client_conns(void) trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting); conn->cache_state = RXRPC_CONN_CLIENT_WAITING; list_move_tail(&conn->cache_link, - &rxrpc_waiting_client_conns); + &rxnet->waiting_client_conns); } nr_active--; } - rxrpc_nr_active_client_conns = nr_active; - spin_unlock(&rxrpc_client_conn_cache_lock); + rxnet->nr_active_client_conns = nr_active; + spin_unlock(&rxnet->client_conn_cache_lock); ASSERTCMP(nr_active, >=, 0); _leave(" [culled]"); } @@ -972,22 +966,25 @@ static void rxrpc_cull_active_client_conns(void) * This may be called from conn setup or from a work item so cannot be * considered non-reentrant. */ -static void rxrpc_discard_expired_client_conns(struct work_struct *work) +void rxrpc_discard_expired_client_conns(struct work_struct *work) { struct rxrpc_connection *conn; + struct rxrpc_net *rxnet = + container_of(to_delayed_work(work), + struct rxrpc_net, client_conn_reaper); unsigned long expiry, conn_expires_at, now; unsigned int nr_conns; bool did_discard = false; - _enter("%c", work ? 'w' : 'n'); + _enter(""); - if (list_empty(&rxrpc_idle_client_conns)) { + if (list_empty(&rxnet->idle_client_conns)) { _leave(" [empty]"); return; } /* Don't double up on the discarding */ - if (!spin_trylock(&rxrpc_client_conn_discard_mutex)) { + if (!spin_trylock(&rxnet->client_conn_discard_lock)) { _leave(" [already]"); return; } @@ -995,19 +992,19 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *work) /* We keep an estimate of what the number of conns ought to be after * we've discarded some so that we don't overdo the discarding. */ - nr_conns = rxrpc_nr_client_conns; + nr_conns = rxnet->nr_client_conns; next: - spin_lock(&rxrpc_client_conn_cache_lock); + spin_lock(&rxnet->client_conn_cache_lock); - if (list_empty(&rxrpc_idle_client_conns)) + if (list_empty(&rxnet->idle_client_conns)) goto out; - conn = list_entry(rxrpc_idle_client_conns.next, + conn = list_entry(rxnet->idle_client_conns.next, struct rxrpc_connection, cache_link); ASSERT(test_bit(RXRPC_CONN_EXPOSED, &conn->flags)); - if (!rxrpc_kill_all_client_conns) { + if (!rxnet->kill_all_client_conns) { /* If the number of connections is over the reap limit, we * expedite discard by reducing the expiry timeout. We must, * however, have at least a short grace period to be able to do @@ -1030,7 +1027,7 @@ next: conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; list_del_init(&conn->cache_link); - spin_unlock(&rxrpc_client_conn_cache_lock); + spin_unlock(&rxnet->client_conn_cache_lock); /* When we cleared the EXPOSED flag, we took on responsibility for the * reference that that had on the usage count. We deal with that here. @@ -1050,14 +1047,14 @@ not_yet_expired: * then things get messier. */ _debug("not yet"); - if (!rxrpc_kill_all_client_conns) + if (!rxnet->kill_all_client_conns) queue_delayed_work(rxrpc_workqueue, - &rxrpc_client_conn_reap, + &rxnet->client_conn_reaper, conn_expires_at - now); out: - spin_unlock(&rxrpc_client_conn_cache_lock); - spin_unlock(&rxrpc_client_conn_discard_mutex); + spin_unlock(&rxnet->client_conn_cache_lock); + spin_unlock(&rxnet->client_conn_discard_lock); _leave(""); } @@ -1065,17 +1062,17 @@ out: * Preemptively destroy all the client connection records rather than waiting * for them to time out */ -void __exit rxrpc_destroy_all_client_connections(void) +void rxrpc_destroy_all_client_connections(struct rxrpc_net *rxnet) { _enter(""); - spin_lock(&rxrpc_client_conn_cache_lock); - rxrpc_kill_all_client_conns = true; - spin_unlock(&rxrpc_client_conn_cache_lock); + spin_lock(&rxnet->client_conn_cache_lock); + rxnet->kill_all_client_conns = true; + spin_unlock(&rxnet->client_conn_cache_lock); - cancel_delayed_work(&rxrpc_client_conn_reap); + cancel_delayed_work(&rxnet->client_conn_reaper); - if (!queue_delayed_work(rxrpc_workqueue, &rxrpc_client_conn_reap, 0)) + if (!queue_delayed_work(rxrpc_workqueue, &rxnet->client_conn_reaper, 0)) _debug("destroy: queue failed"); _leave(""); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index b0ecb770fdce..ade4d3d0b2a7 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -22,13 +22,6 @@ */ unsigned int rxrpc_connection_expiry = 10 * 60; -static void rxrpc_connection_reaper(struct work_struct *work); - -LIST_HEAD(rxrpc_connections); -LIST_HEAD(rxrpc_connection_proc_list); -DEFINE_RWLOCK(rxrpc_connection_lock); -static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); - static void rxrpc_destroy_connection(struct rcu_head *); /* @@ -222,15 +215,17 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) */ void rxrpc_kill_connection(struct rxrpc_connection *conn) { + struct rxrpc_net *rxnet = conn->params.local->rxnet; + ASSERT(!rcu_access_pointer(conn->channels[0].call) && !rcu_access_pointer(conn->channels[1].call) && !rcu_access_pointer(conn->channels[2].call) && !rcu_access_pointer(conn->channels[3].call)); ASSERT(list_empty(&conn->cache_link)); - write_lock(&rxrpc_connection_lock); + write_lock(&rxnet->conn_lock); list_del_init(&conn->proc_link); - write_unlock(&rxrpc_connection_lock); + write_unlock(&rxnet->conn_lock); /* Drain the Rx queue. Note that even though we've unpublished, an * incoming packet could still be being added to our Rx queue, so we @@ -309,14 +304,17 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn) */ void rxrpc_put_service_conn(struct rxrpc_connection *conn) { + struct rxrpc_net *rxnet; const void *here = __builtin_return_address(0); int n; n = atomic_dec_return(&conn->usage); trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); ASSERTCMP(n, >=, 0); - if (n == 0) - rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + if (n == 0) { + rxnet = conn->params.local->rxnet; + rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, 0); + } } /* @@ -348,9 +346,12 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) /* * reap dead service connections */ -static void rxrpc_connection_reaper(struct work_struct *work) +void rxrpc_service_connection_reaper(struct work_struct *work) { struct rxrpc_connection *conn, *_p; + struct rxrpc_net *rxnet = + container_of(to_delayed_work(work), + struct rxrpc_net, service_conn_reaper); unsigned long reap_older_than, earliest, idle_timestamp, now; LIST_HEAD(graveyard); @@ -361,8 +362,8 @@ static void rxrpc_connection_reaper(struct work_struct *work) reap_older_than = now - rxrpc_connection_expiry * HZ; earliest = ULONG_MAX; - write_lock(&rxrpc_connection_lock); - list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { + write_lock(&rxnet->conn_lock); + list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) { ASSERTCMP(atomic_read(&conn->usage), >, 0); if (likely(atomic_read(&conn->usage) > 1)) continue; @@ -393,12 +394,12 @@ static void rxrpc_connection_reaper(struct work_struct *work) list_move_tail(&conn->link, &graveyard); } - write_unlock(&rxrpc_connection_lock); + write_unlock(&rxnet->conn_lock); if (earliest != ULONG_MAX) { _debug("reschedule reaper %ld", (long) earliest - now); ASSERT(time_after(earliest, now)); - rxrpc_queue_delayed_work(&rxrpc_connection_reap, + rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, earliest - now); } @@ -418,36 +419,30 @@ static void rxrpc_connection_reaper(struct work_struct *work) * preemptively destroy all the service connection records rather than * waiting for them to time out */ -void __exit rxrpc_destroy_all_connections(void) +void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) { struct rxrpc_connection *conn, *_p; bool leak = false; _enter(""); - rxrpc_destroy_all_client_connections(); + rxrpc_destroy_all_client_connections(rxnet); rxrpc_connection_expiry = 0; - cancel_delayed_work(&rxrpc_connection_reap); - rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + cancel_delayed_work(&rxnet->client_conn_reaper); + rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, 0); flush_workqueue(rxrpc_workqueue); - write_lock(&rxrpc_connection_lock); - list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { + write_lock(&rxnet->conn_lock); + list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) { pr_err("AF_RXRPC: Leaked conn %p {%d}\n", conn, atomic_read(&conn->usage)); leak = true; } - write_unlock(&rxrpc_connection_lock); + write_unlock(&rxnet->conn_lock); BUG_ON(leak); - ASSERT(list_empty(&rxrpc_connection_proc_list)); - - /* Make sure the local and peer records pinned by any dying connections - * are released. - */ - rcu_barrier(); - rxrpc_destroy_client_conn_ids(); + ASSERT(list_empty(&rxnet->conn_proc_list)); _leave(""); } diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index eef551f40dc2..edfc633f7d5e 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -121,7 +121,8 @@ replace_old_connection: * Preallocate a service connection. The connection is placed on the proc and * reap lists so that we don't have to get the lock from BH context. */ -struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp) +struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxnet, + gfp_t gfp) { struct rxrpc_connection *conn = rxrpc_alloc_connection(gfp); @@ -132,10 +133,10 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp) conn->state = RXRPC_CONN_SERVICE_PREALLOC; atomic_set(&conn->usage, 2); - write_lock(&rxrpc_connection_lock); - list_add_tail(&conn->link, &rxrpc_connections); - list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list); - write_unlock(&rxrpc_connection_lock); + write_lock(&rxnet->conn_lock); + list_add_tail(&conn->link, &rxnet->service_conns); + list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); + write_unlock(&rxnet->conn_lock); trace_rxrpc_conn(conn, rxrpc_conn_new_service, atomic_read(&conn->usage), diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index ff4864d550b8..17d79fd73ade 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -25,9 +25,6 @@ static void rxrpc_local_processor(struct work_struct *); static void rxrpc_local_rcu(struct rcu_head *); -static DEFINE_MUTEX(rxrpc_local_mutex); -static LIST_HEAD(rxrpc_local_endpoints); - /* * Compare a local to an address. Return -ve, 0 or +ve to indicate less than, * same or greater than. @@ -77,13 +74,15 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local, /* * Allocate a new local endpoint. */ -static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx) +static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, + const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); if (local) { atomic_set(&local->usage, 1); + local->rxnet = rxnet; INIT_LIST_HEAD(&local->link); INIT_WORK(&local->processor, rxrpc_local_processor); init_rwsem(&local->defrag_sem); @@ -105,7 +104,7 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx) * create the local socket * - must be called with rxrpc_local_mutex locked */ -static int rxrpc_open_socket(struct rxrpc_local *local) +static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { struct sock *sock; int ret, opt; @@ -114,7 +113,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local) local, local->srx.transport_type, local->srx.transport.family); /* create a socket to represent the local endpoint */ - ret = sock_create_kern(&init_net, local->srx.transport.family, + ret = sock_create_kern(net, local->srx.transport.family, local->srx.transport_type, 0, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); @@ -172,9 +171,11 @@ error: /* * Look up or create a new local endpoint using the specified local address. */ -struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx) +struct rxrpc_local *rxrpc_lookup_local(struct net *net, + const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; + struct rxrpc_net *rxnet = rxrpc_net(net); struct list_head *cursor; const char *age; long diff; @@ -183,10 +184,10 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx) _enter("{%d,%d,%pISp}", srx->transport_type, srx->transport.family, &srx->transport); - mutex_lock(&rxrpc_local_mutex); + mutex_lock(&rxnet->local_mutex); - for (cursor = rxrpc_local_endpoints.next; - cursor != &rxrpc_local_endpoints; + for (cursor = rxnet->local_endpoints.next; + cursor != &rxnet->local_endpoints; cursor = cursor->next) { local = list_entry(cursor, struct rxrpc_local, link); @@ -220,11 +221,11 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx) goto found; } - local = rxrpc_alloc_local(srx); + local = rxrpc_alloc_local(rxnet, srx); if (!local) goto nomem; - ret = rxrpc_open_socket(local); + ret = rxrpc_open_socket(local, net); if (ret < 0) goto sock_error; @@ -232,7 +233,7 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx) age = "new"; found: - mutex_unlock(&rxrpc_local_mutex); + mutex_unlock(&rxnet->local_mutex); _net("LOCAL %s %d {%pISp}", age, local->debug_id, &local->srx.transport); @@ -243,13 +244,13 @@ found: nomem: ret = -ENOMEM; sock_error: - mutex_unlock(&rxrpc_local_mutex); + mutex_unlock(&rxnet->local_mutex); kfree(local); _leave(" = %d", ret); return ERR_PTR(ret); addr_in_use: - mutex_unlock(&rxrpc_local_mutex); + mutex_unlock(&rxnet->local_mutex); _leave(" = -EADDRINUSE"); return ERR_PTR(-EADDRINUSE); } @@ -273,6 +274,7 @@ void __rxrpc_put_local(struct rxrpc_local *local) static void rxrpc_local_destroyer(struct rxrpc_local *local) { struct socket *socket = local->socket; + struct rxrpc_net *rxnet = local->rxnet; _enter("%d", local->debug_id); @@ -286,9 +288,9 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local) } local->dead = true; - mutex_lock(&rxrpc_local_mutex); + mutex_lock(&rxnet->local_mutex); list_del_init(&local->link); - mutex_unlock(&rxrpc_local_mutex); + mutex_unlock(&rxnet->local_mutex); ASSERT(RB_EMPTY_ROOT(&local->client_conns)); ASSERT(!local->service); @@ -357,7 +359,7 @@ static void rxrpc_local_rcu(struct rcu_head *rcu) /* * Verify the local endpoint list is empty by this point. */ -void __exit rxrpc_destroy_all_locals(void) +void rxrpc_destroy_all_locals(struct rxrpc_net *rxnet) { struct rxrpc_local *local; @@ -365,15 +367,13 @@ void __exit rxrpc_destroy_all_locals(void) flush_workqueue(rxrpc_workqueue); - if (!list_empty(&rxrpc_local_endpoints)) { - mutex_lock(&rxrpc_local_mutex); - list_for_each_entry(local, &rxrpc_local_endpoints, link) { + if (!list_empty(&rxnet->local_endpoints)) { + mutex_lock(&rxnet->local_mutex); + list_for_each_entry(local, &rxnet->local_endpoints, link) { pr_err("AF_RXRPC: Leaked local %p {%d}\n", local, atomic_read(&local->usage)); } - mutex_unlock(&rxrpc_local_mutex); + mutex_unlock(&rxnet->local_mutex); BUG(); } - - rcu_barrier(); } diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c new file mode 100644 index 000000000000..26449a6bb076 --- /dev/null +++ b/net/rxrpc/net_ns.c @@ -0,0 +1,85 @@ +/* rxrpc network namespace handling. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include "ar-internal.h" + +unsigned int rxrpc_net_id; + +/* + * Initialise a per-network namespace record. + */ +static __net_init int rxrpc_init_net(struct net *net) +{ + struct rxrpc_net *rxnet = rxrpc_net(net); + int ret; + + get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch)); + rxnet->epoch |= RXRPC_RANDOM_EPOCH; + + INIT_LIST_HEAD(&rxnet->calls); + rwlock_init(&rxnet->call_lock); + + INIT_LIST_HEAD(&rxnet->conn_proc_list); + INIT_LIST_HEAD(&rxnet->service_conns); + rwlock_init(&rxnet->conn_lock); + INIT_DELAYED_WORK(&rxnet->service_conn_reaper, + rxrpc_service_connection_reaper); + + rxnet->nr_client_conns = 0; + rxnet->nr_active_client_conns = 0; + rxnet->kill_all_client_conns = false; + spin_lock_init(&rxnet->client_conn_cache_lock); + spin_lock_init(&rxnet->client_conn_discard_lock); + INIT_LIST_HEAD(&rxnet->waiting_client_conns); + INIT_LIST_HEAD(&rxnet->active_client_conns); + INIT_LIST_HEAD(&rxnet->idle_client_conns); + INIT_DELAYED_WORK(&rxnet->client_conn_reaper, + rxrpc_discard_expired_client_conns); + + INIT_LIST_HEAD(&rxnet->local_endpoints); + mutex_init(&rxnet->local_mutex); + hash_init(rxnet->peer_hash); + spin_lock_init(&rxnet->peer_hash_lock); + + ret = -ENOMEM; + rxnet->proc_net = proc_net_mkdir(net, "rxrpc", net->proc_net); + if (!rxnet->proc_net) + goto err_proc; + + proc_create("calls", 0444, rxnet->proc_net, &rxrpc_call_seq_fops); + proc_create("conns", 0444, rxnet->proc_net, &rxrpc_connection_seq_fops); + return 0; + + proc_remove(rxnet->proc_net); +err_proc: + return ret; +} + +/* + * Clean up a per-network namespace record. + */ +static __net_exit void rxrpc_exit_net(struct net *net) +{ + struct rxrpc_net *rxnet = rxrpc_net(net); + + rxrpc_destroy_all_calls(rxnet); + rxrpc_destroy_all_connections(rxnet); + rxrpc_destroy_all_locals(rxnet); + proc_remove(rxnet->proc_net); +} + +struct pernet_operations rxrpc_net_ops = { + .init = rxrpc_init_net, + .exit = rxrpc_exit_net, + .id = &rxrpc_net_id, + .size = sizeof(struct rxrpc_net), +}; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 862eea6b266c..cfed3b27adf0 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -26,9 +26,6 @@ #include #include "ar-internal.h" -static DEFINE_HASHTABLE(rxrpc_peer_hash, 10); -static DEFINE_SPINLOCK(rxrpc_peer_hash_lock); - /* * Hash a peer key. */ @@ -124,8 +121,9 @@ static struct rxrpc_peer *__rxrpc_lookup_peer_rcu( unsigned long hash_key) { struct rxrpc_peer *peer; + struct rxrpc_net *rxnet = local->rxnet; - hash_for_each_possible_rcu(rxrpc_peer_hash, peer, hash_link, hash_key) { + hash_for_each_possible_rcu(rxnet->peer_hash, peer, hash_link, hash_key) { if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0) { if (atomic_read(&peer->usage) == 0) return NULL; @@ -301,13 +299,14 @@ struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *prealloc) { struct rxrpc_peer *peer; + struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key; hash_key = rxrpc_peer_hash_key(local, &prealloc->srx); prealloc->local = local; rxrpc_init_peer(prealloc, hash_key); - spin_lock(&rxrpc_peer_hash_lock); + spin_lock(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key); @@ -315,10 +314,10 @@ struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, peer = NULL; if (!peer) { peer = prealloc; - hash_add_rcu(rxrpc_peer_hash, &peer->hash_link, hash_key); + hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); } - spin_unlock(&rxrpc_peer_hash_lock); + spin_unlock(&rxnet->peer_hash_lock); return peer; } @@ -329,6 +328,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_peer *peer, *candidate; + struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key = rxrpc_peer_hash_key(local, srx); _enter("{%pISp}", &srx->transport); @@ -350,17 +350,17 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, return NULL; } - spin_lock_bh(&rxrpc_peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; if (!peer) - hash_add_rcu(rxrpc_peer_hash, + hash_add_rcu(rxnet->peer_hash, &candidate->hash_link, hash_key); - spin_unlock_bh(&rxrpc_peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (peer) kfree(candidate); @@ -379,11 +379,13 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, */ void __rxrpc_put_peer(struct rxrpc_peer *peer) { + struct rxrpc_net *rxnet = peer->local->rxnet; + ASSERT(hlist_empty(&peer->error_targets)); - spin_lock_bh(&rxrpc_peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); - spin_unlock_bh(&rxrpc_peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); kfree_rcu(peer, rcu); } diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index b9bcfbfb095c..e92d8405b15a 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -30,19 +30,25 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { */ static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) { + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + rcu_read_lock(); - read_lock(&rxrpc_call_lock); - return seq_list_start_head(&rxrpc_calls, *_pos); + read_lock(&rxnet->call_lock); + return seq_list_start_head(&rxnet->calls, *_pos); } static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - return seq_list_next(v, &rxrpc_calls, pos); + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + return seq_list_next(v, &rxnet->calls, pos); } static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) { - read_unlock(&rxrpc_call_lock); + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + read_unlock(&rxnet->call_lock); rcu_read_unlock(); } @@ -52,10 +58,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_sock *rx; struct rxrpc_peer *peer; struct rxrpc_call *call; + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); rxrpc_seq_t tx_hard_ack, rx_hard_ack; char lbuff[50], rbuff[50]; - if (v == &rxrpc_calls) { + if (v == &rxnet->calls) { seq_puts(seq, "Proto Local " " Remote " @@ -113,7 +120,8 @@ static const struct seq_operations rxrpc_call_seq_ops = { static int rxrpc_call_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &rxrpc_call_seq_ops); + return seq_open_net(inode, file, &rxrpc_call_seq_ops, + sizeof(struct seq_net_private)); } const struct file_operations rxrpc_call_seq_fops = { @@ -129,27 +137,34 @@ const struct file_operations rxrpc_call_seq_fops = { */ static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) { - read_lock(&rxrpc_connection_lock); - return seq_list_start_head(&rxrpc_connection_proc_list, *_pos); + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + read_lock(&rxnet->conn_lock); + return seq_list_start_head(&rxnet->conn_proc_list, *_pos); } static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - return seq_list_next(v, &rxrpc_connection_proc_list, pos); + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + return seq_list_next(v, &rxnet->conn_proc_list, pos); } static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) { - read_unlock(&rxrpc_connection_lock); + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + read_unlock(&rxnet->conn_lock); } static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) { struct rxrpc_connection *conn; + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); char lbuff[50], rbuff[50]; - if (v == &rxrpc_connection_proc_list) { + if (v == &rxnet->conn_proc_list) { seq_puts(seq, "Proto Local " " Remote " @@ -197,7 +212,8 @@ static const struct seq_operations rxrpc_connection_seq_ops = { static int rxrpc_connection_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &rxrpc_connection_seq_ops); + return seq_open_net(inode, file, &rxrpc_connection_seq_ops, + sizeof(struct seq_net_private)); } const struct file_operations rxrpc_connection_seq_fops = { -- cgit v1.2.3 From ba615f675281d76fd19aa03558777f81fb6b6084 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 24 May 2017 09:59:31 -0700 Subject: tcp: avoid fastopen API to be used on AF_UNSPEC Fastopen API should be used to perform fastopen operations on the TCP socket. It does not make sense to use fastopen API to perform disconnect by calling it with AF_UNSPEC. The fastopen data path is also prone to race conditions and bugs when using with AF_UNSPEC. One issue reported and analyzed by Vegard Nossum is as follows: +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Thread A: Thread B: ------------------------------------------------------------------------ sendto() - tcp_sendmsg() - sk_stream_memory_free() = 0 - goto wait_for_sndbuf - sk_stream_wait_memory() - sk_wait_event() // sleep | sendto(flags=MSG_FASTOPEN, dest_addr=AF_UNSPEC) | - tcp_sendmsg() | - tcp_sendmsg_fastopen() | - __inet_stream_connect() | - tcp_disconnect() //because of AF_UNSPEC | - tcp_transmit_skb()// send RST | - return 0; // no reconnect! | - sk_stream_wait_connect() | - sock_error() | - xchg(&sk->sk_err, 0) | - return -ECONNRESET - ... // wake up, see sk->sk_err == 0 - skb_entail() on TCP_CLOSE socket If the connection is reopened then we will send a brand new SYN packet after thread A has already queued a buffer. At this point I think the socket internal state (sequence numbers etc.) becomes messed up. When the new connection is closed, the FIN-ACK is rejected because the sequence number is outside the window. The other side tries to retransmit, but __tcp_retransmit_skb() calls tcp_trim_head() on an empty skb which corrupts the skb data length and hits a BUG() in copy_and_csum_bits(). +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Hence, this patch adds a check for AF_UNSPEC in the fastopen data path and return EOPNOTSUPP to user if such case happens. Fixes: cf60af03ca4e7 ("tcp: Fast Open client - sendmsg(MSG_FASTOPEN)") Reported-by: Vegard Nossum Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 842b575f8fdd..59792d283ff8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1084,9 +1084,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); + struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && + uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ @@ -1108,7 +1111,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; - err = __inet_stream_connect(sk->sk_socket, msg->msg_name, + err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst -- cgit v1.2.3 From 5990baaa6d7b437dfcf58b7021ca56b1d6b35869 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Wed, 24 May 2017 15:19:35 -0700 Subject: arp: fixed -Wuninitialized compiler warning Commit 7d472a59c0e5ec117220a05de6b370447fb6cb66 ("arp: always override existing neigh entries with gratuitous ARP") introduced a compiler warning: net/ipv4/arp.c:880:35: warning: 'addr_type' may be used uninitialized in this function [-Wmaybe-uninitialized] While the code logic seems to be correct and doesn't allow the variable to be used uninitialized, and the warning is not consistently reproducible, it's still worth fixing it for other people not to waste time looking at the warning in case it pops up in the build environment. Yes, compiler is probably at fault, but we will need to accommodate. Fixes: 7d472a59c0e5 ("arp: always override existing neigh entries with gratuitous ARP") Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index ae96e6f3e0cb..e9f3386a528b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -863,8 +863,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); + addr_type = -1; if (n || IN_DEV_ARP_ACCEPT(in_dev)) { - addr_type = -1; is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } -- cgit v1.2.3 From 41703a731066fde79c3e5ccf3391cf77a98aeda5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:07 +0200 Subject: bpf: add bpf_clone_redirect to bpf_helper_changes_pkt_data The bpf_clone_redirect() still needs to be listed in bpf_helper_changes_pkt_data() since we call into bpf_try_make_head_writable() from there, thus we need to invalidate prior pkt regs as well. Fixes: 36bbef52c7eb ("bpf: direct packet write and access for helpers for clsact progs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a253a6197e6b..a6bb95fa87b2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2281,6 +2281,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_change_head || func == bpf_skb_change_tail || func == bpf_skb_pull_data || + func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head) -- cgit v1.2.3 From 3abd1ade6765e8edcccad6a9e1039cc709e65dde Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 25 May 2017 10:42:33 -0700 Subject: net: ipv4: refactor __ip_route_output_key_hash A later patch wants access to the fib result on an output route lookup with the rcu lock held. Refactor __ip_route_output_key_hash, pushing the logic between rcu_read_lock ... rcu_read_unlock into a new helper with the fib_result as an input arg. To keep the name length under control remove the leading underscores from the name and add _rcu to the name of the new helper indicating it is called with the rcu read lock held. Signed-off-by: David Ahern Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/route.h | 9 ++++++--- net/ipv4/icmp.c | 2 +- net/ipv4/route.c | 53 +++++++++++++++++++++++++++++++---------------------- 3 files changed, 38 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index 2cc0e14c6359..89e4028cd063 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -113,13 +113,16 @@ struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); -struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *flp, - const struct sk_buff *skb); +struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp, + const struct sk_buff *skb); +struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp, + struct fib_result *res, + const struct sk_buff *skb); static inline struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp) { - return __ip_route_output_key_hash(net, flp, NULL); + return ip_route_output_key_hash(net, flp, NULL); } struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 43318b5f5647..5610971bf859 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -489,7 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key_hash(net, fl4, skb_in); + rt = ip_route_output_key_hash(net, fl4, skb_in); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 655d9eebe43e..c9b55cb0e316 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2246,29 +2246,40 @@ add: * Major route resolver routine. */ -struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, - const struct sk_buff *skb) +struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, + const struct sk_buff *skb) { - struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); - unsigned int flags = 0; struct fib_result res; struct rtable *rth; - int orig_oif; - int err = -ENETUNREACH; res.tclassid = 0; res.fi = NULL; res.table = NULL; - orig_oif = fl4->flowi4_oif; - fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = ((tos & RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); rcu_read_lock(); + rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); + rcu_read_unlock(); + + return rth; +} +EXPORT_SYMBOL_GPL(ip_route_output_key_hash); + +struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, + struct fib_result *res, + const struct sk_buff *skb) +{ + struct net_device *dev_out = NULL; + int orig_oif = fl4->flowi4_oif; + unsigned int flags = 0; + struct rtable *rth; + int err = -ENETUNREACH; + if (fl4->saddr) { rth = ERR_PTR(-EINVAL); if (ipv4_is_multicast(fl4->saddr) || @@ -2354,15 +2365,15 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = LOOPBACK_IFINDEX; - res.type = RTN_LOCAL; + res->type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - err = fib_lookup(net, fl4, &res, 0); + err = fib_lookup(net, fl4, res, 0); if (err) { - res.fi = NULL; - res.table = NULL; + res->fi = NULL; + res->table = NULL; if (fl4->flowi4_oif && (ipv4_is_multicast(fl4->daddr) || !netif_index_is_l3_master(net, fl4->flowi4_oif))) { @@ -2387,43 +2398,41 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, if (fl4->saddr == 0) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); - res.type = RTN_UNICAST; + res->type = RTN_UNICAST; goto make_route; } rth = ERR_PTR(err); goto out; } - if (res.type == RTN_LOCAL) { + if (res->type == RTN_LOCAL) { if (!fl4->saddr) { - if (res.fi->fib_prefsrc) - fl4->saddr = res.fi->fib_prefsrc; + if (res->fi->fib_prefsrc) + fl4->saddr = res->fi->fib_prefsrc; else fl4->saddr = fl4->daddr; } /* L3 master device is the loopback for that domain */ - dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(res)) ? : + dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } - fib_select_path(net, &res, fl4, skb); + fib_select_path(net, res, fl4, skb); - dev_out = FIB_RES_DEV(res); + dev_out = FIB_RES_DEV(*res); fl4->flowi4_oif = dev_out->ifindex; make_route: - rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); + rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); out: - rcu_read_unlock(); return rth; } -EXPORT_SYMBOL_GPL(__ip_route_output_key_hash); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) { -- cgit v1.2.3 From 5510cdf7be042a1943222e19912f13a396c0b914 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 25 May 2017 10:42:34 -0700 Subject: net: ipv4: refactor ip_route_input_noref A later patch wants access to the fib result on an input route lookup with the rcu lock held. Refactor ip_route_input_noref pushing the logic between rcu_read_lock ... rcu_read_unlock into a new helper that takes the fib_result as an input arg. Signed-off-by: David Ahern Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/route.h | 3 +++ net/ipv4/route.c | 66 ++++++++++++++++++++++++++++++----------------------- 2 files changed, 40 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index 89e4028cd063..08e689f23365 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -178,6 +178,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin); +int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin, + struct fib_result *res); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c9b55cb0e316..1dc8fd1e60a9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1852,9 +1852,9 @@ static int ip_mkroute_input(struct sk_buff *skb, */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) + u8 tos, struct net_device *dev, + struct fib_result *res) { - struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); struct ip_tunnel_info *tun_info; struct flowi4 fl4; @@ -1884,8 +1884,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; - res.fi = NULL; - res.table = NULL; + res->fi = NULL; + res->table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; @@ -1921,17 +1921,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); - err = fib_lookup(net, &fl4, &res, 0); + err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; goto no_route; } - if (res.type == RTN_BROADCAST) + if (res->type == RTN_BROADCAST) goto brd_input; - if (res.type == RTN_LOCAL) { + if (res->type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &itag); if (err < 0) @@ -1943,10 +1943,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = -EHOSTUNREACH; goto no_route; } - if (res.type != RTN_UNICAST) + if (res->type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos); out: return err; brd_input: @@ -1960,14 +1960,14 @@ brd_input: goto martian_source; } flags |= RTCF_BROADCAST; - res.type = RTN_BROADCAST; + res->type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: do_cache = false; - if (res.fi) { + if (res->fi) { if (!itag) { - rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; @@ -1978,7 +1978,7 @@ local_input: } rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, - flags | RTCF_LOCAL, res.type, + flags | RTCF_LOCAL, res->type, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; @@ -1988,18 +1988,18 @@ local_input: rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; - if (res.table) - rth->rt_table_id = res.table->tb_id; + if (res->table) + rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); - if (res.type == RTN_UNREACHABLE) { + if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { - struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh *nh = &FIB_RES_NH(*res); rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { @@ -2019,9 +2019,9 @@ local_input: no_route: RT_CACHE_STAT_INC(in_no_route); - res.type = RTN_UNREACHABLE; - res.fi = NULL; - res.table = NULL; + res->type = RTN_UNREACHABLE; + res->fi = NULL; + res->table = NULL; goto local_input; /* @@ -2051,11 +2051,22 @@ martian_source: int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { - int res; + struct fib_result res; + int err; tos &= IPTOS_RT_MASK; rcu_read_lock(); + err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(ip_route_input_noref); + +/* called with rcu_read_lock held */ +int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, struct fib_result *res) +{ /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting @@ -2070,6 +2081,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_multicast(daddr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; + int err = -EINVAL; if (in_dev) our = ip_check_mc_rcu(in_dev, daddr, saddr, @@ -2085,7 +2097,6 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol); } - res = -EINVAL; if (our #ifdef CONFIG_IP_MROUTE || @@ -2093,17 +2104,14 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, IN_DEV_MFORWARD(in_dev)) #endif ) { - res = ip_route_input_mc(skb, daddr, saddr, + err = ip_route_input_mc(skb, daddr, saddr, tos, dev, our); } - rcu_read_unlock(); - return res; + return err; } - res = ip_route_input_slow(skb, daddr, saddr, tos, dev); - rcu_read_unlock(); - return res; + + return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); } -EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, -- cgit v1.2.3 From d3166e0c959311881eaf7ed2e5130822d4337905 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 25 May 2017 10:42:35 -0700 Subject: net: ipv4: Remove event arg to rt_fill_info rt_fill_info has 1 caller with the event set to RTM_NEWROUTE. Given that remove the arg and use RTM_NEWROUTE directly in rt_fill_info. Signed-off-by: David Ahern Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv4/route.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1dc8fd1e60a9..d8fcecce3839 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2536,7 +2536,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow); static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, - u32 seq, int event) + u32 seq) { struct rtable *rt = skb_rtable(skb); struct rtmsg *r; @@ -2545,7 +2545,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, u32 error; u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0); + nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); if (!nlh) return -EMSGSIZE; @@ -2745,8 +2745,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, table_id = rt->rt_table_id; err = rt_fill_info(net, dst, src, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - RTM_NEWROUTE); + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); if (err < 0) goto errout_free; -- cgit v1.2.3 From 3765d35ed8b9363cbf72ffe2282002d717a40843 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 25 May 2017 10:42:36 -0700 Subject: net: ipv4: Convert inet_rtm_getroute to rcu versions of route lookup Convert inet_rtm_getroute to use ip_route_input_rcu and ip_route_output_key_hash_rcu passing the fib_result arg to both. The rcu lock is held through the creation of the response, so the rtable/dst does not need to be attached to the skb and is passed to rt_fill_info directly. In converting from ip_route_output_key to ip_route_output_key_hash_rcu the xfrm_lookup_route in ip_route_output_flow is dropped since flowi4_proto is not set for a route get request. Signed-off-by: David Ahern Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv4/route.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d8fcecce3839..1fa9127ec4db 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2534,11 +2534,11 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); +/* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, - u32 seq) + u32 seq, struct rtable *rt) { - struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; @@ -2653,6 +2653,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; + struct fib_result res = {}; struct rtable *rt = NULL; struct flowi4 fl4; __be32 dst = 0; @@ -2709,10 +2710,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; + rcu_read_lock(); + if (iif) { struct net_device *dev; - dev = __dev_get_by_index(net, iif); + dev = dev_get_by_index_rcu(net, iif); if (!dev) { err = -ENODEV; goto errout_free; @@ -2721,14 +2724,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb->protocol = htons(ETH_P_IP); skb->dev = dev; skb->mark = mark; - err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, + dev, &res); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { - rt = ip_route_output_key(net, &fl4); - + rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); @@ -2737,7 +2740,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err) goto errout_free; - skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; @@ -2745,15 +2747,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, table_id = rt->rt_table_id; err = rt_fill_info(net, dst, src, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, rt); if (err < 0) goto errout_free; + rcu_read_unlock(); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; errout_free: + rcu_read_unlock(); kfree_skb(skb); goto errout; } -- cgit v1.2.3 From 6ffd903415320d68a528865296e4740da350785e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 25 May 2017 10:42:37 -0700 Subject: net: ipv4: Save trie prefix to fib lookup result Prefix is needed for returning matching route spec on get route request. Signed-off-by: David Ahern Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + net/ipv4/fib_trie.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 42e8b8f55f7c..25f5c516afd1 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -136,6 +136,7 @@ struct fib_rule; struct fib_table; struct fib_result { + __be32 prefix; unsigned char prefixlen; unsigned char nh_sel; unsigned char type; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 6d0f6c79d9aa..6e9df7d9bcc2 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1452,6 +1452,7 @@ found: if (!(fib_flags & FIB_LOOKUP_NOREF)) atomic_inc(&fi->fib_clntref); + res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; res->type = fa->fa_type; -- cgit v1.2.3 From b61798130f1be5bff08712308126c2d7ebe390ef Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 25 May 2017 10:42:39 -0700 Subject: net: ipv4: RTM_GETROUTE: return matched fib result when requested This patch adds support to return matched fib result when RTM_F_FIB_MATCH flag is specified in RTM_GETROUTE request. This is useful for user-space applications/controllers wanting to query a matching route. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv4/route.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1fa9127ec4db..3a7425694d8b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -114,6 +114,8 @@ #include #include +#include "fib_lookup.h" + #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -2746,8 +2748,15 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = rt->rt_table_id; - err = rt_fill_info(net, dst, src, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, rt); + if (rtm->rtm_flags & RTM_F_FIB_MATCH) + err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, RTM_NEWROUTE, table_id, + rt->rt_type, res.prefix, res.prefixlen, + fl4.flowi4_tos, res.fi, 0); + else + err = rt_fill_info(net, dst, src, table_id, &fl4, skb, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, + rt); if (err < 0) goto errout_free; -- cgit v1.2.3 From 18c3a61c4264cd2f7d6f1bde4b1bf036d5b0a2bc Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 25 May 2017 10:42:40 -0700 Subject: net: ipv6: RTM_GETROUTE: return matched fib result when requested This patch adds support to return matched fib result when RTM_F_FIB_MATCH flag is specified in RTM_GETROUTE request. This is useful for user-space applications/controllers wanting to query a matching route. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 80bda31ffbbe..2fe84bdc4e60 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3607,11 +3607,13 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; + int err, iif = 0, oif = 0; + struct dst_entry *dst; struct rt6_info *rt; struct sk_buff *skb; struct rtmsg *rtm; struct flowi6 fl6; - int err, iif = 0, oif = 0; + bool fibmatch; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); @@ -3622,6 +3624,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, memset(&fl6, 0, sizeof(fl6)); rtm = nlmsg_data(nlh); fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); + fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); if (tb[RTA_SRC]) { if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) @@ -3667,12 +3670,23 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6, - flags); + if (!fibmatch) + dst = ip6_route_input_lookup(net, dev, &fl6, flags); } else { fl6.flowi6_oif = oif; - rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); + if (!fibmatch) + dst = ip6_route_output(net, NULL, &fl6); + } + + if (fibmatch) + dst = ip6_route_lookup(net, &fl6, 0); + + rt = container_of(dst, struct rt6_info, dst); + if (rt->dst.error) { + err = rt->dst.error; + ip6_rt_put(rt); + goto errout; } if (rt == net->ipv6.ip6_null_entry) { @@ -3689,10 +3703,14 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, } skb_dst_set(skb, &rt->dst); - - err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, - RTM_NEWROUTE, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0); + if (fibmatch) + err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, + RTM_NEWROUTE, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + else + err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, + RTM_NEWROUTE, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); if (err < 0) { kfree_skb(skb); goto errout; -- cgit v1.2.3 From 0908cf4dfef35fc6ac12329007052ebe93ff1081 Mon Sep 17 00:00:00 2001 From: linzhang Date: Thu, 25 May 2017 14:07:18 +0800 Subject: net: llc: add lock_sock in llc_ui_bind to avoid a race condition There is a race condition in llc_ui_bind if two or more processes/threads try to bind a same socket. If more processes/threads bind a same socket success that will lead to two problems, one is this action is not what we expected, another is will lead to kernel in unstable status or oops(in my simple test case, cause llc2.ko can't unload). The current code is test SOCK_ZAPPED bit to avoid a process to bind a same socket twice but that is can't avoid more processes/threads try to bind a same socket at the same time. So, add lock_sock in llc_ui_bind like others, such as llc_ui_connect. Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- net/llc/af_llc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 8364fe5b59e4..c38d16f22d2a 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -311,6 +311,8 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) int rc = -EINVAL; dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); + + lock_sock(sk); if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; @@ -382,6 +384,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) out_put: llc_sap_put(sap); out: + release_sock(sk); return rc; } -- cgit v1.2.3 From 804ec7ebe8ea003999ca8d1bfc499edc6a9e07df Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 25 May 2017 19:14:56 +0200 Subject: sctp: fix ICMP processing if skb is non-linear sometimes ICMP replies to INIT chunks are ignored by the client, even if the encapsulated SCTP headers match an open socket. This happens when the ICMP packet is carried by a paged skb: use skb_header_pointer() to read packet contents beyond the SCTP header, so that chunk header and initiate tag are validated correctly. v2: - don't use skb_header_pointer() to read the transport header, since icmp_socket_deliver() already puts these 8 bytes in the linear area. - change commit message to make specific reference to INIT chunks. Signed-off-by: Davide Caratti Acked-by: Marcelo Ricardo Leitner Acked-by: Vlad Yasevich Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/input.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 0e06a278d2a9..ba9ad32fc447 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -473,15 +473,14 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, struct sctp_association **app, struct sctp_transport **tpp) { + struct sctp_init_chunk *chunkhdr, _chunkhdr; union sctp_addr saddr; union sctp_addr daddr; struct sctp_af *af; struct sock *sk = NULL; struct sctp_association *asoc; struct sctp_transport *transport = NULL; - struct sctp_init_chunk *chunkhdr; __u32 vtag = ntohl(sctphdr->vtag); - int len = skb->len - ((void *)sctphdr - (void *)skb->data); *app = NULL; *tpp = NULL; @@ -516,13 +515,16 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, * discard the packet. */ if (vtag == 0) { - chunkhdr = (void *)sctphdr + sizeof(struct sctphdr); - if (len < sizeof(struct sctphdr) + sizeof(sctp_chunkhdr_t) - + sizeof(__be32) || + /* chunk header + first 4 octects of init header */ + chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) + + sizeof(struct sctphdr), + sizeof(struct sctp_chunkhdr) + + sizeof(__be32), &_chunkhdr); + if (!chunkhdr || chunkhdr->chunk_hdr.type != SCTP_CID_INIT || - ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) { + ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) goto out; - } + } else if (vtag != asoc->c.peer_vtag) { goto out; } -- cgit v1.2.3 From 0e9a709560dbcfbace8bf4019dc5298619235891 Mon Sep 17 00:00:00 2001 From: Peter Dawson Date: Fri, 26 May 2017 06:35:18 +1000 Subject: ip6_tunnel, ip6_gre: fix setting of DSCP on encapsulated packets This fix addresses two problems in the way the DSCP field is formulated on the encapsulating header of IPv6 tunnels. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195661 1) The IPv6 tunneling code was manipulating the DSCP field of the encapsulating packet using the 32b flowlabel. Since the flowlabel is only the lower 20b it was incorrect to assume that the upper 12b containing the DSCP and ECN fields would remain intact when formulating the encapsulating header. This fix handles the 'inherit' and 'fixed-value' DSCP cases explicitly using the extant dsfield u8 variable. 2) The use of INET_ECN_encapsulate(0, dsfield) in ip6_tnl_xmit was incorrect and resulted in the DSCP value always being set to 0. Commit 90427ef5d2a4 ("ipv6: fix flow labels when the traffic class is non-0") caused the regression by masking out the flowlabel which exposed the incorrect handling of the DSCP portion of the flowlabel in ip6_tunnel and ip6_gre. Fixes: 90427ef5d2a4 ("ipv6: fix flow labels when the traffic class is non-0") Signed-off-by: Peter Dawson Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 13 +++++++------ net/ipv6/ip6_tunnel.c | 21 +++++++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 8d128ba79b66..0c5b4caa1949 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -537,11 +537,10 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - dsfield = ipv4_get_dsfield(iph); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) - & IPV6_TCLASS_MASK; + dsfield = ipv4_get_dsfield(iph); + else + dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else @@ -598,9 +597,11 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); + dsfield = ipv6_get_dsfield(ipv6h); + else + dsfield = ip6_tclass(t->parms.flowinfo); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 6eb2ae507500..7ae6c503f1ca 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1196,7 +1196,7 @@ route_lookup: skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; @@ -1231,8 +1231,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (tproto != IPPROTO_IPIP && tproto != 0) return -1; - dsfield = ipv4_get_dsfield(iph); - if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; @@ -1246,6 +1244,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPIP; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; + dsfield = ip6_tclass(key->label); } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; @@ -1254,8 +1253,9 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPIP; if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) - & IPV6_TCLASS_MASK; + dsfield = ipv4_get_dsfield(iph); + else + dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else @@ -1267,6 +1267,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; + dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph)); + skb_set_inner_ipproto(skb, IPPROTO_IPIP); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, @@ -1300,8 +1302,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) ip6_tnl_addr_conflict(t, ipv6h)) return -1; - dsfield = ipv6_get_dsfield(ipv6h); - if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; @@ -1315,6 +1315,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPV6; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; + dsfield = ip6_tclass(key->label); } else { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ @@ -1337,7 +1338,9 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPV6; if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK); + dsfield = ipv6_get_dsfield(ipv6h); + else + dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) @@ -1351,6 +1354,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; + dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h)); + skb_set_inner_ipproto(skb, IPPROTO_IPV6); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, -- cgit v1.2.3 From 3fb07daff8e99243366a081e5129560734de4ada Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 May 2017 14:27:35 -0700 Subject: ipv4: add reference counting to metrics Andrey Konovalov reported crashes in ipv4_mtu() I could reproduce the issue with KASAN kernels, between 10.246.7.151 and 10.246.7.152 : 1) 20 concurrent netperf -t TCP_RR -H 10.246.7.152 -l 1000 & 2) At the same time run following loop : while : do ip ro add 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 ip ro del 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 done Cong Wang attempted to add back rt->fi in commit 82486aa6f1b9 ("ipv4: restore rt->fi for reference counting") but this proved to add some issues that were complex to solve. Instead, I suggested to add a refcount to the metrics themselves, being a standalone object (in particular, no reference to other objects) I tried to make this patch as small as possible to ease its backport, instead of being super clean. Note that we believe that only ipv4 dst need to take care of the metric refcount. But if this is wrong, this patch adds the basic infrastructure to extend this to other families. Many thanks to Julian Anastasov for reviewing this patch, and Cong Wang for his efforts on this problem. Fixes: 2860583fe840 ("ipv4: Kill rt->fi") Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Reviewed-by: Julian Anastasov Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/dst.h | 8 +++++++- include/net/ip_fib.h | 10 +++++----- net/core/dst.c | 23 ++++++++++++++--------- net/ipv4/fib_semantics.c | 17 ++++++++++------- net/ipv4/route.c | 10 +++++++++- 5 files changed, 45 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/dst.h b/include/net/dst.h index 049af33da3b6..cfc043784166 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -107,10 +107,16 @@ struct dst_entry { }; }; +struct dst_metrics { + u32 metrics[RTAX_MAX]; + atomic_t refcnt; +}; +extern const struct dst_metrics dst_default_metrics; + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); -extern const u32 dst_default_metrics[]; #define DST_METRICS_READ_ONLY 0x1UL +#define DST_METRICS_REFCOUNTED 0x2UL #define DST_METRICS_FLAGS 0x3UL #define __DST_METRICS_PTR(Y) \ ((u32 *)((Y) & ~DST_METRICS_FLAGS)) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 6692c5758b33..f7f6aa789c61 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -114,11 +114,11 @@ struct fib_info { __be32 fib_prefsrc; u32 fib_tb_id; u32 fib_priority; - u32 *fib_metrics; -#define fib_mtu fib_metrics[RTAX_MTU-1] -#define fib_window fib_metrics[RTAX_WINDOW-1] -#define fib_rtt fib_metrics[RTAX_RTT-1] -#define fib_advmss fib_metrics[RTAX_ADVMSS-1] + struct dst_metrics *fib_metrics; +#define fib_mtu fib_metrics->metrics[RTAX_MTU-1] +#define fib_window fib_metrics->metrics[RTAX_WINDOW-1] +#define fib_rtt fib_metrics->metrics[RTAX_RTT-1] +#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_weight; diff --git a/net/core/dst.c b/net/core/dst.c index 960e503b5a52..6192f11beec9 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -151,13 +151,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard_out); -const u32 dst_default_metrics[RTAX_MAX + 1] = { +const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ - [RTAX_MAX] = 0xdeadbeef, + .refcnt = ATOMIC_INIT(1), }; void dst_init(struct dst_entry *dst, struct dst_ops *ops, @@ -169,7 +169,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, if (dev) dev_hold(dev); dst->ops = ops; - dst_init_metrics(dst, dst_default_metrics, true); + dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; dst->path = dst; dst->from = NULL; @@ -314,25 +314,30 @@ EXPORT_SYMBOL(dst_release); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { - u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { - u32 *old_p = __DST_METRICS_PTR(old); + struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + atomic_set(&p->refcnt, 1); + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); - p = __DST_METRICS_PTR(prev); + p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; + } else if (prev & DST_METRICS_REFCOUNTED) { + if (atomic_dec_and_test(&old_p->refcnt)) + kfree(old_p); } } - return p; + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); + return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); @@ -341,7 +346,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; - new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da449ddb8cc1..ad9ad4aab5da 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -203,6 +203,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + struct dst_metrics *m; change_nexthops(fi) { if (nexthop_nh->nh_dev) @@ -213,8 +214,9 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - if (fi->fib_metrics != (u32 *) dst_default_metrics) - kfree(fi->fib_metrics); + m = fi->fib_metrics; + if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) + kfree(m); kfree(fi); } @@ -971,11 +973,11 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; - fi->fib_metrics[type - 1] = val; + fi->fib_metrics->metrics[type - 1] = val; } if (ecn_ca) - fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } @@ -1033,11 +1035,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; fib_info_cnt++; if (cfg->fc_mx) { - fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); if (!fi->fib_metrics) goto failure; + atomic_set(&fi->fib_metrics->refcnt, 1); } else - fi->fib_metrics = (u32 *) dst_default_metrics; + fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; @@ -1238,7 +1241,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 655d9eebe43e..6883b3d4ba8f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1385,8 +1385,12 @@ static void rt_add_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *) dst; + if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt)) + kfree(p); + if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; @@ -1438,7 +1442,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); + if (fi->fib_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + atomic_inc(&fi->fib_metrics->refcnt); + } #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif -- cgit v1.2.3 From 1f51445af35e8477027d87ca015a10257b13f5a2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 26 May 2017 08:37:23 +0200 Subject: bridge: Export VLAN filtering state It's useful for drivers supporting bridge offload to be able to query the bridge's VLAN filtering state. Currently, upon enslavement to a bridge master, the offloading driver will only learn about the bridge's VLAN filtering state after the bridge device was already linked with its slave. Being able to query the bridge's VLAN filtering state allows such drivers to forbid enslavement in case resource couldn't be allocated for a VLAN-aware bridge and also choose the correct initialization routine for the enslaved port, which is dependent on the bridge type. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 9 +++++++++ net/bridge/br_if.c | 2 +- net/bridge/br_mdb.c | 4 ++-- net/bridge/br_netlink.c | 2 +- net/bridge/br_private.h | 9 --------- net/bridge/br_vlan.c | 8 ++++++++ 6 files changed, 21 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 0c16866a7aac..d6cd103eb165 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -80,4 +80,13 @@ static inline bool br_multicast_has_querier_adjacent(struct net_device *dev, } #endif +#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) +bool br_vlan_enabled(const struct net_device *dev); +#else +static inline bool br_vlan_enabled(const struct net_device *dev) +{ + return false; +} +#endif + #endif diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 7f8d05cf9065..f3aef22931ab 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -138,7 +138,7 @@ void br_manage_promisc(struct net_bridge *br) /* If vlan filtering is disabled or bridge interface is placed * into promiscuous mode, place all ports in promiscuous mode. */ - if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br)) + if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br->dev)) set_all = true; list_for_each_entry(p, &br->port_list, list) { diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index b0845480a3ae..09dcdb9c0f3c 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -599,7 +599,7 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; vg = nbp_vlan_group(p); - if (br_vlan_enabled(br) && vg && entry->vid == 0) { + if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; err = __br_mdb_add(net, br, entry); @@ -694,7 +694,7 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; vg = nbp_vlan_group(p); - if (br_vlan_enabled(br) && vg && entry->vid == 0) { + if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; err = __br_mdb_del(br, entry); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 574f78824d8a..1e63ec466d7c 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1251,7 +1251,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 ageing_time = jiffies_to_clock_t(br->ageing_time); u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; - u8 vlan_enabled = br_vlan_enabled(br); + u8 vlan_enabled = br_vlan_enabled(br->dev); u64 clockval; clockval = br_timer_value(&br->hello_timer); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 0d177280aa84..20626927f433 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -854,10 +854,6 @@ static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) return vg->pvid; } -static inline int br_vlan_enabled(struct net_bridge *br) -{ - return br->vlan_enabled; -} #else static inline bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, @@ -945,11 +941,6 @@ static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) return 0; } -static inline int br_vlan_enabled(struct net_bridge *br) -{ - return 0; -} - static inline int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index b838213c408e..26a1a56639b2 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -706,6 +706,14 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) return __br_vlan_filter_toggle(br, val); } +bool br_vlan_enabled(const struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + + return !!br->vlan_enabled; +} +EXPORT_SYMBOL_GPL(br_vlan_enabled); + int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) { int err = 0; -- cgit v1.2.3 From 9341b988e606f951df57d15569a425c6c74b945e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 26 May 2017 08:37:24 +0200 Subject: bridge: Export multicast enabled state During enslavement to a bridge, after the CHANGEUPPER is sent, the multicast enabled state of the bridge isn't propagated down to the offloading driver unless it's changed. This patch allows such drivers to query the multicast enabled state from the bridge, so that they'll be able to correctly configure their flood tables during port enslavement. In case multicast is disabled, unregistered multicast packets can be treated as broadcast and be flooded through all the bridge ports. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 5 +++++ net/bridge/br_multicast.c | 8 ++++++++ 2 files changed, 13 insertions(+) (limited to 'net') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index d6cd103eb165..3cd18ac0697f 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -62,6 +62,7 @@ int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list); bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto); bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto); +bool br_multicast_enabled(const struct net_device *dev); #else static inline int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list) @@ -78,6 +79,10 @@ static inline bool br_multicast_has_querier_adjacent(struct net_device *dev, { return false; } +static inline bool br_multicast_enabled(const struct net_device *dev) +{ + return false; +} #endif #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index faa7261a992f..8dc5c8d69bcd 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2176,6 +2176,14 @@ unlock: return err; } +bool br_multicast_enabled(const struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + + return !br->multicast_disabled; +} +EXPORT_SYMBOL_GPL(br_multicast_enabled); + int br_multicast_set_querier(struct net_bridge *br, unsigned long val) { unsigned long max_delay; -- cgit v1.2.3 From 3d3ea5af5c0b382bc9d9aed378fd814fb5d4a011 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 27 May 2017 10:14:34 -0400 Subject: rtnl: Add support for netdev event to link messages When netdev events happen, a rtnetlink_event() handler will send messages for every event in it's white list. These messages contain current information about a particular device, but they do not include the iformation about which event just happened. So, it is impossible to tell what just happend for these events. This patch adds a new extension to RTM_NEWLINK message called IFLA_EVENT that would have an encoding of event that triggered this message. This would allow the the message consumer to easily determine if it needs to perform certain actions. Signed-off-by: Vladislav Yasevich Acked-by: David Ahern Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 3 +- include/uapi/linux/if_link.h | 11 ++++++++ net/core/dev.c | 2 +- net/core/rtnetlink.c | 65 ++++++++++++++++++++++++++++++++++++++------ 4 files changed, 70 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 57e54847b0b9..dea59c8eec54 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -18,7 +18,8 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, - unsigned change, gfp_t flags); + unsigned change, u32 event, + gfp_t flags); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 15ac20382aba..8ed679fe603f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -157,6 +157,7 @@ enum { IFLA_GSO_MAX_SIZE, IFLA_PAD, IFLA_XDP, + IFLA_EVENT, __IFLA_MAX }; @@ -911,4 +912,14 @@ enum { #define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) +enum { + IFLA_EVENT_NONE, + IFLA_EVENT_REBOOT, /* internal reset / reboot */ + IFLA_EVENT_FEATURES, /* change in offload features */ + IFLA_EVENT_BONDING_FAILOVER, /* change in active slave */ + IFLA_EVENT_NOTIFY_PEERS, /* re-sent grat. arp/ndisc */ + IFLA_EVENT_IGMP_RESEND, /* re-sent IGMP JOIN */ + IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ +}; + #endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 3d98fbf4cbb0..06e0a7492df8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7084,7 +7084,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL); /* diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 64953af4a3b1..9da53e43750c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -941,6 +941,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1282,9 +1283,40 @@ err_cancel: return err; } +static u32 rtnl_get_event(unsigned long event) +{ + u32 rtnl_event_type = IFLA_EVENT_NONE; + + switch (event) { + case NETDEV_REBOOT: + rtnl_event_type = IFLA_EVENT_REBOOT; + break; + case NETDEV_FEAT_CHANGE: + rtnl_event_type = IFLA_EVENT_FEATURES; + break; + case NETDEV_BONDING_FAILOVER: + rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; + break; + case NETDEV_NOTIFY_PEERS: + rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; + break; + case NETDEV_RESEND_IGMP: + rtnl_event_type = IFLA_EVENT_IGMP_RESEND; + break; + case NETDEV_CHANGEINFODATA: + rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; + break; + default: + break; + } + + return rtnl_event_type; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags, u32 ext_filter_mask) + unsigned int flags, u32 ext_filter_mask, + u32 event) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1333,6 +1365,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; + if (event != IFLA_EVENT_NONE) { + if (nla_put_u32(skb, IFLA_EVENT, event)) + goto nla_put_failure; + } + if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure; @@ -1467,6 +1504,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, + [IFLA_EVENT] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1626,7 +1664,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask); + ext_filter_mask, 0); if (err < 0) { if (likely(skb->len)) @@ -2736,7 +2774,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask); + nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2808,7 +2846,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, - unsigned int change, gfp_t flags) + unsigned int change, + u32 event, gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2819,7 +2858,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2840,18 +2879,25 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) +static void rtmsg_ifinfo_event(int type, struct net_device *dev, + unsigned int change, u32 event, + gfp_t flags) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) +{ + rtmsg_ifinfo_event(type, dev, change, IFLA_EVENT_NONE, flags); +} EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -4168,7 +4214,8 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_NOTIFY_PEERS: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); + rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), + GFP_KERNEL); break; default: break; -- cgit v1.2.3 From 7a7c0a6438b8e7636d5a22e572892cc234f68297 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 27 May 2017 00:27:25 +0200 Subject: mac80211: fix TX aggregation start/stop callback race When starting or stopping an aggregation session, one of the steps is that the driver calls back to mac80211 that the start/stop can proceed. This is handled by queueing up a fake SKB and processing it from the normal iface/sdata work. Since this isn't flushed when disassociating, the following race is possible: * associate * start aggregation session * driver callback * disassociate * associate again to the same AP * callback processing runs, leading to a WARN_ON() that the TID hadn't requested aggregation If the second association isn't to the same AP, there would only be a message printed ("Could not find station: "), but the same race could happen. Fix this by not going the whole detour with a fake SKB etc. but simply looking up the aggregation session in the driver callback, marking it with a START_CB/STOP_CB bit and then scheduling the regular aggregation work that will now process these bits as well. This also simplifies the code and gets rid of the whole problem with allocation failures of said skb, which could have left the session in limbo. Reported-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 128 ++++++++++++++++++--------------------------- net/mac80211/ht.c | 16 ++++-- net/mac80211/ieee80211_i.h | 14 ++--- net/mac80211/iface.c | 11 +--- net/mac80211/sta_info.h | 2 + 5 files changed, 71 insertions(+), 100 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 60e2a62f7bef..cf2392b2ac71 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -7,7 +7,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation - * Copyright(c) 2015 Intel Deutschland GmbH + * Copyright(c) 2015-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -741,46 +741,43 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local, ieee80211_agg_start_txq(sta, tid, true); } -void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid) +void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid, + struct tid_ampdu_tx *tid_tx) { - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; - struct sta_info *sta; - struct tid_ampdu_tx *tid_tx; - trace_api_start_tx_ba_cb(sdata, ra, tid); + if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state))) + return; + + if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) + ieee80211_agg_tx_operational(local, sta, tid); +} + +static struct tid_ampdu_tx * +ieee80211_lookup_tid_tx(struct ieee80211_sub_if_data *sdata, + const u8 *ra, u16 tid, struct sta_info **sta) +{ + struct tid_ampdu_tx *tid_tx; if (tid >= IEEE80211_NUM_TIDS) { ht_dbg(sdata, "Bad TID value: tid = %d (>= %d)\n", tid, IEEE80211_NUM_TIDS); - return; + return NULL; } - mutex_lock(&local->sta_mtx); - sta = sta_info_get_bss(sdata, ra); - if (!sta) { - mutex_unlock(&local->sta_mtx); + *sta = sta_info_get_bss(sdata, ra); + if (!*sta) { ht_dbg(sdata, "Could not find station: %pM\n", ra); - return; + return NULL; } - mutex_lock(&sta->ampdu_mlme.mtx); - tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + tid_tx = rcu_dereference((*sta)->ampdu_mlme.tid_tx[tid]); - if (WARN_ON(!tid_tx)) { + if (WARN_ON(!tid_tx)) ht_dbg(sdata, "addBA was not requested!\n"); - goto unlock; - } - if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state))) - goto unlock; - - if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) - ieee80211_agg_tx_operational(local, sta, tid); - - unlock: - mutex_unlock(&sta->ampdu_mlme.mtx); - mutex_unlock(&local->sta_mtx); + return tid_tx; } void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, @@ -788,19 +785,20 @@ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; - struct ieee80211_ra_tid *ra_tid; - struct sk_buff *skb = dev_alloc_skb(0); + struct sta_info *sta; + struct tid_ampdu_tx *tid_tx; - if (unlikely(!skb)) - return; + trace_api_start_tx_ba_cb(sdata, ra, tid); - ra_tid = (struct ieee80211_ra_tid *) &skb->cb; - memcpy(&ra_tid->ra, ra, ETH_ALEN); - ra_tid->tid = tid; + rcu_read_lock(); + tid_tx = ieee80211_lookup_tid_tx(sdata, ra, tid, &sta); + if (!tid_tx) + goto out; - skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_START; - skb_queue_tail(&sdata->skb_queue, skb); - ieee80211_queue_work(&local->hw, &sdata->work); + set_bit(HT_AGG_STATE_START_CB, &tid_tx->state); + ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work); + out: + rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe); @@ -860,37 +858,18 @@ int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid) } EXPORT_SYMBOL(ieee80211_stop_tx_ba_session); -void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid) +void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid, + struct tid_ampdu_tx *tid_tx) { - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - struct ieee80211_local *local = sdata->local; - struct sta_info *sta; - struct tid_ampdu_tx *tid_tx; + struct ieee80211_sub_if_data *sdata = sta->sdata; bool send_delba = false; - trace_api_stop_tx_ba_cb(sdata, ra, tid); - - if (tid >= IEEE80211_NUM_TIDS) { - ht_dbg(sdata, "Bad TID value: tid = %d (>= %d)\n", - tid, IEEE80211_NUM_TIDS); - return; - } - - ht_dbg(sdata, "Stopping Tx BA session for %pM tid %d\n", ra, tid); - - mutex_lock(&local->sta_mtx); - - sta = sta_info_get_bss(sdata, ra); - if (!sta) { - ht_dbg(sdata, "Could not find station: %pM\n", ra); - goto unlock; - } + ht_dbg(sdata, "Stopping Tx BA session for %pM tid %d\n", + sta->sta.addr, tid); - mutex_lock(&sta->ampdu_mlme.mtx); spin_lock_bh(&sta->lock); - tid_tx = rcu_dereference_protected_tid_tx(sta, tid); - if (!tid_tx || !test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { + if (!test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { ht_dbg(sdata, "unexpected callback to A-MPDU stop for %pM tid %d\n", sta->sta.addr, tid); @@ -906,12 +885,8 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid) spin_unlock_bh(&sta->lock); if (send_delba) - ieee80211_send_delba(sdata, ra, tid, + ieee80211_send_delba(sdata, sta->sta.addr, tid, WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); - - mutex_unlock(&sta->ampdu_mlme.mtx); - unlock: - mutex_unlock(&local->sta_mtx); } void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, @@ -919,19 +894,20 @@ void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; - struct ieee80211_ra_tid *ra_tid; - struct sk_buff *skb = dev_alloc_skb(0); + struct sta_info *sta; + struct tid_ampdu_tx *tid_tx; - if (unlikely(!skb)) - return; + trace_api_stop_tx_ba_cb(sdata, ra, tid); - ra_tid = (struct ieee80211_ra_tid *) &skb->cb; - memcpy(&ra_tid->ra, ra, ETH_ALEN); - ra_tid->tid = tid; + rcu_read_lock(); + tid_tx = ieee80211_lookup_tid_tx(sdata, ra, tid, &sta); + if (!tid_tx) + goto out; - skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_STOP; - skb_queue_tail(&sdata->skb_queue, skb); - ieee80211_queue_work(&local->hw, &sdata->work); + set_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state); + ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work); + out: + rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe); diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index f4a528773563..6ca5442b1e03 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -7,6 +7,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation + * Copyright 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -289,8 +290,6 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, { int i; - cancel_work_sync(&sta->ampdu_mlme.work); - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { __ieee80211_stop_tx_ba_session(sta, i, reason); __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, @@ -298,6 +297,9 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, reason != AGG_STOP_DESTROY_STA && reason != AGG_STOP_PEER_REQUEST); } + + /* stopping might queue the work again - so cancel only afterwards */ + cancel_work_sync(&sta->ampdu_mlme.work); } void ieee80211_ba_session_work(struct work_struct *work) @@ -352,10 +354,16 @@ void ieee80211_ba_session_work(struct work_struct *work) spin_unlock_bh(&sta->lock); tid_tx = rcu_dereference_protected_tid_tx(sta, tid); - if (tid_tx && test_and_clear_bit(HT_AGG_STATE_WANT_STOP, - &tid_tx->state)) + if (!tid_tx) + continue; + + if (test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state)) + ieee80211_start_tx_ba_cb(sta, tid, tid_tx); + if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state)) ___ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_LOCAL_REQUEST); + if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) + ieee80211_stop_tx_ba_cb(sta, tid, tid_tx); } mutex_unlock(&sta->ampdu_mlme.mtx); } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f8f6c148f554..665501ac358f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1036,8 +1036,6 @@ struct ieee80211_rx_agg { enum sdata_queue_type { IEEE80211_SDATA_QUEUE_TYPE_FRAME = 0, - IEEE80211_SDATA_QUEUE_AGG_START = 1, - IEEE80211_SDATA_QUEUE_AGG_STOP = 2, IEEE80211_SDATA_QUEUE_RX_AGG_START = 3, IEEE80211_SDATA_QUEUE_RX_AGG_STOP = 4, }; @@ -1427,12 +1425,6 @@ ieee80211_get_sband(struct ieee80211_sub_if_data *sdata) return local->hw.wiphy->bands[band]; } -/* this struct represents 802.11n's RA/TID combination */ -struct ieee80211_ra_tid { - u8 ra[ETH_ALEN]; - u16 tid; -}; - /* this struct holds the value parsing from channel switch IE */ struct ieee80211_csa_ie { struct cfg80211_chan_def chandef; @@ -1794,8 +1786,10 @@ int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, enum ieee80211_agg_stop_reason reason); int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, enum ieee80211_agg_stop_reason reason); -void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid); -void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid); +void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid, + struct tid_ampdu_tx *tid_tx); +void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid, + struct tid_ampdu_tx *tid_tx); void ieee80211_ba_session_work(struct work_struct *work); void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid); void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 3bd5b81f5d81..8fae1a72e6a7 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1237,7 +1237,6 @@ static void ieee80211_iface_work(struct work_struct *work) struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct sta_info *sta; - struct ieee80211_ra_tid *ra_tid; struct ieee80211_rx_agg *rx_agg; if (!ieee80211_sdata_running(sdata)) @@ -1253,15 +1252,7 @@ static void ieee80211_iface_work(struct work_struct *work) while ((skb = skb_dequeue(&sdata->skb_queue))) { struct ieee80211_mgmt *mgmt = (void *)skb->data; - if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_START) { - ra_tid = (void *)&skb->cb; - ieee80211_start_tx_ba_cb(&sdata->vif, ra_tid->ra, - ra_tid->tid); - } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_STOP) { - ra_tid = (void *)&skb->cb; - ieee80211_stop_tx_ba_cb(&sdata->vif, ra_tid->ra, - ra_tid->tid); - } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_RX_AGG_START) { + if (skb->pkt_type == IEEE80211_SDATA_QUEUE_RX_AGG_START) { rx_agg = (void *)&skb->cb; mutex_lock(&local->sta_mtx); sta = sta_info_get_bss(sdata, rx_agg->addr); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 5609cacb20d5..ea0747d6a6da 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -116,6 +116,8 @@ enum ieee80211_sta_info_flags { #define HT_AGG_STATE_STOPPING 3 #define HT_AGG_STATE_WANT_START 4 #define HT_AGG_STATE_WANT_STOP 5 +#define HT_AGG_STATE_START_CB 6 +#define HT_AGG_STATE_STOP_CB 7 enum ieee80211_agg_stop_reason { AGG_STOP_DECLINED, -- cgit v1.2.3 From ba277e8e05dbd4aa13f74f859e276d5d54467eab Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 27 May 2017 16:19:25 -0600 Subject: net: ipv4: refactor key and length checks fib_table_insert and fib_table_delete have the same checks on the prefix and length. Refactor into a helper. Avoids duplicate extack messages in the next patch. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 6e9df7d9bcc2..9bd46e1e1037 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1099,6 +1099,17 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } +static bool fib_valid_key_len(u32 key, u8 plen) +{ + if (plen > KEYLENGTH) + return false; + + if ((plen < KEYLENGTH) && (key << plen)) + return false; + + return true; +} + /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) @@ -1115,16 +1126,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb, u32 key; int err; - if (plen > KEYLENGTH) - return -EINVAL; - key = ntohl(cfg->fc_dst); - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); - - if ((plen < KEYLENGTH) && (key << plen)) + if (!fib_valid_key_len(key, plen)) return -EINVAL; + pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); + fi = fib_create_info(cfg, extack); if (IS_ERR(fi)) { err = PTR_ERR(fi); @@ -1518,12 +1526,9 @@ int fib_table_delete(struct net *net, struct fib_table *tb, u8 tos = cfg->fc_tos; u32 key; - if (plen > KEYLENGTH) - return -EINVAL; - key = ntohl(cfg->fc_dst); - if ((plen < KEYLENGTH) && (key << plen)) + if (!fib_valid_key_len(key, plen)) return -EINVAL; l = fib_find_node(t, &tp, key); -- cgit v1.2.3 From 78055998954b7a3e6c31eb24d1d26f0b63a7ec0d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 27 May 2017 16:19:26 -0600 Subject: net: ipv4: Add extack message for invalid prefix or length Add extack error message for invalid prefix length and invalid prefix. Example of the latter is a route spec containing 172.16.100.1/24, where the /24 mask means the lower 8-bits should be 0. Amazing how easy that one is to overlook when an EINVAL is returned. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_frontend.c | 7 ++++--- net/ipv4/fib_trie.c | 17 +++++++++++------ 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index dcbfd5dfd25e..3dbfd5e6a347 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -266,7 +266,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags); int fib_table_insert(struct net *, struct fib_table *, struct fib_config *, struct netlink_ext_ack *extack); -int fib_table_delete(struct net *, struct fib_table *, struct fib_config *); +int fib_table_delete(struct net *, struct fib_table *, struct fib_config *, + struct netlink_ext_ack *extack); int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb); int fib_table_flush(struct net *net, struct fib_table *table); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 14d2f7bd7c76..715b7967d8ea 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -588,7 +588,8 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) - err = fib_table_delete(net, tb, &cfg); + err = fib_table_delete(net, tb, &cfg, + NULL); else err = -ESRCH; } else { @@ -732,7 +733,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_table_delete(net, tb, &cfg); + err = fib_table_delete(net, tb, &cfg, extack); errout: return err; } @@ -851,7 +852,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad if (cmd == RTM_NEWROUTE) fib_table_insert(net, tb, &cfg, NULL); else - fib_table_delete(net, tb, &cfg); + fib_table_delete(net, tb, &cfg, NULL); } void fib_add_ifaddr(struct in_ifaddr *ifa) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9bd46e1e1037..a624d380c81d 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1099,13 +1099,18 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } -static bool fib_valid_key_len(u32 key, u8 plen) +static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) { - if (plen > KEYLENGTH) + if (plen > KEYLENGTH) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); return false; + } - if ((plen < KEYLENGTH) && (key << plen)) + if ((plen < KEYLENGTH) && (key << plen)) { + NL_SET_ERR_MSG(extack, + "Invalid prefix for given prefix length"); return false; + } return true; } @@ -1128,7 +1133,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen)) + if (!fib_valid_key_len(key, plen, extack)) return -EINVAL; pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); @@ -1516,7 +1521,7 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, - struct fib_config *cfg) + struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; @@ -1528,7 +1533,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen)) + if (!fib_valid_key_len(key, plen, extack)) return -EINVAL; l = fib_find_node(t, &tp, key); -- cgit v1.2.3 From c255bd681d1a93fff2a2c249d91449cce830ac64 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 27 May 2017 16:19:27 -0600 Subject: net: lwtunnel: Add extack to encap attr validation Pass extack down to lwtunnel_valid_encap_type and lwtunnel_valid_encap_type_attr. Add messages for unknown or unsupported encap types. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 13 +++++++++---- net/core/lwtunnel.c | 18 +++++++++++++----- net/ipv4/fib_frontend.c | 6 ++++-- net/ipv6/route.c | 4 ++-- 4 files changed, 28 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index ebfe237aad7e..ca6f002774ef 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -107,8 +107,10 @@ int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); -int lwtunnel_valid_encap_type(u16 encap_type); -int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len); +int lwtunnel_valid_encap_type(u16 encap_type, + struct netlink_ext_ack *extack); +int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, + struct netlink_ext_ack *extack); int lwtunnel_build_state(u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, @@ -172,11 +174,14 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, return -EOPNOTSUPP; } -static inline int lwtunnel_valid_encap_type(u16 encap_type) +static inline int lwtunnel_valid_encap_type(u16 encap_type, + struct netlink_ext_ack *extack) { + NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } -static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len) +static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, + struct netlink_ext_ack *extack) { /* return 0 since we are not walking attr looking for * RTA_ENCAP_TYPE attribute on nexthops. diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index cfae3d5fe11f..ab840386a74d 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -126,14 +126,16 @@ int lwtunnel_build_state(u16 encap_type, } EXPORT_SYMBOL(lwtunnel_build_state); -int lwtunnel_valid_encap_type(u16 encap_type) +int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || - encap_type > LWTUNNEL_ENCAP_MAX) + encap_type > LWTUNNEL_ENCAP_MAX) { + NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type"); return ret; + } rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); @@ -153,11 +155,16 @@ int lwtunnel_valid_encap_type(u16 encap_type) } } #endif - return ops ? 0 : -EOPNOTSUPP; + ret = ops ? 0 : -EOPNOTSUPP; + if (ret < 0) + NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); + + return ret; } EXPORT_SYMBOL(lwtunnel_valid_encap_type); -int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) +int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, + struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; @@ -174,7 +181,8 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) if (nla_entype) { encap_type = nla_get_u16(nla_entype); - if (lwtunnel_valid_encap_type(encap_type) != 0) + if (lwtunnel_valid_encap_type(encap_type, + extack) != 0) return -EOPNOTSUPP; } } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 715b7967d8ea..4e678fa892dd 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -685,7 +685,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, break; case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), - nla_len(attr)); + nla_len(attr), + extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); @@ -702,7 +703,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); - err = lwtunnel_valid_encap_type(cfg->fc_encap_type); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type, + extack); if (err < 0) goto errout; break; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2fe84bdc4e60..524a76b5206e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3016,7 +3016,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, - cfg->fc_mp_len); + cfg->fc_mp_len, extack); if (err < 0) goto errout; } @@ -3035,7 +3035,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); - err = lwtunnel_valid_encap_type(cfg->fc_encap_type); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; } -- cgit v1.2.3 From 9ae287274817c032a4428fde84d1ab26d6b96761 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 27 May 2017 16:19:28 -0600 Subject: net: add extack arg to lwtunnel build state Pass extack arg down to lwtunnel_build_state and the build_state callbacks. Add messages for failures in lwtunnel_build_state, and add the extarg to nla_parse where possible in the build_state callbacks. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netlink.h | 10 ++++++++++ include/net/lwtunnel.h | 9 ++++++--- net/core/lwt_bpf.c | 5 +++-- net/core/lwtunnel.c | 20 +++++++++++++++++--- net/ipv4/fib_lookup.h | 3 ++- net/ipv4/fib_semantics.c | 20 +++++++++++--------- net/ipv4/fib_trie.c | 2 +- net/ipv4/ip_tunnel_core.c | 11 +++++++---- net/ipv6/ila/ila_lwt.c | 5 +++-- net/ipv6/route.c | 2 +- net/ipv6/seg6_iptunnel.c | 5 +++-- net/mpls/mpls_iptunnel.c | 5 +++-- 12 files changed, 67 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index a68aad484c69..8664fd26eb5d 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -102,6 +102,16 @@ struct netlink_ext_ack { (extack)->bad_attr = (attr); \ } while (0) +#define NL_SET_ERR_MSG_ATTR(extack, attr, msg) do { \ + static const char __msg[] = (msg); \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) { \ + __extack->_msg = __msg; \ + __extack->bad_attr = (attr); \ + } \ +} while (0) + extern void netlink_kernel_release(struct sock *sk); extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups); extern int netlink_change_ngroups(struct sock *sk, unsigned int groups); diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index ca6f002774ef..7c26863b8cf4 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -35,7 +35,8 @@ struct lwtunnel_state { struct lwtunnel_encap_ops { int (*build_state)(struct nlattr *encap, unsigned int family, const void *cfg, - struct lwtunnel_state **ts); + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack); void (*destroy_state)(struct lwtunnel_state *lws); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*input)(struct sk_buff *skb); @@ -114,7 +115,8 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, int lwtunnel_build_state(u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, - struct lwtunnel_state **lws); + struct lwtunnel_state **lws, + struct netlink_ext_ack *extack); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); @@ -192,7 +194,8 @@ static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, static inline int lwtunnel_build_state(u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, - struct lwtunnel_state **lws) + struct lwtunnel_state **lws, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index b3bc0a31af9f..1307731ddfe4 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -240,7 +240,8 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { static int bpf_build_state(struct nlattr *nla, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_BPF_MAX + 1]; struct lwtunnel_state *newts; @@ -250,7 +251,7 @@ static int bpf_build_state(struct nlattr *nla, if (family != AF_INET && family != AF_INET6) return -EAFNOSUPPORT; - ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, NULL); + ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); if (ret < 0) return ret; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index ab840386a74d..d9cb3532f1dd 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -103,25 +103,39 @@ EXPORT_SYMBOL(lwtunnel_encap_del_ops); int lwtunnel_build_state(u16 encap_type, struct nlattr *encap, unsigned int family, - const void *cfg, struct lwtunnel_state **lws) + const void *cfg, struct lwtunnel_state **lws, + struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; + bool found = false; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || - encap_type > LWTUNNEL_ENCAP_MAX) + encap_type > LWTUNNEL_ENCAP_MAX) { + NL_SET_ERR_MSG_ATTR(extack, encap, + "Unknown LWT encapsulation type"); return ret; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); if (likely(ops && ops->build_state && try_module_get(ops->owner))) { - ret = ops->build_state(encap, family, cfg, lws); + found = true; + ret = ops->build_state(encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); } rcu_read_unlock(); + /* don't rely on -EOPNOTSUPP to detect match as build_state + * handlers could return it + */ + if (!found) { + NL_SET_ERR_MSG_ATTR(extack, encap, + "LWT encapsulation type not supported"); + } + return ret; } EXPORT_SYMBOL(lwtunnel_build_state); diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 2704e08545da..769ab87ebc4b 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -30,7 +30,8 @@ static inline void fib_alias_accessed(struct fib_alias *fa) void fib_release_info(struct fib_info *); struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); -int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, + struct netlink_ext_ack *extack); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index de9484658232..2157dc08c407 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -532,7 +532,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, ret = lwtunnel_build_state(nla_get_u16( nla_entype), nla, AF_INET, cfg, - &lwtstate); + &lwtstate, extack); if (ret) goto errout; nexthop_nh->nh_lwtstate = @@ -614,7 +614,8 @@ static inline void fib_add_weight(struct fib_info *fi, static int fib_encap_match(u16 encap_type, struct nlattr *encap, const struct fib_nh *nh, - const struct fib_config *cfg) + const struct fib_config *cfg, + struct netlink_ext_ack *extack) { struct lwtunnel_state *lwtstate; int ret, result = 0; @@ -622,8 +623,8 @@ static int fib_encap_match(u16 encap_type, if (encap_type == LWTUNNEL_ENCAP_NONE) return 0; - ret = lwtunnel_build_state(encap_type, encap, - AF_INET, cfg, &lwtstate); + ret = lwtunnel_build_state(encap_type, encap, AF_INET, + cfg, &lwtstate, extack); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); lwtstate_free(lwtstate); @@ -632,7 +633,8 @@ static int fib_encap_match(u16 encap_type, return result; } -int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, + struct netlink_ext_ack *extack) { #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; @@ -644,9 +646,9 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) if (cfg->fc_oif || cfg->fc_gw) { if (cfg->fc_encap) { - if (fib_encap_match(cfg->fc_encap_type, - cfg->fc_encap, fi->fib_nh, cfg)) - return 1; + if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, + fi->fib_nh, cfg, extack)) + return 1; } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) @@ -1148,7 +1150,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } err = lwtunnel_build_state(cfg->fc_encap_type, cfg->fc_encap, AF_INET, cfg, - &lwtstate); + &lwtstate, extack); if (err) goto failure; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index a624d380c81d..d56659e97a6e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1562,7 +1562,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && - fib_nh_match(cfg, fi) == 0) { + fib_nh_match(cfg, fi, extack) == 0) { fa_to_delete = fa; break; } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index baf196eaf1d8..90e11479c725 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -228,14 +228,16 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { static int ip_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct ip_tunnel_info *tun_info; struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, NULL); + err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, + extack); if (err < 0) return err; @@ -325,7 +327,8 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { static int ip6_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct ip_tunnel_info *tun_info; struct lwtunnel_state *new_state; @@ -333,7 +336,7 @@ static int ip6_tun_build_state(struct nlattr *attr, int err; err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, - NULL); + extack); if (err < 0) return err; diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index f4a413aba423..0c02a09bc351 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -117,7 +117,8 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { static int ila_build_state(struct nlattr *nla, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct ila_lwt *ilwt; struct ila_params *p; @@ -146,7 +147,7 @@ static int ila_build_state(struct nlattr *nla, return -EINVAL; } - ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, NULL); + ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); if (ret < 0) return ret; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 524a76b5206e..9d9b5bbea153 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1939,7 +1939,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, err = lwtunnel_build_state(cfg->fc_encap_type, cfg->fc_encap, AF_INET6, cfg, - &lwtstate); + &lwtstate, extack); if (err) goto out; rt->dst.lwtstate = lwtstate_get(lwtstate); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 6a495490d43e..264d772d3c7d 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -326,7 +326,8 @@ drop: static int seg6_build_state(struct nlattr *nla, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1]; struct seg6_iptunnel_encap *tuninfo; @@ -336,7 +337,7 @@ static int seg6_build_state(struct nlattr *nla, int err; err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla, - seg6_iptunnel_policy, NULL); + seg6_iptunnel_policy, extack); if (err < 0) return err; diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 369c7a23c86c..15e1aa708e50 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -159,7 +159,8 @@ drop: static int mpls_build_state(struct nlattr *nla, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct mpls_iptunnel_encap *tun_encap_info; struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; @@ -168,7 +169,7 @@ static int mpls_build_state(struct nlattr *nla, int ret; ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla, - mpls_iptunnel_policy, NULL); + mpls_iptunnel_policy, extack); if (ret < 0) return ret; -- cgit v1.2.3 From a1f10abe12b6d70f8b02dedccb48c9d234a57b67 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 27 May 2017 16:19:29 -0600 Subject: net: Fill in extack for mpls lwt encap Fill in extack for errors in build_state for mpls lwt encap including passing extack to nla_get_labels and adding error messages for failures in it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 49 ++++++++++++++++++++++++++++++++++++++---------- net/mpls/internal.h | 2 +- net/mpls/mpls_iptunnel.c | 12 +++++++----- 3 files changed, 47 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 257ec66009da..f3830951fb1c 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -728,8 +728,8 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt, goto errout; if (newdst) { - err = nla_get_labels(newdst, max_labels, - &nh->nh_labels, nh->nh_label); + err = nla_get_labels(newdst, max_labels, &nh->nh_labels, + nh->nh_label, NULL); if (err) goto errout; } @@ -782,7 +782,8 @@ static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len, nla = nla_find(attrs, attrlen, RTA_NEWDST); if (nla && - nla_get_labels(nla, MAX_NEW_LABELS, &n_labels, NULL) != 0) + nla_get_labels(nla, MAX_NEW_LABELS, &n_labels, + NULL, NULL) != 0) return 0; *max_labels = max_t(u8, *max_labels, n_labels); @@ -1541,8 +1542,8 @@ int nla_put_labels(struct sk_buff *skb, int attrtype, } EXPORT_SYMBOL_GPL(nla_put_labels); -int nla_get_labels(const struct nlattr *nla, - u8 max_labels, u8 *labels, u32 label[]) +int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, + u32 label[], struct netlink_ext_ack *extack) { unsigned len = nla_len(nla); struct mpls_shim_hdr *nla_label; @@ -1553,13 +1554,18 @@ int nla_get_labels(const struct nlattr *nla, /* len needs to be an even multiple of 4 (the label size). Number * of labels is a u8 so check for overflow. */ - if (len & 3 || len / 4 > 255) + if (len & 3 || len / 4 > 255) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Invalid length for labels attribute"); return -EINVAL; + } /* Limit the number of new labels allowed */ nla_labels = len/4; - if (nla_labels > max_labels) + if (nla_labels > max_labels) { + NL_SET_ERR_MSG(extack, "Too many labels"); return -EINVAL; + } /* when label == NULL, caller wants number of labels */ if (!label) @@ -1574,8 +1580,29 @@ int nla_get_labels(const struct nlattr *nla, /* Ensure the bottom of stack flag is properly set * and ttl and tc are both clear. */ - if ((dec.bos != bos) || dec.ttl || dec.tc) + if (dec.ttl) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "TTL in label must be 0"); + return -EINVAL; + } + + if (dec.tc) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Traffic class in label must be 0"); return -EINVAL; + } + + if (dec.bos != bos) { + NL_SET_BAD_ATTR(extack, nla); + if (bos) { + NL_SET_ERR_MSG(extack, + "BOS bit must be set in first label"); + } else { + NL_SET_ERR_MSG(extack, + "BOS bit can only be set in first label"); + } + return -EINVAL; + } switch (dec.label) { case MPLS_LABEL_IMPLNULL: @@ -1583,6 +1610,8 @@ int nla_get_labels(const struct nlattr *nla, * assign and distribute, but which never * actually appears in the encapsulation. */ + NL_SET_ERR_MSG_ATTR(extack, nla, + "Implicit NULL Label (3) can not be used in encapsulation"); return -EINVAL; } @@ -1696,14 +1725,14 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, case RTA_NEWDST: if (nla_get_labels(nla, MAX_NEW_LABELS, &cfg->rc_output_labels, - cfg->rc_output_label)) + cfg->rc_output_label, NULL)) goto errout; break; case RTA_DST: { u8 label_count; if (nla_get_labels(nla, 1, &label_count, - &cfg->rc_label)) + &cfg->rc_label, NULL)) goto errout; /* Reserved labels may not be set */ diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 4db6a5971322..e59f299ceb3f 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -203,7 +203,7 @@ static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, - u32 label[]); + u32 label[], struct netlink_ext_ack *extack); int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, u8 via[]); bool mpls_output_possible(const struct net_device *dev); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 15e1aa708e50..6e558a419f60 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -173,13 +173,14 @@ static int mpls_build_state(struct nlattr *nla, if (ret < 0) return ret; - if (!tb[MPLS_IPTUNNEL_DST]) + if (!tb[MPLS_IPTUNNEL_DST]) { + NL_SET_ERR_MSG(extack, "MPLS_IPTUNNEL_DST attribute is missing"); return -EINVAL; - + } /* determine number of labels */ - if (nla_get_labels(tb[MPLS_IPTUNNEL_DST], - MAX_NEW_LABELS, &n_labels, NULL)) + if (nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, + &n_labels, NULL, extack)) return -EINVAL; newts = lwtunnel_state_alloc(sizeof(*tun_encap_info) + @@ -189,7 +190,8 @@ static int mpls_build_state(struct nlattr *nla, tun_encap_info = mpls_lwtunnel_encap(newts); ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], n_labels, - &tun_encap_info->labels, tun_encap_info->label); + &tun_encap_info->labels, tun_encap_info->label, + extack); if (ret) goto errout; -- cgit v1.2.3 From b7b386f42f079b25b942c756820e36c6bd09b2ca Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 27 May 2017 16:19:30 -0600 Subject: net: mpls: Pull common label check into helper mpls_route_add and mpls_route_del have the same checks on the label. Move to a helper. Avoid duplicate extack messages in the next patch. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index f3830951fb1c..726eafecc793 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -856,6 +856,19 @@ errout: return err; } +static bool mpls_label_ok(struct net *net, unsigned int index) +{ + /* Reserved labels may not be set */ + if (index < MPLS_LABEL_FIRST_UNRESERVED) + return false; + + /* The full 20 bit range may not be supported. */ + if (index >= net->mpls.platform_labels) + return false; + + return true; +} + static int mpls_route_add(struct mpls_route_config *cfg) { struct mpls_route __rcu **platform_label; @@ -875,12 +888,7 @@ static int mpls_route_add(struct mpls_route_config *cfg) index = find_free_label(net); } - /* Reserved labels may not be set */ - if (index < MPLS_LABEL_FIRST_UNRESERVED) - goto errout; - - /* The full 20 bit range may not be supported. */ - if (index >= net->mpls.platform_labels) + if (!mpls_label_ok(net, index)) goto errout; /* Append makes no sense with mpls */ @@ -952,12 +960,7 @@ static int mpls_route_del(struct mpls_route_config *cfg) index = cfg->rc_label; - /* Reserved labels may not be removed */ - if (index < MPLS_LABEL_FIRST_UNRESERVED) - goto errout; - - /* The full 20 bit range may not be supported */ - if (index >= net->mpls.platform_labels) + if (!mpls_label_ok(net, index)) goto errout; mpls_route_update(net, index, NULL, &cfg->rc_nlinfo); @@ -1735,10 +1738,9 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, &cfg->rc_label, NULL)) goto errout; - /* Reserved labels may not be set */ - if (cfg->rc_label < MPLS_LABEL_FIRST_UNRESERVED) + if (!mpls_label_ok(cfg->rc_nlinfo.nl_net, + cfg->rc_label)) goto errout; - break; } case RTA_VIA: -- cgit v1.2.3 From 074350e2ebcab9cf62528984a617cc0a05b24fff Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 27 May 2017 16:19:31 -0600 Subject: net: mpls: Add extack messages for route add and delete failures Add error messages for failures in adding and deleting mpls routes. This covers most of the annoying EINVAL errors. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 125 ++++++++++++++++++++++++++++++++++++---------------- net/mpls/internal.h | 2 +- 2 files changed, 87 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 726eafecc793..0133d1ad9032 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -720,7 +720,8 @@ errout: static int mpls_nh_build(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif, struct nlattr *via, - struct nlattr *newdst, u8 max_labels) + struct nlattr *newdst, u8 max_labels, + struct netlink_ext_ack *extack) { int err = -ENOMEM; @@ -729,14 +730,14 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt, if (newdst) { err = nla_get_labels(newdst, max_labels, &nh->nh_labels, - nh->nh_label, NULL); + nh->nh_label, extack); if (err) goto errout; } if (via) { err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table, - __mpls_nh_via(rt, nh)); + __mpls_nh_via(rt, nh), extack); if (err) goto errout; } else { @@ -803,7 +804,8 @@ static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len, } static int mpls_nh_build_multi(struct mpls_route_config *cfg, - struct mpls_route *rt, u8 max_labels) + struct mpls_route *rt, u8 max_labels, + struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = cfg->rc_mp; struct nlattr *nla_via, *nla_newdst; @@ -837,7 +839,7 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg, err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh, rtnh->rtnh_ifindex, nla_via, nla_newdst, - max_labels); + max_labels, extack); if (err) goto errout; @@ -856,20 +858,28 @@ errout: return err; } -static bool mpls_label_ok(struct net *net, unsigned int index) +static bool mpls_label_ok(struct net *net, unsigned int index, + struct netlink_ext_ack *extack) { /* Reserved labels may not be set */ - if (index < MPLS_LABEL_FIRST_UNRESERVED) + if (index < MPLS_LABEL_FIRST_UNRESERVED) { + NL_SET_ERR_MSG(extack, + "Invalid label - must be MPLS_LABEL_FIRST_UNRESERVED or higher"); return false; + } /* The full 20 bit range may not be supported. */ - if (index >= net->mpls.platform_labels) + if (index >= net->mpls.platform_labels) { + NL_SET_ERR_MSG(extack, + "Label >= configured maximum in platform_labels"); return false; + } return true; } -static int mpls_route_add(struct mpls_route_config *cfg) +static int mpls_route_add(struct mpls_route_config *cfg, + struct netlink_ext_ack *extack) { struct mpls_route __rcu **platform_label; struct net *net = cfg->rc_nlinfo.nl_net; @@ -888,13 +898,15 @@ static int mpls_route_add(struct mpls_route_config *cfg) index = find_free_label(net); } - if (!mpls_label_ok(net, index)) + if (!mpls_label_ok(net, index, extack)) goto errout; /* Append makes no sense with mpls */ err = -EOPNOTSUPP; - if (cfg->rc_nlflags & NLM_F_APPEND) + if (cfg->rc_nlflags & NLM_F_APPEND) { + NL_SET_ERR_MSG(extack, "MPLS does not support route append"); goto errout; + } err = -EEXIST; platform_label = rtnl_dereference(net->mpls.platform_label); @@ -921,8 +933,10 @@ static int mpls_route_add(struct mpls_route_config *cfg) nhs = 1; } - if (nhs == 0) + if (nhs == 0) { + NL_SET_ERR_MSG(extack, "Route does not contain a nexthop"); goto errout; + } err = -ENOMEM; rt = mpls_rt_alloc(nhs, max_via_alen, max_labels); @@ -936,7 +950,7 @@ static int mpls_route_add(struct mpls_route_config *cfg) rt->rt_ttl_propagate = cfg->rc_ttl_propagate; if (cfg->rc_mp) - err = mpls_nh_build_multi(cfg, rt, max_labels); + err = mpls_nh_build_multi(cfg, rt, max_labels, extack); else err = mpls_nh_build_from_cfg(cfg, rt); if (err) @@ -952,7 +966,8 @@ errout: return err; } -static int mpls_route_del(struct mpls_route_config *cfg) +static int mpls_route_del(struct mpls_route_config *cfg, + struct netlink_ext_ack *extack) { struct net *net = cfg->rc_nlinfo.nl_net; unsigned index; @@ -960,7 +975,7 @@ static int mpls_route_del(struct mpls_route_config *cfg) index = cfg->rc_label; - if (!mpls_label_ok(net, index)) + if (!mpls_label_ok(net, index, extack)) goto errout; mpls_route_update(net, index, NULL, &cfg->rc_nlinfo); @@ -1626,19 +1641,25 @@ out: } EXPORT_SYMBOL_GPL(nla_get_labels); -int nla_get_via(const struct nlattr *nla, u8 *via_alen, - u8 *via_table, u8 via_addr[]) +int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, + u8 via_addr[], struct netlink_ext_ack *extack) { struct rtvia *via = nla_data(nla); int err = -EINVAL; int alen; - if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) + if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Invalid attribute length for RTA_VIA"); goto errout; + } alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); - if (alen > MAX_VIA_ALEN) + if (alen > MAX_VIA_ALEN) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Invalid address length for RTA_VIA"); goto errout; + } /* Validate the address family */ switch (via->rtvia_family) { @@ -1668,8 +1689,10 @@ errout: return err; } -static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, - struct mpls_route_config *cfg) +static int rtm_to_route_config(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct mpls_route_config *cfg, + struct netlink_ext_ack *extack) { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; @@ -1677,35 +1700,54 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, - NULL); + extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); - if (rtm->rtm_family != AF_MPLS) + if (rtm->rtm_family != AF_MPLS) { + NL_SET_ERR_MSG(extack, "Invalid address family in rtmsg"); goto errout; - if (rtm->rtm_dst_len != 20) + } + if (rtm->rtm_dst_len != 20) { + NL_SET_ERR_MSG(extack, "rtm_dst_len must be 20 for MPLS"); goto errout; - if (rtm->rtm_src_len != 0) + } + if (rtm->rtm_src_len != 0) { + NL_SET_ERR_MSG(extack, "rtm_src_len must be 0 for MPLS"); goto errout; - if (rtm->rtm_tos != 0) + } + if (rtm->rtm_tos != 0) { + NL_SET_ERR_MSG(extack, "rtm_tos must be 0 for MPLS"); goto errout; - if (rtm->rtm_table != RT_TABLE_MAIN) + } + if (rtm->rtm_table != RT_TABLE_MAIN) { + NL_SET_ERR_MSG(extack, + "MPLS only supports the main route table"); goto errout; + } /* Any value is acceptable for rtm_protocol */ /* As mpls uses destination specific addresses * (or source specific address in the case of multicast) * all addresses have universal scope. */ - if (rtm->rtm_scope != RT_SCOPE_UNIVERSE) + if (rtm->rtm_scope != RT_SCOPE_UNIVERSE) { + NL_SET_ERR_MSG(extack, + "Invalid route scope - MPLS only supports UNIVERSE"); goto errout; - if (rtm->rtm_type != RTN_UNICAST) + } + if (rtm->rtm_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, + "Invalid route type - MPLS only supports UNICAST"); goto errout; - if (rtm->rtm_flags != 0) + } + if (rtm->rtm_flags != 0) { + NL_SET_ERR_MSG(extack, "rtm_flags must be 0 for MPLS"); goto errout; + } cfg->rc_label = LABEL_NOT_SPECIFIED; cfg->rc_protocol = rtm->rtm_protocol; @@ -1728,25 +1770,26 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, case RTA_NEWDST: if (nla_get_labels(nla, MAX_NEW_LABELS, &cfg->rc_output_labels, - cfg->rc_output_label, NULL)) + cfg->rc_output_label, extack)) goto errout; break; case RTA_DST: { u8 label_count; if (nla_get_labels(nla, 1, &label_count, - &cfg->rc_label, NULL)) + &cfg->rc_label, extack)) goto errout; if (!mpls_label_ok(cfg->rc_nlinfo.nl_net, - cfg->rc_label)) + cfg->rc_label, extack)) goto errout; break; } case RTA_VIA: { if (nla_get_via(nla, &cfg->rc_via_alen, - &cfg->rc_via_table, cfg->rc_via)) + &cfg->rc_via_table, cfg->rc_via, + extack)) goto errout; break; } @@ -1760,14 +1803,18 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, { u8 ttl_propagate = nla_get_u8(nla); - if (ttl_propagate > 1) + if (ttl_propagate > 1) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "RTA_TTL_PROPAGATE can only be 0 or 1"); goto errout; + } cfg->rc_ttl_propagate = ttl_propagate ? MPLS_TTL_PROP_ENABLED : MPLS_TTL_PROP_DISABLED; break; } default: + NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute"); /* Unsupported attribute */ goto errout; } @@ -1788,11 +1835,11 @@ static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (!cfg) return -ENOMEM; - err = rtm_to_route_config(skb, nlh, cfg); + err = rtm_to_route_config(skb, nlh, cfg, extack); if (err < 0) goto out; - err = mpls_route_del(cfg); + err = mpls_route_del(cfg, extack); out: kfree(cfg); @@ -1810,11 +1857,11 @@ static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (!cfg) return -ENOMEM; - err = rtm_to_route_config(skb, nlh, cfg); + err = rtm_to_route_config(skb, nlh, cfg, extack); if (err < 0) goto out; - err = mpls_route_add(cfg); + err = mpls_route_add(cfg, extack); out: kfree(cfg); diff --git a/net/mpls/internal.h b/net/mpls/internal.h index e59f299ceb3f..a015a6a1143b 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -205,7 +205,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, u32 label[], struct netlink_ext_ack *extack); int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, - u8 via[]); + u8 via[], struct netlink_ext_ack *extack); bool mpls_output_possible(const struct net_device *dev); unsigned int mpls_dev_mtu(const struct net_device *dev); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); -- cgit v1.2.3 From d4e7256007b0763614a41c9c0a5a943dc1095b22 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 27 May 2017 16:19:32 -0600 Subject: net: mpls: Make nla_get_via in af_mpls.c nla_get_via is only used in af_mpls.c. Remove declaration from internal.h and move up in af_mpls.c before first use. Code move only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 96 ++++++++++++++++++++++++++--------------------------- net/mpls/internal.h | 2 -- 2 files changed, 48 insertions(+), 50 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 0133d1ad9032..a953fcf169ba 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -684,6 +684,54 @@ errout: return err; } +static int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, + u8 via_addr[], struct netlink_ext_ack *extack) +{ + struct rtvia *via = nla_data(nla); + int err = -EINVAL; + int alen; + + if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Invalid attribute length for RTA_VIA"); + goto errout; + } + alen = nla_len(nla) - + offsetof(struct rtvia, rtvia_addr); + if (alen > MAX_VIA_ALEN) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Invalid address length for RTA_VIA"); + goto errout; + } + + /* Validate the address family */ + switch (via->rtvia_family) { + case AF_PACKET: + *via_table = NEIGH_LINK_TABLE; + break; + case AF_INET: + *via_table = NEIGH_ARP_TABLE; + if (alen != 4) + goto errout; + break; + case AF_INET6: + *via_table = NEIGH_ND_TABLE; + if (alen != 16) + goto errout; + break; + default: + /* Unsupported address family */ + goto errout; + } + + memcpy(via_addr, via->rtvia_addr, alen); + *via_alen = alen; + err = 0; + +errout: + return err; +} + static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg, struct mpls_route *rt) { @@ -1641,54 +1689,6 @@ out: } EXPORT_SYMBOL_GPL(nla_get_labels); -int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, - u8 via_addr[], struct netlink_ext_ack *extack) -{ - struct rtvia *via = nla_data(nla); - int err = -EINVAL; - int alen; - - if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "Invalid attribute length for RTA_VIA"); - goto errout; - } - alen = nla_len(nla) - - offsetof(struct rtvia, rtvia_addr); - if (alen > MAX_VIA_ALEN) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "Invalid address length for RTA_VIA"); - goto errout; - } - - /* Validate the address family */ - switch (via->rtvia_family) { - case AF_PACKET: - *via_table = NEIGH_LINK_TABLE; - break; - case AF_INET: - *via_table = NEIGH_ARP_TABLE; - if (alen != 4) - goto errout; - break; - case AF_INET6: - *via_table = NEIGH_ND_TABLE; - if (alen != 16) - goto errout; - break; - default: - /* Unsupported address family */ - goto errout; - } - - memcpy(via_addr, via->rtvia_addr, alen); - *via_alen = alen; - err = 0; - -errout: - return err; -} - static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct mpls_route_config *cfg, diff --git a/net/mpls/internal.h b/net/mpls/internal.h index a015a6a1143b..cf65aec2e551 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -204,8 +204,6 @@ int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, u32 label[], struct netlink_ext_ack *extack); -int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, - u8 via[], struct netlink_ext_ack *extack); bool mpls_output_possible(const struct net_device *dev); unsigned int mpls_dev_mtu(const struct net_device *dev); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); -- cgit v1.2.3 From e1af005b1ce6d33c0467808ddceb09a8a1a424ba Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 27 May 2017 16:19:33 -0600 Subject: net: mpls: remove unnecessary initialization of err err is initialized to EINVAL and not used before it is set again. Remove the unnecessary initialization. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index a953fcf169ba..94b3317232a6 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -743,8 +743,6 @@ static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg, if (!nh) return -ENOMEM; - err = -EINVAL; - nh->nh_labels = cfg->rc_output_labels; for (i = 0; i < nh->nh_labels; i++) nh->nh_label[i] = cfg->rc_output_label[i]; -- cgit v1.2.3 From f3d736c478adbe46234c35b2ef9b55a11dd01d7c Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 26 May 2017 18:12:42 -0400 Subject: net: dsa: remove dsa_port_is_bridged The helper is only used once and makes the code more complicated that it should. Remove it and reorganize the variables so that it fits on 80 columns. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 887e26695519..0442b6bf52fa 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -69,18 +69,13 @@ static int dsa_slave_get_iflink(const struct net_device *dev) return p->dp->ds->dst->master_netdev->ifindex; } -static inline bool dsa_port_is_bridged(struct dsa_port *dp) -{ - return !!dp->bridge_dev; -} - static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->dp->ds->dst->master_netdev; - struct dsa_switch *ds = p->dp->ds; - u8 stp_state = dsa_port_is_bridged(p->dp) ? - BR_STATE_BLOCKING : BR_STATE_FORWARDING; + struct dsa_port *dp = p->dp; + struct dsa_switch *ds = dp->ds; + struct net_device *master = ds->dst->master_netdev; + u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING; int err; if (!(master->flags & IFF_UP)) -- cgit v1.2.3 From 6ea44adce91526700535b3150f77f8639ae8c82d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 May 2017 17:00:32 +1000 Subject: SUNRPC: ensure correct error is reported by xs_tcp_setup_socket() If you attempt a TCP mount from an host that is unreachable in a way that triggers an immediate error from kernel_connect(), that error does not propagate up, instead EAGAIN is reported. This results in call_connect_status receiving the wrong error. A case that it easy to demonstrate is to attempt to mount from an address that results in ENETUNREACH, but first deleting any default route. Without this patch, the mount.nfs process is persistently runnable and is hard to kill. With this patch it exits as it should. The problem is caused by the fact that xs_tcp_force_close() eventually calls xprt_wake_pending_tasks(xprt, -EAGAIN); which causes an error return of -EAGAIN. so when xs_tcp_setup_sock() calls xprt_wake_pending_tasks(xprt, status); the status is ignored. Fixes: 4efdd92c9211 ("SUNRPC: Remove TCP client connection reset hack") Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 16aff8ddc16f..d5b54c020dec 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2432,7 +2432,12 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -ENETUNREACH: case -EADDRINUSE: case -ENOBUFS: - /* retry with existing socket, after a delay */ + /* + * xs_tcp_force_close() wakes tasks with -EIO. + * We need to wake them first to ensure the + * correct error code. + */ + xprt_wake_pending_tasks(xprt, status); xs_tcp_force_close(xprt); goto out; } -- cgit v1.2.3 From 15e5651525c2e580a523568ed207e4a3fb11cc33 Mon Sep 17 00:00:00 2001 From: Douglas Caetano dos Santos Date: Fri, 26 May 2017 14:28:00 -0300 Subject: tcp: reinitialize MTU probing when setting MSS in a TCP repair MTU probing initialization occurred only at connect() and at SYN or SYN-ACK reception, but the former sets MSS to either the default or the user set value (through TCP_MAXSEG sockopt) and the latter never happens with repaired sockets. The result was that, with MTU probing enabled and unless TCP_MAXSEG sockopt was used before connect(), probing would be stuck at tcp_base_mss value until tcp_probe_interval seconds have passed. Signed-off-by: Douglas Caetano dos Santos Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 59792d283ff8..b5ea036ca781 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2381,9 +2381,10 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l return 0; } -static int tcp_repair_options_est(struct tcp_sock *tp, +static int tcp_repair_options_est(struct sock *sk, struct tcp_repair_opt __user *optbuf, unsigned int len) { + struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; while (len >= sizeof(opt)) { @@ -2396,6 +2397,7 @@ static int tcp_repair_options_est(struct tcp_sock *tp, switch (opt.opt_code) { case TCPOPT_MSS: tp->rx_opt.mss_clamp = opt.opt_val; + tcp_mtup_init(sk); break; case TCPOPT_WINDOW: { @@ -2555,7 +2557,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED) - err = tcp_repair_options_est(tp, + err = tcp_repair_options_est(sk, (struct tcp_repair_opt __user *)optval, optlen); else -- cgit v1.2.3 From 23c9ee4934e7a79b49151d0f05c24117d69c73fe Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 26 May 2017 18:12:51 -0400 Subject: net: dsa: remove dev arg of dsa_register_switch The current dsa_register_switch function takes a useless struct device pointer argument, which always equals ds->dev. Drivers either call it with ds->dev, or with the same device pointer passed to dsa_switch_alloc, which ends up being assigned to ds->dev. This patch removes the second argument of the dsa_register_switch and _dsa_register_switch functions. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 2 +- drivers/net/dsa/dsa_loop.c | 2 +- drivers/net/dsa/lan9303-core.c | 2 +- drivers/net/dsa/mt7530.c | 2 +- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- drivers/net/dsa/qca8k.c | 2 +- include/net/dsa.h | 2 +- net/dsa/dsa2.c | 10 +++++----- 8 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 6a5648a9cb09..e68d368e20ac 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1989,7 +1989,7 @@ int b53_switch_register(struct b53_device *dev) pr_info("found switch: %s, rev %i\n", dev->name, dev->core_rev); - return dsa_register_switch(dev->ds, dev->ds->dev); + return dsa_register_switch(dev->ds); } EXPORT_SYMBOL(b53_switch_register); diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index 5edf07beb9d2..79e62593ff4e 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -271,7 +271,7 @@ static int dsa_loop_drv_probe(struct mdio_device *mdiodev) dev_set_drvdata(&mdiodev->dev, ds); - return dsa_register_switch(ds, ds->dev); + return dsa_register_switch(ds); } static void dsa_loop_drv_remove(struct mdio_device *mdiodev) diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c index c8b2423c8ef7..cd76e61f1fca 100644 --- a/drivers/net/dsa/lan9303-core.c +++ b/drivers/net/dsa/lan9303-core.c @@ -802,7 +802,7 @@ static int lan9303_register_switch(struct lan9303 *chip) chip->ds->ops = &lan9303_switch_ops; chip->ds->phys_mii_mask = chip->phy_addr_sel_strap ? 0xe : 0x7; - return dsa_register_switch(chip->ds, chip->dev); + return dsa_register_switch(chip->ds); } static void lan9303_probe_reset_gpio(struct lan9303 *chip, diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 4d2f45153ede..25e00d5e0eec 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1080,7 +1080,7 @@ mt7530_probe(struct mdio_device *mdiodev) mutex_init(&priv->reg_mutex); dev_set_drvdata(&mdiodev->dev, priv); - return dsa_register_switch(priv->ds, &mdiodev->dev); + return dsa_register_switch(priv->ds); } static void diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 070e82ac6132..7cf470c3e662 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3884,7 +3884,7 @@ static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip) dev_set_drvdata(dev, ds); - return dsa_register_switch(ds, dev); + return dsa_register_switch(ds); } static void mv88e6xxx_unregister_switch(struct mv88e6xxx_chip *chip) diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 0f6a011d8ed1..b3bee7eab45f 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -958,7 +958,7 @@ qca8k_sw_probe(struct mdio_device *mdiodev) mutex_init(&priv->reg_mutex); dev_set_drvdata(&mdiodev->dev, priv); - return dsa_register_switch(priv->ds, &mdiodev->dev); + return dsa_register_switch(priv->ds); } static void diff --git a/include/net/dsa.h b/include/net/dsa.h index c0e567c0c824..d9bd6939229a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -480,7 +480,7 @@ static inline bool netdev_uses_dsa(struct net_device *dev) struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n); void dsa_unregister_switch(struct dsa_switch *ds); -int dsa_register_switch(struct dsa_switch *ds, struct device *dev); +int dsa_register_switch(struct dsa_switch *ds); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 4301f52e4f5a..c0a4576db4a2 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -686,10 +686,10 @@ static struct device_node *dsa_get_ports(struct dsa_switch *ds, return ports; } -static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev) +static int _dsa_register_switch(struct dsa_switch *ds) { - struct dsa_chip_data *pdata = dev->platform_data; - struct device_node *np = dev->of_node; + struct dsa_chip_data *pdata = ds->dev->platform_data; + struct device_node *np = ds->dev->of_node; struct dsa_switch_tree *dst; struct device_node *ports; u32 tree, index; @@ -803,12 +803,12 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) } EXPORT_SYMBOL_GPL(dsa_switch_alloc); -int dsa_register_switch(struct dsa_switch *ds, struct device *dev) +int dsa_register_switch(struct dsa_switch *ds) { int err; mutex_lock(&dsa2_mutex); - err = _dsa_register_switch(ds, dev); + err = _dsa_register_switch(ds); mutex_unlock(&dsa2_mutex); return err; -- cgit v1.2.3 From 8c6c918da16f55fd4a5df53aa072ba3238f12fec Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 30 May 2017 10:07:02 -0400 Subject: rtnetlink: use the new rtnl_get_event() interface Small clean-up to rtmsg_ifinfo() to use the rtnl_get_event() interface instead of using 'internal' values directly. Signed-off-by: Vladislav Yasevich Acked-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9da53e43750c..7084f1db2446 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2896,7 +2896,7 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { - rtmsg_ifinfo_event(type, dev, change, IFLA_EVENT_NONE, flags); + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags); } EXPORT_SYMBOL(rtmsg_ifinfo); -- cgit v1.2.3 From c2e8471d98f072f0f18acdd7df5b401ed0d29a2c Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 30 May 2017 23:50:36 -0700 Subject: mpls: fix clearing of dead nh_flags on link up recent fixes to use WRITE_ONCE for nh_flags on link up, accidently ended up leaving the deadflags on a nh. This patch fixes the WRITE_ONCE to use freshly evaluated nh_flags. Fixes: 39eb8cd17588 ("net: mpls: rt_nhn_alive and nh_flags should be accessed using READ_ONCE") Reported-by: Satish Ashok Signed-off-by: Roopa Prabhu Acked-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 257ec66009da..7b05fd1497ce 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1418,7 +1418,7 @@ static void mpls_ifup(struct net_device *dev, unsigned int flags) continue; alive++; nh_flags &= ~flags; - WRITE_ONCE(nh->nh_flags, flags); + WRITE_ONCE(nh->nh_flags, nh_flags); } endfor_nexthops(rt); WRITE_ONCE(rt->rt_nhn_alive, alive); -- cgit v1.2.3 From 50bbfed96766acb8b9e1b299e3daae5187c3eb5a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:31 -0700 Subject: bpf: track stack depth of classic bpf programs To track stack depth of classic bpf programs we only need to analyze ST|STX instructions, since check_load_and_stores() verifies that programs can load from stack only after write. We also need to change the way cBPF stack slots map to eBPF stack, since typical classic programs are using slots 0 and 1, so they need to map to stack offsets -4 and -8 respectively in order to take advantage of small stack interpreter and JITs. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a6bb95fa87b2..946f758d44f2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -352,7 +352,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program - * @new_prog: buffer where converted program will be stored + * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' @@ -364,14 +364,13 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); */ static int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_insn *new_prog, int *new_len) + struct bpf_prog *new_prog, int *new_len) { - int new_flen = 0, pass = 0, target, i; - struct bpf_insn *new_insn; + int new_flen = 0, pass = 0, target, i, stack_off; + struct bpf_insn *new_insn, *first_insn = NULL; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; @@ -383,6 +382,7 @@ static int bpf_convert_filter(struct sock_filter *prog, int len, return -EINVAL; if (new_prog) { + first_insn = new_prog->insnsi; addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL | __GFP_NOWARN); if (!addrs) @@ -390,11 +390,11 @@ static int bpf_convert_filter(struct sock_filter *prog, int len, } do_pass: - new_insn = new_prog; + new_insn = first_insn; fp = prog; /* Classic BPF related prologue emission. */ - if (new_insn) { + if (new_prog) { /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ @@ -415,7 +415,7 @@ do_pass: struct bpf_insn *insn = tmp_insns; if (addrs) - addrs[i] = new_insn - new_prog; + addrs[i] = new_insn - first_insn; switch (fp->code) { /* All arithmetic insns and skb loads map as-is. */ @@ -561,17 +561,25 @@ do_pass: /* Store to stack. */ case BPF_ST: case BPF_STX: + stack_off = fp->k * 4 + 4; *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == BPF_ST ? BPF_REG_A : BPF_REG_X, - -(BPF_MEMWORDS - fp->k) * 4); + -stack_off); + /* check_load_and_stores() verifies that classic BPF can + * load from stack only after write, so tracking + * stack_depth for ST|STX insns is enough + */ + if (new_prog && new_prog->aux->stack_depth < stack_off) + new_prog->aux->stack_depth = stack_off; break; /* Load from stack. */ case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: + stack_off = fp->k * 4 + 4; *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_FP, - -(BPF_MEMWORDS - fp->k) * 4); + -stack_off); break; /* A = K or X = K */ @@ -619,13 +627,13 @@ do_pass: if (!new_prog) { /* Only calculating new length. */ - *new_len = new_insn - new_prog; + *new_len = new_insn - first_insn; return 0; } pass++; - if (new_flen != new_insn - new_prog) { - new_flen = new_insn - new_prog; + if (new_flen != new_insn - first_insn) { + new_flen = new_insn - first_insn; if (pass > 2) goto err; goto do_pass; @@ -1017,7 +1025,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp, &new_len); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, -- cgit v1.2.3 From 8b8010fb7876e816300ddd60fa089e9ceb209f3e Mon Sep 17 00:00:00 2001 From: Woojung Huh Date: Wed, 31 May 2017 20:19:06 +0000 Subject: dsa: add support for Microchip KSZ tail tagging Adding support for the Microchip KSZ switch family tail tagging. Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Woojung Huh Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/Kconfig | 3 ++ net/dsa/Makefile | 1 + net/dsa/dsa.c | 3 ++ net/dsa/dsa_priv.h | 3 ++ net/dsa/tag_ksz.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 112 insertions(+) create mode 100644 net/dsa/tag_ksz.c (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index d9bd6939229a..7de1234ba136 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -31,6 +31,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_BRCM, DSA_TAG_PROTO_DSA, DSA_TAG_PROTO_EDSA, + DSA_TAG_PROTO_KSZ, DSA_TAG_PROTO_LAN9303, DSA_TAG_PROTO_MTK, DSA_TAG_PROTO_QCA, diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 297389b2ab35..cc5f8f971689 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -25,6 +25,9 @@ config NET_DSA_TAG_DSA config NET_DSA_TAG_EDSA bool +config NET_DSA_TAG_KSZ + bool + config NET_DSA_TAG_LAN9303 bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 90e5aa6f7d0f..fcce25da937c 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -6,6 +6,7 @@ dsa_core-y += dsa.o dsa2.o legacy.o port.o slave.o switch.o dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o +dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 3288a80d4d6c..402459e73f33 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -49,6 +49,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { #ifdef CONFIG_NET_DSA_TAG_EDSA [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops, #endif +#ifdef CONFIG_NET_DSA_TAG_KSZ + [DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops, +#endif #ifdef CONFIG_NET_DSA_TAG_LAN9303 [DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops, #endif diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index c1d4180651af..7459d5735d8b 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -167,6 +167,9 @@ extern const struct dsa_device_ops dsa_netdev_ops; /* tag_edsa.c */ extern const struct dsa_device_ops edsa_netdev_ops; +/* tag_ksz.c */ +extern const struct dsa_device_ops ksz_netdev_ops; + /* tag_lan9303.c */ extern const struct dsa_device_ops lan9303_netdev_ops; diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c new file mode 100644 index 000000000000..0b08a40b493a --- /dev/null +++ b/net/dsa/tag_ksz.c @@ -0,0 +1,101 @@ +/* + * net/dsa/tag_ksz.c - Microchip KSZ Switch tag format handling + * Copyright (c) 2017 Microchip Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include "dsa_priv.h" + +/* For Ingress (Host -> KSZ), 2 bytes are added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag0 : Prioritization (not used now) + * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5) + * + * For Egress (KSZ -> Host), 1 byte is added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag0 : zero-based value represents port + * (eg, 0x00=port1, 0x02=port3, 0x06=port7) + */ + +#define KSZ_INGRESS_TAG_LEN 2 +#define KSZ_EGRESS_TAG_LEN 1 + +static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct sk_buff *nskb; + int padlen; + u8 *tag; + + padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len; + + if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) { + nskb = skb; + } else { + nskb = alloc_skb(NET_IP_ALIGN + skb->len + + padlen + KSZ_INGRESS_TAG_LEN, GFP_ATOMIC); + if (!nskb) { + kfree_skb(skb); + return NULL; + } + skb_reserve(nskb, NET_IP_ALIGN); + + skb_reset_mac_header(nskb); + skb_set_network_header(nskb, + skb_network_header(skb) - skb->head); + skb_set_transport_header(nskb, + skb_transport_header(skb) - skb->head); + skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); + kfree_skb(skb); + } + + /* skb is freed when it fails */ + if (skb_put_padto(nskb, nskb->len + padlen)) + return NULL; + + tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); + tag[0] = 0; + tag[1] = 1 << p->dp->index; /* destination port */ + + return nskb; +} + +struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + u8 *tag; + int source_port; + + ds = dst->cpu_dp->ds; + + tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; + + source_port = tag[0] & 7; + if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + return NULL; + + pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN); + + skb->dev = ds->ports[source_port].netdev; + + return skb; +} + +const struct dsa_device_ops ksz_netdev_ops = { + .xmit = ksz_xmit, + .rcv = ksz_rcv, +}; -- cgit v1.2.3 From ba52d61e0ffbb8538d5f07071d38a78afb920176 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 31 May 2017 22:53:25 -0700 Subject: ipv4: route: restore skb_dst_set in inet_rtm_getroute recent updates to inet_rtm_getroute dropped skb_dst_set in inet_rtm_getroute. This patch restores it because it is needed to release the dst correctly. Fixes: 3765d35ed8b9 ("net: ipv4: Convert inet_rtm_getroute to rcu versions of route lookup") Reported-by: John Fastabend Signed-off-by: David Ahern Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv4/route.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f1f2e5aaa2d6..9b38cf18144e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2547,8 +2547,9 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, - u32 seq, struct rtable *rt) + u32 seq) { + struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; @@ -2750,6 +2751,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err) goto errout_free; + skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; @@ -2763,8 +2765,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_tos, res.fi, 0); else err = rt_fill_info(net, dst, src, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - rt); + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); if (err < 0) goto errout_free; -- cgit v1.2.3 From 7212462fa6fdae61f7f40a4ead048def45bb23cb Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 1 Jun 2017 10:00:07 +0200 Subject: netlink: don't send unknown nsid The NETLINK_F_LISTEN_ALL_NSID otion enables to listen all netns that have a nsid assigned into the netns where the netlink socket is opened. The nsid is sent as metadata to userland, but the existence of this nsid is checked only for netns that are different from the socket netns. Thus, if no nsid is assigned to the socket netns, NETNSA_NSID_NOT_ASSIGNED is reported to the userland. This value is confusing and useless. After this patch, only valid nsid are sent to userland. Reported-by: Flavio Leitner Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ee841f00a6ec..7586d446d7dc 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1415,7 +1416,8 @@ static void do_one_broadcast(struct sock *sk, goto out; } NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); - NETLINK_CB(p->skb2).nsid_is_set = true; + if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) + NETLINK_CB(p->skb2).nsid_is_set = true; val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); -- cgit v1.2.3 From 7b954ed7529b6bae8712345dd9d22afa57b96dca Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 1 Jun 2017 13:22:42 +0100 Subject: net: dsa: make function ksz_rcv static function ksz_rcv can be made static as it does not need to be in global scope. Reformat arguments to make it checkpatch warning free too. Cleans up sparse warning: "symbol 'ksz_rcv' was not declared. Should it be static?" Signed-off-by: Colin Ian King Reviewed-by: Woojung Huh Signed-off-by: David S. Miller --- net/dsa/tag_ksz.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 0b08a40b493a..dfcd2fff5b13 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -72,8 +72,9 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) return nskb; } -struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_switch *ds; -- cgit v1.2.3 From aeb073241fe7a2b932e04e20c60e47718332877f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 1 Jun 2017 18:07:55 +0300 Subject: net: bridge: start hello timer only if device is up When the transition of NO_STP -> KERNEL_STP was fixed by always calling mod_timer in br_stp_start, it introduced a new regression which causes the timer to be armed even when the bridge is down, and since we stop the timers in its ndo_stop() function, they never get disabled if the device is destroyed before it's upped. To reproduce: $ while :; do ip l add br0 type bridge hello_time 100; brctl stp br0 on; ip l del br0; done; CC: Xin Long CC: Ivan Vecera CC: Sebastian Ott Reported-by: Sebastian Ott Fixes: 6d18c732b95c ("bridge: start hello_timer when enabling KERNEL_STP in br_stp_start") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_stp_if.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 0db8102995a5..6f12a5271219 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -179,7 +179,8 @@ static void br_stp_start(struct net_bridge *br) br_debug(br, "using kernel STP\n"); /* To start timers on any ports left in blocking */ - mod_timer(&br->hello_timer, jiffies + br->hello_time); + if (br->dev->flags & IFF_UP) + mod_timer(&br->hello_timer, jiffies + br->hello_time); br_port_state_selection(br); } -- cgit v1.2.3 From e165bc02a02c70e40d5c811c705ba269aeca0497 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 1 Jun 2017 21:26:03 +0200 Subject: mac80211: fix dropped counter in multiqueue RX In the commit enabling per-CPU station statistics, I inadvertedly copy-pasted some code to update rx_packets and forgot to change it to update rx_dropped_misc. Fix that. This addresses https://bugzilla.kernel.org/show_bug.cgi?id=195953. Fixes: c9c5962b56c1 ("mac80211: enable collecting station statistics per-CPU") Reported-by: Petru-Florin Mihancea Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 7cdf7a835bb0..403e3cc58b57 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2155,7 +2155,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); - sinfo->rx_packets += cpurxs->dropped; + sinfo->rx_dropped_misc += cpurxs->dropped; } } -- cgit v1.2.3 From 73a7ece8f70c955464080e434b5324bcdfdcb1b1 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 1 Jun 2017 16:07:11 -0400 Subject: net: dsa: comment hot path requirements The DSA layer uses inline helpers and copy of the tagging functions for faster access in hot path. Add comments to detail that. Reviewed-by: Florian Fainelli Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ net/dsa/dsa_priv.h | 1 + 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 7de1234ba136..18ca0a935c96 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -127,6 +127,8 @@ struct dsa_switch_tree { * protocol to use. */ struct net_device *master_netdev; + + /* Copy of tag_ops->rcv for faster access in hot path */ struct sk_buff * (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, @@ -465,6 +467,7 @@ struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev); struct net_device *dsa_dev_to_net_device(struct device *dev); +/* Keep inline for faster access in hot path */ static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) { return dst->rcv != NULL; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 7459d5735d8b..db2a7b9edfb8 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -73,6 +73,7 @@ struct dsa_device_ops { }; struct dsa_slave_priv { + /* Copy of dp->ds->dst->tag_ops->xmit for faster access in hot path */ struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); -- cgit v1.2.3 From 02f840cbc9fa9ee147d882edb96b203999c3ac62 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 1 Jun 2017 16:07:12 -0400 Subject: net: dsa: do not cast dst dsa_ptr is not a void pointer anymore since Nov 2011, as of cf50dcc24f82 ("dsa: Change dsa_uses_{dsa, trailer}_tags() into inline functions"), but an explicit dsa_switch_tree pointer, thus remove the (void *) cast. Reviewed-by: Florian Fainelli Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 2 +- net/dsa/legacy.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index c0a4576db4a2..21b44a9828f6 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -454,7 +454,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) * sent to the tag format's receive function. */ wmb(); - dst->master_netdev->dsa_ptr = (void *)dst; + dst->master_netdev->dsa_ptr = dst; dst->applied = true; return 0; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index ac4379b8d7ac..d70a1a788d17 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -651,7 +651,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, * sent to the tag format's receive function. */ wmb(); - dev->dsa_ptr = (void *)dst; + dev->dsa_ptr = dst; return 0; } -- cgit v1.2.3 From 5470979585d81271d4338c2accfd5e71dafa4af6 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 1 Jun 2017 16:07:14 -0400 Subject: net: dsa: remove out_drop label in taggers rcv Many rcv functions from net/dsa/tag_*.c have a useless out_drop goto label which simply returns NULL. Kill it in favor of the obvious. Reviewed-by: Florian Fainelli Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/tag_brcm.c | 11 ++++------- net/dsa/tag_dsa.c | 13 +++++-------- net/dsa/tag_edsa.c | 13 +++++-------- net/dsa/tag_mtk.c | 9 +++------ net/dsa/tag_qca.c | 11 ++++------- net/dsa/tag_trailer.c | 9 +++------ 6 files changed, 24 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 9f204f18ada3..635ecb6781e4 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -104,27 +104,27 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, ds = dst->cpu_dp->ds; if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) - goto out_drop; + return NULL; /* skb->data points to the EtherType, the tag is right before it */ brcm_tag = skb->data - 2; /* The opcode should never be different than 0b000 */ if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK)) - goto out_drop; + return NULL; /* We should never see a reserved reason code without knowing how to * handle it */ if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD)) - goto out_drop; + return NULL; /* Locate which port this is coming from */ source_port = brcm_tag[3] & BRCM_EG_PID_MASK; /* Validate port against switch setup, either the port is totally */ if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) - goto out_drop; + return NULL; /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); @@ -137,9 +137,6 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = ds->ports[source_port].netdev; return skb; - -out_drop: - return NULL; } const struct dsa_device_ops brcm_netdev_ops = { diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 3b62a57956a3..089c99c8ed51 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -79,7 +79,7 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, int source_port; if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) - goto out_drop; + return NULL; /* * The ethertype field is part of the DSA header. @@ -90,7 +90,7 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, * Check that frame type is either TO_CPU or FORWARD. */ if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0) - goto out_drop; + return NULL; /* * Determine source device and port. @@ -103,14 +103,14 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, * port is a registered DSA port. */ if (source_device >= DSA_MAX_SWITCHES) - goto out_drop; + return NULL; ds = dst->ds[source_device]; if (!ds) - goto out_drop; + return NULL; if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) - goto out_drop; + return NULL; /* * Convert the DSA header to an 802.1q header if the 'tagged' @@ -161,9 +161,6 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = ds->ports[source_port].netdev; return skb; - -out_drop: - return NULL; } const struct dsa_device_ops dsa_netdev_ops = { diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index f95cafd05702..a7eed1d43d80 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -92,7 +92,7 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, int source_port; if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) - goto out_drop; + return NULL; /* * Skip the two null bytes after the ethertype. @@ -103,7 +103,7 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, * Check that frame type is either TO_CPU or FORWARD. */ if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0) - goto out_drop; + return NULL; /* * Determine source device and port. @@ -116,14 +116,14 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, * port is a registered DSA port. */ if (source_device >= DSA_MAX_SWITCHES) - goto out_drop; + return NULL; ds = dst->ds[source_device]; if (!ds) - goto out_drop; + return NULL; if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) - goto out_drop; + return NULL; /* * If the 'tagged' bit is set, convert the DSA tag to a 802.1q @@ -180,9 +180,6 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = ds->ports[source_port].netdev; return skb; - -out_drop: - return NULL; } const struct dsa_device_ops edsa_netdev_ops = { diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index d1258e84cd71..4b4aaf1574aa 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -57,7 +57,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, __be16 *phdr, hdr; if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN))) - goto out_drop; + return NULL; /* The MTK header is added by the switch between src addr * and ethertype at this point, skb->data points to 2 bytes @@ -79,19 +79,16 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, */ ds = dst->ds[0]; if (!ds) - goto out_drop; + return NULL; /* Get source port information */ port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK); if (!ds->ports[port].netdev) - goto out_drop; + return NULL; skb->dev = ds->ports[port].netdev; return skb; - -out_drop: - return NULL; } const struct dsa_device_ops mtk_netdev_ops = { diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 2451007699b7..44f545d2761a 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -77,7 +77,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, __be16 *phdr, hdr; if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) - goto out_drop; + return NULL; /* The QCA header is added by the switch between src addr and Ethertype * At this point, skb->data points to ethertype so header should be @@ -89,7 +89,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Make sure the version is correct */ ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S; if (unlikely(ver != QCA_HDR_VERSION)) - goto out_drop; + return NULL; /* Remove QCA tag and recalculate checksum */ skb_pull_rcsum(skb, QCA_HDR_LEN); @@ -101,20 +101,17 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, */ ds = dst->cpu_dp->ds; if (!ds) - goto out_drop; + return NULL; /* Get source port information */ port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK); if (!ds->ports[port].netdev) - goto out_drop; + return NULL; /* Update skb & forward the frame accordingly */ skb->dev = ds->ports[port].netdev; return skb; - -out_drop: - return NULL; } const struct dsa_device_ops qca_netdev_ops = { diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 7488ab2932ab..ec729c0ef390 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -70,25 +70,22 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, ds = dst->cpu_dp->ds; if (skb_linearize(skb)) - goto out_drop; + return NULL; trailer = skb_tail_pointer(skb) - 4; if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 || (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00) - goto out_drop; + return NULL; source_port = trailer[1] & 7; if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) - goto out_drop; + return NULL; pskb_trim_rcsum(skb, skb->len - 4); skb->dev = ds->ports[source_port].netdev; return skb; - -out_drop: - return NULL; } const struct dsa_device_ops trailer_netdev_ops = { -- cgit v1.2.3 From fe47d563065c91cd30088d9b118b79d44905be6f Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 1 Jun 2017 16:07:15 -0400 Subject: net: dsa: factor skb freeing on xmit As of a86d8becc3f0 ("net: dsa: Factor bottom tag receive functions"), the rcv caller frees the original SKB in case or error. Be symmetric with that and make the xmit caller do the same. At the same time, fix the checkpatch NULL comparison check: CHECK: Comparison to NULL could be written "!nskb" #208: FILE: net/dsa/tag_trailer.c:35: + if (nskb == NULL) Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 8 ++++++-- net/dsa/tag_brcm.c | 6 +----- net/dsa/tag_dsa.c | 8 ++------ net/dsa/tag_edsa.c | 8 ++------ net/dsa/tag_ksz.c | 4 +--- net/dsa/tag_lan9303.c | 5 +---- net/dsa/tag_mtk.c | 6 +----- net/dsa/tag_qca.c | 6 +----- net/dsa/tag_trailer.c | 4 +--- 9 files changed, 16 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 0442b6bf52fa..1cfdb31a2f44 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -357,10 +357,14 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; - /* Transmit function may have to reallocate the original SKB */ + /* Transmit function may have to reallocate the original SKB, + * in which case it must have freed it. Only free it here on error. + */ nskb = p->xmit(skb, dev); - if (!nskb) + if (!nskb) { + kfree_skb(skb); return NETDEV_TX_OK; + } /* SKB for netpoll still need to be mangled with the protocol-specific * tag to be successfully transmitted diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 635ecb6781e4..c03860907f28 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -65,7 +65,7 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev u8 *brcm_tag; if (skb_cow_head(skb, BRCM_TAG_LEN) < 0) - goto out_free; + return NULL; skb_push(skb, BRCM_TAG_LEN); @@ -86,10 +86,6 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK; return skb; - -out_free: - kfree_skb(skb); - return NULL; } static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 089c99c8ed51..12867a4b458f 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -28,7 +28,7 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) */ if (skb->protocol == htons(ETH_P_8021Q)) { if (skb_cow_head(skb, 0) < 0) - goto out_free; + return NULL; /* * Construct tagged FROM_CPU DSA tag from 802.1q tag. @@ -46,7 +46,7 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) } } else { if (skb_cow_head(skb, DSA_HLEN) < 0) - goto out_free; + return NULL; skb_push(skb, DSA_HLEN); memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN); @@ -62,10 +62,6 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) } return skb; - -out_free: - kfree_skb(skb); - return NULL; } static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index a7eed1d43d80..67a9d26f9075 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -30,7 +30,7 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) */ if (skb->protocol == htons(ETH_P_8021Q)) { if (skb_cow_head(skb, DSA_HLEN) < 0) - goto out_free; + return NULL; skb_push(skb, DSA_HLEN); memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN); @@ -55,7 +55,7 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) } } else { if (skb_cow_head(skb, EDSA_HLEN) < 0) - goto out_free; + return NULL; skb_push(skb, EDSA_HLEN); memmove(skb->data, skb->data + EDSA_HLEN, 2 * ETH_ALEN); @@ -75,10 +75,6 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) } return skb; - -out_free: - kfree_skb(skb); - return NULL; } static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index dfcd2fff5b13..b94a334a1d02 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -46,10 +46,8 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) } else { nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + KSZ_INGRESS_TAG_LEN, GFP_ATOMIC); - if (!nskb) { - kfree_skb(skb); + if (!nskb) return NULL; - } skb_reserve(nskb, NET_IP_ALIGN); skb_reset_mac_header(nskb); diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index afd59330b5f1..247774d149f9 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -52,7 +52,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) if (skb_cow_head(skb, LAN9303_TAG_LEN) < 0) { dev_dbg(&dev->dev, "Cannot make room for the special tag. Dropping packet\n"); - goto out_free; + return NULL; } /* provide 'LAN9303_TAG_LEN' bytes additional space */ @@ -66,9 +66,6 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) lan9303_tag[1] = htons(p->dp->index | BIT(4)); return skb; -out_free: - kfree_skb(skb); - return NULL; } static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 4b4aaf1574aa..2f32b7ea3365 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -27,7 +27,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, u8 *mtk_tag; if (skb_cow_head(skb, MTK_HDR_LEN) < 0) - goto out_free; + return NULL; skb_push(skb, MTK_HDR_LEN); @@ -41,10 +41,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, mtk_tag[3] = 0; return skb; - -out_free: - kfree_skb(skb); - return NULL; } static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 44f545d2761a..4f43cf0b4eff 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -45,7 +45,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) dev->stats.tx_bytes += skb->len; if (skb_cow_head(skb, 0) < 0) - goto out_free; + return NULL; skb_push(skb, QCA_HDR_LEN); @@ -60,10 +60,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) *phdr = htons(hdr); return skb; - -out_free: - kfree_skb(skb); - return NULL; } static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index ec729c0ef390..b4f6db094409 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -32,10 +32,8 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) padlen = 60 - skb->len; nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC); - if (nskb == NULL) { - kfree_skb(skb); + if (!nskb) return NULL; - } skb_reserve(nskb, NET_IP_ALIGN); skb_reset_mac_header(nskb); -- cgit v1.2.3 From ac2629a4797fdc839f7a4775cc141e6f53e064cf Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 1 Jun 2017 19:53:04 -0700 Subject: net: dsa: Move dsa_switch_{suspend,resume} out of legacy.c dsa_switch_suspend() and dsa_switch_resume() are functions that belong in net/dsa/dsa.c and are not part of the legacy platform support code. Fixes: a6a71f19fe5e ("net: dsa: isolate legacy code") Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/legacy.c | 47 ----------------------------------------------- 2 files changed, 47 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 26130ae438da..90038d45a547 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -223,6 +223,53 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, return 0; } +#ifdef CONFIG_PM_SLEEP +int dsa_switch_suspend(struct dsa_switch *ds) +{ + int i, ret = 0; + + /* Suspend slave network devices */ + for (i = 0; i < ds->num_ports; i++) { + if (!dsa_is_port_initialized(ds, i)) + continue; + + ret = dsa_slave_suspend(ds->ports[i].netdev); + if (ret) + return ret; + } + + if (ds->ops->suspend) + ret = ds->ops->suspend(ds); + + return ret; +} +EXPORT_SYMBOL_GPL(dsa_switch_suspend); + +int dsa_switch_resume(struct dsa_switch *ds) +{ + int i, ret = 0; + + if (ds->ops->resume) + ret = ds->ops->resume(ds); + + if (ret) + return ret; + + /* Resume slave network devices */ + for (i = 0; i < ds->num_ports; i++) { + if (!dsa_is_port_initialized(ds, i)) + continue; + + ret = dsa_slave_resume(ds->ports[i].netdev); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_switch_resume); +#endif + static struct packet_type dsa_pack_type __read_mostly = { .type = cpu_to_be16(ETH_P_XDSA), .func = dsa_switch_rcv, diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index ad345c8b0b06..7281098df04e 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -289,53 +289,6 @@ static void dsa_switch_destroy(struct dsa_switch *ds) dsa_switch_unregister_notifier(ds); } -#ifdef CONFIG_PM_SLEEP -int dsa_switch_suspend(struct dsa_switch *ds) -{ - int i, ret = 0; - - /* Suspend slave network devices */ - for (i = 0; i < ds->num_ports; i++) { - if (!dsa_is_port_initialized(ds, i)) - continue; - - ret = dsa_slave_suspend(ds->ports[i].netdev); - if (ret) - return ret; - } - - if (ds->ops->suspend) - ret = ds->ops->suspend(ds); - - return ret; -} -EXPORT_SYMBOL_GPL(dsa_switch_suspend); - -int dsa_switch_resume(struct dsa_switch *ds) -{ - int i, ret = 0; - - if (ds->ops->resume) - ret = ds->ops->resume(ds); - - if (ret) - return ret; - - /* Resume slave network devices */ - for (i = 0; i < ds->num_ports; i++) { - if (!dsa_is_port_initialized(ds, i)) - continue; - - ret = dsa_slave_resume(ds->ports[i].netdev); - if (ret) - return ret; - } - - return 0; -} -EXPORT_SYMBOL_GPL(dsa_switch_resume); -#endif - /* platform driver init and cleanup *****************************************/ static int dev_is_class(struct device *dev, void *class) { -- cgit v1.2.3 From cee360ab4dd66fc1de33a5fa1cb418fa21c27ce3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 31 May 2017 16:36:31 +0800 Subject: sctp: define the member stream as an object instead of pointer in asoc As Marcelo's suggestion, stream is a fixed size member of asoc and would not grow with more streams. To avoid an allocation for it, this patch is to define it as an object instead of pointer and update the places using it, also create sctp_stream_update() called in sctp_assoc_update() to migrate the stream info from one stream to another. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 +- net/sctp/associola.c | 13 ++++----- net/sctp/chunk.c | 4 +-- net/sctp/outqueue.c | 10 +++---- net/sctp/proc.c | 4 +-- net/sctp/sm_make_chunk.c | 2 +- net/sctp/sm_statefuns.c | 8 +++--- net/sctp/socket.c | 14 +++++----- net/sctp/stream.c | 68 ++++++++++++++++++++++------------------------ net/sctp/ulpqueue.c | 8 +++--- 10 files changed, 65 insertions(+), 69 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index a8b38e123f97..c8dbf410c4f5 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -381,6 +381,7 @@ int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp); int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp); void sctp_stream_free(struct sctp_stream *stream); void sctp_stream_clear(struct sctp_stream *stream); +void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); /* What is the current SSN number for this stream? */ #define sctp_ssn_peek(stream, type, sid) \ @@ -1750,7 +1751,7 @@ struct sctp_association { __u32 default_rcv_context; /* Stream arrays */ - struct sctp_stream *stream; + struct sctp_stream stream; /* All outbound chunks go through this structure. */ struct sctp_outq outqueue; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 95238284c422..6625b15ab81a 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -291,7 +291,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a return asoc; stream_free: - sctp_stream_free(asoc->stream); + sctp_stream_free(&asoc->stream); fail_init: sock_put(asoc->base.sk); sctp_endpoint_put(asoc->ep); @@ -365,7 +365,7 @@ void sctp_association_free(struct sctp_association *asoc) sctp_tsnmap_free(&asoc->peer.tsn_map); /* Free stream information. */ - sctp_stream_free(asoc->stream); + sctp_stream_free(&asoc->stream); if (asoc->strreset_chunk) sctp_chunk_free(asoc->strreset_chunk); @@ -1151,7 +1151,7 @@ void sctp_assoc_update(struct sctp_association *asoc, /* Reinitialize SSN for both local streams * and peer's streams. */ - sctp_stream_clear(asoc->stream); + sctp_stream_clear(&asoc->stream); /* Flush the ULP reassembly and ordered queue. * Any data there will now be stale and will @@ -1177,11 +1177,8 @@ void sctp_assoc_update(struct sctp_association *asoc, asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; - if (sctp_state(asoc, COOKIE_WAIT)) { - sctp_stream_free(asoc->stream); - asoc->stream = new->stream; - new->stream = NULL; - } + if (sctp_state(asoc, COOKIE_WAIT)) + sctp_stream_update(&asoc->stream, &new->stream); if (!asoc->assoc_id) { /* get a new association id since we don't have one diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 697721a7a3f1..81466f6442e8 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -307,7 +307,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = - &chunk->asoc->stream->out[chunk->sinfo.sinfo_stream]; + &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; @@ -320,7 +320,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { struct sctp_stream_out *streamout = - &chunk->asoc->stream->out[chunk->sinfo.sinfo_stream]; + &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->abandoned_sent[SCTP_PR_INDEX(RTX)]++; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index fe4c3d462f6e..20299df163b9 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -363,7 +363,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); - streamout = &asoc->stream->out[chk->sinfo.sinfo_stream]; + streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; streamout->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; @@ -400,9 +400,9 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, q->out_qlen -= chk->skb->len; asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; - if (chk->sinfo.sinfo_stream < asoc->stream->outcnt) { + if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) { struct sctp_stream_out *streamout = - &asoc->stream->out[chk->sinfo.sinfo_stream]; + &asoc->stream.out[chk->sinfo.sinfo_stream]; streamout->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; } @@ -1036,7 +1036,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) /* RFC 2960 6.5 Every DATA chunk MUST carry a valid * stream identifier. */ - if (chunk->sinfo.sinfo_stream >= asoc->stream->outcnt) { + if (chunk->sinfo.sinfo_stream >= asoc->stream.outcnt) { /* Mark as failed send. */ sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); @@ -1054,7 +1054,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) continue; } - if (asoc->stream->out[sid].state == SCTP_STREAM_CLOSED) { + if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) { sctp_outq_head_data(q, chunk); goto sctp_flush_out; } diff --git a/net/sctp/proc.c b/net/sctp/proc.c index a0b29d43627f..5a27d0f03df5 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -361,8 +361,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) sctp_seq_dump_remote_addrs(seq, assoc); seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d " "%8d %8d %8d %8d", - assoc->hbinterval, assoc->stream->incnt, - assoc->stream->outcnt, assoc->max_retrans, + assoc->hbinterval, assoc->stream.incnt, + assoc->stream.outcnt, assoc->max_retrans, assoc->init_retries, assoc->shutdown_retries, assoc->rtx_data_chunks, atomic_read(&sk->sk_wmem_alloc), diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 92e332e17391..244181413bca 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1544,7 +1544,7 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) /* All fragments will be on the same stream */ sid = ntohs(chunk->subh.data_hdr->stream); - stream = chunk->asoc->stream; + stream = &chunk->asoc->stream; /* Now assign the sequence number to the entire message. * All fragments must have the same stream sequence number. diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index f863b5573e42..df73190da761 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3958,7 +3958,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net, /* Silently discard the chunk if stream-id is not valid */ sctp_walk_fwdtsn(skip, chunk) { - if (ntohs(skip->stream) >= asoc->stream->incnt) + if (ntohs(skip->stream) >= asoc->stream.incnt) goto discard_noforce; } @@ -4029,7 +4029,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast( /* Silently discard the chunk if stream-id is not valid */ sctp_walk_fwdtsn(skip, chunk) { - if (ntohs(skip->stream) >= asoc->stream->incnt) + if (ntohs(skip->stream) >= asoc->stream.incnt) goto gen_shutdown; } @@ -6365,7 +6365,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * and discard the DATA chunk. */ sid = ntohs(data_hdr->stream); - if (sid >= asoc->stream->incnt) { + if (sid >= asoc->stream.incnt) { /* Mark tsn as received even though we drop it */ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn)); @@ -6387,7 +6387,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * and is invalid. */ ssn = ntohs(data_hdr->ssn); - if (ordered && SSN_lt(ssn, sctp_ssn_peek(asoc->stream, in, sid))) + if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->stream, in, sid))) return SCTP_IERROR_PROTO_VIOLATION; /* Send the data up to the user. Note: Schedule the diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f16c8d97b7f3..0822046e4f3f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1920,7 +1920,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) } /* Check for invalid stream. */ - if (sinfo->sinfo_stream >= asoc->stream->outcnt) { + if (sinfo->sinfo_stream >= asoc->stream.outcnt) { err = -EINVAL; goto out_free; } @@ -4497,8 +4497,8 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, info->sctpi_rwnd = asoc->a_rwnd; info->sctpi_unackdata = asoc->unack_data; info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); - info->sctpi_instrms = asoc->stream->incnt; - info->sctpi_outstrms = asoc->stream->outcnt; + info->sctpi_instrms = asoc->stream.incnt; + info->sctpi_outstrms = asoc->stream.outcnt; list_for_each(pos, &asoc->base.inqueue.in_chunk_list) info->sctpi_inqueue++; list_for_each(pos, &asoc->outqueue.out_chunk_list) @@ -4727,8 +4727,8 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, status.sstat_unackdata = asoc->unack_data; status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); - status.sstat_instrms = asoc->stream->incnt; - status.sstat_outstrms = asoc->stream->outcnt; + status.sstat_instrms = asoc->stream.incnt; + status.sstat_outstrms = asoc->stream.outcnt; status.sstat_fragmentation_point = asoc->frag_point; status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc); memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr, @@ -6600,10 +6600,10 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); - if (!asoc || params.sprstat_sid >= asoc->stream->outcnt) + if (!asoc || params.sprstat_sid >= asoc->stream.outcnt) goto out; - streamout = &asoc->stream->out[params.sprstat_sid]; + streamout = &asoc->stream.out[params.sprstat_sid]; if (policy == SCTP_PR_SCTP_NONE) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index dda53a293986..af6b49850344 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -37,30 +37,23 @@ int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp) { - struct sctp_stream *stream; + struct sctp_stream *stream = &asoc->stream; int i; - stream = kzalloc(sizeof(*stream), gfp); - if (!stream) - return -ENOMEM; - stream->outcnt = asoc->c.sinit_num_ostreams; stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp); - if (!stream->out) { - kfree(stream); + if (!stream->out) return -ENOMEM; - } + for (i = 0; i < stream->outcnt; i++) stream->out[i].state = SCTP_STREAM_OPEN; - asoc->stream = stream; - return 0; } int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp) { - struct sctp_stream *stream = asoc->stream; + struct sctp_stream *stream = &asoc->stream; int i; /* Initial stream->out size may be very big, so free it and alloc @@ -70,7 +63,7 @@ int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp) stream->outcnt = asoc->c.sinit_num_ostreams; stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp); if (!stream->out) - goto nomem; + return -ENOMEM; for (i = 0; i < stream->outcnt; i++) stream->out[i].state = SCTP_STREAM_OPEN; @@ -79,26 +72,17 @@ int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp) stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp); if (!stream->in) { kfree(stream->out); - goto nomem; + stream->out = NULL; + return -ENOMEM; } return 0; - -nomem: - asoc->stream = NULL; - kfree(stream); - - return -ENOMEM; } void sctp_stream_free(struct sctp_stream *stream) { - if (unlikely(!stream)) - return; - kfree(stream->out); kfree(stream->in); - kfree(stream); } void sctp_stream_clear(struct sctp_stream *stream) @@ -112,6 +96,19 @@ void sctp_stream_clear(struct sctp_stream *stream) stream->in[i].ssn = 0; } +void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) +{ + sctp_stream_free(stream); + + stream->out = new->out; + stream->in = new->in; + stream->outcnt = new->outcnt; + stream->incnt = new->incnt; + + new->out = NULL; + new->in = NULL; +} + static int sctp_send_reconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { @@ -128,7 +125,7 @@ static int sctp_send_reconf(struct sctp_association *asoc, int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params) { - struct sctp_stream *stream = asoc->stream; + struct sctp_stream *stream = &asoc->stream; __u16 i, str_nums, *str_list; struct sctp_chunk *chunk; int retval = -EINVAL; @@ -214,6 +211,7 @@ out: int sctp_send_reset_assoc(struct sctp_association *asoc) { + struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u16 i; @@ -230,8 +228,8 @@ int sctp_send_reset_assoc(struct sctp_association *asoc) return -ENOMEM; /* Block further xmit of data until this request is completed */ - for (i = 0; i < asoc->stream->outcnt; i++) - asoc->stream->out[i].state = SCTP_STREAM_CLOSED; + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); @@ -241,8 +239,8 @@ int sctp_send_reset_assoc(struct sctp_association *asoc) sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; - for (i = 0; i < asoc->stream->outcnt; i++) - asoc->stream->out[i].state = SCTP_STREAM_OPEN; + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_OPEN; return retval; } @@ -255,7 +253,7 @@ int sctp_send_reset_assoc(struct sctp_association *asoc) int sctp_send_add_streams(struct sctp_association *asoc, struct sctp_add_streams *params) { - struct sctp_stream *stream = asoc->stream; + struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval = -ENOMEM; __u32 outcnt, incnt; @@ -357,7 +355,7 @@ struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_ulpevent **evp) { struct sctp_strreset_outreq *outreq = param.v; - struct sctp_stream *stream = asoc->stream; + struct sctp_stream *stream = &asoc->stream; __u16 i, nums, flags = 0, *str_p = NULL; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq; @@ -449,7 +447,7 @@ struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_ulpevent **evp) { struct sctp_strreset_inreq *inreq = param.v; - struct sctp_stream *stream = asoc->stream; + struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u16 i, nums, *str_p; @@ -523,7 +521,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( { __u32 init_tsn = 0, next_tsn = 0, max_tsn_seen; struct sctp_strreset_tsnreq *tsnreq = param.v; - struct sctp_stream *stream = asoc->stream; + struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq; __u16 i; @@ -612,7 +610,7 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; - struct sctp_stream *stream = asoc->stream; + struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_stream_in *streamin; __u32 request_seq, incnt; @@ -687,7 +685,7 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; - struct sctp_stream *stream = asoc->stream; + struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_stream_out *streamout; struct sctp_chunk *chunk = NULL; @@ -758,8 +756,8 @@ struct sctp_chunk *sctp_process_strreset_resp( union sctp_params param, struct sctp_ulpevent **evp) { + struct sctp_stream *stream = &asoc->stream; struct sctp_strreset_resp *resp = param.v; - struct sctp_stream *stream = asoc->stream; struct sctp_transport *t; __u16 i, nums, flags = 0; sctp_paramhdr_t *req; diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index aa3624d50278..25f7e4140566 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -764,7 +764,7 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, __u16 sid, csid, cssn; sid = event->stream; - stream = ulpq->asoc->stream; + stream = &ulpq->asoc->stream; event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; @@ -858,7 +858,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, /* Note: The stream ID must be verified before this routine. */ sid = event->stream; ssn = event->ssn; - stream = ulpq->asoc->stream; + stream = &ulpq->asoc->stream; /* Is this the expected SSN for this stream ID? */ if (ssn != sctp_ssn_peek(stream, in, sid)) { @@ -893,7 +893,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) struct sk_buff_head *lobby = &ulpq->lobby; __u16 csid, cssn; - stream = ulpq->asoc->stream; + stream = &ulpq->asoc->stream; /* We are holding the chunks by stream, by SSN. */ skb_queue_head_init(&temp); @@ -958,7 +958,7 @@ void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) struct sctp_stream *stream; /* Note: The stream ID must be verified before this routine. */ - stream = ulpq->asoc->stream; + stream = &ulpq->asoc->stream; /* Is this an old SSN? If so ignore. */ if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid))) -- cgit v1.2.3 From ff356414dc006170153c79434eb81d130c03beec Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 31 May 2017 16:36:32 +0800 Subject: sctp: merge sctp_stream_new and sctp_stream_init Since last patch, sctp doesn't need to alloc memory for asoc->stream any more. sctp_stream_new and sctp_stream_init both are used to alloc memory for stream.in or stream.out, and their names are also confusing. This patch is to merge them into sctp_stream_init, and only pass stream and streamcnt parameters into it, instead of the whole asoc. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 4 ++-- net/sctp/associola.c | 3 ++- net/sctp/sm_make_chunk.c | 3 ++- net/sctp/stream.c | 33 +++++++++++---------------------- 4 files changed, 17 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index c8dbf410c4f5..5051317162df 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -377,8 +377,8 @@ typedef struct sctp_sender_hb_info { __u64 hb_nonce; } sctp_sender_hb_info_t; -int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp); -int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp); +int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, + gfp_t gfp); void sctp_stream_free(struct sctp_stream *stream); void sctp_stream_clear(struct sctp_stream *stream); void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 6625b15ab81a..288c5e0cda5d 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -246,7 +246,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a if (!sctp_ulpq_init(&asoc->ulpq, asoc)) goto fail_init; - if (sctp_stream_new(asoc, gfp)) + if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, + 0, gfp)) goto fail_init; /* Assume that peer would support both address types unless we are diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 244181413bca..bd439edf2d8a 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2454,7 +2454,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * stream sequence number shall be set to 0. */ - if (sctp_stream_init(asoc, gfp)) + if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, + asoc->c.sinit_max_instreams, gfp)) goto clean_up; if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index af6b49850344..82e6d40052a8 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -35,47 +35,36 @@ #include #include -int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp) +int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, + gfp_t gfp) { - struct sctp_stream *stream = &asoc->stream; - int i; - - stream->outcnt = asoc->c.sinit_num_ostreams; - stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp); - if (!stream->out) - return -ENOMEM; - - for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; - - return 0; -} - -int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp) -{ - struct sctp_stream *stream = &asoc->stream; int i; /* Initial stream->out size may be very big, so free it and alloc * a new one with new outcnt to save memory. */ kfree(stream->out); - stream->outcnt = asoc->c.sinit_num_ostreams; - stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp); + + stream->out = kcalloc(outcnt, sizeof(*stream->out), gfp); if (!stream->out) return -ENOMEM; + stream->outcnt = outcnt; for (i = 0; i < stream->outcnt; i++) stream->out[i].state = SCTP_STREAM_OPEN; - stream->incnt = asoc->c.sinit_max_instreams; - stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp); + if (!incnt) + return 0; + + stream->in = kcalloc(incnt, sizeof(*stream->in), gfp); if (!stream->in) { kfree(stream->out); stream->out = NULL; return -ENOMEM; } + stream->incnt = incnt; + return 0; } -- cgit v1.2.3 From 6e80ac5cc992ab6256c3dae87f7e57db15e1a58c Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 31 May 2017 13:15:41 +0100 Subject: ipv6: xfrm: Handle errors reported by xfrm6_find_1stfragopt() xfrm6_find_1stfragopt() may now return an error code and we must not treat it as a length. Fixes: 2423496af35d ("ipv6: Prevent overrun when parsing v6 header options") Signed-off-by: Ben Hutchings Acked-by: Craig Gallek Signed-off-by: David S. Miller --- net/ipv6/xfrm6_mode_ro.c | 2 ++ net/ipv6/xfrm6_mode_transport.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c index 0e015906f9ca..07d36573f50b 100644 --- a/net/ipv6/xfrm6_mode_ro.c +++ b/net/ipv6/xfrm6_mode_ro.c @@ -47,6 +47,8 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) iph = ipv6_hdr(skb); hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + if (hdr_len < 0) + return hdr_len; skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); skb_set_network_header(skb, -x->props.header_len); skb->transport_header = skb->network_header + hdr_len; diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 7a92c0f31912..9ad07a91708e 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -30,6 +30,8 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) skb_set_inner_transport_header(skb, skb_transport_offset(skb)); hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + if (hdr_len < 0) + return hdr_len; skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); skb_set_network_header(skb, -x->props.header_len); skb->transport_header = skb->network_header + hdr_len; -- cgit v1.2.3 From 44abafc4cc094214a99f860f778c48ecb23422fc Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 31 May 2017 11:21:27 -0700 Subject: tcp: disallow cwnd undo when switching congestion control When the sender switches its congestion control during loss recovery, if the recovery is spurious then it may incorrectly revert cwnd and ssthresh to the older values set by a previous congestion control. Consider a congestion control (like BBR) that does not use ssthresh and keeps it infinite: the connection may incorrectly revert cwnd to an infinite value when switching from BBR to another congestion control. This patch fixes it by disallowing such cwnd undo operation upon switching congestion control. Note that undo_marker is not reset s.t. the packets that were incorrectly marked lost would be corrected. We only avoid undoing the cwnd in tcp_undo_cwnd_reduction(). Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6e3c512054a6..324c9bcc5456 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -180,6 +180,7 @@ void tcp_init_congestion_control(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) -- cgit v1.2.3 From 775e68a93fe4d33ec93949c8022ed84b97a97096 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 31 May 2017 11:30:53 -0700 Subject: tcp: use TS opt on RTTs for congestion control Currently when a data packet is retransmitted, we do not compute an RTT sample for congestion control due to Kern's check. Therefore the congestion control that uses RTT signals may not receive any update during loss recovery which could last many round trips. For example, BBR and Vegas may not be able to update its min RTT estimation if the network path has shortened until it recovers from losses. This patch mitigates that by using TCP timestamp options for RTT measurement for congestion control. Note that we already use timestamps for RTT estimation. Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9f4380662196..4ea8ec5c7bb4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2921,9 +2921,9 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) rtt_us ? : jiffies_to_usecs(1)); } -static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - long seq_rtt_us, long sack_rtt_us, - long ca_rtt_us) +static bool tcp_ack_update_rtt(struct sock *sk, const int flag, + long seq_rtt_us, long sack_rtt_us, + long ca_rtt_us, struct rate_sample *rs) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2948,6 +2948,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, seq_rtt_us = ca_rtt_us = delta_us; } + rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) return false; @@ -2967,12 +2968,13 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { + struct rate_sample rs; long rtt_us = -1L; if (req && !req->num_retrans && tcp_rsk(req)->snt_synack) rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack); - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us); + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs); } @@ -3177,9 +3179,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt); } - sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, - ca_rtt_us); + ca_rtt_us, sack->rate); if (flag & FLAG_ACKED) { tcp_rearm_rto(sk); @@ -3215,7 +3216,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = ca_rtt_us, + .rtt_us = sack->rate->rtt_us, .in_flight = last_in_flight }; icsk->icsk_ca_ops->pkts_acked(sk, &sample); -- cgit v1.2.3 From f4d01666616adeebe72f84ee6e9385e309805ba5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 1 Jun 2017 14:18:36 -0700 Subject: tcp: remove unnecessary skb_reset_tail_pointer() __pskb_trim_head() does not need to reset skb tail pointer. Also change the comments, __pskb_pull_head() does not exist. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 478f75baee31..e3aab1c1cf78 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1328,9 +1328,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, return 0; } -/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c - * eventually). The difference is that pulled data not copied, but - * immediately discarded. +/* This is similar to __pskb_pull_tail(). The difference is that pulled + * data is not copied, but immediately discarded. */ static int __pskb_trim_head(struct sk_buff *skb, int len) { @@ -1365,7 +1364,6 @@ static int __pskb_trim_head(struct sk_buff *skb, int len) } shinfo->nr_frags = k; - skb_reset_tail_pointer(skb); skb->data_len -= len; skb->len = skb->data_len; return len; -- cgit v1.2.3 From 518d8a2e9bad83c6040eccebc3d1f7388fc034e7 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 1 Jun 2017 21:37:37 +0300 Subject: net/flow_dissector: add support for dissection of misc ip header fields Add support for dissection of ip tos and ttl and ipv6 traffic-class and hoplimit. Both are dissected into the same struct. Uses similar call to ip dissection function as with tcp, arp and others. Signed-off-by: Or Gerlitz Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 11 +++++++++++ net/core/flow_dissector.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index efe34eec61dc..e2663e900b0a 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -165,6 +165,16 @@ struct flow_dissector_key_tcp { __be16 flags; }; +/** + * struct flow_dissector_key_ip: + * @tos: tos + * @ttl: ttl + */ +struct flow_dissector_key_ip { + __u8 tos; + __u8 ttl; +}; + enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ @@ -186,6 +196,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */ FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ + FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_MAX, }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5a45943081f5..fc5fc4594c90 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -367,6 +367,40 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb, key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); } +static void +__skb_flow_dissect_ipv4(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, const struct iphdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = iph->tos; + key_ip->ttl = iph->ttl; +} + +static void +__skb_flow_dissect_ipv6(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, const struct ipv6hdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = ipv6_get_dsfield(iph); + key_ip->ttl = iph->hop_limit; +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -469,6 +503,9 @@ ip: } } + __skb_flow_dissect_ipv4(skb, flow_dissector, + target_container, data, iph); + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) goto out_good; @@ -514,6 +551,9 @@ ipv6: goto out_good; } + __skb_flow_dissect_ipv6(skb, flow_dissector, + target_container, data, iph); + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) goto out_good; -- cgit v1.2.3 From 4d80cc0aaaab9efac14c9d3d702b69961800de20 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 1 Jun 2017 21:37:38 +0300 Subject: net/sched: cls_flower: add support for matching on ip tos and ttl Benefit from the support of ip header fields dissection and allow users to set rules matching on ipv4 tos and ttl or ipv6 traffic-class and hoplimit. Signed-off-by: Or Gerlitz Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 5 +++++ net/sched/cls_flower.c | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index c6e8cf5e9c40..edf43ddf47b0 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -454,6 +454,11 @@ enum { TCA_FLOWER_KEY_TCP_FLAGS, /* be16 */ TCA_FLOWER_KEY_TCP_FLAGS_MASK, /* be16 */ + TCA_FLOWER_KEY_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index fb74a47830f4..33feaee197cf 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -50,6 +50,7 @@ struct fl_flow_key { struct flow_dissector_key_ports enc_tp; struct flow_dissector_key_mpls mpls; struct flow_dissector_key_tcp tcp; + struct flow_dissector_key_ip ip; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -427,6 +428,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_IP_TOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -528,6 +533,19 @@ static int fl_set_key_flags(struct nlattr **tb, return 0; } +static void fl_set_key_ip(struct nlattr **tb, + struct flow_dissector_key_ip *key, + struct flow_dissector_key_ip *mask) +{ + fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS, + &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK, + sizeof(key->tos)); + + fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, + &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK, + sizeof(key->ttl)); +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { @@ -570,6 +588,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)); + fl_set_key_ip(tb, &key->ip, &mask->ip); } if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { @@ -772,6 +791,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_IP, ip); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_TCP, tcp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, @@ -1082,6 +1103,19 @@ static int fl_dump_key_mpls(struct sk_buff *skb, return 0; } +static int fl_dump_key_ip(struct sk_buff *skb, + struct flow_dissector_key_ip *key, + struct flow_dissector_key_ip *mask) +{ + if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos, + TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) || + fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl, + TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl))) + return -1; + + return 0; +} + static int fl_dump_key_vlan(struct sk_buff *skb, struct flow_dissector_key_vlan *vlan_key, struct flow_dissector_key_vlan *vlan_mask) @@ -1195,9 +1229,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && - fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, - sizeof(key->basic.ip_proto))) + sizeof(key->basic.ip_proto)) || + fl_dump_key_ip(skb, &key->ip, &mask->ip))) goto nla_put_failure; if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && -- cgit v1.2.3 From 5f733ee68f9a4df94775299ac6a7ab260704f6ed Mon Sep 17 00:00:00 2001 From: Liam McBirnie Date: Thu, 1 Jun 2017 15:36:01 +1000 Subject: ip6_tunnel: fix traffic class routing for tunnels ip6_route_output() requires that the flowlabel contains the traffic class for policy routing. Commit 0e9a709560db ("ip6_tunnel, ip6_gre: fix setting of DSCP on encapsulated packets") removed the code which previously added the traffic class to the flowlabel. The traffic class is added here because only route lookup needs the flowlabel to contain the traffic class. Fixes: 0e9a709560db ("ip6_tunnel, ip6_gre: fix setting of DSCP on encapsulated packets") Signed-off-by: Liam McBirnie Acked-by: Peter Dawson Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 7ae6c503f1ca..9b37f9747fc6 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1095,6 +1095,9 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, if (!dst) { route_lookup: + /* add dsfield to flowlabel for route lookup */ + fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); + dst = ip6_route_output(net, NULL, fl6); if (dst->error) -- cgit v1.2.3 From fbd0ac60420fa2f5d45865326ad1074b80f29060 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 2 Jun 2017 02:40:44 -0700 Subject: net-procfs: Use vsnprintf extension %phN Save a bit of code by using the kernel extension. $ size net/core/net-procfs.o* text data bss dec hex filename 3701 120 0 3821 eed net/core/net-procfs.o.new 3764 120 0 3884 f2c net/core/net-procfs.o.old Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/core/net-procfs.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 14d09345f00d..4847964931df 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -363,15 +363,10 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) netif_addr_lock_bh(dev); netdev_for_each_mc_addr(ha, dev) { - int i; - - seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, - dev->name, ha->refcount, ha->global_use); - - for (i = 0; i < dev->addr_len; i++) - seq_printf(seq, "%02x", ha->addr[i]); - - seq_putc(seq, '\n'); + seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", + dev->ifindex, dev->name, + ha->refcount, ha->global_use, + (int)dev->addr_len, ha->addr); } netif_addr_unlock_bh(dev); return 0; -- cgit v1.2.3 From 8e2f6dd298beb93c30e1699a5c26492a10214447 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Jun 2017 08:02:26 -0700 Subject: dccp: consistently use dccp_write_space() DCCP uses dccp_write_space() for sk->sk_write_space method. Unfortunately a passive connection (as provided by accept()) is using the generic sk_stream_write_space() function. Lets simply inherit sk->sk_write_space from the parent instead of forcing the generic one. Signed-off-by: Eric Dumazet Cc: Gerrit Renker Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 82dec8825d28..a3fa1a5b6d98 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -790,7 +790,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); - newsk->sk_write_space = sk_stream_write_space; /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); -- cgit v1.2.3 From 1820dd0633b9972028e377ee76e5d40873491d25 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 2 Jun 2017 16:22:27 +0100 Subject: rxrpc: remove redundant proc_remove call The proc_remove call is dead code as it occurs after a return and hence can never be called. Remove it. Detected by CoverityScan, CID#1437743 ("Logically dead code") Signed-off-by: Colin Ian King Acked-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/net_ns.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 26449a6bb076..7edceb8522f5 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -59,7 +59,6 @@ static __net_init int rxrpc_init_net(struct net *net) proc_create("conns", 0444, rxnet->proc_net, &rxrpc_connection_seq_fops); return 0; - proc_remove(rxnet->proc_net); err_proc: return ret; } -- cgit v1.2.3 From 38b257938ac6655d0d6333743303231b9c465ec1 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Fri, 2 Jun 2017 12:38:22 -0400 Subject: sock: reset sk_err when the error queue is empty Prior to f5f99309fa74 (sock: do not set sk_err in sock_dequeue_err_skb), sk_err was reset to the error of the skb on the head of the error queue. Applications, most notably ping, are relying on this behavior to reset sk_err for ICMP packets. Set sk_err to the ICMP error when there is an ICMP packet at the head of the error queue. Fixes: f5f99309fa74 (sock: do not set sk_err in sock_dequeue_err_skb) Reported-by: Cyril Hrubis Tested-by: Cyril Hrubis Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 346d3e85dfbc..b1be7c01efe2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3754,8 +3754,11 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); - if (skb && (skb_next = skb_peek(q))) + if (skb && (skb_next = skb_peek(q))) { icmp_next = is_icmp_err_skb(skb_next); + if (icmp_next) + sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin; + } spin_unlock_irqrestore(&q->lock, flags); if (is_icmp_err_skb(skb) && !icmp_next) -- cgit v1.2.3 From 937c7df85ce7ce6b2319894f6ad3376f15dff186 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 2 Jun 2017 12:31:21 -0700 Subject: net: dsa: Pass dsa_port reference to ethtool setup/restore We do not need to have a reference to a dsa_switch, instead we should pass a reference to a CPU dsa_port, change that. This is a preliminary change to better support multiple CPU ports. Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 6 ++++-- net/dsa/dsa2.c | 4 ++-- net/dsa/dsa_priv.h | 4 ++-- net/dsa/legacy.c | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 402459e73f33..fdc448b30e56 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -112,8 +112,9 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) return ops; } -int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds) +int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) { + struct dsa_switch *ds = cpu_dp->ds; struct net_device *master; struct ethtool_ops *cpu_ops; @@ -136,8 +137,9 @@ int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds) return 0; } -void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds) +void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp) { + struct dsa_switch *ds = cpu_dp->ds; struct net_device *master; master = ds->dst->master_netdev; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 21b44a9828f6..7f9bf1456a65 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -444,7 +444,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) } if (dst->cpu_dp) { - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp->ds); + err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); if (err) return err; } @@ -485,7 +485,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) } if (dst->cpu_dp) - dsa_cpu_port_ethtool_restore(dst->cpu_dp->ds); + dsa_cpu_port_ethtool_restore(dst->cpu_dp); pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index db2a7b9edfb8..66ee248796c8 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -103,8 +103,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, struct dsa_port *dport, int port); void dsa_cpu_dsa_destroy(struct dsa_port *dport); const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); -int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds); -void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); +int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp); +void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp); /* legacy.c */ int dsa_legacy_register(void); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index d70a1a788d17..d534d8f4b9cf 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -205,7 +205,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", index); - ret = dsa_cpu_port_ethtool_setup(ds); + ret = dsa_cpu_port_ethtool_setup(ds->dst->cpu_dp); if (ret) return ret; @@ -733,7 +733,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } - dsa_cpu_port_ethtool_restore(dst->cpu_dp->ds); + dsa_cpu_port_ethtool_restore(dst->cpu_dp); dev_put(dst->master_netdev); } -- cgit v1.2.3 From e41c1b5030e279c6626d3d052c3ca5ce5e2508df Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 2 Jun 2017 12:31:22 -0700 Subject: net: dsa: Consistently use dsa_port for dsa_*_port_{apply, unapply} We have all the information we need in dsa_port, so use it instead of repeating the same arguments over and over again. Suggested-by: Vivien Didelot Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 90 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 41 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 7f9bf1456a65..067daec644c1 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -214,66 +214,61 @@ static int dsa_dst_complete(struct dsa_switch_tree *dst) return 0; } -static int dsa_dsa_port_apply(struct dsa_port *port, u32 index, - struct dsa_switch *ds) +static int dsa_dsa_port_apply(struct dsa_port *port) { + struct dsa_switch *ds = port->ds; int err; - err = dsa_cpu_dsa_setup(ds, ds->dev, port, index); + err = dsa_cpu_dsa_setup(ds, ds->dev, port, port->index); if (err) { dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n", - index, err); + port->index, err); return err; } - memset(&ds->ports[index].devlink_port, 0, - sizeof(ds->ports[index].devlink_port)); + memset(&port->devlink_port, 0, sizeof(port->devlink_port)); - return devlink_port_register(ds->devlink, - &ds->ports[index].devlink_port, - index); + return devlink_port_register(ds->devlink, &port->devlink_port, + port->index); } -static void dsa_dsa_port_unapply(struct dsa_port *port, u32 index, - struct dsa_switch *ds) +static void dsa_dsa_port_unapply(struct dsa_port *port) { - devlink_port_unregister(&ds->ports[index].devlink_port); + devlink_port_unregister(&port->devlink_port); dsa_cpu_dsa_destroy(port); } -static int dsa_cpu_port_apply(struct dsa_port *port, u32 index, - struct dsa_switch *ds) +static int dsa_cpu_port_apply(struct dsa_port *port) { + struct dsa_switch *ds = port->ds; int err; - err = dsa_cpu_dsa_setup(ds, ds->dev, port, index); + err = dsa_cpu_dsa_setup(ds, ds->dev, port, port->index); if (err) { dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n", - index, err); + port->index, err); return err; } - ds->cpu_port_mask |= BIT(index); + ds->cpu_port_mask |= BIT(port->index); - memset(&ds->ports[index].devlink_port, 0, - sizeof(ds->ports[index].devlink_port)); - err = devlink_port_register(ds->devlink, &ds->ports[index].devlink_port, - index); + memset(&port->devlink_port, 0, sizeof(port->devlink_port)); + err = devlink_port_register(ds->devlink, &port->devlink_port, + port->index); return err; } -static void dsa_cpu_port_unapply(struct dsa_port *port, u32 index, - struct dsa_switch *ds) +static void dsa_cpu_port_unapply(struct dsa_port *port) { - devlink_port_unregister(&ds->ports[index].devlink_port); + devlink_port_unregister(&port->devlink_port); dsa_cpu_dsa_destroy(port); - ds->cpu_port_mask &= ~BIT(index); + port->ds->cpu_port_mask &= ~BIT(port->index); } -static int dsa_user_port_apply(struct dsa_port *port, u32 index, - struct dsa_switch *ds) +static int dsa_user_port_apply(struct dsa_port *port) { + struct dsa_switch *ds = port->ds; const char *name = port->name; int err; @@ -282,35 +277,32 @@ static int dsa_user_port_apply(struct dsa_port *port, u32 index, if (!name) name = "eth%d"; - err = dsa_slave_create(ds, ds->dev, index, name); + err = dsa_slave_create(ds, ds->dev, port->index, name); if (err) { dev_warn(ds->dev, "Failed to create slave %d: %d\n", - index, err); - ds->ports[index].netdev = NULL; + port->index, err); + port->netdev = NULL; return err; } - memset(&ds->ports[index].devlink_port, 0, - sizeof(ds->ports[index].devlink_port)); - err = devlink_port_register(ds->devlink, &ds->ports[index].devlink_port, - index); + memset(&port->devlink_port, 0, sizeof(port->devlink_port)); + err = devlink_port_register(ds->devlink, &port->devlink_port, + port->index); if (err) return err; - devlink_port_type_eth_set(&ds->ports[index].devlink_port, - ds->ports[index].netdev); + devlink_port_type_eth_set(&port->devlink_port, port->netdev); return 0; } -static void dsa_user_port_unapply(struct dsa_port *port, u32 index, - struct dsa_switch *ds) +static void dsa_user_port_unapply(struct dsa_port *port) { - devlink_port_unregister(&ds->ports[index].devlink_port); - if (ds->ports[index].netdev) { - dsa_slave_destroy(ds->ports[index].netdev); - ds->ports[index].netdev = NULL; - ds->enabled_port_mask &= ~(1 << index); + devlink_port_unregister(&port->devlink_port); + if (port->netdev) { + dsa_slave_destroy(port->netdev); + port->netdev = NULL; + port->ds->enabled_port_mask &= ~(1 << port->index); } } @@ -370,20 +362,20 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) continue; if (dsa_port_is_dsa(port)) { - err = dsa_dsa_port_apply(port, index, ds); + err = dsa_dsa_port_apply(port); if (err) return err; continue; } if (dsa_port_is_cpu(port)) { - err = dsa_cpu_port_apply(port, index, ds); + err = dsa_cpu_port_apply(port); if (err) return err; continue; } - err = dsa_user_port_apply(port, index, ds); + err = dsa_user_port_apply(port); if (err) continue; } @@ -402,16 +394,16 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) continue; if (dsa_port_is_dsa(port)) { - dsa_dsa_port_unapply(port, index, ds); + dsa_dsa_port_unapply(port); continue; } if (dsa_port_is_cpu(port)) { - dsa_cpu_port_unapply(port, index, ds); + dsa_cpu_port_unapply(port); continue; } - dsa_user_port_unapply(port, index, ds); + dsa_user_port_unapply(port); } if (ds->slave_mii_bus && ds->ops->phy_read) -- cgit v1.2.3 From 14be36c2c96cd18cfa036f230b57ea78d82a303f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 2 Jun 2017 12:31:23 -0700 Subject: net: dsa: Initialize all CPU and enabled ports masks in dsa_ds_parse() There was no reason for duplicating the code that initializes ds->enabled_port_mask in both dsa_parse_ports_dn() and dsa_parse_ports(), instead move this to dsa_ds_parse() which is early enough before ops->setup() has run. While at it, we can now make dsa_is_cpu_port() check ds->cpu_port_mask which is a step towards being multi-CPU port capable. Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- net/dsa/dsa2.c | 33 +++++++++++++++------------------ 2 files changed, 16 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 448d8bc77707..2effb0af9d7c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -254,7 +254,7 @@ struct dsa_switch { static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { - return ds->dst->cpu_dp == &ds->ports[p]; + return !!(ds->cpu_port_mask & (1 << p)); } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 067daec644c1..cd13bb54a30c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -250,8 +250,6 @@ static int dsa_cpu_port_apply(struct dsa_port *port) return err; } - ds->cpu_port_mask |= BIT(port->index); - memset(&port->devlink_port, 0, sizeof(port->devlink_port)); err = devlink_port_register(ds->devlink, &port->devlink_port, port->index); @@ -522,6 +520,12 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, dst->rcv = dst->tag_ops->rcv; + /* Initialize cpu_port_mask now for drv->setup() + * to have access to a correct value, just like what + * net/dsa/dsa.c::dsa_switch_setup_one does. + */ + ds->cpu_port_mask |= BIT(index); + return 0; } @@ -533,14 +537,22 @@ static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) for (index = 0; index < ds->num_ports; index++) { port = &ds->ports[index]; - if (!dsa_port_is_valid(port)) + if (!dsa_port_is_valid(port) || + dsa_port_is_dsa(port)) continue; if (dsa_port_is_cpu(port)) { err = dsa_cpu_parse(port, index, dst, ds); if (err) return err; + } else { + /* Initialize enabled_port_mask now for drv->setup() + * to have access to a correct value, just like what + * net/dsa/dsa.c::dsa_switch_setup_one does. + */ + ds->enabled_port_mask |= BIT(index); } + } pr_info("DSA: switch %d %d parsed\n", dst->tree, ds->index); @@ -589,13 +601,6 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) return -EINVAL; ds->ports[reg].dn = port; - - /* Initialize enabled_port_mask now for ops->setup() - * to have access to a correct value, just like what - * net/dsa/dsa.c::dsa_switch_setup_one does. - */ - if (!dsa_port_is_cpu(&ds->ports[reg])) - ds->enabled_port_mask |= 1 << reg; } return 0; @@ -611,14 +616,6 @@ static int dsa_parse_ports(struct dsa_chip_data *cd, struct dsa_switch *ds) continue; ds->ports[i].name = cd->port_names[i]; - - /* Initialize enabled_port_mask now for drv->setup() - * to have access to a correct value, just like what - * net/dsa/dsa.c::dsa_switch_setup_one does. - */ - if (!dsa_port_is_cpu(&ds->ports[i])) - ds->enabled_port_mask |= 1 << i; - valid_name_found = true; } -- cgit v1.2.3 From 5071034e4af709d6783b7d105dc296a5cc84739b Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Fri, 2 Jun 2017 09:01:49 -0700 Subject: neigh: Really delete an arp/neigh entry on "ip neigh delete" or "arp -d" The command # arp -s 62.2.0.1 a:b:c:d:e:f dev eth2 adds an entry like the following (listed by "arp -an") ? (62.2.0.1) at 0a:0b:0c:0d:0e:0f [ether] PERM on eth2 but the symmetric deletion command # arp -i eth2 -d 62.2.0.1 does not remove the PERM entry from the table, and instead leaves behind ? (62.2.0.1) at on eth2 The reason is that there is a refcnt of 1 for the arp_tbl itself (neigh_alloc starts off the entry with a refcnt of 1), thus the neigh_release() call from arp_invalidate() will (at best) just decrement the ref to 1, but will never actually free it from the table. To fix this, we need to do something like neigh_forced_gc: if the refcnt is 1 (i.e., on the table's ref), remove the entry from the table and free it. This patch refactors and shares common code between neigh_forced_gc and the newly added neigh_remove_one. A similar issue exists for IPv6 Neighbor Cache entries, and is fixed in a similar manner by this patch. Signed-off-by: Sowmini Varadhan Reviewed-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/neighbour.h | 1 + net/core/neighbour.c | 60 ++++++++++++++++++++++++++++++++++++++++--------- net/ipv4/arp.c | 4 ++++ 3 files changed, 54 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e4dd3a214034..639b67564a7d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -317,6 +317,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); +bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d274f81fcc2c..dadb5eef91c3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -118,6 +118,50 @@ unsigned long neigh_rand_reach_time(unsigned long base) EXPORT_SYMBOL(neigh_rand_reach_time); +static bool neigh_del(struct neighbour *n, __u8 state, + struct neighbour __rcu **np, struct neigh_table *tbl) +{ + bool retval = false; + + write_lock(&n->lock); + if (atomic_read(&n->refcnt) == 1 && !(n->nud_state & state)) { + struct neighbour *neigh; + + neigh = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + rcu_assign_pointer(*np, neigh); + n->dead = 1; + retval = true; + } + write_unlock(&n->lock); + if (retval) + neigh_cleanup_and_release(n); + return retval; +} + +bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) +{ + struct neigh_hash_table *nht; + void *pkey = ndel->primary_key; + u32 hash_val; + struct neighbour *n; + struct neighbour __rcu **np; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); + hash_val = hash_val >> (32 - nht->hash_shift); + + np = &nht->hash_buckets[hash_val]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock)))) { + if (n == ndel) + return neigh_del(n, 0, np, tbl); + np = &n->next; + } + return false; +} + static int neigh_forced_gc(struct neigh_table *tbl) { int shrunk = 0; @@ -140,19 +184,10 @@ static int neigh_forced_gc(struct neigh_table *tbl) * - nobody refers to it. * - it is not permanent */ - write_lock(&n->lock); - if (atomic_read(&n->refcnt) == 1 && - !(n->nud_state & NUD_PERMANENT)) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); - n->dead = 1; - shrunk = 1; - write_unlock(&n->lock); - neigh_cleanup_and_release(n); + if (neigh_del(n, NUD_PERMANENT, np, tbl)) { + shrunk = 1; continue; } - write_unlock(&n->lock); np = &n->next; } } @@ -1649,7 +1684,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid); + write_lock_bh(&tbl->lock); neigh_release(neigh); + neigh_remove_one(neigh, tbl); + write_unlock_bh(&tbl->lock); out: return err; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index e9f3386a528b..a651c53260ec 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1113,13 +1113,17 @@ static int arp_invalidate(struct net_device *dev, __be32 ip) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; + struct neigh_table *tbl = &arp_tbl; if (neigh) { if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); + write_lock_bh(&tbl->lock); neigh_release(neigh); + neigh_remove_one(neigh, tbl); + write_unlock_bh(&tbl->lock); } return err; -- cgit v1.2.3 From e3e86b5119f81e5e2499bea7ea1ebe8ac6aab789 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 4 Jun 2017 21:41:10 -0400 Subject: ipv6: Fix leak in ipv6_gso_segment(). If ip6_find_1stfragopt() fails and we return an error we have to free up 'segs' because nobody else is going to. Fixes: 2423496af35d ("ipv6: Prevent overrun when parsing v6 header options") Reported-by: Ben Hutchings Signed-off-by: David S. Miller --- net/ipv6/ip6_offload.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 280268f1dd7b..cdb3728faca7 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -116,8 +116,10 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (udpfrag) { int err = ip6_find_1stfragopt(skb, &prevhdr); - if (err < 0) + if (err < 0) { + kfree_skb_list(segs); return ERR_PTR(err); + } fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) -- cgit v1.2.3 From b07ac9894644202614ca87c69f3f45e424a82fef Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 2 Jun 2017 22:05:23 -0700 Subject: net: dsa: Fix stale cpu_switch reference after unbind then bind Commit 9520ed8fb841 ("net: dsa: use cpu_switch instead of ds[0]") replaced the use of dst->ds[0] with dst->cpu_switch since that is functionally equivalent, however, we can now run into an use after free scenario after unbinding then rebinding the switch driver. The use after free happens because we do correctly initialize dst->cpu_switch the first time we probe in dsa_cpu_parse(), then we unbind the driver: dsa_dst_unapply() is called, and we rebind again. dst->cpu_switch now points to a freed "ds" structure, and so when we finally dereference it in dsa_cpu_port_ethtool_setup(), we oops. To fix this, simply set dst->cpu_switch to NULL in dsa_dst_unapply() which guarantees that we always correctly re-assign dst->cpu_switch in dsa_cpu_parse(). Fixes: 9520ed8fb841 ("net: dsa: use cpu_switch instead of ds[0]") Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 033b3bfb63dc..7796580e99ee 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -484,8 +484,10 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - if (dst->cpu_switch) + if (dst->cpu_switch) { dsa_cpu_port_ethtool_restore(dst->cpu_switch); + dst->cpu_switch = NULL; + } pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; -- cgit v1.2.3 From 77d4b1d36926a9b8387c6b53eeba42bcaaffcea3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 3 Jun 2017 09:29:25 -0700 Subject: net: ping: do not abuse udp_poll() Alexander reported various KASAN messages triggered in recent kernels The problem is that ping sockets should not use udp_poll() in the first place, and recent changes in UDP stack finally exposed this old bug. Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind") Fixes: 6d0bfe226116 ("net: ipv6: Add IPv6 support to the ping socket.") Signed-off-by: Eric Dumazet Reported-by: Sasha Levin Cc: Solar Designer Cc: Vasiliy Kulikov Cc: Lorenzo Colitti Acked-By: Lorenzo Colitti Tested-By: Lorenzo Colitti Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + net/ipv4/af_inet.c | 2 +- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index dbf0abba33b8..3e505bbff8ca 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1007,6 +1007,7 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, */ extern const struct proto_ops inet6_stream_ops; extern const struct proto_ops inet6_dgram_ops; +extern const struct proto_ops inet6_sockraw_ops; struct group_source_req; struct group_filter; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f3dad1661343..58925b6597de 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1043,7 +1043,7 @@ static struct inet_protosw inetsw_array[] = .type = SOCK_DGRAM, .protocol = IPPROTO_ICMP, .prot = &ping_prot, - .ops = &inet_dgram_ops, + .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, }, diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 9b522fa90e6d..ac826dd338ff 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -192,7 +192,7 @@ static struct inet_protosw pingv6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_ICMPV6, .prot = &pingv6_prot, - .ops = &inet6_dgram_ops, + .ops = &inet6_sockraw_ops, .flags = INET_PROTOSW_REUSE, }; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 1f992d9e261d..60be012fe708 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1338,7 +1338,7 @@ void raw6_proc_exit(void) #endif /* CONFIG_PROC_FS */ /* Same as inet6_dgram_ops, sans udp_poll. */ -static const struct proto_ops inet6_sockraw_ops = { +const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, -- cgit v1.2.3 From 48a1df65334b74bd7531f932cca5928932abf769 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 4 Jun 2017 04:16:22 +0200 Subject: skbuff: return -EMSGSIZE in skb_to_sgvec to prevent overflow This is a defense-in-depth measure in response to bugs like 4d6fa57b4dab ("macsec: avoid heap overflow in skb_to_sgvec"). There's not only a potential overflow of sglist items, but also a stack overflow potential, so we fix this by limiting the amount of recursion this function is allowed to do. Not actually providing a bounded base case is a future disaster that we can easily avoid here. As a small matter of house keeping, we take this opportunity to move the documentation comment over the actual function the documentation is for. While this could be implemented by using an explicit stack of skbuffs, when implementing this, the function complexity increased considerably, and I don't think such complexity and bloat is actually worth it. So, instead I built this and tested it on x86, x86_64, ARM, ARM64, and MIPS, and measured the stack usage there. I also reverted the recent MIPS changes that give it a separate IRQ stack, so that I could experience some worst-case situations. I found that limiting it to 24 layers deep yielded a good stack usage with room for safety, as well as being much deeper than any driver actually ever creates. Signed-off-by: Jason A. Donenfeld Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Cc: David Howells Cc: Sabrina Dubroca Cc: "Michael S. Tsirkin" Cc: Jason Wang Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 +++---- net/core/skbuff.c | 65 ++++++++++++++++++++++++++++++++------------------ 2 files changed, 46 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 45a59c1e0cc7..d460a4cbda1c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -953,10 +953,10 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t priority); -int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, - int offset, int len); -int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, - int len); +int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, + int offset, int len); +int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, + int offset, int len); int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); int skb_pad(struct sk_buff *skb, int pad); #define dev_kfree_skb(a) consume_skb(a) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 780b7c1563d0..bba33cf4f7cd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3508,24 +3508,18 @@ void __init skb_init(void) NULL); } -/** - * skb_to_sgvec - Fill a scatter-gather list from a socket buffer - * @skb: Socket buffer containing the buffers to be mapped - * @sg: The scatter-gather list to map into - * @offset: The offset into the buffer's contents to start mapping - * @len: Length of buffer space to be mapped - * - * Fill the specified scatter-gather list with mappings/pointers into a - * region of the buffer space attached to a socket buffer. - */ static int -__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, + unsigned int recursion_level) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int elt = 0; + if (unlikely(recursion_level >= 24)) + return -EMSGSIZE; + if (copy > 0) { if (copy > len) copy = len; @@ -3544,6 +3538,8 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (unlikely(elt && sg_is_last(&sg[elt - 1]))) + return -EMSGSIZE; if (copy > len) copy = len; @@ -3558,16 +3554,22 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) } skb_walk_frags(skb, frag_iter) { - int end; + int end, ret; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { + if (unlikely(elt && sg_is_last(&sg[elt - 1]))) + return -EMSGSIZE; + if (copy > len) copy = len; - elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, - copy); + ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, + copy, recursion_level + 1); + if (unlikely(ret < 0)) + return ret; + elt += ret; if ((len -= copy) == 0) return elt; offset += copy; @@ -3578,6 +3580,31 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) return elt; } +/** + * skb_to_sgvec - Fill a scatter-gather list from a socket buffer + * @skb: Socket buffer containing the buffers to be mapped + * @sg: The scatter-gather list to map into + * @offset: The offset into the buffer's contents to start mapping + * @len: Length of buffer space to be mapped + * + * Fill the specified scatter-gather list with mappings/pointers into a + * region of the buffer space attached to a socket buffer. Returns either + * the number of scatterlist items used, or -EMSGSIZE if the contents + * could not fit. + */ +int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); + + if (nsg <= 0) + return nsg; + + sg_mark_end(&sg[nsg - 1]); + + return nsg; +} +EXPORT_SYMBOL_GPL(skb_to_sgvec); + /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given * sglist without mark the sg which contain last skb data as the end. * So the caller can mannipulate sg list as will when padding new data after @@ -3600,19 +3627,11 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { - return __skb_to_sgvec(skb, sg, offset, len); + return __skb_to_sgvec(skb, sg, offset, len, 0); } EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); -int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) -{ - int nsg = __skb_to_sgvec(skb, sg, offset, len); - sg_mark_end(&sg[nsg - 1]); - - return nsg; -} -EXPORT_SYMBOL_GPL(skb_to_sgvec); /** * skb_cow_data - Check that a socket buffer's data buffers are writable -- cgit v1.2.3 From 3f29770723fe498a5c5f57c3a31a996ebdde03e1 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 4 Jun 2017 04:16:23 +0200 Subject: ipsec: check return value of skb_to_sgvec always Signed-off-by: Jason A. Donenfeld Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 8 ++++++-- net/ipv4/esp4.c | 20 +++++++++++++------- net/ipv6/ah6.c | 8 ++++++-- net/ipv6/esp6.c | 20 +++++++++++++------- 4 files changed, 38 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 22377c8ff14b..e8f862358518 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -220,7 +220,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags + sglists); - skb_to_sgvec_nomark(skb, sg, 0, skb->len); + err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ @@ -393,7 +395,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, ihl); sg_init_table(sg, nfrags + sglists); - skb_to_sgvec_nomark(skb, sg, 0, skb->len); + err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 93322f895eab..d815d1755473 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -377,9 +377,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * esp->esph = esph; sg_init_table(sg, esp->nfrags); - skb_to_sgvec(skb, sg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + esp->clen + alen); + err = skb_to_sgvec(skb, sg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + if (unlikely(err < 0)) + goto error; if (!esp->inplace) { int allocsize; @@ -403,9 +405,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * spin_unlock_bh(&x->lock); sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); - skb_to_sgvec(skb, dsg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + esp->clen + alen); + err = skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + if (unlikely(err < 0)) + goto error; } if ((x->props.flags & XFRM_STATE_ESN)) @@ -690,7 +694,9 @@ skip_cow: esp_input_set_header(skb, seqhi); sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + err = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out; skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index dda6035e3b84..755f38271dd5 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -423,7 +423,9 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags + sglists); - skb_to_sgvec_nomark(skb, sg, 0, skb->len); + err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ @@ -606,7 +608,9 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) ip6h->hop_limit = 0; sg_init_table(sg, nfrags + sglists); - skb_to_sgvec_nomark(skb, sg, 0, skb->len); + err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 1fe99ba8066c..2ede4e459c4e 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -346,9 +346,11 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info esph = esp_output_set_esn(skb, x, ip_esp_hdr(skb), seqhi); sg_init_table(sg, esp->nfrags); - skb_to_sgvec(skb, sg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + esp->clen + alen); + err = skb_to_sgvec(skb, sg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + if (unlikely(err < 0)) + goto error; if (!esp->inplace) { int allocsize; @@ -372,9 +374,11 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info spin_unlock_bh(&x->lock); sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); - skb_to_sgvec(skb, dsg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + esp->clen + alen); + err = skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + if (unlikely(err < 0)) + goto error; } if ((x->props.flags & XFRM_STATE_ESN)) @@ -618,7 +622,9 @@ skip_cow: esp_input_set_header(skb, seqhi); sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + ret = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(ret < 0)) + goto out; skb->ip_summed = CHECKSUM_NONE; -- cgit v1.2.3 From 89a5ea99662505d2d61f2a3030a6896c2cb3cdb0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 4 Jun 2017 04:16:24 +0200 Subject: rxrpc: check return value of skb_to_sgvec always Signed-off-by: Jason A. Donenfeld Acked-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/rxkad.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 1bb9b2ccc267..29fe20ad04aa 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -227,7 +227,9 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, len &= ~(call->conn->size_align - 1); sg_init_table(sg, nsg); - skb_to_sgvec(skb, sg, 0, len); + err = skb_to_sgvec(skb, sg, 0, len); + if (unlikely(err < 0)) + goto out; skcipher_request_set_crypt(req, sg, sg, len, iv.x); crypto_skcipher_encrypt(req); @@ -324,7 +326,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, bool aborted; u32 data_size, buf; u16 check; - int nsg; + int nsg, ret; _enter(""); @@ -342,7 +344,9 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, goto nomem; sg_init_table(sg, nsg); - skb_to_sgvec(skb, sg, offset, 8); + ret = skb_to_sgvec(skb, sg, offset, 8); + if (unlikely(ret < 0)) + return ret; /* start the decryption afresh */ memset(&iv, 0, sizeof(iv)); @@ -409,7 +413,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, bool aborted; u32 data_size, buf; u16 check; - int nsg; + int nsg, ret; _enter(",{%d}", skb->len); @@ -434,7 +438,12 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, } sg_init_table(sg, nsg); - skb_to_sgvec(skb, sg, offset, len); + ret = skb_to_sgvec(skb, sg, offset, len); + if (unlikely(ret < 0)) { + if (sg != _sg) + kfree(sg); + return ret; + } /* decrypt from the session key */ token = call->conn->params.key->payload.data[0]; -- cgit v1.2.3 From b699d0035836f6712917a41e7ae58d84359b8ff9 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Sun, 4 Jun 2017 14:43:43 +0800 Subject: sit: reload iphdr in ipip6_rcv Since iptunnel_pull_header() can call pskb_may_pull(), we must reload any pointer that was related to skb->head. Fixes: a09a4c8dd1ec ("tunnels: Remove encapsulation offloads on decap") Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv6/sit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 61e5902f0687..af832e7ce80f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -657,6 +657,7 @@ static int ipip6_rcv(struct sk_buff *skb) if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6), !net_eq(tunnel->net, dev_net(tunnel->dev)))) goto out; + iph = ip_hdr(skb); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { -- cgit v1.2.3 From 68d6d1ae5c0429bcc8911e1db5f80fe2cd1ca974 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Jun 2017 14:30:49 +0100 Subject: rxrpc: Separate the connection's protocol service ID from the lookup ID Keep the rxrpc_connection struct's idea of the service ID that is exposed in the protocol separate from the service ID that's used as a lookup key. This allows the protocol service ID on a client connection to get upgraded without making the connection unfindable for other client calls that also would like to use the upgraded connection. The connection's actual service ID is then returned through recvmsg() by way of msg_name. Whilst we're at it, we get rid of the last_service_id field from each channel. The service ID is per-connection, not per-call and an entire connection is upgraded in one go. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 5 ++--- net/rxrpc/ar-internal.h | 2 +- net/rxrpc/conn_client.c | 4 ++++ net/rxrpc/conn_event.c | 4 ++-- net/rxrpc/conn_object.c | 1 - net/rxrpc/conn_service.c | 1 + net/rxrpc/proc.c | 2 +- net/rxrpc/recvmsg.c | 7 +++++-- net/rxrpc/rxkad.c | 2 +- net/rxrpc/security.c | 4 ++-- 10 files changed, 19 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index cd34ffbff1d1..1e4ac889ec00 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -131,9 +131,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; - struct sock *sk = sock->sk; struct rxrpc_local *local; - struct rxrpc_sock *rx = rxrpc_sk(sk); + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); u16 service_id = srx->srx_service; int ret; @@ -152,7 +151,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) memcpy(&rx->srx, srx, sizeof(rx->srx)); - local = rxrpc_lookup_local(sock_net(sock->sk), &rx->srx); + local = rxrpc_lookup_local(sock_net(&rx->sk), &rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); goto error_unlock; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 067dbb3121d0..de98a49adb35 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -386,7 +386,6 @@ struct rxrpc_connection { u32 call_counter; /* Call ID counter */ u32 last_call; /* ID of last call */ u8 last_type; /* Type of last packet */ - u16 last_service_id; union { u32 last_seq; u32 last_abort; @@ -417,6 +416,7 @@ struct rxrpc_connection { atomic_t serial; /* packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ u32 security_nonce; /* response re-use preventer */ + u16 service_id; /* Service ID, possibly upgraded */ u8 size_align; /* data size alignment (for security) */ u8 security_size; /* security header size */ u8 security_ix; /* security type */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index c86f3202f967..3f358bf424ad 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -188,6 +188,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) conn->params = *cp; conn->out_clientflag = RXRPC_CLIENT_INITIATED; conn->state = RXRPC_CONN_CLIENT; + conn->service_id = cp->service_id; ret = rxrpc_get_client_connection_id(conn, gfp); if (ret < 0) @@ -343,6 +344,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call, if (cp->exclusive) { call->conn = candidate; call->security_ix = candidate->security_ix; + call->service_id = candidate->service_id; _leave(" = 0 [exclusive %d]", candidate->debug_id); return 0; } @@ -392,6 +394,7 @@ candidate_published: set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); call->conn = candidate; call->security_ix = candidate->security_ix; + call->service_id = candidate->service_id; spin_unlock(&local->client_conns_lock); _leave(" = 0 [new %d]", candidate->debug_id); return 0; @@ -413,6 +416,7 @@ found_extant_conn: spin_lock(&conn->channel_lock); call->conn = conn; call->security_ix = conn->security_ix; + call->service_id = conn->service_id; list_add(&call->chan_wait_link, &conn->waiting_calls); spin_unlock(&conn->channel_lock); _leave(" = 0 [extant %d]", conn->debug_id); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 46babcf82ce8..59a51a56e7c8 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -74,7 +74,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.whdr.userStatus = 0; pkt.whdr.securityIndex = conn->security_ix; pkt.whdr._rsvd = 0; - pkt.whdr.serviceId = htons(chan->last_service_id); + pkt.whdr.serviceId = htons(conn->service_id); len = sizeof(pkt.whdr); switch (chan->last_type) { @@ -208,7 +208,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, whdr.userStatus = 0; whdr.securityIndex = conn->security_ix; whdr._rsvd = 0; - whdr.serviceId = htons(conn->params.service_id); + whdr.serviceId = htons(conn->service_id); word = htonl(conn->local_abort); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index ade4d3d0b2a7..5bb255107427 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -167,7 +167,6 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, * through the channel, whilst disposing of the actual call record. */ trace_rxrpc_disconnect_call(call); - chan->last_service_id = call->service_id; if (call->abort_code) { chan->last_abort = call->abort_code; chan->last_type = RXRPC_PACKET_TYPE_ABORT; diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index edfc633f7d5e..c7f8682a55b2 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -160,6 +160,7 @@ void rxrpc_new_incoming_connection(struct rxrpc_connection *conn, conn->proto.epoch = sp->hdr.epoch; conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; conn->params.service_id = sp->hdr.serviceId; + conn->service_id = sp->hdr.serviceId; conn->security_ix = sp->hdr.securityIndex; conn->out_clientflag = 0; if (conn->security_ix) diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index e92d8405b15a..7421656963a9 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -190,7 +190,7 @@ print: " %s %08x %08x %08x\n", lbuff, rbuff, - conn->params.service_id, + conn->service_id, conn->proto.cid, rxrpc_conn_is_service(conn) ? "Svc" : "Clt", atomic_read(&conn->usage), diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index f9caf3b77509..bdece21f313d 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -522,8 +522,11 @@ try_again: } if (msg->msg_name) { - size_t len = sizeof(call->conn->params.peer->srx); - memcpy(msg->msg_name, &call->conn->params.peer->srx, len); + struct sockaddr_rxrpc *srx = msg->msg_name; + size_t len = sizeof(call->peer->srx); + + memcpy(msg->msg_name, &call->peer->srx, len); + srx->srx_service = call->service_id; msg->msg_namelen = len; } diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 29fe20ad04aa..46d1a1f0b55b 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -649,7 +649,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) whdr.userStatus = 0; whdr.securityIndex = conn->security_ix; whdr._rsvd = 0; - whdr.serviceId = htons(conn->params.service_id); + whdr.serviceId = htons(conn->service_id); iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 7d921e56e715..b9f5dbbe0b8b 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -121,7 +121,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) _enter(""); - sprintf(kdesc, "%u:%u", conn->params.service_id, conn->security_ix); + sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix); sec = rxrpc_security_lookup(conn->security_ix); if (!sec) { @@ -133,7 +133,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) read_lock(&local->services_lock); rx = rcu_dereference_protected(local->service, lockdep_is_held(&local->services_lock)); - if (rx && rx->srx.srx_service == conn->params.service_id) + if (rx && rx->srx.srx_service == conn->service_id) goto found_service; /* the service appears to have died */ -- cgit v1.2.3 From 28036f44851e2515aa91b547b45cefddcac52ff6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Jun 2017 14:30:49 +0100 Subject: rxrpc: Permit multiple service binding Permit bind() to be called on an AF_RXRPC socket more than once (currently maximum twice) to bind multiple listening services to it. There are some restrictions: (1) All bind() calls involved must have a non-zero service ID. (2) The service IDs must all be different. (3) The rest of the address (notably the transport part) must be the same in all (a single UDP socket is shared). (4) This must be done before listen() or sendmsg() is called. This allows someone to connect to the service socket with different service IDs and lays the foundation for service upgrading. The service ID used by an incoming call can be extracted from the msg_name returned by recvmsg(). Signed-off-by: David Howells --- Documentation/networking/rxrpc.txt | 4 +++ net/rxrpc/af_rxrpc.c | 62 ++++++++++++++++++++++++-------------- net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/call_accept.c | 3 +- net/rxrpc/local_object.c | 1 + net/rxrpc/security.c | 3 +- 6 files changed, 51 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 1b63bbc6b94f..b7115ec55e04 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -600,6 +600,10 @@ A server would be set up to accept operations in the following manner: }; bind(server, &srx, sizeof(srx)); + More than one service ID may be bound to a socket, provided the transport + parameters are the same. The limit is currently two. To do this, bind() + should be called twice. + (3) The server is then set to listen out for incoming calls: listen(server, 100); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 1e4ac889ec00..3b982bca7d22 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -144,31 +144,48 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) lock_sock(&rx->sk); - if (rx->sk.sk_state != RXRPC_UNBOUND) { - ret = -EINVAL; - goto error_unlock; - } - - memcpy(&rx->srx, srx, sizeof(rx->srx)); + switch (rx->sk.sk_state) { + case RXRPC_UNBOUND: + rx->srx = *srx; + local = rxrpc_lookup_local(sock_net(&rx->sk), &rx->srx); + if (IS_ERR(local)) { + ret = PTR_ERR(local); + goto error_unlock; + } - local = rxrpc_lookup_local(sock_net(&rx->sk), &rx->srx); - if (IS_ERR(local)) { - ret = PTR_ERR(local); - goto error_unlock; - } + if (service_id) { + write_lock(&local->services_lock); + if (rcu_access_pointer(local->service)) + goto service_in_use; + rx->local = local; + rcu_assign_pointer(local->service, rx); + write_unlock(&local->services_lock); + + rx->sk.sk_state = RXRPC_SERVER_BOUND; + } else { + rx->local = local; + rx->sk.sk_state = RXRPC_CLIENT_BOUND; + } + break; - if (service_id) { - write_lock(&local->services_lock); - if (rcu_access_pointer(local->service)) - goto service_in_use; - rx->local = local; - rcu_assign_pointer(local->service, rx); - write_unlock(&local->services_lock); + case RXRPC_SERVER_BOUND: + ret = -EINVAL; + if (service_id == 0) + goto error_unlock; + ret = -EADDRINUSE; + if (service_id == rx->srx.srx_service) + goto error_unlock; + ret = -EINVAL; + srx->srx_service = rx->srx.srx_service; + if (memcmp(srx, &rx->srx, sizeof(*srx)) != 0) + goto error_unlock; + rx->second_service = service_id; + rx->sk.sk_state = RXRPC_SERVER_BOUND2; + break; - rx->sk.sk_state = RXRPC_SERVER_BOUND; - } else { - rx->local = local; - rx->sk.sk_state = RXRPC_CLIENT_BOUND; + default: + ret = -EINVAL; + goto error_unlock; } release_sock(&rx->sk); @@ -205,6 +222,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) ret = -EADDRNOTAVAIL; break; case RXRPC_SERVER_BOUND: + case RXRPC_SERVER_BOUND2: ASSERT(rx->local != NULL); max = READ_ONCE(rxrpc_max_backlog); ret = -EINVAL; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index de98a49adb35..781fbc253b5a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -61,6 +61,7 @@ enum { RXRPC_CLIENT_UNBOUND, /* Unbound socket used as client */ RXRPC_CLIENT_BOUND, /* client local address bound */ RXRPC_SERVER_BOUND, /* server local address bound */ + RXRPC_SERVER_BOUND2, /* second server local address bound */ RXRPC_SERVER_LISTENING, /* server listening for connections */ RXRPC_SERVER_LISTEN_DISABLED, /* server listening disabled */ RXRPC_CLOSE, /* socket is being closed */ @@ -142,6 +143,7 @@ struct rxrpc_sock { u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT bool exclusive; /* Exclusive connection for a client socket */ + u16 second_service; /* Additional service bound to the endpoint */ sa_family_t family; /* Protocol family created with */ struct sockaddr_rxrpc srx; /* local address */ struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index a8515b0d4717..544df53ccf79 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -341,7 +341,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, /* Get the socket providing the service */ rx = rcu_dereference(local->service); - if (rx && service_id == rx->srx.srx_service) + if (rx && (service_id == rx->srx.srx_service || + service_id == rx->second_service)) goto found_service; trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 17d79fd73ade..38b99db30e54 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -94,6 +94,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, rwlock_init(&local->services_lock); local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); + local->srx.srx_service = 0; } _leave(" = %p", local); diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index b9f5dbbe0b8b..e9f428351293 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -133,7 +133,8 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) read_lock(&local->services_lock); rx = rcu_dereference_protected(local->service, lockdep_is_held(&local->services_lock)); - if (rx && rx->srx.srx_service == conn->service_id) + if (rx && (rx->srx.srx_service == conn->service_id || + rx->second_service == conn->service_id)) goto found_service; /* the service appears to have died */ -- cgit v1.2.3 From 4722974d90e06d0164ca1b73a6b34cec6bdb64ad Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Jun 2017 14:30:49 +0100 Subject: rxrpc: Implement service upgrade Implement AuriStor's service upgrade facility. There are three problems that this is meant to deal with: (1) Various of the standard AFS RPC calls have IPv4 addresses in their requests and/or replies - but there's no room for including IPv6 addresses. (2) Definition of IPv6-specific RPC operations in the standard operation sets has not yet been achieved. (3) One could envision the creation a new service on the same port that as the original service. The new service could implement improved operations - and the client could try this first, falling back to the original service if it's not there. Unfortunately, certain servers ignore packets addressed to a service they don't implement and don't respond in any way - not even with an ABORT. This means that the client must then wait for the call timeout to occur. What service upgrade does is to see if the connection is marked as being 'upgradeable' and if so, change the service ID in the server and thus the request and reply formats. Note that the upgrade isn't mandatory - a server that supports only the original call set will ignore the upgrade request. In the protocol, the procedure is then as follows: (1) To request an upgrade, the first DATA packet in a new connection must have the userStatus set to 1 (this is normally 0). The userStatus value is normally ignored by the server. (2) If the server doesn't support upgrading, the reply packets will contain the same service ID as for the first request packet. (3) If the server does support upgrading, all future reply packets on that connection will contain the new service ID and the new service ID will be applied to *all* further calls on that connection as well. (4) The RPC op used to probe the upgrade must take the same request data as the shadow call in the upgrade set (but may return a different reply). GetCapability RPC ops were added to all standard sets for just this purpose. Ops where the request formats differ cannot be used for probing. (5) The client must wait for completion of the probe before sending any further RPC ops to the same destination. It should then use the service ID that recvmsg() reported back in all future calls. (6) The shadow service must have call definitions for all the operation IDs defined by the original service. To support service upgrading, a server should: (1) Call bind() twice on its AF_RXRPC socket before calling listen(). Each bind() should supply a different service ID, but the transport addresses must be the same. This allows the server to receive requests with either service ID. (2) Enable automatic upgrading by calling setsockopt(), specifying RXRPC_UPGRADEABLE_SERVICE and passing in a two-member array of unsigned shorts as the argument: unsigned short optval[2]; This specifies a pair of service IDs. They must be different and must match the service IDs bound to the socket. Member 0 is the service ID to upgrade from and member 1 is the service ID to upgrade to. Signed-off-by: David Howells --- Documentation/networking/rxrpc.txt | 34 ++++++++++++++++++++++++++-------- include/linux/rxrpc.h | 1 + include/rxrpc/packet.h | 2 ++ net/rxrpc/af_rxrpc.c | 23 +++++++++++++++++++++++ net/rxrpc/ar-internal.h | 10 ++++++++-- net/rxrpc/call_accept.c | 2 +- net/rxrpc/conn_service.c | 11 ++++++++++- 7 files changed, 71 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index b7115ec55e04..2a1662760450 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -433,6 +433,13 @@ AF_RXRPC sockets support a few socket options at the SOL_RXRPC level: Encrypted checksum plus entire packet padded and encrypted, including actual packet length. + (*) RXRPC_UPGRADEABLE_SERVICE + + This is used to indicate that a service socket with two bindings may + upgrade one bound service to the other if requested by the client. optval + must point to an array of two unsigned short ints. The first is the + service ID to upgrade from and the second the service ID to upgrade to. + ======== SECURITY @@ -588,7 +595,7 @@ A server would be set up to accept operations in the following manner: The keyring can be manipulated after it has been given to the socket. This permits the server to add more keys, replace keys, etc. whilst it is live. - (2) A local address must then be bound: + (3) A local address must then be bound: struct sockaddr_rxrpc srx = { .srx_family = AF_RXRPC, @@ -604,11 +611,22 @@ A server would be set up to accept operations in the following manner: parameters are the same. The limit is currently two. To do this, bind() should be called twice. - (3) The server is then set to listen out for incoming calls: + (4) If service upgrading is required, first two service IDs must have been + bound and then the following option must be set: + + unsigned short service_ids[2] = { from_ID, to_ID }; + setsockopt(server, SOL_RXRPC, RXRPC_UPGRADEABLE_SERVICE, + service_ids, sizeof(service_ids)); + + This will automatically upgrade connections on service from_ID to service + to_ID if they request it. This will be reflected in msg_name obtained + through recvmsg() when the request data is delivered to userspace. + + (5) The server is then set to listen out for incoming calls: listen(server, 100); - (4) The kernel notifies the server of pending incoming connections by sending + (6) The kernel notifies the server of pending incoming connections by sending it a message for each. This is received with recvmsg() on the server socket. It has no data, and has a single dataless control message attached: @@ -620,13 +638,13 @@ A server would be set up to accept operations in the following manner: the time it is accepted - in which case the first call still on the queue will be accepted. - (5) The server then accepts the new call by issuing a sendmsg() with two + (7) The server then accepts the new call by issuing a sendmsg() with two pieces of control data and no actual data: RXRPC_ACCEPT - indicate connection acceptance RXRPC_USER_CALL_ID - specify user ID for this call - (6) The first request data packet will then be posted to the server socket for + (8) The first request data packet will then be posted to the server socket for recvmsg() to pick up. At that point, the RxRPC address for the call can be read from the address fields in the msghdr struct. @@ -638,7 +656,7 @@ A server would be set up to accept operations in the following manner: RXRPC_USER_CALL_ID - specifies the user ID for this call - (8) The reply data should then be posted to the server socket using a series + (9) The reply data should then be posted to the server socket using a series of sendmsg() calls, each with the following control messages attached: RXRPC_USER_CALL_ID - specifies the user ID for this call @@ -646,7 +664,7 @@ A server would be set up to accept operations in the following manner: MSG_MORE should be set in msghdr::msg_flags on all but the last message for a particular call. - (9) The final ACK from the client will be posted for retrieval by recvmsg() +(10) The final ACK from the client will be posted for retrieval by recvmsg() when it is received. It will take the form of a dataless message with two control messages attached: @@ -656,7 +674,7 @@ A server would be set up to accept operations in the following manner: MSG_EOR will be flagged to indicate that this is the final message for this call. -(10) Up to the point the final packet of reply data is sent, the call can be +(11) Up to the point the final packet of reply data is sent, the call can be aborted by calling sendmsg() with a dataless message with the following control messages attached: diff --git a/include/linux/rxrpc.h b/include/linux/rxrpc.h index c68307bc306f..634116561a6a 100644 --- a/include/linux/rxrpc.h +++ b/include/linux/rxrpc.h @@ -37,6 +37,7 @@ struct sockaddr_rxrpc { #define RXRPC_SECURITY_KEYRING 2 /* [srvr] set ring of server security keys */ #define RXRPC_EXCLUSIVE_CONNECTION 3 /* Deprecated; use RXRPC_EXCLUSIVE_CALL instead */ #define RXRPC_MIN_SECURITY_LEVEL 4 /* minimum security level */ +#define RXRPC_UPGRADEABLE_SERVICE 5 /* Upgrade service[0] -> service[1] */ /* * RxRPC control messages diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h index 703a64b4681a..a2dcfb850b9f 100644 --- a/include/rxrpc/packet.h +++ b/include/rxrpc/packet.h @@ -58,6 +58,8 @@ struct rxrpc_wire_header { #define RXRPC_SLOW_START_OK 0x20 /* [ACK] slow start supported */ uint8_t userStatus; /* app-layer defined status */ +#define RXRPC_USERSTATUS_SERVICE_UPGRADE 0x01 /* AuriStor service upgrade request */ + uint8_t securityIndex; /* security protocol ID */ union { __be16 _rsvd; /* reserved */ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 3b982bca7d22..0c4dc4a7832c 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -490,6 +490,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); unsigned int min_sec_level; + u16 service_upgrade[2]; int ret; _enter(",%d,%d,,%d", level, optname, optlen); @@ -546,6 +547,28 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, rx->min_sec_level = min_sec_level; goto success; + case RXRPC_UPGRADEABLE_SERVICE: + ret = -EINVAL; + if (optlen != sizeof(service_upgrade) || + rx->service_upgrade.from != 0) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_SERVER_BOUND2) + goto error; + ret = -EFAULT; + if (copy_from_user(service_upgrade, optval, + sizeof(service_upgrade)) != 0) + goto error; + ret = -EINVAL; + if ((service_upgrade[0] != rx->srx.srx_service || + service_upgrade[1] != rx->second_service) && + (service_upgrade[0] != rx->second_service || + service_upgrade[1] != rx->srx.srx_service)) + goto error; + rx->service_upgrade.from = service_upgrade[0]; + rx->service_upgrade.to = service_upgrade[1]; + goto success; + default: break; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 781fbc253b5a..c1ebd886a53f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -144,8 +144,13 @@ struct rxrpc_sock { #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT bool exclusive; /* Exclusive connection for a client socket */ u16 second_service; /* Additional service bound to the endpoint */ + struct { + /* Service upgrade information */ + u16 from; /* Service ID to upgrade (if not 0) */ + u16 to; /* service ID to upgrade to */ + } service_upgrade; sa_family_t family; /* Protocol family created with */ - struct sockaddr_rxrpc srx; /* local address */ + struct sockaddr_rxrpc srx; /* Primary Service/local addresses */ struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */ }; @@ -861,7 +866,8 @@ static inline void rxrpc_put_connection(struct rxrpc_connection *conn) struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t); -void rxrpc_new_incoming_connection(struct rxrpc_connection *, struct sk_buff *); +void rxrpc_new_incoming_connection(struct rxrpc_sock *, + struct rxrpc_connection *, struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 544df53ccf79..0d4d84e8c074 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -296,7 +296,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, conn->params.local = local; conn->params.peer = peer; rxrpc_see_connection(conn); - rxrpc_new_incoming_connection(conn, skb); + rxrpc_new_incoming_connection(rx, conn, skb); } else { rxrpc_get_connection(conn); } diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index c7f8682a55b2..e60fcd2a4a02 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -150,7 +150,8 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn * Set up an incoming connection. This is called in BH context with the RCU * read lock held. */ -void rxrpc_new_incoming_connection(struct rxrpc_connection *conn, +void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, + struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -168,6 +169,14 @@ void rxrpc_new_incoming_connection(struct rxrpc_connection *conn, else conn->state = RXRPC_CONN_SERVICE; + /* See if we should upgrade the service. This can only happen on the + * first packet on a new connection. Once done, it applies to all + * subsequent calls on that connection. + */ + if (sp->hdr.userStatus == RXRPC_USERSTATUS_SERVICE_UPGRADE && + conn->service_id == rx->service_upgrade.from) + conn->service_id = rx->service_upgrade.to; + /* Make the connection a target for incoming packets. */ rxrpc_publish_service_conn(conn->params.peer, conn); -- cgit v1.2.3 From 4e255721d1575a766ada06dc7eb03acdcd34eaaf Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Jun 2017 14:30:49 +0100 Subject: rxrpc: Add service upgrade support for client connections Make it possible for a client to use AuriStor's service upgrade facility. The client does this by adding an RXRPC_UPGRADE_SERVICE control message to the first sendmsg() of a call. This takes no parameters. When recvmsg() starts returning data from the call, the service ID field in the returned msg_name will reflect the result of the upgrade attempt. If the upgrade was ignored, srx_service will match what was set in the sendmsg(); if the upgrade happened the srx_service will be altered to indicate the service the server upgraded to. Note that: (1) The choice of upgrade service is up to the server (2) Further client calls to the same server that would share a connection are blocked if an upgrade probe is in progress. (3) This should only be used to probe the service. Clients should then use the returned service ID in all subsequent communications with that server (and not set the upgrade). Note that the kernel will not retain this information should the connection expire from its cache. (4) If a server that supports upgrading is replaced by one that doesn't, whilst a connection is live, and if the replacement is running, say, OpenAFS 1.6.4 or older or an older IBM AFS, then the replacement server will not respond to packets sent to the upgraded connection. At this point, calls will time out and the server must be reprobed. Signed-off-by: David Howells --- Documentation/networking/rxrpc.txt | 30 ++++++++++++++++++++++++++ include/linux/rxrpc.h | 1 + include/trace/events/rxrpc.h | 1 + net/rxrpc/ar-internal.h | 3 +++ net/rxrpc/conn_client.c | 43 +++++++++++++++++++++++++++++++------- net/rxrpc/input.c | 17 +++++++++++++++ net/rxrpc/output.c | 4 ++++ net/rxrpc/sendmsg.c | 19 +++++++++++++---- 8 files changed, 106 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 2a1662760450..18078e630a63 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -325,6 +325,8 @@ calls, to invoke certain actions and to report certain conditions. These are: RXRPC_LOCAL_ERROR -rt error num Local error encountered RXRPC_NEW_CALL -r- n/a New call received RXRPC_ACCEPT s-- n/a Accept new call + RXRPC_EXCLUSIVE_CALL s-- n/a Make an exclusive client call + RXRPC_UPGRADE_SERVICE s-- n/a Client call can be upgraded (SRT = usable in Sendmsg / delivered by Recvmsg / Terminal message) @@ -387,6 +389,23 @@ calls, to invoke certain actions and to report certain conditions. These are: return error ENODATA. If the user ID is already in use by another call, then error EBADSLT will be returned. + (*) RXRPC_EXCLUSIVE_CALL + + This is used to indicate that a client call should be made on a one-off + connection. The connection is discarded once the call has terminated. + + (*) RXRPC_UPGRADE_SERVICE + + This is used to make a client call to probe if the specified service ID + may be upgraded by the server. The caller must check msg_name returned to + recvmsg() for the service ID actually in use. The operation probed must + be one that takes the same arguments in both services. + + Once this has been used to establish the upgrade capability (or lack + thereof) of the server, the service ID returned should be used for all + future communication to that server and RXRPC_UPGRADE_SERVICE should no + longer be set. + ============== SOCKET OPTIONS @@ -566,6 +585,17 @@ A client would issue an operation by: buffer instead, and MSG_EOR will be flagged to indicate the end of that call. +A client may ask for a service ID it knows and ask that this be upgraded to a +better service if one is available by supplying RXRPC_UPGRADE_SERVICE on the +first sendmsg() of a call. The client should then check srx_service in the +msg_name filled in by recvmsg() when collecting the result. srx_service will +hold the same value as given to sendmsg() if the upgrade request was ignored by +the service - otherwise it will be altered to indicate the service ID the +server upgraded to. Note that the upgraded service ID is chosen by the server. +The caller has to wait until it sees the service ID in the reply before sending +any more calls (further calls to the same destination will be blocked until the +probe is concluded). + ==================== EXAMPLE SERVER USAGE diff --git a/include/linux/rxrpc.h b/include/linux/rxrpc.h index 634116561a6a..707910c6c6c5 100644 --- a/include/linux/rxrpc.h +++ b/include/linux/rxrpc.h @@ -54,6 +54,7 @@ struct sockaddr_rxrpc { #define RXRPC_NEW_CALL 8 /* -r: [Service] new incoming call notification */ #define RXRPC_ACCEPT 9 /* s-: [Service] accept request */ #define RXRPC_EXCLUSIVE_CALL 10 /* s-: Call should be on exclusive connection */ +#define RXRPC_UPGRADE_SERVICE 11 /* s-: Request service upgrade for client call */ /* * RxRPC security levels diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 29a3d53a4015..ebe96796027a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -233,6 +233,7 @@ enum rxrpc_congest_change { EM(RXRPC_CONN_CLIENT_INACTIVE, "Inac") \ EM(RXRPC_CONN_CLIENT_WAITING, "Wait") \ EM(RXRPC_CONN_CLIENT_ACTIVE, "Actv") \ + EM(RXRPC_CONN_CLIENT_UPGRADE, "Upgd") \ EM(RXRPC_CONN_CLIENT_CULLED, "Cull") \ E_(RXRPC_CONN_CLIENT_IDLE, "Idle") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c1ebd886a53f..e9b536cb0acf 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -320,6 +320,7 @@ struct rxrpc_conn_parameters { struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* Security details */ bool exclusive; /* T if conn is exclusive */ + bool upgrade; /* T if service ID can be upgraded */ u16 service_id; /* Service ID for this connection */ u32 security_level; /* Security level selected */ }; @@ -334,6 +335,7 @@ enum rxrpc_conn_flag { RXRPC_CONN_EXPOSED, /* Conn has extra ref for exposure */ RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */ + RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */ }; /* @@ -350,6 +352,7 @@ enum rxrpc_conn_cache_state { RXRPC_CONN_CLIENT_INACTIVE, /* Conn is not yet listed */ RXRPC_CONN_CLIENT_WAITING, /* Conn is on wait list, waiting for capacity */ RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */ + RXRPC_CONN_CLIENT_UPGRADE, /* Conn is on active list, probing for upgrade */ RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */ RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */ RXRPC_CONN__NR_CACHE_STATES diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3f358bf424ad..dd8bb919c15a 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -36,12 +36,15 @@ * * rxrpc_nr_active_client_conns is held incremented also. * - * (4) CULLED - The connection got summarily culled to try and free up + * (4) UPGRADE - As for ACTIVE, but only one call may be in progress and is + * being used to probe for service upgrade. + * + * (5) CULLED - The connection got summarily culled to try and free up * capacity. Calls currently in progress on the connection are allowed to * continue, but new calls will have to wait. There can be no waiters in * this state - the conn would have to go to the WAITING state instead. * - * (5) IDLE - The connection has no calls in progress upon it and must have + * (6) IDLE - The connection has no calls in progress upon it and must have * been exposed to the world (ie. the EXPOSED flag must be set). When it * expires, the EXPOSED flag is cleared and the connection transitions to * the INACTIVE state. @@ -184,6 +187,8 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) atomic_set(&conn->usage, 1); if (cp->exclusive) __set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + if (cp->upgrade) + __set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags); conn->params = *cp; conn->out_clientflag = RXRPC_CLIENT_INITIATED; @@ -300,7 +305,8 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call, #define cmp(X) ((long)conn->params.X - (long)cp->X) diff = (cmp(peer) ?: cmp(key) ?: - cmp(security_level)); + cmp(security_level) ?: + cmp(upgrade)); #undef cmp if (diff < 0) { p = p->rb_left; @@ -365,7 +371,8 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call, #define cmp(X) ((long)conn->params.X - (long)candidate->params.X) diff = (cmp(peer) ?: cmp(key) ?: - cmp(security_level)); + cmp(security_level) ?: + cmp(upgrade)); #undef cmp if (diff < 0) { pp = &(*pp)->rb_left; @@ -436,8 +443,13 @@ error: static void rxrpc_activate_conn(struct rxrpc_net *rxnet, struct rxrpc_connection *conn) { - trace_rxrpc_client(conn, -1, rxrpc_client_to_active); - conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE; + if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) { + trace_rxrpc_client(conn, -1, rxrpc_client_to_upgrade); + conn->cache_state = RXRPC_CONN_CLIENT_UPGRADE; + } else { + trace_rxrpc_client(conn, -1, rxrpc_client_to_active); + conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE; + } rxnet->nr_active_client_conns++; list_move_tail(&conn->cache_link, &rxnet->active_client_conns); } @@ -461,7 +473,8 @@ static void rxrpc_animate_client_conn(struct rxrpc_net *rxnet, _enter("%d,%d", conn->debug_id, conn->cache_state); - if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE) + if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE || + conn->cache_state == RXRPC_CONN_CLIENT_UPGRADE) goto out; spin_lock(&rxnet->client_conn_cache_lock); @@ -474,6 +487,7 @@ static void rxrpc_animate_client_conn(struct rxrpc_net *rxnet, switch (conn->cache_state) { case RXRPC_CONN_CLIENT_ACTIVE: + case RXRPC_CONN_CLIENT_UPGRADE: case RXRPC_CONN_CLIENT_WAITING: break; @@ -577,6 +591,9 @@ static void rxrpc_activate_channels_locked(struct rxrpc_connection *conn) case RXRPC_CONN_CLIENT_ACTIVE: mask = RXRPC_ACTIVE_CHANS_MASK; break; + case RXRPC_CONN_CLIENT_UPGRADE: + mask = 0x01; + break; default: return; } @@ -787,6 +804,15 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) spin_lock(&rxnet->client_conn_cache_lock); switch (conn->cache_state) { + case RXRPC_CONN_CLIENT_UPGRADE: + /* Deal with termination of a service upgrade probe. */ + if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { + clear_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags); + trace_rxrpc_client(conn, channel, rxrpc_client_to_active); + conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE; + rxrpc_activate_channels_locked(conn); + } + /* fall through */ case RXRPC_CONN_CLIENT_ACTIVE: if (list_empty(&conn->waiting_calls)) { rxrpc_deactivate_one_channel(conn, channel); @@ -941,7 +967,8 @@ static void rxrpc_cull_active_client_conns(struct rxrpc_net *rxnet) ASSERT(!list_empty(&rxnet->active_client_conns)); conn = list_entry(rxnet->active_client_conns.next, struct rxrpc_connection, cache_link); - ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE); + ASSERTIFCMP(conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE, + conn->cache_state, ==, RXRPC_CONN_CLIENT_UPGRADE); if (list_empty(&conn->waiting_calls)) { trace_rxrpc_client(conn, -1, rxrpc_client_to_culled); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 45dba732a3b4..e56e23ed2229 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1142,6 +1142,13 @@ void rxrpc_data_ready(struct sock *udp_sk) if (sp->hdr.securityIndex != conn->security_ix) goto wrong_security; + if (sp->hdr.serviceId != conn->service_id) { + if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) || + conn->service_id != conn->params.service_id) + goto reupgrade; + conn->service_id = sp->hdr.serviceId; + } + if (sp->hdr.callNumber == 0) { /* Connection-level packet */ _debug("CONN %p {%d}", conn, conn->debug_id); @@ -1194,6 +1201,9 @@ void rxrpc_data_ready(struct sock *udp_sk) rxrpc_input_implicit_end_call(conn, call); call = NULL; } + + if (call && sp->hdr.serviceId != call->service_id) + call->service_id = sp->hdr.serviceId; } else { skew = 0; call = NULL; @@ -1237,11 +1247,18 @@ wrong_security: skb->priority = RXKADINCONSISTENCY; goto post_abort; +reupgrade: + rcu_read_unlock(); + trace_rxrpc_abort("UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_PROTOCOL_ERROR, EBADMSG); + goto protocol_error; + bad_message_unlock: rcu_read_unlock(); bad_message: trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_PROTOCOL_ERROR, EBADMSG); +protocol_error: skb->priority = RX_PROTOCOL_ERROR; post_abort: skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 5dab1ff3a6c2..5bd2d0fa4a03 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -292,6 +292,10 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, whdr._rsvd = htons(sp->hdr._rsvd); whdr.serviceId = htons(call->service_id); + if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) && + sp->hdr.seq == 1) + whdr.userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; + iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); iov[1].iov_base = skb->head; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 96ffa5d5733b..5a4801e7f560 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -366,7 +366,8 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, unsigned long *user_call_ID, enum rxrpc_command *command, u32 *abort_code, - bool *_exclusive) + bool *_exclusive, + bool *_upgrade) { struct cmsghdr *cmsg; bool got_user_ID = false; @@ -429,6 +430,13 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, if (len != 0) return -EINVAL; break; + + case RXRPC_UPGRADE_SERVICE: + *_upgrade = true; + if (len != 0) + return -EINVAL; + break; + default: return -EINVAL; } @@ -447,7 +455,8 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, */ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, - unsigned long user_call_ID, bool exclusive) + unsigned long user_call_ID, bool exclusive, + bool upgrade) __releases(&rx->sk.sk_lock.slock) { struct rxrpc_conn_parameters cp; @@ -472,6 +481,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, cp.key = rx->key; cp.security_level = rx->min_sec_level; cp.exclusive = rx->exclusive | exclusive; + cp.upgrade = upgrade; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL); /* The socket is now unlocked */ @@ -493,13 +503,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) struct rxrpc_call *call; unsigned long user_call_ID = 0; bool exclusive = false; + bool upgrade = true; u32 abort_code = 0; int ret; _enter(""); ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, - &exclusive); + &exclusive, &upgrade); if (ret < 0) goto error_release_sock; @@ -521,7 +532,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (cmd != RXRPC_CMD_SEND_DATA) goto error_release_sock; call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, - exclusive); + exclusive, upgrade); /* The socket is now unlocked... */ if (IS_ERR(call)) return PTR_ERR(call); -- cgit v1.2.3 From 8ec1507dc9d16ecd9ada2f67efa95f8d586335fb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 4 Jun 2017 18:49:28 +0200 Subject: net: sched: select cls when cls_act is enabled It really makes no sense to have cls_act enabled without cls. In that case, the cls_act code is dead. So select it. This also fixes an issue recently reported by kbuild robot: [linux-next:master 1326/4151] net/sched/act_api.c:37:18: error: implicit declaration of function 'tcf_chain_get' Reported-by: kbuild test robot Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 9fb84f0de6af..e70ed26485a2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -649,6 +649,7 @@ config NET_EMATCH_IPSET config NET_CLS_ACT bool "Actions" + select NET_CLS ---help--- Say Y here if you want to use traffic control actions. Actions get attached to classifiers and are invoked after a successful -- cgit v1.2.3 From 6044bd4a7d580d4459b992bc6631c817486a1514 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Mon, 5 Jun 2017 08:57:21 +0800 Subject: devlink: fix potential memort leak We must free allocated skb when genlmsg_put() return fails. Fixes: 1555d204e743 ("devlink: Support for pipeline debug (dpipe)") Signed-off-by: Haishuang Yan Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index b0b87a292e7c..a0adfc31a3fe 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1680,8 +1680,10 @@ start_again: hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) + if (!hdr) { + nlmsg_free(skb); return -EMSGSIZE; + } if (devlink_nl_put_handle(skb, devlink)) goto nla_put_failure; @@ -2098,8 +2100,10 @@ start_again: hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) + if (!hdr) { + nlmsg_free(skb); return -EMSGSIZE; + } if (devlink_nl_put_handle(skb, devlink)) goto nla_put_failure; -- cgit v1.2.3 From f4eb17e1efe538d4da7d574bedb00a8dafcc26b7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 6 Jun 2017 11:34:06 -0400 Subject: Revert "sit: reload iphdr in ipip6_rcv" This reverts commit b699d0035836f6712917a41e7ae58d84359b8ff9. As per Eric Dumazet, the pskb_may_pull() is a NOP in this particular case, so the 'iph' reload is unnecessary. Signed-off-by: David S. Miller --- net/ipv6/sit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index af832e7ce80f..61e5902f0687 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -657,7 +657,6 @@ static int ipip6_rcv(struct sk_buff *skb) if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6), !net_eq(tunnel->net, dev_net(tunnel->dev)))) goto out; - iph = ip_hdr(skb); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { -- cgit v1.2.3 From e25ea21ffa66a029acfa89d2611c0e7ef23e7d8c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 6 Jun 2017 14:12:02 +0200 Subject: net: sched: introduce a TRAP control action There is need to instruct the HW offloaded path to push certain matched packets to cpu/kernel for further analysis. So this patch introduces a new TRAP control action to TC. For kernel datapath, this action does not make much sense. So with the same logic as in HW, new TRAP behaves similar to STOLEN. The skb is just dropped in the datapath (and virtually ejected to an upper level, which does not exist in case of kernel). Signed-off-by: Jiri Pirko Reviewed-by: Yotam Gigi Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 7 +++++++ net/core/dev.c | 2 ++ net/sched/cls_bpf.c | 1 + net/sched/sch_atm.c | 1 + net/sched/sch_cbq.c | 1 + net/sched/sch_drr.c | 1 + net/sched/sch_dsmark.c | 1 + net/sched/sch_fq_codel.c | 1 + net/sched/sch_hfsc.c | 1 + net/sched/sch_htb.c | 1 + net/sched/sch_multiq.c | 1 + net/sched/sch_prio.c | 1 + net/sched/sch_qfq.c | 1 + net/sched/sch_sfb.c | 1 + net/sched/sch_sfq.c | 1 + 15 files changed, 22 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index edf43ddf47b0..2055783e6ee9 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -37,6 +37,13 @@ enum { #define TC_ACT_QUEUED 5 #define TC_ACT_REPEAT 6 #define TC_ACT_REDIRECT 7 +#define TC_ACT_TRAP 8 /* For hw path, this means "trap to cpu" + * and don't further process the frame + * in hardware. For sw path, this is + * equivalent of TC_ACT_STOLEN - drop + * the skb and act like everything + * is alright. + */ /* There is a special kind of actions called "extended actions", * which need a value parameter. These have a local opcode located in diff --git a/net/core/dev.c b/net/core/dev.c index 06e0a7492df8..8f72f4a9c6ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3269,6 +3269,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *ret = NET_XMIT_SUCCESS; consume_skb(skb); return NULL; @@ -4038,6 +4039,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: consume_skb(skb); return NULL; case TC_ACT_REDIRECT: diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5ebeae996e63..a9c56ad4533a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -70,6 +70,7 @@ static int cls_bpf_exec_opcode(int code) case TC_ACT_OK: case TC_ACT_SHOT: case TC_ACT_STOLEN: + case TC_ACT_TRAP: case TC_ACT_REDIRECT: case TC_ACT_UNSPEC: return code; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index f435546c3864..de162592eee0 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -406,6 +406,7 @@ done: switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 8dd6d0aca678..481036f6b54e 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -254,6 +254,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 5db2a2843c66..a413dc1c2098 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -339,6 +339,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 7ccdd825d34e..6d94fcc3592a 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -243,6 +243,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, #ifdef CONFIG_NET_CLS_ACT case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index f201e73947fb..337f2d6d81e4 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -103,6 +103,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return 0; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index a324f84b1ccd..b52f74610dc7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1155,6 +1155,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 195bbca9eb0b..203286ab4427 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -238,6 +238,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 604767482ad0..f143b7bbaa0d 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -52,6 +52,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index a2404688dd01..e3e364cc9a70 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -48,6 +48,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 076ad032befb..0e16dfda0bd7 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -726,6 +726,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 9756b1ccd345..11fb6ec878d6 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -266,6 +266,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return false; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 66dfd15b7946..f80ea2cc5f1f 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -187,6 +187,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return 0; -- cgit v1.2.3 From e3ebdb20fddacded2740a333ff66781e0d28b05c Mon Sep 17 00:00:00 2001 From: Richard Haines Date: Mon, 5 Jun 2017 16:44:40 +0100 Subject: net/ipv6: Fix CALIPSO causing GPF with datagram support When using CALIPSO with IPPROTO_UDP it is possible to trigger a GPF as the IP header may have moved. Also update the payload length after adding the CALIPSO option. Signed-off-by: Richard Haines Acked-by: Paul Moore Signed-off-by: Huw Davies Signed-off-by: David S. Miller --- net/ipv6/calipso.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 37ac9de713c6..8d772fea1dde 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -1319,7 +1319,7 @@ static int calipso_skbuff_setattr(struct sk_buff *skb, struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *hop; unsigned char buf[CALIPSO_MAX_BUFFER]; - int len_delta, new_end, pad; + int len_delta, new_end, pad, payload; unsigned int start, end; ip6_hdr = ipv6_hdr(skb); @@ -1346,6 +1346,8 @@ static int calipso_skbuff_setattr(struct sk_buff *skb, if (ret_val < 0) return ret_val; + ip6_hdr = ipv6_hdr(skb); /* Reset as skb_cow() may have moved it */ + if (len_delta) { if (len_delta > 0) skb_push(skb, len_delta); @@ -1355,6 +1357,8 @@ static int calipso_skbuff_setattr(struct sk_buff *skb, sizeof(*ip6_hdr) + start); skb_reset_network_header(skb); ip6_hdr = ipv6_hdr(skb); + payload = ntohs(ip6_hdr->payload_len); + ip6_hdr->payload_len = htons(payload + len_delta); } hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); -- cgit v1.2.3 From 1020ce3108cc26fbf09d70550ea2937cb1a211d2 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 6 Jun 2017 01:26:24 +0300 Subject: net: bridge: fix a null pointer dereference in br_afspec We might call br_afspec() with p == NULL which is a valid use case if the action is on the bridge device itself, but the bridge tunnel code dereferences the p pointer without checking, so check if p is null first. Reported-by: Gustavo A. R. Silva Fixes: efa5356b0d97 ("bridge: per vlan dst_metadata netlink support") Signed-off-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 574f78824d8a..32bd3ead9ba1 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -595,7 +595,7 @@ static int br_afspec(struct net_bridge *br, err = 0; switch (nla_type(attr)) { case IFLA_BRIDGE_VLAN_TUNNEL_INFO: - if (!(p->flags & BR_VLAN_TUNNEL)) + if (!p || !(p->flags & BR_VLAN_TUNNEL)) return -EINVAL; err = br_parse_vlan_tunnel_info(attr, &tinfo_curr); if (err) -- cgit v1.2.3