From 6341e62b212a2541efb0160c470e90bd226d5496 Mon Sep 17 00:00:00 2001 From: Christoph Jaeger Date: Sat, 20 Dec 2014 15:41:11 -0500 Subject: kconfig: use bool instead of boolean for type definition attributes Support for keyword 'boolean' will be dropped later on. No functional change. Reference: http://lkml.kernel.org/r/cover.1418003065.git.cj@linux.com Signed-off-by: Christoph Jaeger Signed-off-by: Michal Marek --- net/Kconfig | 14 +++++++------- net/sched/Kconfig | 2 +- net/switchdev/Kconfig | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index ff9ffc17fa0e..44dd5786ee91 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -231,18 +231,18 @@ source "net/hsr/Kconfig" source "net/switchdev/Kconfig" config RPS - boolean + bool depends on SMP && SYSFS default y config RFS_ACCEL - boolean + bool depends on RPS select CPU_RMAP default y config XPS - boolean + bool depends on SMP default y @@ -254,18 +254,18 @@ config CGROUP_NET_PRIO a per-interface basis. config CGROUP_NET_CLASSID - boolean "Network classid cgroup" + bool "Network classid cgroup" depends on CGROUPS ---help--- Cgroup subsystem for use as general purpose socket classid marker that is being used in cls_cgroup and for netfilter matching. config NET_RX_BUSY_POLL - boolean + bool default y config BQL - boolean + bool depends on SYSFS select DQL default y @@ -282,7 +282,7 @@ config BPF_JIT this feature changing /proc/sys/net/core/bpf_jit_enable config NET_FLOW_LIMIT - boolean + bool depends on RPS default y ---help--- diff --git a/net/sched/Kconfig b/net/sched/Kconfig index c54c9d9d1ffb..706af73c969f 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -348,7 +348,7 @@ config NET_SCH_PLUG comment "Classification" config NET_CLS - boolean + bool config NET_CLS_BASIC tristate "Elementary classification (BASIC)" diff --git a/net/switchdev/Kconfig b/net/switchdev/Kconfig index 155754588fd6..86a47e17cfaf 100644 --- a/net/switchdev/Kconfig +++ b/net/switchdev/Kconfig @@ -3,7 +3,7 @@ # config NET_SWITCHDEV - boolean "Switch (and switch-ish) device support (EXPERIMENTAL)" + bool "Switch (and switch-ish) device support (EXPERIMENTAL)" depends on INET ---help--- This module provides glue between core networking code and device -- cgit v1.2.3 From 7754f53e944122fa95ef36039d3a5009910ce6fc Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 12 Jan 2015 16:23:37 +0200 Subject: virtio/9p: verify device has config space Some devices might not implement config space access (e.g. remoteproc used not to - before 3.9). virtio/9p needs config space access so make it fail gracefully if not there. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- net/9p/trans_virtio.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index daa749c8b3fb..d8e376a5f0f1 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -524,6 +524,12 @@ static int p9_virtio_probe(struct virtio_device *vdev) int err; struct virtio_chan *chan; + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL); if (!chan) { pr_err("Failed to allocate virtio 9P channel\n"); -- cgit v1.2.3 From dd3733b3e798daf778a1ec08557f388f00fdc2f6 Mon Sep 17 00:00:00 2001 From: Alexey Andriyanov Date: Fri, 6 Feb 2015 22:32:20 +0300 Subject: ipvs: fix inability to remove a mixed-family RS The current code prevents any operation with a mixed-family dest unless IP_VS_CONN_F_TUNNEL flag is set. The problem is that it's impossible for the client to follow this rule, because ip_vs_genl_parse_dest does not even read the destination conn_flags when cmd = IPVS_CMD_DEL_DEST (need_full_dest = 0). Also, not every client can pass this flag when removing a dest. ipvsadm, for example, does not support the "-i" command line option together with the "-d" option. This change disables any checks for mixed-family on IPVS_CMD_DEL_DEST command. Signed-off-by: Alexey Andriyanov Fixes: bc18d37f676f ("ipvs: Allow heterogeneous pools now that we support them") Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b8295a430a56..fdcda8be1f0f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3399,7 +3399,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (udest.af == 0) udest.af = svc->af; - if (udest.af != svc->af) { + if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services */ -- cgit v1.2.3 From 813b00d63f6ca1ed40a2f4f9c034d59bc424025e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 13 Feb 2015 13:08:25 -0500 Subject: SUNRPC: Always manipulate rpc_rqst::rq_bc_pa_list under xprt->bc_pa_lock Other code that accesses rq_bc_pa_list holds xprt->bc_pa_lock. xprt_complete_bc_request() should do the same. Fixes: 2ea24497a1b3 ("SUNRPC: RPC callbacks may be split . . .") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/backchannel_rqst.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 651f49ab601f..9dd0ea8db463 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -309,12 +309,15 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) struct rpc_xprt *xprt = req->rq_xprt; struct svc_serv *bc_serv = xprt->bc_serv; + spin_lock(&xprt->bc_pa_lock); + list_del(&req->rq_bc_pa_list); + spin_unlock(&xprt->bc_pa_lock); + req->rq_private_buf.len = copied; set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); dprintk("RPC: add callback request to list\n"); spin_lock(&bc_serv->sv_cb_lock); - list_del(&req->rq_bc_pa_list); list_add(&req->rq_bc_list, &bc_serv->sv_cb_list); wake_up(&bc_serv->sv_cb_waitq); spin_unlock(&bc_serv->sv_cb_lock); -- cgit v1.2.3 From 520aa7414bb590f39d0d1591b06018e60cbc7cf4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 12 Feb 2015 22:15:31 +0100 Subject: netfilter: nft_compat: fix module refcount underflow Feb 12 18:20:42 nfdev kernel: ------------[ cut here ]------------ Feb 12 18:20:42 nfdev kernel: WARNING: CPU: 4 PID: 4359 at kernel/module.c:963 module_put+0x9b/0xba() Feb 12 18:20:42 nfdev kernel: CPU: 4 PID: 4359 Comm: ebtables-compat Tainted: G W 3.19.0-rc6+ #43 [...] Feb 12 18:20:42 nfdev kernel: Call Trace: Feb 12 18:20:42 nfdev kernel: [] dump_stack+0x4c/0x65 Feb 12 18:20:42 nfdev kernel: [] warn_slowpath_common+0x9c/0xb6 Feb 12 18:20:42 nfdev kernel: [] ? module_put+0x9b/0xba Feb 12 18:20:42 nfdev kernel: [] warn_slowpath_null+0x15/0x17 Feb 12 18:20:42 nfdev kernel: [] module_put+0x9b/0xba Feb 12 18:20:42 nfdev kernel: [] nft_match_destroy+0x45/0x4c Feb 12 18:20:42 nfdev kernel: [] nf_tables_rule_destroy+0x28/0x70 Reported-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso Tested-by: Arturo Borrero Gonzalez --- net/netfilter/nft_compat.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 265e190f2218..b6364869c2e0 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -578,8 +578,12 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct xt_match *match = nft_match->ops.data; if (strcmp(match->name, mt_name) == 0 && - match->revision == rev && match->family == family) + match->revision == rev && match->family == family) { + if (!try_module_get(match->me)) + return ERR_PTR(-ENOENT); + return &nft_match->ops; + } } match = xt_request_find_match(family, mt_name, rev); @@ -648,8 +652,12 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct xt_target *target = nft_target->ops.data; if (strcmp(target->name, tg_name) == 0 && - target->revision == rev && target->family == family) + target->revision == rev && target->family == family) { + if (!try_module_get(target->me)) + return ERR_PTR(-ENOENT); + return &nft_target->ops; + } } target = xt_request_find_target(family, tg_name, rev); -- cgit v1.2.3 From cef9ed86ed62eeffcd017882278bbece32001f86 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 13 Feb 2015 12:47:50 +0100 Subject: netfilter: xt_recent: don't reject rule if new hitcount exceeds table max given: -A INPUT -m recent --update --seconds 30 --hitcount 4 and iptables-save > foo then iptables-restore < foo will fail with: kernel: xt_recent: hitcount (4) is larger than packets to be remembered (4) for table DEFAULT Even when the check is fixed, the restore won't work if the hitcount is increased to e.g. 6, since by the time checkentry runs it will find the 'old' incarnation of the table. We can avoid this by increasing the maximum threshold silently; we only have to rm all the current entries of the table (these entries would not have enough room to handle the increased hitcount). This even makes (not-very-useful) -A INPUT -m recent --update --seconds 30 --hitcount 4 -A INPUT -m recent --update --seconds 30 --hitcount 42 work. Fixes: abc86d0f99242b7f142b (netfilter: xt_recent: relax ip_pkt_list_tot restrictions) Tracked-down-by: Chris Vine Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_recent.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 30dbe34915ae..45e1b30e4fb2 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -378,12 +378,11 @@ static int recent_mt_check(const struct xt_mtchk_param *par, mutex_lock(&recent_mutex); t = recent_table_lookup(recent_net, info->name); if (t != NULL) { - if (info->hit_count > t->nstamps_max_mask) { - pr_info("hitcount (%u) is larger than packets to be remembered (%u) for table %s\n", - info->hit_count, t->nstamps_max_mask + 1, - info->name); - ret = -EINVAL; - goto out; + if (nstamp_mask > t->nstamps_max_mask) { + spin_lock_bh(&recent_lock); + recent_table_flush(t); + t->nstamps_max_mask = nstamp_mask; + spin_unlock_bh(&recent_lock); } t->refcnt++; -- cgit v1.2.3 From 78296c97ca1fd3b104f12e1f1fbc06c46635990b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Feb 2015 19:03:45 -0800 Subject: netfilter: xt_socket: fix a stack corruption bug As soon as extract_icmp6_fields() returns, its local storage (automatic variables) is deallocated and can be overwritten. Lets add an additional parameter to make sure storage is valid long enough. While we are at it, adds some const qualifiers. Signed-off-by: Eric Dumazet Fixes: b64c9256a9b76 ("tproxy: added IPv6 support to the socket match") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_socket.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 1ba67931eb1b..13332dbf291d 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -243,12 +243,13 @@ static int extract_icmp6_fields(const struct sk_buff *skb, unsigned int outside_hdrlen, int *protocol, - struct in6_addr **raddr, - struct in6_addr **laddr, + const struct in6_addr **raddr, + const struct in6_addr **laddr, __be16 *rport, - __be16 *lport) + __be16 *lport, + struct ipv6hdr *ipv6_var) { - struct ipv6hdr *inside_iph, _inside_iph; + const struct ipv6hdr *inside_iph; struct icmp6hdr *icmph, _icmph; __be16 *ports, _ports[2]; u8 inside_nexthdr; @@ -263,12 +264,14 @@ extract_icmp6_fields(const struct sk_buff *skb, if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK) return 1; - inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph); + inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), + sizeof(*ipv6_var), ipv6_var); if (inside_iph == NULL) return 1; inside_nexthdr = inside_iph->nexthdr; - inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), + inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + + sizeof(*ipv6_var), &inside_nexthdr, &inside_fragoff); if (inside_hdrlen < 0) return 1; /* hjm: Packet has no/incomplete transport layer headers. */ @@ -315,10 +318,10 @@ xt_socket_get_sock_v6(struct net *net, const u8 protocol, static bool socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) { - struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6hdr ipv6_var, *iph = ipv6_hdr(skb); struct udphdr _hdr, *hp = NULL; struct sock *sk = skb->sk; - struct in6_addr *daddr = NULL, *saddr = NULL; + const struct in6_addr *daddr = NULL, *saddr = NULL; __be16 uninitialized_var(dport), uninitialized_var(sport); int thoff = 0, uninitialized_var(tproto); const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; @@ -342,7 +345,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) } else if (tproto == IPPROTO_ICMPV6) { if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, - &sport, &dport)) + &sport, &dport, &ipv6_var)) return false; } else { return false; -- cgit v1.2.3 From a1d1e9be5a1dafe0ddc2181a9201c2ae29c71eff Mon Sep 17 00:00:00 2001 From: David Ramos Date: Fri, 13 Feb 2015 13:11:51 -0800 Subject: svcrpc: fix memory leak in gssp_accept_sec_context_upcall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our UC-KLEE tool found a kernel memory leak of 512 bytes (on x86_64) for each call to gssp_accept_sec_context_upcall() (net/sunrpc/auth_gss/gss_rpc_upcall.c). Since it appears that this call can be triggered by remote connections (at least, from a cursory a glance at the call chain), it may be exploitable to cause kernel memory exhaustion. We found the bug in kernel 3.16.3, but it appears to date back to commit 9dfd87da1aeb0fd364167ad199f40fe96a6a87be (2013-08-20). The gssp_accept_sec_context_upcall() function performs a pair of calls to gssp_alloc_receive_pages() and gssp_free_receive_pages(). The first allocates memory for arg->pages. The second then frees the pages pointed to by the arg->pages array, but not the array itself. Reported-by: David A. Ramos Fixes: 9dfd87da1aeb ("rpc: fix huge kmalloc's in gss-proxy”) Signed-off-by: David A. Ramos Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_rpc_upcall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index abbb7dcd1689..59eeed43eda2 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -217,6 +217,8 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) for (i = 0; i < arg->npages && arg->pages[i]; i++) __free_page(arg->pages[i]); + + kfree(arg->pages); } static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) -- cgit v1.2.3 From 7a6fdeb2b1e93548854063c46c9724458564c76b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 22 Dec 2014 19:14:26 +0300 Subject: libceph: nuke pool op infrastructure On Mon, Dec 22, 2014 at 5:35 PM, Sage Weil wrote: > On Mon, 22 Dec 2014, Ilya Dryomov wrote: >> Actually, pool op stuff has been unused for over two years - looks like >> it was added for rbd create_snap and that got ripped out in 2012. It's >> unlikely we'd ever need to manage pools or snaps from the kernel client >> so I think it makes sense to nuke it. Sage? > > Yep! Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 36 ----------- include/linux/ceph/mon_client.h | 9 +-- net/ceph/ceph_strings.c | 14 ----- net/ceph/debugfs.c | 2 - net/ceph/mon_client.c | 135 +--------------------------------------- 5 files changed, 4 insertions(+), 192 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index c0dadaac26e3..69e2c9e2305b 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -158,17 +158,6 @@ enum { }; -/* pool operations */ -enum { - POOL_OP_CREATE = 0x01, - POOL_OP_DELETE = 0x02, - POOL_OP_AUID_CHANGE = 0x03, - POOL_OP_CREATE_SNAP = 0x11, - POOL_OP_DELETE_SNAP = 0x12, - POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, - POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, -}; - struct ceph_mon_request_header { __le64 have_version; __le16 session_mon; @@ -191,31 +180,6 @@ struct ceph_mon_statfs_reply { struct ceph_statfs st; } __attribute__ ((packed)); -const char *ceph_pool_op_name(int op); - -struct ceph_mon_poolop { - struct ceph_mon_request_header monhdr; - struct ceph_fsid fsid; - __le32 pool; - __le32 op; - __le64 auid; - __le64 snapid; - __le32 name_len; -} __attribute__ ((packed)); - -struct ceph_mon_poolop_reply { - struct ceph_mon_request_header monhdr; - struct ceph_fsid fsid; - __le32 reply_code; - __le32 epoch; - char has_data; - char data[0]; -} __attribute__ ((packed)); - -struct ceph_mon_unmanaged_snap { - __le64 snapid; -} __attribute__ ((packed)); - struct ceph_osd_getmap { struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index deb47e45ac7c..81810dc21f06 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -40,7 +40,7 @@ struct ceph_mon_request { }; /* - * ceph_mon_generic_request is being used for the statfs, poolop and + * ceph_mon_generic_request is being used for the statfs and * mon_get_version requests which are being done a bit differently * because we need to get data back to the caller */ @@ -50,7 +50,6 @@ struct ceph_mon_generic_request { struct rb_node node; int result; void *buf; - int buf_len; struct completion completion; struct ceph_msg *request; /* original request */ struct ceph_msg *reply; /* and reply */ @@ -117,10 +116,4 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc); extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); -extern int ceph_monc_create_snapid(struct ceph_mon_client *monc, - u32 pool, u64 *snapid); - -extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc, - u32 pool, u64 snapid); - #endif diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 30560202f57b..139a9cb19b0c 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c @@ -42,17 +42,3 @@ const char *ceph_osd_state_name(int s) return "???"; } } - -const char *ceph_pool_op_name(int op) -{ - switch (op) { - case POOL_OP_CREATE: return "create"; - case POOL_OP_DELETE: return "delete"; - case POOL_OP_AUID_CHANGE: return "auid change"; - case POOL_OP_CREATE_SNAP: return "create snap"; - case POOL_OP_DELETE_SNAP: return "delete snap"; - case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; - case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; - } - return "???"; -} diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index d2d525529f87..14d9995097cc 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -127,8 +127,6 @@ static int monc_show(struct seq_file *s, void *p) op = le16_to_cpu(req->request->hdr.type); if (op == CEPH_MSG_STATFS) seq_printf(s, "%llu statfs\n", req->tid); - else if (op == CEPH_MSG_POOLOP) - seq_printf(s, "%llu poolop\n", req->tid); else if (op == CEPH_MSG_MON_GET_VERSION) seq_printf(s, "%llu mon_get_version", req->tid); else diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index f2148e22b148..02e105c6865f 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -410,7 +410,7 @@ out_unlocked: } /* - * generic requests (e.g., statfs, poolop) + * generic requests (currently statfs, mon_get_version) */ static struct ceph_mon_generic_request *__lookup_generic_req( struct ceph_mon_client *monc, u64 tid) @@ -569,7 +569,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, return; bad: - pr_err("corrupt generic reply, tid %llu\n", tid); + pr_err("corrupt statfs reply, tid %llu\n", tid); ceph_msg_dump(msg); } @@ -588,7 +588,6 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) kref_init(&req->kref); req->buf = buf; - req->buf_len = sizeof(*buf); init_completion(&req->completion); err = -ENOMEM; @@ -647,7 +646,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, return; bad: - pr_err("corrupt mon_get_version reply\n"); + pr_err("corrupt mon_get_version reply, tid %llu\n", tid); ceph_msg_dump(msg); } @@ -670,7 +669,6 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, kref_init(&req->kref); req->buf = newest; - req->buf_len = sizeof(*newest); init_completion(&req->completion); req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, @@ -706,128 +704,6 @@ out: } EXPORT_SYMBOL(ceph_monc_do_get_version); -/* - * pool ops - */ -static int get_poolop_reply_buf(const char *src, size_t src_len, - char *dst, size_t dst_len) -{ - u32 buf_len; - - if (src_len != sizeof(u32) + dst_len) - return -EINVAL; - - buf_len = le32_to_cpu(*(__le32 *)src); - if (buf_len != dst_len) - return -EINVAL; - - memcpy(dst, src + sizeof(u32), dst_len); - return 0; -} - -static void handle_poolop_reply(struct ceph_mon_client *monc, - struct ceph_msg *msg) -{ - struct ceph_mon_generic_request *req; - struct ceph_mon_poolop_reply *reply = msg->front.iov_base; - u64 tid = le64_to_cpu(msg->hdr.tid); - - if (msg->front.iov_len < sizeof(*reply)) - goto bad; - dout("handle_poolop_reply %p tid %llu\n", msg, tid); - - mutex_lock(&monc->mutex); - req = __lookup_generic_req(monc, tid); - if (req) { - if (req->buf_len && - get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), - msg->front.iov_len - sizeof(*reply), - req->buf, req->buf_len) < 0) { - mutex_unlock(&monc->mutex); - goto bad; - } - req->result = le32_to_cpu(reply->reply_code); - get_generic_request(req); - } - mutex_unlock(&monc->mutex); - if (req) { - complete(&req->completion); - put_generic_request(req); - } - return; - -bad: - pr_err("corrupt generic reply, tid %llu\n", tid); - ceph_msg_dump(msg); -} - -/* - * Do a synchronous pool op. - */ -static int do_poolop(struct ceph_mon_client *monc, u32 op, - u32 pool, u64 snapid, - char *buf, int len) -{ - struct ceph_mon_generic_request *req; - struct ceph_mon_poolop *h; - int err; - - req = kzalloc(sizeof(*req), GFP_NOFS); - if (!req) - return -ENOMEM; - - kref_init(&req->kref); - req->buf = buf; - req->buf_len = len; - init_completion(&req->completion); - - err = -ENOMEM; - req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, - true); - if (!req->request) - goto out; - req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, - true); - if (!req->reply) - goto out; - - /* fill out request */ - req->request->hdr.version = cpu_to_le16(2); - h = req->request->front.iov_base; - h->monhdr.have_version = 0; - h->monhdr.session_mon = cpu_to_le16(-1); - h->monhdr.session_mon_tid = 0; - h->fsid = monc->monmap->fsid; - h->pool = cpu_to_le32(pool); - h->op = cpu_to_le32(op); - h->auid = 0; - h->snapid = cpu_to_le64(snapid); - h->name_len = 0; - - err = do_generic_request(monc, req); - -out: - kref_put(&req->kref, release_generic_request); - return err; -} - -int ceph_monc_create_snapid(struct ceph_mon_client *monc, - u32 pool, u64 *snapid) -{ - return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, - pool, 0, (char *)snapid, sizeof(*snapid)); - -} -EXPORT_SYMBOL(ceph_monc_create_snapid); - -int ceph_monc_delete_snapid(struct ceph_mon_client *monc, - u32 pool, u64 snapid) -{ - return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, - pool, snapid, NULL, 0); - -} - /* * Resend pending generic requests. */ @@ -1112,10 +988,6 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) handle_get_version_reply(monc, msg); break; - case CEPH_MSG_POOLOP_REPLY: - handle_poolop_reply(monc, msg); - break; - case CEPH_MSG_MON_MAP: ceph_monc_handle_map(monc, msg); break; @@ -1154,7 +1026,6 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MON_SUBSCRIBE_ACK: m = ceph_msg_get(monc->m_subscribe_ack); break; - case CEPH_MSG_POOLOP_REPLY: case CEPH_MSG_STATFS_REPLY: return get_generic_reply(con, hdr, skip); case CEPH_MSG_AUTH_REPLY: -- cgit v1.2.3 From f646912d1044ace6c6c5b5785b2579177781873b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 22 Dec 2014 19:35:17 +0300 Subject: libceph: use mon_client.c/put_generic_request() more Signed-off-by: Ilya Dryomov --- net/ceph/mon_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 02e105c6865f..2b3cf05e87b0 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -610,7 +610,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) err = do_generic_request(monc, req); out: - kref_put(&req->kref, release_generic_request); + put_generic_request(req); return err; } EXPORT_SYMBOL(ceph_monc_do_statfs); @@ -699,7 +699,7 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, mutex_unlock(&monc->mutex); out: - kref_put(&req->kref, release_generic_request); + put_generic_request(req); return err; } EXPORT_SYMBOL(ceph_monc_do_get_version); -- cgit v1.2.3 From ba988f87f532cd2b8c4740aa8ec49056521ae833 Mon Sep 17 00:00:00 2001 From: Chaitanya Huilgol Date: Fri, 23 Jan 2015 16:41:25 +0530 Subject: libceph: tcp_nodelay support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TCP_NODELAY socket option set on connection sockets, disables Nagle’s algorithm and improves latency characteristics. tcp_nodelay(default)/notcp_nodelay option flags provided to enable/disable setting the socket option. Signed-off-by: Chaitanya Huilgol [idryomov@redhat.com: NO_TCP_NODELAY -> TCP_NODELAY, minor adjustments] Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 3 ++- include/linux/ceph/messenger.h | 4 +++- net/ceph/ceph_common.c | 16 +++++++++++++++- net/ceph/messenger.c | 14 +++++++++++++- 4 files changed, 33 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 8b11a79ca1cb..16fff9608848 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -30,8 +30,9 @@ #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ #define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ #define CEPH_OPT_NOMSGAUTH (1<<4) /* not require cephx message signature */ +#define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */ -#define CEPH_OPT_DEFAULT (0) +#define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) #define ceph_set_opt(client, opt) \ (client)->options->flags |= CEPH_OPT_##opt; diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index d9d396c16503..e15499422fdc 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -57,6 +57,7 @@ struct ceph_messenger { atomic_t stopping; bool nocrc; + bool tcp_nodelay; /* * the global_seq counts connections i (attempt to) initiate @@ -264,7 +265,8 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, u64 supported_features, u64 required_features, - bool nocrc); + bool nocrc, + bool tcp_nodelay); extern void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 5d5ab67f516d..ec565508e904 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -239,6 +239,8 @@ enum { Opt_nocrc, Opt_cephx_require_signatures, Opt_nocephx_require_signatures, + Opt_tcp_nodelay, + Opt_notcp_nodelay, }; static match_table_t opt_tokens = { @@ -259,6 +261,8 @@ static match_table_t opt_tokens = { {Opt_nocrc, "nocrc"}, {Opt_cephx_require_signatures, "cephx_require_signatures"}, {Opt_nocephx_require_signatures, "nocephx_require_signatures"}, + {Opt_tcp_nodelay, "tcp_nodelay"}, + {Opt_notcp_nodelay, "notcp_nodelay"}, {-1, NULL} }; @@ -457,6 +461,7 @@ ceph_parse_options(char *options, const char *dev_name, case Opt_nocrc: opt->flags |= CEPH_OPT_NOCRC; break; + case Opt_cephx_require_signatures: opt->flags &= ~CEPH_OPT_NOMSGAUTH; break; @@ -464,6 +469,13 @@ ceph_parse_options(char *options, const char *dev_name, opt->flags |= CEPH_OPT_NOMSGAUTH; break; + case Opt_tcp_nodelay: + opt->flags |= CEPH_OPT_TCP_NODELAY; + break; + case Opt_notcp_nodelay: + opt->flags &= ~CEPH_OPT_TCP_NODELAY; + break; + default: BUG_ON(token); } @@ -518,10 +530,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, /* msgr */ if (ceph_test_opt(client, MYIP)) myaddr = &client->options->my_addr; + ceph_messenger_init(&client->msgr, myaddr, client->supported_features, client->required_features, - ceph_test_opt(client, NOCRC)); + ceph_test_opt(client, NOCRC), + ceph_test_opt(client, TCP_NODELAY)); /* subsystems */ err = ceph_monc_init(&client->monc, client); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 33a2f201e460..6b3f54ed65ba 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -510,6 +510,16 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } + if (con->msgr->tcp_nodelay) { + int optval = 1; + + ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)&optval, sizeof(optval)); + if (ret) + pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", + ret); + } + sk_set_memalloc(sock->sk); con->sock = sock; @@ -2922,7 +2932,8 @@ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, u64 supported_features, u64 required_features, - bool nocrc) + bool nocrc, + bool tcp_nodelay) { msgr->supported_features = supported_features; msgr->required_features = required_features; @@ -2937,6 +2948,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr, get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); encode_my_addr(msgr); msgr->nocrc = nocrc; + msgr->tcp_nodelay = tcp_nodelay; atomic_set(&msgr->stopping, 0); -- cgit v1.2.3 From 7eb71e0351fbb1b242ae70abb7bb17107fe2f792 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 17 Feb 2015 19:37:15 +0300 Subject: libceph: fix double __remove_osd() problem It turns out it's possible to get __remove_osd() called twice on the same OSD. That doesn't sit well with rb_erase() - depending on the shape of the tree we can get a NULL dereference, a soft lockup or a random crash at some point in the future as we end up touching freed memory. One scenario that I was able to reproduce is as follows: con_fault_finish() osd_reset() ceph_osdc_handle_map() kick_requests() reset_changed_osds() __reset_osd() __remove_osd() __kick_osd_requests() __reset_osd() __remove_osd() <-- !!! A case can be made that osd refcounting is imperfect and reworking it would be a proper resolution, but for now Sage and I decided to fix this by adding a safe guard around __remove_osd(). Fixes: http://tracker.ceph.com/issues/8087 Cc: Sage Weil Cc: stable@vger.kernel.org # 3.9+: 7c6e6fc53e73: libceph: assert both regular and lingering lists in __remove_osd() Cc: stable@vger.kernel.org # 3.9+: cc9f1f518cec: libceph: change from BUG to WARN for __remove_osd() asserts Cc: stable@vger.kernel.org # 3.9+ Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 53299c7b0ca4..f693a2f8ac86 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1048,14 +1048,24 @@ static void put_osd(struct ceph_osd *osd) */ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) { - dout("__remove_osd %p\n", osd); + dout("%s %p osd%d\n", __func__, osd, osd->o_osd); WARN_ON(!list_empty(&osd->o_requests)); WARN_ON(!list_empty(&osd->o_linger_requests)); - rb_erase(&osd->o_node, &osdc->osds); list_del_init(&osd->o_osd_lru); - ceph_con_close(&osd->o_con); - put_osd(osd); + rb_erase(&osd->o_node, &osdc->osds); + RB_CLEAR_NODE(&osd->o_node); +} + +static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +{ + dout("%s %p osd%d\n", __func__, osd, osd->o_osd); + + if (!RB_EMPTY_NODE(&osd->o_node)) { + ceph_con_close(&osd->o_con); + __remove_osd(osdc, osd); + put_osd(osd); + } } static void remove_all_osds(struct ceph_osd_client *osdc) @@ -1065,7 +1075,7 @@ static void remove_all_osds(struct ceph_osd_client *osdc) while (!RB_EMPTY_ROOT(&osdc->osds)) { struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), struct ceph_osd, o_node); - __remove_osd(osdc, osd); + remove_osd(osdc, osd); } mutex_unlock(&osdc->request_mutex); } @@ -1106,7 +1116,7 @@ static void remove_old_osds(struct ceph_osd_client *osdc) list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { if (time_before(jiffies, osd->lru_ttl)) break; - __remove_osd(osdc, osd); + remove_osd(osdc, osd); } mutex_unlock(&osdc->request_mutex); } @@ -1121,8 +1131,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) dout("__reset_osd %p osd%d\n", osd, osd->o_osd); if (list_empty(&osd->o_requests) && list_empty(&osd->o_linger_requests)) { - __remove_osd(osdc, osd); - + remove_osd(osdc, osd); return -ENODEV; } @@ -1926,6 +1935,7 @@ static void reset_changed_osds(struct ceph_osd_client *osdc) { struct rb_node *p, *n; + dout("%s %p\n", __func__, osdc); for (p = rb_first(&osdc->osds); p; p = n) { struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); -- cgit v1.2.3 From b28ec2f37e6a2bbd0bdf74b39cb89c74e4ad17f3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Feb 2015 11:49:42 +0300 Subject: libceph: kfree() in put_osd() shouldn't depend on authorizer a255651d4cad ("ceph: ensure auth ops are defined before use") made kfree() in put_osd() conditional on the authorizer. A mechanical mistake most likely - fix it. Cc: Alex Elder Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f693a2f8ac86..41a4abc7e98e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1035,10 +1035,11 @@ static void put_osd(struct ceph_osd *osd) { dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); - if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { + if (atomic_dec_and_test(&osd->o_ref)) { struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); + if (osd->o_auth.authorizer) + ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); kfree(osd); } } -- cgit v1.2.3 From 1c4cff0cf55011792125b6041bc4e9713e46240f Mon Sep 17 00:00:00 2001 From: Ignacy Gawędzki Date: Fri, 13 Feb 2015 14:47:05 -0800 Subject: gen_stats.c: Duplicate xstats buffer for later use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gnet_stats_copy_app() function gets called, more often than not, with its second argument a pointer to an automatic variable in the caller's stack. Therefore, to avoid copying garbage afterwards when calling gnet_stats_finish_copy(), this data is better copied to a dynamically allocated memory that gets freed after use. [xiyou.wangcong@gmail.com: remove a useless kfree()] Signed-off-by: Ignacy Gawędzki Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/gen_stats.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 0c08062d1796..1e2f46a69d50 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -32,6 +32,9 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) return 0; nla_put_failure: + kfree(d->xstats); + d->xstats = NULL; + d->xstats_len = 0; spin_unlock_bh(d->lock); return -1; } @@ -305,7 +308,9 @@ int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) { if (d->compat_xstats) { - d->xstats = st; + d->xstats = kmemdup(st, len, GFP_ATOMIC); + if (!d->xstats) + goto err_out; d->xstats_len = len; } @@ -313,6 +318,11 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) return gnet_stats_copy(d, TCA_STATS_APP, st, len); return 0; + +err_out: + d->xstats_len = 0; + spin_unlock_bh(d->lock); + return -1; } EXPORT_SYMBOL(gnet_stats_copy_app); @@ -345,6 +355,9 @@ gnet_stats_finish_copy(struct gnet_dump *d) return -1; } + kfree(d->xstats); + d->xstats = NULL; + d->xstats_len = 0; spin_unlock_bh(d->lock); return 0; } -- cgit v1.2.3 From fba04a9e0c869498889b6445fd06cbe7da9bb834 Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Tue, 17 Feb 2015 13:33:46 +0300 Subject: ipv4: ip_check_defrag should correctly check return value of skb_copy_bits skb_copy_bits() returns zero on success and negative value on error, so it is needed to invert the condition in ip_check_defrag(). Fixes: 1bf3751ec90c ("ipv4: ip_check_defrag must not modify skb before unsharing") Signed-off-by: Alexander Drozdov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e5b6d0ddcb58..2c8d98e728c0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -664,7 +664,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) if (skb->protocol != htons(ETH_P_IP)) return skb; - if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) + if (skb_copy_bits(skb, 0, &iph, sizeof(iph)) < 0) return skb; if (iph.ihl < 5 || iph.version != 4) -- cgit v1.2.3 From 34eea79e2664b314cab6a30fc582fdfa7a1bb1df Mon Sep 17 00:00:00 2001 From: Ignacy Gawędzki Date: Tue, 17 Feb 2015 20:15:20 +0100 Subject: ematch: Fix auto-loading of ematch modules. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In tcf_em_validate(), after calling request_module() to load the kind-specific module, set em->ops to NULL before returning -EAGAIN, so that module_put() is not called again by tcf_em_tree_destroy(). Signed-off-by: Ignacy Gawędzki Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/ematch.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 6742200b1307..fbb7ebfc58c6 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -228,6 +228,7 @@ static int tcf_em_validate(struct tcf_proto *tp, * to replay the request. */ module_put(em->ops->owner); + em->ops = NULL; err = -EAGAIN; } #endif -- cgit v1.2.3 From 7b4577a9da3702049650f7095506e9afd9f68849 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 17 Feb 2015 11:23:10 -0800 Subject: openvswitch: Fix net exit. Open vSwitch allows moving internal vport to different namespace while still connected to the bridge. But when namespace deleted OVS does not detach these vports, that results in dangling pointer to netdevice which causes kernel panic as follows. This issue is fixed by detaching all ovs ports from the deleted namespace at net-exit. BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [] ovs_vport_locate+0x35/0x80 [openvswitch] Oops: 0000 [#1] SMP Call Trace: [] lookup_vport+0x21/0xd0 [openvswitch] [] ovs_vport_cmd_get+0x59/0xf0 [openvswitch] [] genl_family_rcv_msg+0x1bc/0x3e0 [] genl_rcv_msg+0x79/0xc0 [] netlink_rcv_skb+0xb9/0xe0 [] genl_rcv+0x2c/0x40 [] netlink_unicast+0x12d/0x1c0 [] netlink_sendmsg+0x34a/0x6b0 [] sock_sendmsg+0xa0/0xe0 [] ___sys_sendmsg+0x408/0x420 [] __sys_sendmsg+0x51/0x90 [] SyS_sendmsg+0x12/0x20 [] system_call_fastpath+0x12/0x17 Reported-by: Assaf Muller Fixes: 46df7b81454("openvswitch: Add support for network namespaces.") Signed-off-by: Pravin B Shelar Reviewed-by: Thomas Graf Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- net/openvswitch/vport.h | 2 ++ 2 files changed, 45 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ae5e77cdc0ca..5bae7243c577 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2194,14 +2194,55 @@ static int __net_init ovs_init_net(struct net *net) return 0; } -static void __net_exit ovs_exit_net(struct net *net) +static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, + struct list_head *head) { - struct datapath *dp, *dp_next; struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + struct datapath *dp; + + list_for_each_entry(dp, &ovs_net->dps, list_node) { + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + + hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { + struct netdev_vport *netdev_vport; + + if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) + continue; + + netdev_vport = netdev_vport_priv(vport); + if (dev_net(netdev_vport->dev) == dnet) + list_add(&vport->detach_list, head); + } + } + } +} + +static void __net_exit ovs_exit_net(struct net *dnet) +{ + struct datapath *dp, *dp_next; + struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id); + struct vport *vport, *vport_next; + struct net *net; + LIST_HEAD(head); ovs_lock(); list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); + + rtnl_lock(); + for_each_net(net) + list_vports_from_net(net, dnet, &head); + rtnl_unlock(); + + /* Detach all vports from given namespace. */ + list_for_each_entry_safe(vport, vport_next, &head, detach_list) { + list_del(&vport->detach_list); + ovs_dp_detach_port(vport); + } + ovs_unlock(); cancel_work_sync(&ovs_net->dp_notify_work); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index f8ae295fb001..bc85331a6c60 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -103,6 +103,7 @@ struct vport_portids { * @ops: Class structure. * @percpu_stats: Points to per-CPU statistics used and maintained by vport * @err_stats: Points to error statistics used and maintained by vport + * @detach_list: list used for detaching vport in net-exit call. */ struct vport { struct rcu_head rcu; @@ -117,6 +118,7 @@ struct vport { struct pcpu_sw_netstats __percpu *percpu_stats; struct vport_err_stats err_stats; + struct list_head detach_list; }; /** -- cgit v1.2.3 From 997d5c3f4427f38562cbe207ce05bb25fdcb993b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Feb 2015 05:47:55 -0800 Subject: sock: sock_dequeue_err_skb() needs hard irq safety Non NAPI drivers can call skb_tstamp_tx() and then sock_queue_err_skb() from hard IRQ context. Therefore, sock_dequeue_err_skb() needs to block hard irq or corruptions or hangs can happen. Signed-off-by: Eric Dumazet Fixes: 364a9e93243d1 ("sock: deduplicate errqueue dequeue") Fixes: cb820f8e4b7f7 ("net: Provide a generic socket error queue delivery method for Tx time stamps.") Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 88c613eab142..f80507823531 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3621,13 +3621,14 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { struct sk_buff_head *q = &sk->sk_error_queue; struct sk_buff *skb, *skb_next; + unsigned long flags; int err = 0; - spin_lock_bh(&q->lock); + spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); if (skb && (skb_next = skb_peek(q))) err = SKB_EXT_ERR(skb_next)->ee.ee_errno; - spin_unlock_bh(&q->lock); + spin_unlock_irqrestore(&q->lock, flags); sk->sk_err = err; if (err) -- cgit v1.2.3 From 65e9256c4ec569ed62bb89023daab7b72368b89f Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Fri, 20 Feb 2015 21:34:58 +0200 Subject: ethtool: Add hw-switch-offload to netdev_features_strings. commit aafb3e98b279 (netdev: introduce new NETIF_F_HW_SWITCH_OFFLOAD feature flag for switch device offloads) add a new feature without adding it to netdev_features_strings array; this patch fixes this. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 91f74f3eb204..aa378ecef186 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -98,6 +98,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_BUSY_POLL_BIT] = "busy-poll", + [NETIF_F_HW_SWITCH_OFFLOAD_BIT] = "hw-switch-offload", }; static const char -- cgit v1.2.3 From 278f7b4fffce9ad267406cf8800df271d14f4a16 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 19 Feb 2015 12:13:13 +0300 Subject: caif: fix a signedness bug in cfpkt_iterate() The cfpkt_iterate() function can return -EPROTO on error, but the function is a u16 so the negative value gets truncated to a positive unsigned short. This causes a static checker warning. The only caller which might care is cffrml_receive(), when it's checking the frame checksum. I modified cffrml_receive() so that it never says -EPROTO is a valid checksum. Also this isn't ever going to be inlined so I removed the "inline". Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- include/net/caif/cfpkt.h | 2 +- net/caif/cffrml.c | 2 +- net/caif/cfpkt_skbuff.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/caif/cfpkt.h b/include/net/caif/cfpkt.h index 1c1ad46250d5..fe328c52c46b 100644 --- a/include/net/caif/cfpkt.h +++ b/include/net/caif/cfpkt.h @@ -171,7 +171,7 @@ struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos); * @return Checksum of buffer. */ -u16 cfpkt_iterate(struct cfpkt *pkt, +int cfpkt_iterate(struct cfpkt *pkt, u16 (*iter_func)(u16 chks, void *buf, u16 len), u16 data); diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c index 8bc7caa28e64..434ba8557826 100644 --- a/net/caif/cffrml.c +++ b/net/caif/cffrml.c @@ -84,7 +84,7 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) u16 tmp; u16 len; u16 hdrchks; - u16 pktchks; + int pktchks; struct cffrml *this; this = container_obj(layr); diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index 1be0b521ac49..f6c3b2137eea 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -255,9 +255,9 @@ inline u16 cfpkt_getlen(struct cfpkt *pkt) return skb->len; } -inline u16 cfpkt_iterate(struct cfpkt *pkt, - u16 (*iter_func)(u16, void *, u16), - u16 data) +int cfpkt_iterate(struct cfpkt *pkt, + u16 (*iter_func)(u16, void *, u16), + u16 data) { /* * Don't care about the performance hit of linearizing, -- cgit v1.2.3 From a4176a9391868bfa87705bcd2e3b49e9b9dd2996 Mon Sep 17 00:00:00 2001 From: Matthew Thode Date: Tue, 17 Feb 2015 18:31:57 -0600 Subject: net: reject creation of netdev names with colons colons are used as a separator in netdev device lookup in dev_ioctl.c Specific functions are SIOCGIFTXQLEN SIOCETHTOOL SIOCSIFNAME Signed-off-by: Matthew Thode Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8f9710c62e20..962ee9d71964 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -946,7 +946,7 @@ bool dev_valid_name(const char *name) return false; while (*name) { - if (*name == '/' || isspace(*name)) + if (*name == '/' || *name == ':' || isspace(*name)) return false; name++; } -- cgit v1.2.3 From 3f34b24a732bab9635c4b32823268c37c01b40f0 Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Fri, 20 Feb 2015 08:24:27 +0300 Subject: af_packet: allow packets defragmentation not only for hash fanout type Packets defragmentation was introduced for PACKET_FANOUT_HASH only, see 7736d33f4262 ("packet: Add pre-defragmentation support for ipv4 fanouts") It may be useful to have defragmentation enabled regardless of fanout type. Without that, the AF_PACKET user may have to: 1. Collect fragments from different rings 2. Defragment by itself Signed-off-by: Alexander Drozdov Signed-off-by: David S. Miller --- net/packet/af_packet.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9c28cec1a083..99fc628f7372 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1349,14 +1349,14 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, return 0; } + if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { + skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); + if (!skb) + return 0; + } switch (f->type) { case PACKET_FANOUT_HASH: default: - if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { - skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); - if (!skb) - return 0; - } idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: -- cgit v1.2.3 From 52d6c8c6ca125872459054daa70f2f1c698c8e75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Feb 2015 17:03:41 -0800 Subject: net: pktgen: disable xmit_clone on virtual devices Trying to use burst capability (aka xmit_more) on a virtual device like bonding is not supported. For example, skb might be queued multiple times on a qdisc, with various list corruptions. Fixes: 38b2cf2982dc ("net: pktgen: packet bursting via skb->xmit_more") Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/pktgen.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b4899f5b7388..508155b283dd 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1134,6 +1134,9 @@ static ssize_t pktgen_if_write(struct file *file, return len; i += len; + if ((value > 1) && + (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + return -ENOTSUPP; pkt_dev->burst = value < 1 ? 1 : value; sprintf(pg_result, "OK: burst=%d", pkt_dev->burst); return count; -- cgit v1.2.3 From 6514890f7aa8dd75fa46e282a1c7a8615e86f363 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 20 Feb 2015 13:33:16 -0500 Subject: tcp: fix tcp_should_expand_sndbuf() to use tcp_packets_in_flight() tcp_should_expand_sndbuf() does not expand the send buffer if we have filled the congestion window. However, it should use tcp_packets_in_flight() instead of tp->packets_out to make this check. Testing has established that the difference matters a lot if there are many SACKed packets, causing a needless performance shortfall. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8fdd27b17306..fb4cf8b8e121 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4770,7 +4770,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we filled the congestion window, do not expand. */ - if (tp->packets_out >= tp->snd_cwnd) + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return false; return true; -- cgit v1.2.3 From 46b9e4bb76ee26f1e024e048bb95af41b763f48f Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 23 Feb 2015 12:02:51 +0100 Subject: decnet: Fix obvious o/0 typo Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- net/decnet/dn_route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 1d7c1256e845..3b81092771f8 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1062,7 +1062,7 @@ source_ok: if (decnet_debug_level & 16) printk(KERN_DEBUG "dn_route_output_slow: initial checks complete." - " dst=%o4x src=%04x oif=%d try_hard=%d\n", + " dst=%04x src=%04x oif=%d try_hard=%d\n", le16_to_cpu(fld.daddr), le16_to_cpu(fld.saddr), fld.flowidn_oif, try_hard); -- cgit v1.2.3 From a948f8ce771a1f07c17ed8bcb51f59f69129a51c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 23 Feb 2015 18:38:24 +0100 Subject: irda: replace current->state by set_current_state() Use helper functions to access current->state. Direct assignments are prone to races and therefore buggy. current->state = TASK_RUNNING can be replaced by __set_current_state() Thanks to Peter Zijlstra for the exact definition of the problem. Suggested-By: Peter Zijlstra Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/irda/ircomm/ircomm_tty.c | 2 +- net/irda/irnet/irnet_ppp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index 40695b9751c1..9940a41efca1 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -811,7 +811,7 @@ static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout) break; } spin_unlock_irqrestore(&self->spinlock, flags); - current->state = TASK_RUNNING; + __set_current_state(TASK_RUNNING); } /* diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 3c83a1e5ab03..1215693fdd22 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -305,7 +305,7 @@ irnet_ctrl_read(irnet_socket * ap, /* Put ourselves on the wait queue to be woken up */ add_wait_queue(&irnet_events.rwait, &wait); - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); for(;;) { /* If there is unread events */ @@ -321,7 +321,7 @@ irnet_ctrl_read(irnet_socket * ap, /* Yield and wait to be woken up */ schedule(); } - current->state = TASK_RUNNING; + __set_current_state(TASK_RUNNING); remove_wait_queue(&irnet_events.rwait, &wait); /* Did we got it ? */ -- cgit v1.2.3 From d720d8cec563ce4e4fa44a613d4f2dcb1caf2998 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Feb 2015 18:12:56 +0000 Subject: net: compat: Ignore MSG_CMSG_COMPAT in compat_sys_{send, recv}msg With commit a7526eb5d06b (net: Unbreak compat_sys_{send,recv}msg), the MSG_CMSG_COMPAT flag is blocked at the compat syscall entry points, changing the kernel compat behaviour from the one before the commit it was trying to fix (1be374a0518a, net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg). On 32-bit kernels (!CONFIG_COMPAT), MSG_CMSG_COMPAT is 0 and the native 32-bit sys_sendmsg() allows flag 0x80000000 to be set (it is ignored by the kernel). However, on a 64-bit kernel, the compat ABI is different with commit a7526eb5d06b. This patch changes the compat_sys_{send,recv}msg behaviour to the one prior to commit 1be374a0518a. The problem was found running 32-bit LTP (sendmsg01) binary on an arm64 kernel. Arguably, LTP should not pass 0xffffffff as flags to sendmsg() but the general rule is not to break user ABI (even when the user behaviour is not entirely sane). Fixes: a7526eb5d06b (net: Unbreak compat_sys_{send,recv}msg) Cc: Andy Lutomirski Cc: David S. Miller Signed-off-by: Catalin Marinas Signed-off-by: David S. Miller --- net/compat.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index 3236b4167a32..94d3d5e97883 100644 --- a/net/compat.c +++ b/net/compat.c @@ -711,24 +711,18 @@ static unsigned char nas[21] = { COMPAT_SYSCALL_DEFINE3(sendmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags) { - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; return __sys_sendmsg(fd, (struct user_msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } COMPAT_SYSCALL_DEFINE4(sendmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags) { - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT); } COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags) { - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; return __sys_recvmsg(fd, (struct user_msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } @@ -751,9 +745,6 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, int datagrams; struct timespec ktspec; - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - if (timeout == NULL) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, NULL); -- cgit v1.2.3 From 77751427a1ff25b27d47a4c36b12c3c8667855ac Mon Sep 17 00:00:00 2001 From: Marcelo Leitner Date: Mon, 23 Feb 2015 11:17:13 -0300 Subject: ipv6: addrconf: validate new MTU before applying it Currently we don't check if the new MTU is valid or not and this allows one to configure a smaller than minimum allowed by RFCs or even bigger than interface own MTU, which is a problem as it may lead to packet drops. If you have a daemon like NetworkManager running, this may be exploited by remote attackers by forging RA packets with an invalid MTU, possibly leading to a DoS. (NetworkManager currently only validates for values too small, but not for too big ones.) The fix is just to make sure the new value is valid. That is, between IPV6_MIN_MTU and interface's MTU. Note that similar check is already performed at ndisc_router_discovery(), for when kernel itself parses the RA. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 98e4a63d72bb..b6030025f411 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4903,6 +4903,21 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } +static +int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct inet6_dev *idev = ctl->extra1; + int min_mtu = IPV6_MIN_MTU; + struct ctl_table lctl; + + lctl = *ctl; + lctl.extra1 = &min_mtu; + lctl.extra2 = idev ? &idev->dev->mtu : NULL; + + return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos); +} + static void dev_disable_change(struct inet6_dev *idev) { struct netdev_notifier_info info; @@ -5054,7 +5069,7 @@ static struct addrconf_sysctl_table .data = &ipv6_devconf.mtu6, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = addrconf_sysctl_mtu, }, { .procname = "accept_ra", -- cgit v1.2.3 From 104f5a6206f4b3133c675e3d41eca2ca4c41406b Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Sun, 8 Feb 2015 12:36:07 +0200 Subject: mac80211: clear sdata->radar_required If ieee80211_vif_use_channel() fails, we have to clear sdata->radar_required (which we might have just set). Failing to do it results in stale radar_required field which prevents starting new scan requests. Reported-by: Jouni Malinen Signed-off-by: Eliad Peller [use false instead of 0] Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index ff0d2db09df9..5bcd4e5589d3 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1508,6 +1508,8 @@ static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata) if (ieee80211_chanctx_refcount(local, ctx) == 0) ieee80211_free_chanctx(local, ctx); + sdata->radar_required = false; + /* Unreserving may ready an in-place reservation. */ if (use_reserved_switch) ieee80211_vif_use_reserved_switch(local); @@ -1566,6 +1568,9 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); out: + if (ret) + sdata->radar_required = false; + mutex_unlock(&local->chanctx_mtx); return ret; } -- cgit v1.2.3 From 5528fae88697268a7176dd6c8d7ab368c04368be Mon Sep 17 00:00:00 2001 From: Samuel Tan Date: Mon, 9 Feb 2015 21:29:15 +0200 Subject: nl80211: use loop index as type for net detect frequency results We currently add nested members of the NL80211_ATTR_SCAN_FREQUENCIES as NLA_U32 attributes of type NL80211_ATTR_WIPHY_FREQ in cfg80211_net_detect_results. However, since there can be an arbitrary number of frequency results, we should use the loop index of the loop used to add the frequency results to NL80211_ATTR_SCAN_FREQUENCIES as the type (i.e. nla_type) for each result attribute, rather than a fixed type. This change is in line with how nested members are added to NL80211_ATTR_SCAN_FREQUENCIES in the functions nl80211_send_wowlan_nd and nl80211_add_scan_req. Signed-off-by: Samuel Tan Signed-off-by: Luciano Coelho Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d78fd8b54515..3c7fb0459e58 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12528,9 +12528,7 @@ static int cfg80211_net_detect_results(struct sk_buff *msg, } for (j = 0; j < match->n_channels; j++) { - if (nla_put_u32(msg, - NL80211_ATTR_WIPHY_FREQ, - match->channels[j])) { + if (nla_put_u32(msg, j, match->channels[j])) { nla_nest_cancel(msg, nl_freqs); nla_nest_cancel(msg, nl_match); goto out; -- cgit v1.2.3 From a18c7192aabac73078f35e5ef4ce28ad5f8cfe8e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 24 Feb 2015 10:56:42 +0100 Subject: nl80211: fix memory leak in monitor flags parsing If monitor flags parsing results in active monitor but that isn't supported, the already allocated message is leaked. Fix this by moving the allocation after this check. Reported-by: Christian Engelmayer Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3c7fb0459e58..be2501538011 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2654,10 +2654,6 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) return err; } - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ? info->attrs[NL80211_ATTR_MNTR_FLAGS] : NULL, &flags); @@ -2666,6 +2662,10 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) return -EOPNOTSUPP; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + wdev = rdev_add_virtual_intf(rdev, nla_data(info->attrs[NL80211_ATTR_IFNAME]), type, err ? NULL : &flags, ¶ms); -- cgit v1.2.3 From 28981e5eb45fe13e1d2c9e0e2e189dc5c8682006 Mon Sep 17 00:00:00 2001 From: Jason Abele Date: Tue, 17 Feb 2015 16:10:41 -0800 Subject: cfg80211: fix n_reg_rules to match world_regdom There are currently 8 rules in the world_regdom, but only the first 6 are applied due to an incorrect value for n_reg_rules. This causes channels 149-165 and 60GHz to be disabled. Signed-off-by: Jason Abele Signed-off-by: Johannes Berg --- net/wireless/reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index b586d0dcb09e..48dfc7b4e981 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -228,7 +228,7 @@ static DECLARE_DELAYED_WORK(reg_timeout, reg_timeout_work); /* We keep a static world regulatory domain in case of the absence of CRDA */ static const struct ieee80211_regdomain world_regdom = { - .n_reg_rules = 6, + .n_reg_rules = 8, .alpha2 = "00", .reg_rules = { /* IEEE 802.11b/g, channels 1..11 */ -- cgit v1.2.3 From 81daf735f9fe35ef6bc4073068748b221d64fb47 Mon Sep 17 00:00:00 2001 From: Junjie Mao Date: Wed, 28 Jan 2015 10:03:26 +0800 Subject: cfg80211: calls nl80211_exit on error nl80211_exit should be called in cfg80211_init if nl80211_init succeeds but regulatory_init or create_singlethread_workqueue fails. Signed-off-by: Junjie Mao Signed-off-by: Johannes Berg --- net/wireless/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 3af0ecf1cc16..2a0bbd22854b 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1199,6 +1199,7 @@ out_fail_wq: regulatory_exit(); out_fail_reg: debugfs_remove(ieee80211_debugfs_dir); + nl80211_exit(); out_fail_nl80211: unregister_netdevice_notifier(&cfg80211_netdev_notifier); out_fail_notifier: -- cgit v1.2.3 From 17dce15801d5602719936045a7e84ff7dc6b6da2 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 19 Feb 2015 12:57:40 +0100 Subject: mac80211/minstrel: fix !x!=0 confusion Commit 06d961a8e210 ("mac80211/minstrel: use the new rate control API") inverted the condition 'if (msr->sample_limit != 0)' to 'if (!msr->sample_limit != 0)'. But it is confusing both to people and compilers (gcc5): net/mac80211/rc80211_minstrel.c: In function 'minstrel_get_rate': net/mac80211/rc80211_minstrel.c:376:26: warning: logical not is only applied to the left hand side of comparison if (!msr->sample_limit != 0) ^ Let there be only 'if (!msr->sample_limit)'. Fixes: 06d961a8e210 ("mac80211/minstrel: use the new rate control API") Signed-off-by: Jiri Slaby Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 7c86a002df95..ef6e8a6c4253 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -373,7 +373,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, rate++; mi->sample_deferred++; } else { - if (!msr->sample_limit != 0) + if (!msr->sample_limit) return; mi->sample_packets++; -- cgit v1.2.3 From 4e10fd5b4a7f4100007147558c304da3e73b25cf Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 24 Feb 2015 14:14:35 -0500 Subject: rtnetlink: avoid 0 sized arrays Arrays (when not in a struct) "shall have a value greater than zero". GCC complains when it's not the case here. Fixes: ba7d49b1f0 ("rtnetlink: provide api for getting and setting slave info") Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ab293a3066b3..1385de0fa080 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2012,8 +2012,8 @@ replay: } if (1) { - struct nlattr *attr[ops ? ops->maxtype + 1 : 0]; - struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 0]; + struct nlattr *attr[ops ? ops->maxtype + 1 : 1]; + struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1]; struct nlattr **data = NULL; struct nlattr **slave_data = NULL; struct net *dest_net, *link_net = NULL; -- cgit v1.2.3 From 41a50d621a321b4c15273cc1b5ed41437f4acdfb Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Tue, 24 Feb 2015 08:18:28 +0300 Subject: af_packet: don't pass empty blocks for PACKET_V3 Before da413eec729d ("packet: Fixed TPACKET V3 to signal poll when block is closed rather than every packet") poll listening for an af_packet socket was not signaled if there was no packets to process. After the patch poll is signaled evety time when block retire timer expires. That happens because af_packet closes the current block on timeout even if the block is empty. Passing empty blocks to the user not only wastes CPU but also wastes ring buffer space increasing probability of packets dropping on small timeouts. Signed-off-by: Alexander Drozdov Cc: Dan Collins Cc: Willem de Bruijn Cc: Guy Harris Signed-off-by: David S. Miller --- net/packet/af_packet.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 99fc628f7372..5bf1e968a728 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -698,6 +698,10 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data) if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { if (!frozen) { + if (!BLOCK_NUM_PKTS(pbd)) { + /* An empty block. Just refresh the timer. */ + goto refresh_timer; + } prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); if (!prb_dispatch_next_block(pkc, po)) goto refresh_timer; @@ -798,7 +802,11 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; } else { - /* Ok, we tmo'd - so get the current time */ + /* Ok, we tmo'd - so get the current time. + * + * It shouldn't really happen as we don't close empty + * blocks. See prb_retire_rx_blk_timer_expired(). + */ struct timespec ts; getnstimeofday(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; -- cgit v1.2.3 From 9c1c98a3bb7b7593b60264b9a07e001e68b46697 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 26 Feb 2015 15:50:50 +0200 Subject: mac80211: Send EAPOL frames at lowest rate The current minstrel_ht rate control behavior is somewhat optimistic in trying to find optimum TX rate. While this is usually fine for normal Data frames, there are cases where a more conservative set of retry parameters would be beneficial to make the connection more robust. EAPOL frames are critical to the authentication and especially the EAPOL-Key message 4/4 (the last message in the 4-way handshake) is important to get through to the AP. If that message is lost, the only recovery mechanism in many cases is to reassociate with the AP and start from scratch. This can often be avoided by trying to send the frame with more conservative rate and/or with more link layer retries. In most cases, minstrel_ht is currently using the initial EAPOL-Key frames for probing higher rates and this results in only five link layer transmission attempts (one at high(ish) MCS and four at MCS0). While this works with most APs, it looks like there are some deployed APs that may have issues with the EAPOL frames using HT MCS immediately after association. Similarly, there may be issues in cases where the signal strength or radio environment is not good enough to be able to get frames through even at couple of MCS 0 tries. The best approach for this would likely to be to reduce the TX rate for the last rate (3rd rate parameter in the set) to a low basic rate (say, 6 Mbps on 5 GHz and 2 or 5.5 Mbps on 2.4 GHz), but doing that cleanly requires some more effort. For now, we can start with a simple one-liner that forces the minimum rate to be used for EAPOL frames similarly how the TX rate is selected for the IEEE 802.11 Management frames. This does result in a small extra latency added to the cases where the AP would be able to receive the higher rate, but taken into account how small number of EAPOL frames are used, this is likely to be insignificant. A future optimization in the minstrel_ht design can also allow this patch to be reverted to get back to the more optimized initial TX rate. It should also be noted that many drivers that do not use minstrel as the rate control algorithm are already doing similar workarounds by forcing the lowest TX rate to be used for EAPOL frames. Cc: stable@vger.kernel.org Reported-by: Linus Torvalds Tested-by: Linus Torvalds Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 88a18ffe2975..07bd8db00af8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -566,6 +566,7 @@ ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx) if (tx->sdata->control_port_no_encrypt) info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; + info->flags |= IEEE80211_TX_CTL_USE_MINRATE; } return TX_CONTINUE; -- cgit v1.2.3 From 76cb4be993c03bf9ec65a58b13f12c679bb041e4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Feb 2015 18:34:01 +0300 Subject: sunrpc: integer underflow in rsc_parse() If we call groups_alloc() with invalid values then it's might lead to memory corruption. For example, with a negative value then we might not allocate enough for sizeof(struct group_info). (We're doing this in the caller for consistency with other callers of groups_alloc(). The other alternative might be to move the check out of all the callers into groups_alloc().) Signed-off-by: Dan Carpenter Reviewed-by: Simo Sorce Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 224a82f24d3c..1095be9c80ab 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -463,6 +463,8 @@ static int rsc_parse(struct cache_detail *cd, /* number of additional gid's */ if (get_int(&mesg, &N)) goto out; + if (N < 0 || N > NGROUPS_MAX) + goto out; status = -ENOMEM; rsci.cred.cr_group_info = groups_alloc(N); if (rsci.cred.cr_group_info == NULL) -- cgit v1.2.3 From 4c4b52d9b2df45e8216d3e30b5452e4a364d2cac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 25 Feb 2015 16:31:54 +0100 Subject: rhashtable: remove indirection for grow/shrink decision functions Currently, all real users of rhashtable default their grow and shrink decision functions to rht_grow_above_75() and rht_shrink_below_30(), so that there's currently no need to have this explicitly selectable. It can/should be generic and private inside rhashtable until a real use case pops up. Since we can make this private, we'll save us this additional indirection layer and can improve insertion/deletion time as well. Reference: http://patchwork.ozlabs.org/patch/443040/ Suggested-by: David S. Miller Signed-off-by: Daniel Borkmann Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 13 ----------- lib/rhashtable.c | 56 ++++++++++++++-------------------------------- lib/test_rhashtable.c | 1 + net/netfilter/nft_hash.c | 2 -- net/netlink/af_netlink.c | 2 -- net/tipc/socket.c | 2 -- 6 files changed, 18 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index cb2104be2135..d438eeb08bff 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -79,12 +79,6 @@ struct rhashtable; * @locks_mul: Number of bucket locks to allocate per cpu (default: 128) * @hashfn: Function to hash key * @obj_hashfn: Function to hash object - * @grow_decision: If defined, may return true if table should expand - * @shrink_decision: If defined, may return true if table should shrink - * - * Note: when implementing the grow and shrink decision function, min/max - * shift must be enforced, otherwise, resizing watermarks they set may be - * useless. */ struct rhashtable_params { size_t nelem_hint; @@ -98,10 +92,6 @@ struct rhashtable_params { size_t locks_mul; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; - bool (*grow_decision)(const struct rhashtable *ht, - size_t new_size); - bool (*shrink_decision)(const struct rhashtable *ht, - size_t new_size); }; /** @@ -193,9 +183,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params); void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node); bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node); -bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size); -bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size); - int rhashtable_expand(struct rhashtable *ht); int rhashtable_shrink(struct rhashtable *ht); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index bcf119bfdef4..090641db4c0d 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -247,26 +247,24 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, * @ht: hash table * @new_size: new table size */ -bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) +static bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) { /* Expand table when exceeding 75% load */ return atomic_read(&ht->nelems) > (new_size / 4 * 3) && (!ht->p.max_shift || atomic_read(&ht->shift) < ht->p.max_shift); } -EXPORT_SYMBOL_GPL(rht_grow_above_75); /** * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size * @ht: hash table * @new_size: new table size */ -bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) +static bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) { /* Shrink table beneath 30% load */ return atomic_read(&ht->nelems) < (new_size * 3 / 10) && (atomic_read(&ht->shift) > ht->p.min_shift); } -EXPORT_SYMBOL_GPL(rht_shrink_below_30); static void lock_buckets(struct bucket_table *new_tbl, struct bucket_table *old_tbl, unsigned int hash) @@ -528,40 +526,19 @@ static void rht_deferred_worker(struct work_struct *work) list_for_each_entry(walker, &ht->walkers, list) walker->resize = true; - if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) + if (rht_grow_above_75(ht, tbl->size)) rhashtable_expand(ht); - else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size)) + else if (rht_shrink_below_30(ht, tbl->size)) rhashtable_shrink(ht); - unlock: mutex_unlock(&ht->mutex); } -static void rhashtable_probe_expand(struct rhashtable *ht) -{ - const struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); - - /* Only adjust the table if no resizing is currently in progress. */ - if (tbl == new_tbl && ht->p.grow_decision && - ht->p.grow_decision(ht, tbl->size)) - schedule_work(&ht->run_work); -} - -static void rhashtable_probe_shrink(struct rhashtable *ht) -{ - const struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); - - /* Only adjust the table if no resizing is currently in progress. */ - if (tbl == new_tbl && ht->p.shrink_decision && - ht->p.shrink_decision(ht, tbl->size)) - schedule_work(&ht->run_work); -} - static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, - struct bucket_table *tbl, u32 hash) + struct bucket_table *tbl, + const struct bucket_table *old_tbl, u32 hash) { + bool no_resize_running = tbl == old_tbl; struct rhash_head *head; hash = rht_bucket_index(tbl, hash); @@ -577,8 +554,8 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); - - rhashtable_probe_expand(ht); + if (no_resize_running && rht_grow_above_75(ht, tbl->size)) + schedule_work(&ht->run_work); } /** @@ -608,7 +585,7 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); lock_buckets(tbl, old_tbl, hash); - __rhashtable_insert(ht, obj, tbl, hash); + __rhashtable_insert(ht, obj, tbl, old_tbl, hash); unlock_buckets(tbl, old_tbl, hash); rcu_read_unlock(); @@ -690,8 +667,11 @@ found: unlock_buckets(new_tbl, old_tbl, new_hash); if (ret) { + bool no_resize_running = new_tbl == old_tbl; + atomic_dec(&ht->nelems); - rhashtable_probe_shrink(ht); + if (no_resize_running && rht_shrink_below_30(ht, new_tbl->size)) + schedule_work(&ht->run_work); } rcu_read_unlock(); @@ -861,7 +841,7 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht, goto exit; } - __rhashtable_insert(ht, obj, new_tbl, new_hash); + __rhashtable_insert(ht, obj, new_tbl, old_tbl, new_hash); exit: unlock_buckets(new_tbl, old_tbl, new_hash); @@ -1123,8 +1103,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) if (!ht->p.hash_rnd) get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); - if (ht->p.grow_decision || ht->p.shrink_decision) - INIT_WORK(&ht->run_work, rht_deferred_worker); + INIT_WORK(&ht->run_work, rht_deferred_worker); return 0; } @@ -1142,8 +1121,7 @@ void rhashtable_destroy(struct rhashtable *ht) { ht->being_destroyed = true; - if (ht->p.grow_decision || ht->p.shrink_decision) - cancel_work_sync(&ht->run_work); + cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); bucket_table_free(rht_dereference(ht->tbl, ht)); diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index f9e9d734446a..67c7593d1dd6 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -201,6 +201,7 @@ static int __init test_rht_init(void) .key_offset = offsetof(struct test_obj, value), .key_len = sizeof(int), .hashfn = jhash, + .max_shift = 1, /* we expand/shrink manually here */ .nulls_base = (3U << RHT_BASE_SHIFT), }; int err; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 61e6c407476a..c82df0a48fcd 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -192,8 +192,6 @@ static int nft_hash_init(const struct nft_set *set, .key_offset = offsetof(struct nft_hash_elem, key), .key_len = set->klen, .hashfn = jhash, - .grow_decision = rht_grow_above_75, - .shrink_decision = rht_shrink_below_30, }; return rhashtable_init(priv, ¶ms); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2702673f0f23..05919bf3f670 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3126,8 +3126,6 @@ static int __init netlink_proto_init(void) .key_len = sizeof(u32), /* portid */ .hashfn = jhash, .max_shift = 16, /* 64K */ - .grow_decision = rht_grow_above_75, - .shrink_decision = rht_shrink_below_30, }; if (err != 0) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f73e975af80b..b4d4467d0bb0 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2364,8 +2364,6 @@ int tipc_sk_rht_init(struct net *net) .hashfn = jhash, .max_shift = 20, /* 1M */ .min_shift = 8, /* 256 */ - .grow_decision = rht_grow_above_75, - .shrink_decision = rht_shrink_below_30, }; return rhashtable_init(&tn->sk_rht, &rht_params); -- cgit v1.2.3 From 505ce4154ac86c250aa4a84a536dd9fc56479bb5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Feb 2015 16:19:00 -0600 Subject: net: Verify permission to dest_net in newlink When applicable verify that the caller has permision to create a network device in another network namespace. This check is already present when moving a network device between network namespaces in setlink so all that is needed is to duplicate that check in newlink. This change almost backports cleanly, but there are context conflicts as the code that follows was added in v4.0-rc1 Fixes: b51642f6d77b net: Enable a userns root rtnl calls that are safe for unprivilged users Signed-off-by: "Eric W. Biederman" Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1385de0fa080..b237959c7497 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2122,6 +2122,10 @@ replay: if (IS_ERR(dest_net)) return PTR_ERR(dest_net); + err = -EPERM; + if (!netlink_ns_capable(skb, dest_net->user_ns, CAP_NET_ADMIN)) + goto out; + if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); -- cgit v1.2.3 From 06615bed60c1fb7c37adddb75bdc80da873b5edb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Feb 2015 16:20:07 -0600 Subject: net: Verify permission to link_net in newlink When applicable verify that the caller has permisson to the underlying network namespace for a newly created network device. Similary checks exist for the network namespace a network device will be created in. Fixes: 317f4810e45e ("rtnl: allow to create device with IFLA_LINK_NETNSID set") Signed-off-by: "Eric W. Biederman" Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b237959c7497..2c49355d16c2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2134,6 +2134,9 @@ replay: err = -EINVAL; goto out; } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; } dev = rtnl_create_link(link_net ? : dest_net, ifname, -- cgit v1.2.3 From cac5e65e8a7ea074f2626d2eaa53aa308452dec4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Feb 2015 09:42:50 -0800 Subject: net: do not use rcu in rtnl_dump_ifinfo() We did a failed attempt in the past to only use rcu in rtnl dump operations (commit e67f88dd12f6 "net: dont hold rtnl mutex during netlink dump callbacks") Now that dumps are holding RTNL anyway, there is no need to also use rcu locking, as it forbids any scheduling ability, like GFP_KERNEL allocations that controlling path should use instead of GFP_ATOMIC whenever possible. This should fix following splat Cong Wang reported : [ INFO: suspicious RCU usage. ] 3.19.0+ #805 Tainted: G W include/linux/rcupdate.h:538 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by ip/771: #0: (rtnl_mutex){+.+.+.}, at: [] netlink_dump+0x21/0x26c #1: (rcu_read_lock){......}, at: [] rcu_read_lock+0x0/0x6e stack backtrace: CPU: 3 PID: 771 Comm: ip Tainted: G W 3.19.0+ #805 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 0000000000000001 ffff8800d51e7718 ffffffff81a27457 0000000029e729e6 ffff8800d6108000 ffff8800d51e7748 ffffffff810b539b ffffffff820013dd 00000000000001c8 0000000000000000 ffff8800d7448088 ffff8800d51e7758 Call Trace: [] dump_stack+0x4c/0x65 [] lockdep_rcu_suspicious+0x107/0x110 [] rcu_preempt_sleep_check+0x45/0x47 [] ___might_sleep+0x1d/0x1cb [] __might_sleep+0x78/0x80 [] idr_alloc+0x45/0xd1 [] ? rcu_read_lock_held+0x3b/0x3d [] ? idr_for_each+0x53/0x101 [] alloc_netid+0x61/0x69 [] __peernet2id+0x79/0x8d [] peernet2id+0x13/0x1f [] rtnl_fill_ifinfo+0xa8d/0xc20 [] ? __lock_is_held+0x39/0x52 [] rtnl_dump_ifinfo+0x149/0x213 [] netlink_dump+0xef/0x26c [] netlink_recvmsg+0x17b/0x2c5 [] __sock_recvmsg+0x4e/0x59 [] sock_recvmsg+0x3f/0x51 [] ___sys_recvmsg+0xf6/0x1d9 [] ? handle_pte_fault+0x6e1/0xd3d [] ? native_sched_clock+0x35/0x37 [] ? sched_clock_local+0x12/0x72 [] ? sched_clock_cpu+0x9e/0xb7 [] ? rcu_read_lock_held+0x3b/0x3d [] ? __fcheck_files+0x4c/0x58 [] ? __fget_light+0x2d/0x52 [] __sys_recvmsg+0x42/0x60 [] SyS_recvmsg+0x12/0x1c Signed-off-by: Eric Dumazet Fixes: 0c7aecd4bde4b7302 ("netns: add rtnl cmd to add and get peer netns ids") Cc: Nicolas Dichtel Reported-by: Cong Wang Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2c49355d16c2..25b4b5d23485 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1300,7 +1300,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[0]; s_idx = cb->args[1]; - rcu_read_lock(); cb->seq = net->dev_base_seq; /* A hack to preserve kernel<->userspace interface. @@ -1322,7 +1321,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { + hlist_for_each_entry(dev, head, index_hlist) { if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, @@ -1344,7 +1343,6 @@ cont: } } out: - rcu_read_unlock(); cb->args[1] = idx; cb->args[0] = h; -- cgit v1.2.3 From 56b08fdcf637955d3023d769afd6cdabc526ba22 Mon Sep 17 00:00:00 2001 From: Arvid Brodin Date: Fri, 27 Feb 2015 21:26:03 +0100 Subject: net/hsr: Fix NULL pointer dereference and refcnt bugs when deleting a HSR interface. To repeat: $ sudo ip link del hsr0 BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [] hsr_del_port+0x15/0xa0 etc... Bug description: As part of the hsr master device destruction, hsr_del_port() is called for each of the hsr ports. At each such call, the master device is updated regarding features and mtu. When the master device is freed before the slave interfaces, master will be NULL in hsr_del_port(), which led to a NULL pointer dereference. Additionally, dev_put() was called on the master device itself in hsr_del_port(), causing a refcnt error. A third bug in the same code path was that the rtnl lock was not taken before hsr_del_port() was called as part of hsr_dev_destroy(). The reporter (Nicolas Dichtel) also said: "hsr_netdev_notify() supposes that the port will always be available when the notification is for an hsr interface. It's wrong. For example, netdev_wait_allrefs() may resend NETDEV_UNREGISTER.". As a precaution against this, a check for port == NULL was added in hsr_dev_notify(). Reported-by: Nicolas Dichtel Fixes: 51f3c605318b056a ("net/hsr: Move slave init to hsr_slave.c.") Signed-off-by: Arvid Brodin Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 3 +++ net/hsr/hsr_main.c | 4 ++++ net/hsr/hsr_slave.c | 10 +++++++--- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index a138d75751df..44d27469ae55 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -359,8 +359,11 @@ static void hsr_dev_destroy(struct net_device *hsr_dev) struct hsr_port *port; hsr = netdev_priv(hsr_dev); + + rtnl_lock(); hsr_for_each_port(hsr, port) hsr_del_port(port); + rtnl_unlock(); del_timer_sync(&hsr->prune_timer); del_timer_sync(&hsr->announce_timer); diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 779d28b65417..cd37d0011b42 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -36,6 +36,10 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; /* Not an HSR device */ hsr = netdev_priv(dev); port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + if (port == NULL) { + /* Resend of notification concerning removed device? */ + return NOTIFY_DONE; + } } else { hsr = port->hsr; } diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index a348dcbcd683..7d37366cc695 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -181,8 +181,10 @@ void hsr_del_port(struct hsr_port *port) list_del_rcu(&port->port_list); if (port != master) { - netdev_update_features(master->dev); - dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); + if (master != NULL) { + netdev_update_features(master->dev); + dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); + } netdev_rx_handler_unregister(port->dev); dev_set_promiscuity(port->dev, -1); } @@ -192,5 +194,7 @@ void hsr_del_port(struct hsr_port *port) */ synchronize_rcu(); - dev_put(port->dev); + + if (port != master) + dev_put(port->dev); } -- cgit v1.2.3 From c03ae533a9c4de83a35105f9bfd7152d916b4680 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 28 Feb 2015 11:51:36 +0100 Subject: rxrpc: terminate retrans loop when sending of skb fails Typo, 'stop' is never set to true. Seems intent is to not attempt to retransmit more packets after sendmsg returns an error. This change is based on code inspection only. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/rxrpc/ar-ack.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index c6be17a959a6..40404183a5da 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -218,7 +218,8 @@ static void rxrpc_resend(struct rxrpc_call *call) struct rxrpc_header *hdr; struct sk_buff *txb; unsigned long *p_txb, resend_at; - int loop, stop; + bool stop; + int loop; u8 resend; _enter("{%d,%d,%d,%d},", @@ -226,7 +227,7 @@ static void rxrpc_resend(struct rxrpc_call *call) atomic_read(&call->sequence), CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); - stop = 0; + stop = false; resend = 0; resend_at = 0; @@ -255,7 +256,7 @@ static void rxrpc_resend(struct rxrpc_call *call) _proto("Tx DATA %%%u { #%d }", ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); if (rxrpc_send_packet(call->conn->trans, txb) < 0) { - stop = 0; + stop = true; sp->resend_at = jiffies + 3; } else { sp->resend_at = -- cgit v1.2.3 From 765dd3bb44711b4ba36c1e06f9c4b7bfe73ffef7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 28 Feb 2015 11:51:37 +0100 Subject: rxrpc: don't multiply with HZ twice rxrpc_resend_timeout has an initial value of 4 * HZ; use it as-is. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/rxrpc/ar-ack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index 40404183a5da..e0547f521f20 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -260,7 +260,7 @@ static void rxrpc_resend(struct rxrpc_call *call) sp->resend_at = jiffies + 3; } else { sp->resend_at = - jiffies + rxrpc_resend_timeout * HZ; + jiffies + rxrpc_resend_timeout; } } -- cgit v1.2.3 From acf8dd0a9d0b9e4cdb597c2f74802f79c699e802 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Mon, 2 Mar 2015 18:27:11 +0100 Subject: udp: only allow UFO for packets from SOCK_DGRAM sockets If an over-MTU UDP datagram is sent through a SOCK_RAW socket to a UFO-capable device, ip_ufo_append_data() sets skb->ip_summed to CHECKSUM_PARTIAL unconditionally as all GSO code assumes transport layer checksum is to be computed on segmentation. However, in this case, skb->csum_start and skb->csum_offset are never set as raw socket transmit path bypasses udp_send_skb() where they are usually set. As a result, driver may access invalid memory when trying to calculate the checksum and store the result (as observed in virtio_net driver). Moreover, the very idea of modifying the userspace provided UDP header is IMHO against raw socket semantics (I wasn't able to find a document clearly stating this or the opposite, though). And while allowing CHECKSUM_NONE in the UFO case would be more efficient, it would be a bit too intrusive change just to handle a corner case like this. Therefore disallowing UFO for packets from SOCK_DGRAM seems to be the best option. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 3 ++- net/ipv6/ip6_output.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d68199d9b2b0..a7aea2048a0d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -888,7 +888,8 @@ static int __ip_append_data(struct sock *sk, cork->length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { + (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && + (sk->sk_type == SOCK_DGRAM)) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, maxfraglen, flags); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7deebf102cba..0a04a37305d5 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1298,7 +1298,8 @@ emsgsize: if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO) && + (sk->sk_type == SOCK_DGRAM)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags, rt); -- cgit v1.2.3 From 71e168b151babf4334e2e26c92230a6bda3b1f24 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 3 Mar 2015 13:53:31 +0100 Subject: net: bridge: add compile-time assert for cb struct size make build fail if structure no longer fits into ->cb storage. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/bridge/br.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index fb57ab6b24f9..02c24cf63c34 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -190,6 +190,8 @@ static int __init br_init(void) { int err; + BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); + err = stp_proto_register(&br_stp_proto); if (err < 0) { pr_err("bridge: can't register sap for STP\n"); -- cgit v1.2.3 From f4f8e73850589008095b1da3c8c17cf68bd1c62a Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Mon, 2 Mar 2015 18:49:56 -0800 Subject: openvswitch: Fix serialization of non-masked set actions. Set actions consist of a regular OVS_KEY_ATTR_* attribute nested inside of a OVS_ACTION_ATTR_SET action attribute. When converting masked actions back to regular set actions, the inner attribute length was not changed, ie, double the length being serialized. This patch fixes the bug. Fixes: 83d2b9b ("net: openvswitch: Support masked set actions.") Signed-off-by: Joe Stringer Acked-by: Jarno Rajahalme Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 216f20b90aa5..22b18c145c92 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2253,14 +2253,20 @@ static int masked_set_action_to_set_action_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); + struct nlattr *nla; size_t key_len = nla_len(ovs_key) / 2; /* Revert the conversion we did from a non-masked set action to * masked set action. */ - if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a) - key_len, ovs_key)) + nla = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + if (!nla) return -EMSGSIZE; + if (nla_put(skb, nla_type(ovs_key), key_len, nla_data(ovs_key))) + return -EMSGSIZE; + + nla_nest_end(skb, nla); return 0; } -- cgit v1.2.3