From 1f7dd3e5a6e4f093017fff12232572ee1aa4639b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 3 Dec 2015 10:18:21 -0500
Subject: cgroup: fix handling of multi-destination migration from
 subtree_control enabling

Consider the following v2 hierarchy.

  P0 (+memory) --- P1 (-memory) --- A
                                 \- B

P0 has memory enabled in its subtree_control while P1 doesn't.  If
both A and B contain processes, they would belong to the memory css of
P1.  Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter.  IOW, enabling controllers
can cause atomic migrations into different csses.

The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses.  pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.

 WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
 Modules linked in:
 CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
 ...
  ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
  ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
  ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
 Call Trace:
  [<ffffffff81551ffc>] dump_stack+0x4e/0x82
  [<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
  [<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
  [<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
  [<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
  [<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
  [<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
  [<ffffffff81189016>] cgroup_attach_task+0x176/0x200
  [<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
  [<ffffffff81189684>] cgroup_procs_write+0x14/0x20
  [<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
  [<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
  [<ffffffff81265f88>] __vfs_write+0x28/0xe0
  [<ffffffff812666fc>] vfs_write+0xac/0x1a0
  [<ffffffff81267019>] SyS_write+0x49/0xb0
  [<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76

This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated.  All controllers are
updated accordingly.

* Controllers which don't care whether there are one or multiple
  target csses can be converted trivially.  cpu, io, freezer, perf,
  netclassid and netprio fall in this category.

* cpuset's current implementation assumes that there's single source
  and destination and thus doesn't support v2 hierarchy already.  The
  only change made by this patchset is how that single destination css
  is obtained.

* memory migration path already doesn't do anything on v2.  How the
  single destination css is obtained is updated and the prep stage of
  mem_cgroup_can_attach() is reordered to accomodate the change.

* pids is the only controller which was affected by this bug.  It now
  correctly handles multi-destination migrations and no longer causes
  counter underflow from incorrect accounting.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
---
 net/core/netclassid_cgroup.c | 11 ++++++-----
 net/core/netprio_cgroup.c    |  9 +++++----
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'net/core')
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 6441f47b1a8f..81cb3c72efe8 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -67,14 +67,15 @@ static int update_classid(const void *v, struct file *file, unsigned n)
 	return 0;
 }
 
-static void cgrp_attach(struct cgroup_subsys_state *css,
-			struct cgroup_taskset *tset)
+static void cgrp_attach(struct cgroup_taskset *tset)
 {
-	struct cgroup_cls_state *cs = css_cls_state(css);
-	void *v = (void *)(unsigned long)cs->classid;
 	struct task_struct *p;
+	struct cgroup_subsys_state *css;
+
+	cgroup_taskset_for_each(p, css, tset) {
+		struct cgroup_cls_state *cs = css_cls_state(css);
+		void *v = (void *)(unsigned long)cs->classid;
 
-	cgroup_taskset_for_each(p, tset) {
 		task_lock(p);
 		iterate_fd(p->files, 0, update_classid, v);
 		task_unlock(p);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index cbd0a199bf52..40fd09fe06ae 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -218,13 +218,14 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
 	return 0;
 }
 
-static void net_prio_attach(struct cgroup_subsys_state *css,
-			    struct cgroup_taskset *tset)
+static void net_prio_attach(struct cgroup_taskset *tset)
 {
 	struct task_struct *p;
-	void *v = (void *)(unsigned long)css->cgroup->id;
+	struct cgroup_subsys_state *css;
+
+	cgroup_taskset_for_each(p, css, tset) {
+		void *v = (void *)(unsigned long)css->cgroup->id;
 
-	cgroup_taskset_for_each(p, tset) {
 		task_lock(p);
 		iterate_fd(p->files, 0, update_netprio, v);
 		task_unlock(p);
-- 
cgit v1.2.3


From 01ce63c90170283a9855d1db4fe81934dddce648 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 4 Dec 2015 15:14:04 -0200
Subject: sctp: update the netstamp_needed counter when copying sockets

Dmitry Vyukov reported that SCTP was triggering a WARN on socket destroy
related to disabling sock timestamp.

When SCTP accepts an association or peel one off, it copies sock flags
but forgot to call net_enable_timestamp() if a packet timestamping flag
was copied, leading to extra calls to net_disable_timestamp() whenever
such clones were closed.

The fix is to call net_enable_timestamp() whenever we copy a sock with
that flag on, like tcp does.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 ++
 net/core/sock.c    | 2 --
 net/sctp/socket.c  | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/include/net/sock.h b/include/net/sock.h
index 52d27ee924f4..b1d475b5db68 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -740,6 +740,8 @@ enum sock_flags {
 	SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
 };
 
+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
 {
 	nsk->sk_flags = osk->sk_flags;
diff --git a/net/core/sock.c b/net/core/sock.c
index e31dfcee1729..d01c8f42dbb2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -433,8 +433,6 @@ static bool sock_needs_netstamp(const struct sock *sk)
 	}
 }
 
-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-
 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 {
 	if (sk->sk_flags & flags) {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 03c8256063ec..4c9282bdd067 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7199,6 +7199,9 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->mc_ttl = 1;
 	newinet->mc_index = 0;
 	newinet->mc_list = NULL;
+
+	if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+		net_enable_timestamp();
 }
 
 static inline void sctp_copy_descendant(struct sock *sk_to,
-- 
cgit v1.2.3


From d188ba86dd07a72ebebfa22fe9cb0b0572e57740 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 8 Dec 2015 07:22:02 -0800
Subject: xfrm: add rcu protection to sk->sk_policy[]

XFRM can deal with SYNACK messages, sent while listener socket
is not locked. We add proper rcu protection to __xfrm_sk_clone_policy()
and xfrm_sk_policy_lookup()

This might serve as the first step to remove xfrm.xfrm_policy_lock
use in fast path.

Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h     |  2 +-
 include/net/xfrm.h     | 24 +++++++++++++++---------
 net/core/sock.c        |  2 +-
 net/xfrm/xfrm_policy.c | 37 +++++++++++++++++++++++++------------
 4 files changed, 42 insertions(+), 23 deletions(-)

(limited to 'net/core')

diff --git a/include/net/sock.h b/include/net/sock.h
index b1d475b5db68..eaef41433d7a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -388,7 +388,7 @@ struct sock {
 		struct socket_wq	*sk_wq_raw;
 	};
 #ifdef CONFIG_XFRM
-	struct xfrm_policy	*sk_policy[2];
+	struct xfrm_policy __rcu *sk_policy[2];
 #endif
 	struct dst_entry	*sk_rx_dst;
 	struct dst_entry __rcu	*sk_dst_cache;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8bae1ef647cd..d6f6e5006ee9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1142,12 +1142,14 @@ static inline int xfrm6_route_forward(struct sk_buff *skb)
 	return xfrm_route_forward(skb, AF_INET6);
 }
 
-int __xfrm_sk_clone_policy(struct sock *sk);
+int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk);
 
-static inline int xfrm_sk_clone_policy(struct sock *sk)
+static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
 {
-	if (unlikely(sk->sk_policy[0] || sk->sk_policy[1]))
-		return __xfrm_sk_clone_policy(sk);
+	sk->sk_policy[0] = NULL;
+	sk->sk_policy[1] = NULL;
+	if (unlikely(osk->sk_policy[0] || osk->sk_policy[1]))
+		return __xfrm_sk_clone_policy(sk, osk);
 	return 0;
 }
 
@@ -1155,12 +1157,16 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir);
 
 static inline void xfrm_sk_free_policy(struct sock *sk)
 {
-	if (unlikely(sk->sk_policy[0] != NULL)) {
-		xfrm_policy_delete(sk->sk_policy[0], XFRM_POLICY_MAX);
+	struct xfrm_policy *pol;
+
+	pol = rcu_dereference_protected(sk->sk_policy[0], 1);
+	if (unlikely(pol != NULL)) {
+		xfrm_policy_delete(pol, XFRM_POLICY_MAX);
 		sk->sk_policy[0] = NULL;
 	}
-	if (unlikely(sk->sk_policy[1] != NULL)) {
-		xfrm_policy_delete(sk->sk_policy[1], XFRM_POLICY_MAX+1);
+	pol = rcu_dereference_protected(sk->sk_policy[1], 1);
+	if (unlikely(pol != NULL)) {
+		xfrm_policy_delete(pol, XFRM_POLICY_MAX+1);
 		sk->sk_policy[1] = NULL;
 	}
 }
@@ -1170,7 +1176,7 @@ void xfrm_garbage_collect(struct net *net);
 #else
 
 static inline void xfrm_sk_free_policy(struct sock *sk) {}
-static inline int xfrm_sk_clone_policy(struct sock *sk) { return 0; }
+static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; }
 static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }  
 static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; } 
 static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
diff --git a/net/core/sock.c b/net/core/sock.c
index d01c8f42dbb2..765be835b06c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1550,7 +1550,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 			 */
 			is_charged = sk_filter_charge(newsk, filter);
 
-		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
+		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
 			/* It is still raw copy of parent, so invalidate
 			 * destructor and make plain sk_free() */
 			newsk->sk_destruct = NULL;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f57a5712cedd..948fa5560de5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1221,8 +1221,10 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 	struct xfrm_policy *pol;
 	struct net *net = sock_net(sk);
 
+	rcu_read_lock();
 	read_lock_bh(&net->xfrm.xfrm_policy_lock);
-	if ((pol = sk->sk_policy[dir]) != NULL) {
+	pol = rcu_dereference(sk->sk_policy[dir]);
+	if (pol != NULL) {
 		bool match = xfrm_selector_match(&pol->selector, fl,
 						 sk->sk_family);
 		int err = 0;
@@ -1246,6 +1248,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 	}
 out:
 	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	rcu_read_unlock();
 	return pol;
 }
 
@@ -1314,13 +1317,14 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 #endif
 
 	write_lock_bh(&net->xfrm.xfrm_policy_lock);
-	old_pol = sk->sk_policy[dir];
-	sk->sk_policy[dir] = pol;
+	old_pol = rcu_dereference_protected(sk->sk_policy[dir],
+				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
 	if (pol) {
 		pol->curlft.add_time = get_seconds();
 		pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
 		xfrm_sk_policy_link(pol, dir);
 	}
+	rcu_assign_pointer(sk->sk_policy[dir], pol);
 	if (old_pol) {
 		if (pol)
 			xfrm_policy_requeue(old_pol, pol);
@@ -1368,17 +1372,26 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
 	return newp;
 }
 
-int __xfrm_sk_clone_policy(struct sock *sk)
+int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
 {
-	struct xfrm_policy *p0 = sk->sk_policy[0],
-			   *p1 = sk->sk_policy[1];
+	const struct xfrm_policy *p;
+	struct xfrm_policy *np;
+	int i, ret = 0;
 
-	sk->sk_policy[0] = sk->sk_policy[1] = NULL;
-	if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
-		return -ENOMEM;
-	if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
-		return -ENOMEM;
-	return 0;
+	rcu_read_lock();
+	for (i = 0; i < 2; i++) {
+		p = rcu_dereference(osk->sk_policy[i]);
+		if (p) {
+			np = clone_policy(p, i);
+			if (unlikely(!np)) {
+				ret = -ENOMEM;
+				break;
+			}
+			rcu_assign_pointer(sk->sk_policy[i], np);
+		}
+	}
+	rcu_read_unlock();
+	return ret;
 }
 
 static int
-- 
cgit v1.2.3


From f654861569872d10dcb79d9d7ca219b316f94ff0 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Mon, 14 Dec 2015 17:44:10 -0500
Subject: skbuff: Fix offset error in skb_reorder_vlan_header

skb_reorder_vlan_header is called after the vlan header has
been pulled.  As a result the offset of the begining of
the mac header has been incrased by 4 bytes (VLAN_HLEN).
When moving the mac addresses, include this incrase in
the offset calcualation so that the mac addresses are
copied correctly.

Fixes: a6e18ff1117 (vlan: Fix untag operations of stacked vlans with REORDER_HEADER off)
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Patrick McHardy <kaber@trash.net>
Signed-off-by: Vladislav Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 152b9c70e252..5cc43d37a4a2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4268,7 +4268,7 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
 		return NULL;
 	}
 
-	memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len,
+	memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN,
 		2 * ETH_ALEN);
 	skb->mac_header += VLAN_HLEN;
 	return skb;
-- 
cgit v1.2.3


From ac5cc977991d2dce85fc734a6c71ddb33f6fe3c1 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Wed, 16 Dec 2015 23:39:04 -0800
Subject: net: check both type and procotol for tcp sockets

Dmitry reported the following out-of-bound access:

Call Trace:
 [<ffffffff816cec2e>] __asan_report_load4_noabort+0x3e/0x40
mm/kasan/report.c:294
 [<ffffffff84affb14>] sock_setsockopt+0x1284/0x13d0 net/core/sock.c:880
 [<     inline     >] SYSC_setsockopt net/socket.c:1746
 [<ffffffff84aed7ee>] SyS_setsockopt+0x1fe/0x240 net/socket.c:1729
 [<ffffffff85c18c76>] entry_SYSCALL_64_fastpath+0x16/0x7a
arch/x86/entry/entry_64.S:185

This is because we mistake a raw socket as a tcp socket.
We should check both sk->sk_type and sk->sk_protocol to ensure
it is a tcp socket.

Willem points out __skb_complete_tx_timestamp() needs to fix as well.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 3 ++-
 net/core/sock.c   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5cc43d37a4a2..b2df375ec9c2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3643,7 +3643,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
 	serr->ee.ee_info = tstype;
 	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
 		serr->ee.ee_data = skb_shinfo(skb)->tskey;
-		if (sk->sk_protocol == IPPROTO_TCP)
+		if (sk->sk_protocol == IPPROTO_TCP &&
+		    sk->sk_type == SOCK_STREAM)
 			serr->ee.ee_data -= sk->sk_tskey;
 	}
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 765be835b06c..0d91f7dca751 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -872,7 +872,8 @@ set_rcvbuf:
 
 		if (val & SOF_TIMESTAMPING_OPT_ID &&
 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
-			if (sk->sk_protocol == IPPROTO_TCP) {
+			if (sk->sk_protocol == IPPROTO_TCP &&
+			    sk->sk_type == SOCK_STREAM) {
 				if (sk->sk_state != TCP_ESTABLISHED) {
 					ret = -EINVAL;
 					break;
-- 
cgit v1.2.3