From b6c6712a42ca3f9fa7f4a3d7c40e3a9dd1fd9e03 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 8 Apr 2010 23:03:29 +0000
Subject: net: sk_dst_cache RCUification

With latest CONFIG_PROVE_RCU stuff, I felt more comfortable to make this
work.

sk->sk_dst_cache is currently protected by a rwlock (sk_dst_lock)

This rwlock is readlocked for a very small amount of time, and dst
entries are already freed after RCU grace period. This calls for RCU
again :)

This patch converts sk_dst_lock to a spinlock, and use RCU for readers.

__sk_dst_get() is supposed to be called with rcu_read_lock() or if
socket locked by user, so use appropriate rcu_dereference_check()
condition (rcu_read_lock_held() || sock_owned_by_user(sk))

This patch avoids two atomic ops per tx packet on UDP connected sockets,
for example, and permits sk_dst_lock to be much less dirtied.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index c5812bbc2cc9..7effa1e689df 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -364,11 +364,11 @@ EXPORT_SYMBOL(sk_reset_txq);
 
 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 {
-	struct dst_entry *dst = sk->sk_dst_cache;
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 		sk_tx_queue_clear(sk);
-		sk->sk_dst_cache = NULL;
+		rcu_assign_pointer(sk->sk_dst_cache, NULL);
 		dst_release(dst);
 		return NULL;
 	}
@@ -1157,7 +1157,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		skb_queue_head_init(&newsk->sk_async_wait_queue);
 #endif
 
-		rwlock_init(&newsk->sk_dst_lock);
+		spin_lock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
 				af_callback_keys + newsk->sk_family,
@@ -1898,7 +1898,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	} else
 		sk->sk_sleep	=	NULL;
 
-	rwlock_init(&sk->sk_dst_lock);
+	spin_lock_init(&sk->sk_dst_lock);
 	rwlock_init(&sk->sk_callback_lock);
 	lockdep_set_class_and_name(&sk->sk_callback_lock,
 			af_callback_keys + sk->sk_family,
-- 
cgit v1.2.3


From aa395145165cb06a0d0885221bbe0ce4a564391d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 20 Apr 2010 13:03:51 +0000
Subject: net: sk_sleep() helper

Define a new function to return the waitqueue of a "struct sock".

static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
	return sk->sk_sleep;
}

Change all read occurrences of sk_sleep by a call to this function.

Needed for a future RCU conversion. sk_sleep wont be a field directly
available.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/atm/atmtcp.c            |  6 ++--
 drivers/net/macvtap.c           |  4 +--
 drivers/net/tun.c               |  4 +--
 drivers/scsi/iscsi_tcp.c        |  4 +--
 include/net/sock.h              | 10 +++++--
 include/net/tcp.h               |  2 +-
 net/atm/common.c                | 12 ++++----
 net/atm/signaling.c             |  2 +-
 net/atm/svc.c                   | 62 ++++++++++++++++++++---------------------
 net/ax25/af_ax25.c              |  8 +++---
 net/bluetooth/af_bluetooth.c    |  6 ++--
 net/bluetooth/bnep/core.c       |  8 +++---
 net/bluetooth/bnep/netdev.c     |  6 ++--
 net/bluetooth/cmtp/cmtp.h       |  2 +-
 net/bluetooth/cmtp/core.c       |  4 +--
 net/bluetooth/hidp/core.c       | 10 +++----
 net/bluetooth/hidp/hidp.h       |  4 +--
 net/bluetooth/l2cap.c           |  4 +--
 net/bluetooth/rfcomm/sock.c     |  8 +++---
 net/bluetooth/sco.c             |  4 +--
 net/caif/caif_socket.c          |  2 +-
 net/core/datagram.c             |  6 ++--
 net/core/sock.c                 | 16 +++++------
 net/core/stream.c               | 16 +++++------
 net/dccp/output.c               |  6 ++--
 net/dccp/proto.c                |  2 +-
 net/decnet/af_decnet.c          | 26 ++++++++---------
 net/ipv4/af_inet.c              |  6 ++--
 net/ipv4/inet_connection_sock.c |  4 +--
 net/ipv4/tcp.c                  |  2 +-
 net/irda/af_irda.c              | 14 +++++-----
 net/iucv/af_iucv.c              | 12 ++++----
 net/llc/af_llc.c                | 12 ++++----
 net/netfilter/ipvs/ip_vs_sync.c |  2 +-
 net/netrom/af_netrom.c          |  8 +++---
 net/rds/af_rds.c                |  2 +-
 net/rds/rds.h                   |  2 +-
 net/rds/recv.c                  |  2 +-
 net/rds/send.c                  |  2 +-
 net/rose/af_rose.c              |  8 +++---
 net/rxrpc/af_rxrpc.c            |  4 +--
 net/sctp/socket.c               | 20 ++++++-------
 net/sunrpc/svcsock.c            | 24 ++++++++--------
 net/tipc/socket.c               | 26 ++++++++---------
 net/unix/af_unix.c              | 10 +++----
 net/x25/af_x25.c                |  8 +++---
 46 files changed, 208 insertions(+), 204 deletions(-)

(limited to 'net/core/sock.c')

diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c
index b86712167eb8..b9101818b47b 100644
--- a/drivers/atm/atmtcp.c
+++ b/drivers/atm/atmtcp.c
@@ -68,7 +68,7 @@ static int atmtcp_send_control(struct atm_vcc *vcc,int type,
 	*(struct atm_vcc **) &new_msg->vcc = vcc;
 	old_test = test_bit(flag,&vcc->flags);
 	out_vcc->push(out_vcc,skb);
-	add_wait_queue(sk_atm(vcc)->sk_sleep, &wait);
+	add_wait_queue(sk_sleep(sk_atm(vcc)), &wait);
 	while (test_bit(flag,&vcc->flags) == old_test) {
 		mb();
 		out_vcc = PRIV(vcc->dev) ? PRIV(vcc->dev)->vcc : NULL;
@@ -80,7 +80,7 @@ static int atmtcp_send_control(struct atm_vcc *vcc,int type,
 		schedule();
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk_atm(vcc)->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk_atm(vcc)), &wait);
 	return error;
 }
 
@@ -105,7 +105,7 @@ static int atmtcp_recv_control(const struct atmtcp_control *msg)
 		    msg->type);
 		return -EINVAL;
 	}
-	wake_up(sk_atm(vcc)->sk_sleep);
+	wake_up(sk_sleep(sk_atm(vcc)));
 	return 0;
 }
 
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index abba3cc81f12..85d6420f8404 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -246,8 +246,8 @@ static void macvtap_sock_write_space(struct sock *sk)
 	    !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
 		return;
 
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible_poll(sk->sk_sleep, POLLOUT | POLLWRNORM | POLLWRBAND);
+	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | POLLWRNORM | POLLWRBAND);
 }
 
 static int macvtap_open(struct inode *inode, struct file *file)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 43265207d463..20a17938c62b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -868,8 +868,8 @@ static void tun_sock_write_space(struct sock *sk)
 	if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
 		return;
 
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
+	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 
 	tun = tun_sk(sk)->tun;
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 0ee725ced511..9eae04afa9a0 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -599,9 +599,9 @@ static void iscsi_sw_tcp_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
 	set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx);
 	write_unlock_bh(&tcp_sw_conn->sock->sk->sk_callback_lock);
 
-	if (sock->sk->sk_sleep && waitqueue_active(sock->sk->sk_sleep)) {
+	if (sk_sleep(sock->sk) && waitqueue_active(sk_sleep(sock->sk))) {
 		sock->sk->sk_err = EIO;
-		wake_up_interruptible(sock->sk->sk_sleep);
+		wake_up_interruptible(sk_sleep(sock->sk));
 	}
 
 	iscsi_conn_stop(cls_conn, flag);
diff --git a/include/net/sock.h b/include/net/sock.h
index 56df440a950b..8ab05146a447 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1160,6 +1160,10 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 	sk->sk_socket = sock;
 }
 
+static inline wait_queue_head_t *sk_sleep(struct sock *sk)
+{
+	return sk->sk_sleep;
+}
 /* Detach socket from process context.
  * Announce socket dead, detach it from wait queue and inode.
  * Note that parent inode held reference count on this struct sock,
@@ -1346,8 +1350,8 @@ static inline int sk_has_allocations(const struct sock *sk)
  *   tp->rcv_nxt check   sock_def_readable
  *   ...                 {
  *   schedule               ...
- *                          if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- *                              wake_up_interruptible(sk->sk_sleep)
+ *                          if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
+ *                              wake_up_interruptible(sk_sleep(sk))
  *                          ...
  *                       }
  *
@@ -1368,7 +1372,7 @@ static inline int sk_has_sleeper(struct sock *sk)
 	 * This memory barrier is paired in the sock_poll_wait.
 	 */
 	smp_mb__after_lock();
-	return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
+	return sk_sleep(sk) && waitqueue_active(sk_sleep(sk));
 }
 
 /**
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 70c5159f4b36..b7d83d204a93 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -939,7 +939,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 
 		tp->ucopy.memory = 0;
 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
-		wake_up_interruptible_sync_poll(sk->sk_sleep,
+		wake_up_interruptible_sync_poll(sk_sleep(sk),
 					   POLLIN | POLLRDNORM | POLLRDBAND);
 		if (!inet_csk_ack_scheduled(sk))
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
diff --git a/net/atm/common.c b/net/atm/common.c
index 97ed94aa0cbc..e3e10e6f8628 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -92,7 +92,7 @@ static void vcc_def_wakeup(struct sock *sk)
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk_has_sleeper(sk))
-		wake_up(sk->sk_sleep);
+		wake_up(sk_sleep(sk));
 	read_unlock(&sk->sk_callback_lock);
 }
 
@@ -110,7 +110,7 @@ static void vcc_write_space(struct sock *sk)
 
 	if (vcc_writable(sk)) {
 		if (sk_has_sleeper(sk))
-			wake_up_interruptible(sk->sk_sleep);
+			wake_up_interruptible(sk_sleep(sk));
 
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
@@ -549,7 +549,7 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
 	}
 
 	eff = (size+3) & ~3; /* align to word boundary */
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	error = 0;
 	while (!(skb = alloc_tx(vcc, eff))) {
 		if (m->msg_flags & MSG_DONTWAIT) {
@@ -568,9 +568,9 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
 			send_sig(SIGPIPE, current, 0);
 			break;
 		}
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	if (error)
 		goto out;
 	skb->dev = NULL; /* for paths shared with net_device interfaces */
@@ -595,7 +595,7 @@ unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
 	struct atm_vcc *vcc;
 	unsigned int mask;
 
-	sock_poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
 	vcc = ATM_SD(sock);
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index 6ba6e466ee54..509c8ac02b63 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -131,7 +131,7 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb)
 		}
 		sk->sk_ack_backlog++;
 		skb_queue_tail(&sk->sk_receive_queue, skb);
-		pr_debug("waking sk->sk_sleep 0x%p\n", sk->sk_sleep);
+		pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk));
 		sk->sk_state_change(sk);
 as_indicate_complete:
 		release_sock(sk);
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 3ba9a45a51ac..754ee4791d96 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -49,14 +49,14 @@ static void svc_disconnect(struct atm_vcc *vcc)
 
 	pr_debug("%p\n", vcc);
 	if (test_bit(ATM_VF_REGIS, &vcc->flags)) {
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 		sigd_enq(vcc, as_close, NULL, NULL, NULL);
 		while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
 			schedule();
-			prepare_to_wait(sk->sk_sleep, &wait,
+			prepare_to_wait(sk_sleep(sk), &wait,
 					TASK_UNINTERRUPTIBLE);
 		}
-		finish_wait(sk->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 	}
 	/* beware - socket is still in use by atmsigd until the last
 	   as_indicate has been answered */
@@ -125,13 +125,13 @@ static int svc_bind(struct socket *sock, struct sockaddr *sockaddr,
 	}
 	vcc->local = *addr;
 	set_bit(ATM_VF_WAITING, &vcc->flags);
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 	sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local);
 	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
 		schedule();
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */
 	if (!sigd) {
 		error = -EUNATCH;
@@ -201,10 +201,10 @@ static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
 		}
 		vcc->remote = *addr;
 		set_bit(ATM_VF_WAITING, &vcc->flags);
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote);
 		if (flags & O_NONBLOCK) {
-			finish_wait(sk->sk_sleep, &wait);
+			finish_wait(sk_sleep(sk), &wait);
 			sock->state = SS_CONNECTING;
 			error = -EINPROGRESS;
 			goto out;
@@ -213,7 +213,7 @@ static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
 		while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
 			schedule();
 			if (!signal_pending(current)) {
-				prepare_to_wait(sk->sk_sleep, &wait,
+				prepare_to_wait(sk_sleep(sk), &wait,
 						TASK_INTERRUPTIBLE);
 				continue;
 			}
@@ -232,14 +232,14 @@ static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
 			 */
 			sigd_enq(vcc, as_close, NULL, NULL, NULL);
 			while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
-				prepare_to_wait(sk->sk_sleep, &wait,
+				prepare_to_wait(sk_sleep(sk), &wait,
 						TASK_INTERRUPTIBLE);
 				schedule();
 			}
 			if (!sk->sk_err)
 				while (!test_bit(ATM_VF_RELEASED, &vcc->flags) &&
 				       sigd) {
-					prepare_to_wait(sk->sk_sleep, &wait,
+					prepare_to_wait(sk_sleep(sk), &wait,
 							TASK_INTERRUPTIBLE);
 					schedule();
 				}
@@ -250,7 +250,7 @@ static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
 			error = -EINTR;
 			break;
 		}
-		finish_wait(sk->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 		if (error)
 			goto out;
 		if (!sigd) {
@@ -302,13 +302,13 @@ static int svc_listen(struct socket *sock, int backlog)
 		goto out;
 	}
 	set_bit(ATM_VF_WAITING, &vcc->flags);
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 	sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local);
 	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
 		schedule();
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	if (!sigd) {
 		error = -EUNATCH;
 		goto out;
@@ -343,7 +343,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
 	while (1) {
 		DEFINE_WAIT(wait);
 
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		while (!(skb = skb_dequeue(&sk->sk_receive_queue)) &&
 		       sigd) {
 			if (test_bit(ATM_VF_RELEASED, &old_vcc->flags))
@@ -363,10 +363,10 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
 				error = -ERESTARTSYS;
 				break;
 			}
-			prepare_to_wait(sk->sk_sleep, &wait,
+			prepare_to_wait(sk_sleep(sk), &wait,
 					TASK_INTERRUPTIBLE);
 		}
-		finish_wait(sk->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 		if (error)
 			goto out;
 		if (!skb) {
@@ -392,17 +392,17 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
 		}
 		/* wait should be short, so we ignore the non-blocking flag */
 		set_bit(ATM_VF_WAITING, &new_vcc->flags);
-		prepare_to_wait(sk_atm(new_vcc)->sk_sleep, &wait,
+		prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
 				TASK_UNINTERRUPTIBLE);
 		sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL);
 		while (test_bit(ATM_VF_WAITING, &new_vcc->flags) && sigd) {
 			release_sock(sk);
 			schedule();
 			lock_sock(sk);
-			prepare_to_wait(sk_atm(new_vcc)->sk_sleep, &wait,
+			prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
 					TASK_UNINTERRUPTIBLE);
 		}
-		finish_wait(sk_atm(new_vcc)->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk_atm(new_vcc)), &wait);
 		if (!sigd) {
 			error = -EUNATCH;
 			goto out;
@@ -438,14 +438,14 @@ int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
 	DEFINE_WAIT(wait);
 
 	set_bit(ATM_VF_WAITING, &vcc->flags);
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 	sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0);
 	while (test_bit(ATM_VF_WAITING, &vcc->flags) &&
 	       !test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
 		schedule();
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	if (!sigd)
 		return -EUNATCH;
 	return -sk->sk_err;
@@ -534,20 +534,20 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
 
 	lock_sock(sk);
 	set_bit(ATM_VF_WAITING, &vcc->flags);
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	sigd_enq(vcc, as_addparty, NULL, NULL,
 		 (struct sockaddr_atmsvc *) sockaddr);
 	if (flags & O_NONBLOCK) {
-		finish_wait(sk->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 		error = -EINPROGRESS;
 		goto out;
 	}
 	pr_debug("added wait queue\n");
 	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
 		schedule();
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	error = xchg(&sk->sk_err_soft, 0);
 out:
 	release_sock(sk);
@@ -563,13 +563,13 @@ static int svc_dropparty(struct socket *sock, int ep_ref)
 
 	lock_sock(sk);
 	set_bit(ATM_VF_WAITING, &vcc->flags);
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref);
 	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
 		schedule();
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	if (!sigd) {
 		error = -EUNATCH;
 		goto out;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 65c5801261f9..cfdfd7e2a172 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1281,7 +1281,7 @@ static int __must_check ax25_connect(struct socket *sock,
 		DEFINE_WAIT(wait);
 
 		for (;;) {
-			prepare_to_wait(sk->sk_sleep, &wait,
+			prepare_to_wait(sk_sleep(sk), &wait,
 					TASK_INTERRUPTIBLE);
 			if (sk->sk_state != TCP_SYN_SENT)
 				break;
@@ -1294,7 +1294,7 @@ static int __must_check ax25_connect(struct socket *sock,
 			err = -ERESTARTSYS;
 			break;
 		}
-		finish_wait(sk->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 
 		if (err)
 			goto out_release;
@@ -1346,7 +1346,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags)
 	 *	hooked into the SABM we saved
 	 */
 	for (;;) {
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		skb = skb_dequeue(&sk->sk_receive_queue);
 		if (skb)
 			break;
@@ -1364,7 +1364,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags)
 		err = -ERESTARTSYS;
 		break;
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 
 	if (err)
 		goto out;
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 404a8500fd03..421c45bd1b95 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -288,7 +288,7 @@ unsigned int bt_sock_poll(struct file * file, struct socket *sock, poll_table *w
 
 	BT_DBG("sock %p, sk %p", sock, sk);
 
-	poll_wait(file, sk->sk_sleep, wait);
+	poll_wait(file, sk_sleep(sk), wait);
 
 	if (sk->sk_state == BT_LISTEN)
 		return bt_accept_poll(sk);
@@ -378,7 +378,7 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
 
 	BT_DBG("sk %p", sk);
 
-	add_wait_queue(sk->sk_sleep, &wait);
+	add_wait_queue(sk_sleep(sk), &wait);
 	while (sk->sk_state != state) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -401,7 +401,7 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
 			break;
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
 	return err;
 }
 EXPORT_SYMBOL(bt_sock_wait_state);
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index 8062dad6d10d..f10b41fb05a0 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -474,7 +474,7 @@ static int bnep_session(void *arg)
 	set_user_nice(current, -15);
 
 	init_waitqueue_entry(&wait, current);
-	add_wait_queue(sk->sk_sleep, &wait);
+	add_wait_queue(sk_sleep(sk), &wait);
 	while (!atomic_read(&s->killed)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -496,7 +496,7 @@ static int bnep_session(void *arg)
 		schedule();
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
 
 	/* Cleanup session */
 	down_write(&bnep_session_sem);
@@ -507,7 +507,7 @@ static int bnep_session(void *arg)
 	/* Wakeup user-space polling for socket errors */
 	s->sock->sk->sk_err = EUNATCH;
 
-	wake_up_interruptible(s->sock->sk->sk_sleep);
+	wake_up_interruptible(sk_sleep(s->sock->sk));
 
 	/* Release the socket */
 	fput(s->sock->file);
@@ -638,7 +638,7 @@ int bnep_del_connection(struct bnep_conndel_req *req)
 
 		/* Kill session thread */
 		atomic_inc(&s->killed);
-		wake_up_interruptible(s->sock->sk->sk_sleep);
+		wake_up_interruptible(sk_sleep(s->sock->sk));
 	} else
 		err = -ENOENT;
 
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index d48b33f4d4ba..0faad5ce6dc4 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -109,7 +109,7 @@ static void bnep_net_set_mc_list(struct net_device *dev)
 	}
 
 	skb_queue_tail(&sk->sk_write_queue, skb);
-	wake_up_interruptible(sk->sk_sleep);
+	wake_up_interruptible(sk_sleep(sk));
 #endif
 }
 
@@ -193,11 +193,11 @@ static netdev_tx_t bnep_net_xmit(struct sk_buff *skb,
 	/*
 	 * We cannot send L2CAP packets from here as we are potentially in a bh.
 	 * So we have to queue them and wake up session thread which is sleeping
-	 * on the sk->sk_sleep.
+	 * on the sk_sleep(sk).
 	 */
 	dev->trans_start = jiffies;
 	skb_queue_tail(&sk->sk_write_queue, skb);
-	wake_up_interruptible(sk->sk_sleep);
+	wake_up_interruptible(sk_sleep(sk));
 
 	if (skb_queue_len(&sk->sk_write_queue) >= BNEP_TX_QUEUE_LEN) {
 		BT_DBG("tx queue is full");
diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h
index e4663aa14d26..785e79e953c5 100644
--- a/net/bluetooth/cmtp/cmtp.h
+++ b/net/bluetooth/cmtp/cmtp.h
@@ -125,7 +125,7 @@ static inline void cmtp_schedule(struct cmtp_session *session)
 {
 	struct sock *sk = session->sock->sk;
 
-	wake_up_interruptible(sk->sk_sleep);
+	wake_up_interruptible(sk_sleep(sk));
 }
 
 /* CMTP init defines */
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 0073ec8495da..d4c6af082d48 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -284,7 +284,7 @@ static int cmtp_session(void *arg)
 	set_user_nice(current, -15);
 
 	init_waitqueue_entry(&wait, current);
-	add_wait_queue(sk->sk_sleep, &wait);
+	add_wait_queue(sk_sleep(sk), &wait);
 	while (!atomic_read(&session->terminate)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -301,7 +301,7 @@ static int cmtp_session(void *arg)
 		schedule();
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
 
 	down_write(&cmtp_session_sem);
 
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 280529ad9274..bfe641b7dfaf 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -561,8 +561,8 @@ static int hidp_session(void *arg)
 
 	init_waitqueue_entry(&ctrl_wait, current);
 	init_waitqueue_entry(&intr_wait, current);
-	add_wait_queue(ctrl_sk->sk_sleep, &ctrl_wait);
-	add_wait_queue(intr_sk->sk_sleep, &intr_wait);
+	add_wait_queue(sk_sleep(ctrl_sk), &ctrl_wait);
+	add_wait_queue(sk_sleep(intr_sk), &intr_wait);
 	while (!atomic_read(&session->terminate)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -584,8 +584,8 @@ static int hidp_session(void *arg)
 		schedule();
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(intr_sk->sk_sleep, &intr_wait);
-	remove_wait_queue(ctrl_sk->sk_sleep, &ctrl_wait);
+	remove_wait_queue(sk_sleep(intr_sk), &intr_wait);
+	remove_wait_queue(sk_sleep(ctrl_sk), &ctrl_wait);
 
 	down_write(&hidp_session_sem);
 
@@ -609,7 +609,7 @@ static int hidp_session(void *arg)
 
 	fput(session->intr_sock->file);
 
-	wait_event_timeout(*(ctrl_sk->sk_sleep),
+	wait_event_timeout(*(sk_sleep(ctrl_sk)),
 		(ctrl_sk->sk_state == BT_CLOSED), msecs_to_jiffies(500));
 
 	fput(session->ctrl_sock->file);
diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h
index a4e215d50c10..8d934a19da0a 100644
--- a/net/bluetooth/hidp/hidp.h
+++ b/net/bluetooth/hidp/hidp.h
@@ -164,8 +164,8 @@ static inline void hidp_schedule(struct hidp_session *session)
 	struct sock *ctrl_sk = session->ctrl_sock->sk;
 	struct sock *intr_sk = session->intr_sock->sk;
 
-	wake_up_interruptible(ctrl_sk->sk_sleep);
-	wake_up_interruptible(intr_sk->sk_sleep);
+	wake_up_interruptible(sk_sleep(ctrl_sk));
+	wake_up_interruptible(sk_sleep(intr_sk));
 }
 
 /* HIDP init defines */
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 99d68c34e4f1..c1e60eed5a97 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -1147,7 +1147,7 @@ static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, int fl
 	BT_DBG("sk %p timeo %ld", sk, timeo);
 
 	/* Wait for an incoming connection. (wake-one). */
-	add_wait_queue_exclusive(sk->sk_sleep, &wait);
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
 	while (!(nsk = bt_accept_dequeue(sk, newsock))) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (!timeo) {
@@ -1170,7 +1170,7 @@ static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, int fl
 		}
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
 
 	if (err)
 		goto done;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 8ed3c37684fa..43fbf6b4b4bf 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -503,7 +503,7 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int f
 	BT_DBG("sk %p timeo %ld", sk, timeo);
 
 	/* Wait for an incoming connection. (wake-one). */
-	add_wait_queue_exclusive(sk->sk_sleep, &wait);
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
 	while (!(nsk = bt_accept_dequeue(sk, newsock))) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (!timeo) {
@@ -526,7 +526,7 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int f
 		}
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
 
 	if (err)
 		goto done;
@@ -621,7 +621,7 @@ static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
-	add_wait_queue(sk->sk_sleep, &wait);
+	add_wait_queue(sk_sleep(sk), &wait);
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -640,7 +640,7 @@ static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
 	}
 
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
 	return timeo;
 }
 
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index ca6b2ad1c3fc..b406d3eff53a 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -567,7 +567,7 @@ static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flag
 	BT_DBG("sk %p timeo %ld", sk, timeo);
 
 	/* Wait for an incoming connection. (wake-one). */
-	add_wait_queue_exclusive(sk->sk_sleep, &wait);
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
 	while (!(ch = bt_accept_dequeue(sk, newsock))) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (!timeo) {
@@ -590,7 +590,7 @@ static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flag
 		}
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
 
 	if (err)
 		goto done;
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index cdf62b9fefac..90317e7d10b4 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -689,7 +689,7 @@ static unsigned int caif_poll(struct file *file, struct socket *sock,
 	struct sock *sk = sock->sk;
 	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
 	u32 mask = 0;
-	poll_wait(file, sk->sk_sleep, wait);
+	poll_wait(file, sk_sleep(sk), wait);
 	lock_sock(&(cf_sk->sk));
 	if (!STATE_IS_OPEN(cf_sk)) {
 		if (!STATE_IS_PENDING(cf_sk))
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 2dccd4ee591b..5574a5ddf908 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -86,7 +86,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
 	int error;
 	DEFINE_WAIT_FUNC(wait, receiver_wake_function);
 
-	prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
 	/* Socket errors? */
 	error = sock_error(sk);
@@ -115,7 +115,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
 	error = 0;
 	*timeo_p = schedule_timeout(*timeo_p);
 out:
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return error;
 interrupted:
 	error = sock_intr_errno(*timeo_p);
@@ -726,7 +726,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 	struct sock *sk = sock->sk;
 	unsigned int mask;
 
-	sock_poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
 	/* exceptional events? */
diff --git a/net/core/sock.c b/net/core/sock.c
index 7effa1e689df..58ebd146ce5a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1395,7 +1395,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
 		if (signal_pending(current))
 			break;
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
 			break;
 		if (sk->sk_shutdown & SEND_SHUTDOWN)
@@ -1404,7 +1404,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
 			break;
 		timeo = schedule_timeout(timeo);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return timeo;
 }
 
@@ -1570,11 +1570,11 @@ int sk_wait_data(struct sock *sk, long *timeo)
 	int rc;
 	DEFINE_WAIT(wait);
 
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return rc;
 }
 EXPORT_SYMBOL(sk_wait_data);
@@ -1798,7 +1798,7 @@ static void sock_def_wakeup(struct sock *sk)
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk_has_sleeper(sk))
-		wake_up_interruptible_all(sk->sk_sleep);
+		wake_up_interruptible_all(sk_sleep(sk));
 	read_unlock(&sk->sk_callback_lock);
 }
 
@@ -1806,7 +1806,7 @@ static void sock_def_error_report(struct sock *sk)
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk_has_sleeper(sk))
-		wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
+		wake_up_interruptible_poll(sk_sleep(sk), POLLERR);
 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
 	read_unlock(&sk->sk_callback_lock);
 }
@@ -1815,7 +1815,7 @@ static void sock_def_readable(struct sock *sk, int len)
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk_has_sleeper(sk))
-		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
+		wake_up_interruptible_sync_poll(sk_sleep(sk), POLLIN |
 						POLLRDNORM | POLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
@@ -1830,7 +1830,7 @@ static void sock_def_write_space(struct sock *sk)
 	 */
 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
 		if (sk_has_sleeper(sk))
-			wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
+			wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 
 		/* Should agree with poll, otherwise some programs break */
diff --git a/net/core/stream.c b/net/core/stream.c
index a37debfeb1b2..7b3c3f30b107 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -32,8 +32,8 @@ void sk_stream_write_space(struct sock *sk)
 	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
 		clear_bit(SOCK_NOSPACE, &sock->flags);
 
-		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-			wake_up_interruptible_poll(sk->sk_sleep, POLLOUT |
+		if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
+			wake_up_interruptible_poll(sk_sleep(sk), POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
@@ -66,13 +66,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 		if (signal_pending(tsk))
 			return sock_intr_errno(*timeo_p);
 
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		sk->sk_write_pending++;
 		done = sk_wait_event(sk, timeo_p,
 				     !sk->sk_err &&
 				     !((1 << sk->sk_state) &
 				       ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
-		finish_wait(sk->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 		sk->sk_write_pending--;
 	} while (!done);
 	return 0;
@@ -96,13 +96,13 @@ void sk_stream_wait_close(struct sock *sk, long timeout)
 		DEFINE_WAIT(wait);
 
 		do {
-			prepare_to_wait(sk->sk_sleep, &wait,
+			prepare_to_wait(sk_sleep(sk), &wait,
 					TASK_INTERRUPTIBLE);
 			if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
 				break;
 		} while (!signal_pending(current) && timeout);
 
-		finish_wait(sk->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 	}
 }
 
@@ -126,7 +126,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 	while (1) {
 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
 		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 			goto do_error;
@@ -157,7 +157,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 		*timeo_p = current_timeo;
 	}
 out:
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return err;
 
 do_error:
diff --git a/net/dccp/output.c b/net/dccp/output.c
index e98b65e9569f..2d3dcb39851f 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -198,7 +198,7 @@ void dccp_write_space(struct sock *sk)
 	read_lock(&sk->sk_callback_lock);
 
 	if (sk_has_sleeper(sk))
-		wake_up_interruptible(sk->sk_sleep);
+		wake_up_interruptible(sk_sleep(sk));
 	/* Should agree with poll, otherwise some programs break */
 	if (sock_writeable(sk))
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
@@ -225,7 +225,7 @@ static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay)
 		dccp_pr_debug("delayed send by %d msec\n", delay);
 		jiffdelay = msecs_to_jiffies(delay);
 
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
 		sk->sk_write_pending++;
 		release_sock(sk);
@@ -241,7 +241,7 @@ static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay)
 		rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
 	} while ((delay = rc) > 0);
 out:
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return rc;
 
 do_error:
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index a0e38d8018f5..b03ecf6b2bb0 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -312,7 +312,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
 	unsigned int mask;
 	struct sock *sk = sock->sk;
 
-	sock_poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	if (sk->sk_state == DCCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 55e3b6b0061a..d6b93d19790f 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -832,7 +832,7 @@ static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation)
 	scp->segsize_loc = dst_metric(__sk_dst_get(sk), RTAX_ADVMSS);
 	dn_send_conn_conf(sk, allocation);
 
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	for(;;) {
 		release_sock(sk);
 		if (scp->state == DN_CC)
@@ -850,9 +850,9 @@ static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation)
 		err = -EAGAIN;
 		if (!*timeo)
 			break;
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	if (err == 0) {
 		sk->sk_socket->state = SS_CONNECTED;
 	} else if (scp->state != DN_CC) {
@@ -873,7 +873,7 @@ static int dn_wait_run(struct sock *sk, long *timeo)
 	if (!*timeo)
 		return -EALREADY;
 
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	for(;;) {
 		release_sock(sk);
 		if (scp->state == DN_CI || scp->state == DN_CC)
@@ -891,9 +891,9 @@ static int dn_wait_run(struct sock *sk, long *timeo)
 		err = -ETIMEDOUT;
 		if (!*timeo)
 			break;
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 out:
 	if (err == 0) {
 		sk->sk_socket->state = SS_CONNECTED;
@@ -1040,7 +1040,7 @@ static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
 	struct sk_buff *skb = NULL;
 	int err = 0;
 
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	for(;;) {
 		release_sock(sk);
 		skb = skb_dequeue(&sk->sk_receive_queue);
@@ -1060,9 +1060,9 @@ static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
 		err = -EAGAIN;
 		if (!*timeo)
 			break;
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 
 	return skb == NULL ? ERR_PTR(err) : skb;
 }
@@ -1746,11 +1746,11 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
 			goto out;
 		}
 
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
 		sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target));
 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
-		finish_wait(sk->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 	}
 
 	skb_queue_walk_safe(queue, skb, n) {
@@ -2003,12 +2003,12 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
 				goto out;
 			}
 
-			prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+			prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 			set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
 			sk_wait_event(sk, &timeo,
 				      !dn_queue_too_long(scp, queue, flags));
 			clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
-			finish_wait(sk->sk_sleep, &wait);
+			finish_wait(sk_sleep(sk), &wait);
 			continue;
 		}
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c5376c725503..5ca7290c2e61 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -548,7 +548,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
 {
 	DEFINE_WAIT(wait);
 
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
 	/* Basic assumption: if someone sets sk->sk_err, he _must_
 	 * change state of the socket from TCP_SYN_*.
@@ -561,9 +561,9 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
 		lock_sock(sk);
 		if (signal_pending(current) || !timeo)
 			break;
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return timeo;
 }
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8da6429269dd..e0a3e3537b14 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -234,7 +234,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 	 * having to remove and re-insert us on the wait queue.
 	 */
 	for (;;) {
-		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
 					  TASK_INTERRUPTIBLE);
 		release_sock(sk);
 		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
@@ -253,7 +253,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 		if (!timeo)
 			break;
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return err;
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0f8caf64caa3..77208334a613 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -378,7 +378,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	struct sock *sk = sock->sk;
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	sock_poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	if (sk->sk_state == TCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 2a4efcea3423..79986a674f6e 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -347,7 +347,7 @@ static void irda_flow_indication(void *instance, void *sap, LOCAL_FLOW flow)
 		self->tx_flow = flow;
 		IRDA_DEBUG(1, "%s(), IrTTP wants us to start again\n",
 			   __func__);
-		wake_up_interruptible(sk->sk_sleep);
+		wake_up_interruptible(sk_sleep(sk));
 		break;
 	default:
 		IRDA_DEBUG(0, "%s(), Unknown flow command!\n", __func__);
@@ -900,7 +900,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
 		if (flags & O_NONBLOCK)
 			goto out;
 
-		err = wait_event_interruptible(*(sk->sk_sleep),
+		err = wait_event_interruptible(*(sk_sleep(sk)),
 					skb_peek(&sk->sk_receive_queue));
 		if (err)
 			goto out;
@@ -1066,7 +1066,7 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
 		goto out;
 
 	err = -ERESTARTSYS;
-	if (wait_event_interruptible(*(sk->sk_sleep),
+	if (wait_event_interruptible(*(sk_sleep(sk)),
 				     (sk->sk_state != TCP_SYN_SENT)))
 		goto out;
 
@@ -1318,7 +1318,7 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
 
 	/* Check if IrTTP is wants us to slow down */
 
-	if (wait_event_interruptible(*(sk->sk_sleep),
+	if (wait_event_interruptible(*(sk_sleep(sk)),
 	    (self->tx_flow != FLOW_STOP  ||  sk->sk_state != TCP_ESTABLISHED))) {
 		err = -ERESTARTSYS;
 		goto out;
@@ -1477,7 +1477,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
 			if (copied >= target)
 				break;
 
-			prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+			prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
 			/*
 			 *	POSIX 1003.1g mandates this order.
@@ -1497,7 +1497,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
 				/* Wait process until data arrives */
 				schedule();
 
-			finish_wait(sk->sk_sleep, &wait);
+			finish_wait(sk_sleep(sk), &wait);
 
 			if (err)
 				goto out;
@@ -1787,7 +1787,7 @@ static unsigned int irda_poll(struct file * file, struct socket *sock,
 	IRDA_DEBUG(4, "%s()\n", __func__);
 
 	lock_kernel();
-	poll_wait(file, sk->sk_sleep, wait);
+	poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
 	/* Exceptional events? */
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index c18286a2167b..9636b7d27b48 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -59,7 +59,7 @@ do {									\
 	DEFINE_WAIT(__wait);						\
 	long __timeo = timeo;						\
 	ret = 0;							\
-	prepare_to_wait(sk->sk_sleep, &__wait, TASK_INTERRUPTIBLE);	\
+	prepare_to_wait(sk_sleep(sk), &__wait, TASK_INTERRUPTIBLE);	\
 	while (!(condition)) {						\
 		if (!__timeo) {						\
 			ret = -EAGAIN;					\
@@ -76,7 +76,7 @@ do {									\
 		if (ret)						\
 			break;						\
 	}								\
-	finish_wait(sk->sk_sleep, &__wait);				\
+	finish_wait(sk_sleep(sk), &__wait);				\
 } while (0)
 
 #define iucv_sock_wait(sk, condition, timeo)				\
@@ -307,7 +307,7 @@ static void iucv_sock_wake_msglim(struct sock *sk)
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk_has_sleeper(sk))
-		wake_up_interruptible_all(sk->sk_sleep);
+		wake_up_interruptible_all(sk_sleep(sk));
 	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	read_unlock(&sk->sk_callback_lock);
 }
@@ -795,7 +795,7 @@ static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
 
 	/* Wait for an incoming connection */
-	add_wait_queue_exclusive(sk->sk_sleep, &wait);
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
 	while (!(nsk = iucv_accept_dequeue(sk, newsock))) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (!timeo) {
@@ -819,7 +819,7 @@ static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
 	}
 
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
 
 	if (err)
 		goto done;
@@ -1269,7 +1269,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock,
 	struct sock *sk = sock->sk;
 	unsigned int mask = 0;
 
-	sock_poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk_sleep(sk), wait);
 
 	if (sk->sk_state == IUCV_LISTEN)
 		return iucv_accept_poll(sk);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 2db6a9f75913..023ba820236f 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -536,7 +536,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
 	int rc = 0;
 
 	while (1) {
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE))
 			break;
 		rc = -ERESTARTSYS;
@@ -547,7 +547,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
 			break;
 		rc = 0;
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return rc;
 }
 
@@ -556,13 +556,13 @@ static int llc_ui_wait_for_conn(struct sock *sk, long timeout)
 	DEFINE_WAIT(wait);
 
 	while (1) {
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT))
 			break;
 		if (signal_pending(current) || !timeout)
 			break;
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return timeout;
 }
 
@@ -573,7 +573,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
 	int rc;
 
 	while (1) {
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		rc = 0;
 		if (sk_wait_event(sk, &timeout,
 				  (sk->sk_shutdown & RCV_SHUTDOWN) ||
@@ -588,7 +588,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
 		if (!timeout)
 			break;
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return rc;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 8fb0ae616761..7ba06939829f 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -802,7 +802,7 @@ static int sync_thread_backup(void *data)
 		ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
 
 	while (!kthread_should_stop()) {
-		wait_event_interruptible(*tinfo->sock->sk->sk_sleep,
+		wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
 			 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
 			 || kthread_should_stop());
 
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index fa07f044b599..06cb02796a0e 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -739,7 +739,7 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
 		DEFINE_WAIT(wait);
 
 		for (;;) {
-			prepare_to_wait(sk->sk_sleep, &wait,
+			prepare_to_wait(sk_sleep(sk), &wait,
 					TASK_INTERRUPTIBLE);
 			if (sk->sk_state != TCP_SYN_SENT)
 				break;
@@ -752,7 +752,7 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
 			err = -ERESTARTSYS;
 			break;
 		}
-		finish_wait(sk->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 		if (err)
 			goto out_release;
 	}
@@ -798,7 +798,7 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags)
 	 *	hooked into the SABM we saved
 	 */
 	for (;;) {
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		skb = skb_dequeue(&sk->sk_receive_queue);
 		if (skb)
 			break;
@@ -816,7 +816,7 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags)
 		err = -ERESTARTSYS;
 		break;
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	if (err)
 		goto out_release;
 
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 7919a9edb8e9..aebfecbdb841 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -158,7 +158,7 @@ static unsigned int rds_poll(struct file *file, struct socket *sock,
 	unsigned int mask = 0;
 	unsigned long flags;
 
-	poll_wait(file, sk->sk_sleep, wait);
+	poll_wait(file, sk_sleep(sk), wait);
 
 	if (rs->rs_seen_congestion)
 		poll_wait(file, &rds_poll_waitq, wait);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 4bec6e2ed495..c224b5bb3ba9 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -492,7 +492,7 @@ void rds_sock_put(struct rds_sock *rs);
 void rds_wake_sk_sleep(struct rds_sock *rs);
 static inline void __rds_wake_sk_sleep(struct sock *sk)
 {
-	wait_queue_head_t *waitq = sk->sk_sleep;
+	wait_queue_head_t *waitq = sk_sleep(sk);
 
 	if (!sock_flag(sk, SOCK_DEAD) && waitq)
 		wake_up(waitq);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index e2a2b9344f7b..795a00b7f2cb 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -432,7 +432,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 				break;
 			}
 
-			timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+			timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
 					(!list_empty(&rs->rs_notify_queue) ||
 					 rs->rs_cong_notify ||
 					 rds_next_incoming(rs, &inc)), timeo);
diff --git a/net/rds/send.c b/net/rds/send.c
index 53d6795ac9d0..9c1c6bcaa6c9 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -915,7 +915,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 			goto out;
 		}
 
-		timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+		timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
 					rds_send_queue_rm(rs, conn, rm,
 							  rs->rs_bound_port,
 							  dport,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 4fb711a035f4..8e45e76a95f5 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -845,7 +845,7 @@ rose_try_next_neigh:
 		DEFINE_WAIT(wait);
 
 		for (;;) {
-			prepare_to_wait(sk->sk_sleep, &wait,
+			prepare_to_wait(sk_sleep(sk), &wait,
 					TASK_INTERRUPTIBLE);
 			if (sk->sk_state != TCP_SYN_SENT)
 				break;
@@ -858,7 +858,7 @@ rose_try_next_neigh:
 			err = -ERESTARTSYS;
 			break;
 		}
-		finish_wait(sk->sk_sleep, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 
 		if (err)
 			goto out_release;
@@ -911,7 +911,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags)
 	 *	hooked into the SABM we saved
 	 */
 	for (;;) {
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
 		skb = skb_dequeue(&sk->sk_receive_queue);
 		if (skb)
@@ -930,7 +930,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags)
 		err = -ERESTARTSYS;
 		break;
 	}
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	if (err)
 		goto out_release;
 
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index c060095b27ce..c432d76f415e 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk)
 	read_lock(&sk->sk_callback_lock);
 	if (rxrpc_writable(sk)) {
 		if (sk_has_sleeper(sk))
-			wake_up_interruptible(sk->sk_sleep);
+			wake_up_interruptible(sk_sleep(sk));
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	read_unlock(&sk->sk_callback_lock);
@@ -589,7 +589,7 @@ static unsigned int rxrpc_poll(struct file *file, struct socket *sock,
 	unsigned int mask;
 	struct sock *sk = sock->sk;
 
-	sock_poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
 	/* the socket is readable if there are any messages waiting on the Rx
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c1941276f6e3..f34adcca8a8c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5702,7 +5702,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	struct sctp_sock *sp = sctp_sk(sk);
 	unsigned int mask;
 
-	poll_wait(file, sk->sk_sleep, wait);
+	poll_wait(file, sk_sleep(sk), wait);
 
 	/* A TCP-style listening socket becomes readable when the accept queue
 	 * is not empty.
@@ -5943,7 +5943,7 @@ static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p)
 	int error;
 	DEFINE_WAIT(wait);
 
-	prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
 	/* Socket errors? */
 	error = sock_error(sk);
@@ -5980,14 +5980,14 @@ static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p)
 	sctp_lock_sock(sk);
 
 ready:
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	return 0;
 
 interrupted:
 	error = sock_intr_errno(*timeo_p);
 
 out:
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	*err = error;
 	return error;
 }
@@ -6061,8 +6061,8 @@ static void __sctp_write_space(struct sctp_association *asoc)
 			wake_up_interruptible(&asoc->wait);
 
 		if (sctp_writeable(sk)) {
-			if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-				wake_up_interruptible(sk->sk_sleep);
+			if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
+				wake_up_interruptible(sk_sleep(sk));
 
 			/* Note that we try to include the Async I/O support
 			 * here by modeling from the current TCP/UDP code.
@@ -6296,7 +6296,7 @@ static int sctp_wait_for_accept(struct sock *sk, long timeo)
 
 
 	for (;;) {
-		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
 					  TASK_INTERRUPTIBLE);
 
 		if (list_empty(&ep->asocs)) {
@@ -6322,7 +6322,7 @@ static int sctp_wait_for_accept(struct sock *sk, long timeo)
 			break;
 	}
 
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 
 	return err;
 }
@@ -6332,7 +6332,7 @@ static void sctp_wait_for_close(struct sock *sk, long timeout)
 	DEFINE_WAIT(wait);
 
 	do {
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		if (list_empty(&sctp_sk(sk)->ep->asocs))
 			break;
 		sctp_release_sock(sk);
@@ -6340,7 +6340,7 @@ static void sctp_wait_for_close(struct sock *sk, long timeout)
 		sctp_lock_sock(sk);
 	} while (!signal_pending(current) && timeout);
 
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 }
 
 static void sctp_skb_set_owner_r_frag(struct sk_buff *skb, struct sock *sk)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index a29f259204e6..ce0d5b35c2ac 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -419,8 +419,8 @@ static void svc_udp_data_ready(struct sock *sk, int count)
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible(sk->sk_sleep);
+	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible(sk_sleep(sk));
 }
 
 /*
@@ -436,10 +436,10 @@ static void svc_write_space(struct sock *sk)
 		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
 
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
+	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) {
 		dprintk("RPC svc_write_space: someone sleeping on %p\n",
 		       svsk);
-		wake_up_interruptible(sk->sk_sleep);
+		wake_up_interruptible(sk_sleep(sk));
 	}
 }
 
@@ -757,8 +757,8 @@ static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
 			printk("svc: socket %p: no user data\n", sk);
 	}
 
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible_all(sk->sk_sleep);
+	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible_all(sk_sleep(sk));
 }
 
 /*
@@ -777,8 +777,8 @@ static void svc_tcp_state_change(struct sock *sk)
 		set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
 		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible_all(sk->sk_sleep);
+	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible_all(sk_sleep(sk));
 }
 
 static void svc_tcp_data_ready(struct sock *sk, int count)
@@ -791,8 +791,8 @@ static void svc_tcp_data_ready(struct sock *sk, int count)
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible(sk->sk_sleep);
+	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible(sk_sleep(sk));
 }
 
 /*
@@ -1494,8 +1494,8 @@ static void svc_sock_detach(struct svc_xprt *xprt)
 	sk->sk_data_ready = svsk->sk_odata;
 	sk->sk_write_space = svsk->sk_owspace;
 
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible(sk->sk_sleep);
+	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible(sk_sleep(sk));
 }
 
 /*
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index cfb20b80b3a1..66e889ba48fd 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -446,7 +446,7 @@ static unsigned int poll(struct file *file, struct socket *sock,
 	struct sock *sk = sock->sk;
 	u32 mask;
 
-	poll_wait(file, sk->sk_sleep, wait);
+	poll_wait(file, sk_sleep(sk), wait);
 
 	if (!skb_queue_empty(&sk->sk_receive_queue) ||
 	    (sock->state == SS_UNCONNECTED) ||
@@ -591,7 +591,7 @@ static int send_msg(struct kiocb *iocb, struct socket *sock,
 			break;
 		}
 		release_sock(sk);
-		res = wait_event_interruptible(*sk->sk_sleep,
+		res = wait_event_interruptible(*sk_sleep(sk),
 					       !tport->congested);
 		lock_sock(sk);
 		if (res)
@@ -650,7 +650,7 @@ static int send_packet(struct kiocb *iocb, struct socket *sock,
 			break;
 		}
 		release_sock(sk);
-		res = wait_event_interruptible(*sk->sk_sleep,
+		res = wait_event_interruptible(*sk_sleep(sk),
 			(!tport->congested || !tport->connected));
 		lock_sock(sk);
 		if (res)
@@ -931,7 +931,7 @@ restart:
 			goto exit;
 		}
 		release_sock(sk);
-		res = wait_event_interruptible(*sk->sk_sleep,
+		res = wait_event_interruptible(*sk_sleep(sk),
 			(!skb_queue_empty(&sk->sk_receive_queue) ||
 			 (sock->state == SS_DISCONNECTING)));
 		lock_sock(sk);
@@ -1064,7 +1064,7 @@ restart:
 			goto exit;
 		}
 		release_sock(sk);
-		res = wait_event_interruptible(*sk->sk_sleep,
+		res = wait_event_interruptible(*sk_sleep(sk),
 			(!skb_queue_empty(&sk->sk_receive_queue) ||
 			 (sock->state == SS_DISCONNECTING)));
 		lock_sock(sk);
@@ -1271,8 +1271,8 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf)
 		tipc_disconnect_port(tipc_sk_port(sk));
 	}
 
-	if (waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible(sk->sk_sleep);
+	if (waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible(sk_sleep(sk));
 	return TIPC_OK;
 }
 
@@ -1343,8 +1343,8 @@ static void wakeupdispatch(struct tipc_port *tport)
 {
 	struct sock *sk = (struct sock *)tport->usr_handle;
 
-	if (waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible(sk->sk_sleep);
+	if (waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible(sk_sleep(sk));
 }
 
 /**
@@ -1426,7 +1426,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
 	/* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */
 
 	release_sock(sk);
-	res = wait_event_interruptible_timeout(*sk->sk_sleep,
+	res = wait_event_interruptible_timeout(*sk_sleep(sk),
 			(!skb_queue_empty(&sk->sk_receive_queue) ||
 			(sock->state != SS_CONNECTING)),
 			sk->sk_rcvtimeo);
@@ -1521,7 +1521,7 @@ static int accept(struct socket *sock, struct socket *new_sock, int flags)
 			goto exit;
 		}
 		release_sock(sk);
-		res = wait_event_interruptible(*sk->sk_sleep,
+		res = wait_event_interruptible(*sk_sleep(sk),
 				(!skb_queue_empty(&sk->sk_receive_queue)));
 		lock_sock(sk);
 		if (res)
@@ -1632,8 +1632,8 @@ restart:
 		/* Discard any unreceived messages; wake up sleeping tasks */
 
 		discard_rx_queue(sk);
-		if (waitqueue_active(sk->sk_sleep))
-			wake_up_interruptible(sk->sk_sleep);
+		if (waitqueue_active(sk_sleep(sk)))
+			wake_up_interruptible(sk_sleep(sk));
 		res = 0;
 		break;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3d9122e78f41..87c0360eaa25 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -316,7 +316,7 @@ static void unix_write_space(struct sock *sk)
 	read_lock(&sk->sk_callback_lock);
 	if (unix_writable(sk)) {
 		if (sk_has_sleeper(sk))
-			wake_up_interruptible_sync(sk->sk_sleep);
+			wake_up_interruptible_sync(sk_sleep(sk));
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	read_unlock(&sk->sk_callback_lock);
@@ -1736,7 +1736,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo)
 	unix_state_lock(sk);
 
 	for (;;) {
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
 		if (!skb_queue_empty(&sk->sk_receive_queue) ||
 		    sk->sk_err ||
@@ -1752,7 +1752,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo)
 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
 	}
 
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	unix_state_unlock(sk);
 	return timeo;
 }
@@ -1991,7 +1991,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
 	struct sock *sk = sock->sk;
 	unsigned int mask;
 
-	sock_poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
 	/* exceptional events? */
@@ -2028,7 +2028,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 	struct sock *sk = sock->sk, *other;
 	unsigned int mask, writable;
 
-	sock_poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
 	/* exceptional events? */
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index cbddd0cb83f1..6cffbc4da029 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -718,7 +718,7 @@ static int x25_wait_for_connection_establishment(struct sock *sk)
 	DECLARE_WAITQUEUE(wait, current);
 	int rc;
 
-	add_wait_queue_exclusive(sk->sk_sleep, &wait);
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
 	for (;;) {
 		__set_current_state(TASK_INTERRUPTIBLE);
 		rc = -ERESTARTSYS;
@@ -738,7 +738,7 @@ static int x25_wait_for_connection_establishment(struct sock *sk)
 			break;
 	}
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
 	return rc;
 }
 
@@ -838,7 +838,7 @@ static int x25_wait_for_data(struct sock *sk, long timeout)
 	DECLARE_WAITQUEUE(wait, current);
 	int rc = 0;
 
-	add_wait_queue_exclusive(sk->sk_sleep, &wait);
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
 	for (;;) {
 		__set_current_state(TASK_INTERRUPTIBLE);
 		if (sk->sk_shutdown & RCV_SHUTDOWN)
@@ -858,7 +858,7 @@ static int x25_wait_for_data(struct sock *sk, long timeout)
 			break;
 	}
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk->sk_sleep, &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
 	return rc;
 }
 
-- 
cgit v1.2.3


From c377411f2494a931ff7facdbb3a6839b1266bcf6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Apr 2010 15:13:20 -0700
Subject: net: sk_add_backlog() take rmem_alloc into account

Current socket backlog limit is not enough to really stop DDOS attacks,
because user thread spend many time to process a full backlog each
round, and user might crazy spin on socket lock.

We should add backlog size and receive_queue size (aka rmem_alloc) to
pace writers, and let user run without being slow down too much.

Introduce a sk_rcvqueues_full() helper, to avoid taking socket lock in
stress situations.

Under huge stress from a multiqueue/RPS enabled NIC, a single flow udp
receiver can now process ~200.000 pps (instead of ~100 pps before the
patch) on a 8 core machine.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 13 +++++++++++--
 net/core/sock.c    |  5 ++++-
 net/ipv4/udp.c     |  4 ++++
 net/ipv6/udp.c     |  8 ++++++++
 net/sctp/socket.c  |  3 ---
 5 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'net/core/sock.c')

diff --git a/include/net/sock.h b/include/net/sock.h
index 07822280d953..cf12b1e61fa6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -256,7 +256,6 @@ struct sock {
 		struct sk_buff *head;
 		struct sk_buff *tail;
 		int len;
-		int limit;
 	} sk_backlog;
 	wait_queue_head_t	*sk_sleep;
 	struct dst_entry	*sk_dst_cache;
@@ -608,10 +607,20 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 	skb->next = NULL;
 }
 
+/*
+ * Take into account size of receive queue and backlog queue
+ */
+static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb)
+{
+	unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);
+
+	return qsize + skb->truesize > sk->sk_rcvbuf;
+}
+
 /* The per-socket spinlock must be held here. */
 static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
-	if (sk->sk_backlog.len >= max(sk->sk_backlog.limit, sk->sk_rcvbuf << 1))
+	if (sk_rcvqueues_full(sk, skb))
 		return -ENOBUFS;
 
 	__sk_add_backlog(sk, skb);
diff --git a/net/core/sock.c b/net/core/sock.c
index 58ebd146ce5a..51041759517e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -327,6 +327,10 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 
 	skb->dev = NULL;
 
+	if (sk_rcvqueues_full(sk, skb)) {
+		atomic_inc(&sk->sk_drops);
+		goto discard_and_relse;
+	}
 	if (nested)
 		bh_lock_sock_nested(sk);
 	else
@@ -1885,7 +1889,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_allocation	=	GFP_KERNEL;
 	sk->sk_rcvbuf		=	sysctl_rmem_default;
 	sk->sk_sndbuf		=	sysctl_wmem_default;
-	sk->sk_backlog.limit	=	sk->sk_rcvbuf << 1;
 	sk->sk_state		=	TCP_CLOSE;
 	sk_set_socket(sk, sock);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fa3d2874db41..63eb56b2d873 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1372,6 +1372,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 			goto drop;
 	}
 
+
+	if (sk_rcvqueues_full(sk, skb))
+		goto drop;
+
 	rc = 0;
 
 	bh_lock_sock(sk);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2850e35cee3d..3ead20ad9d07 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -584,6 +584,10 @@ static void flush_stack(struct sock **stack, unsigned int count,
 
 		sk = stack[i];
 		if (skb1) {
+			if (sk_rcvqueues_full(sk, skb)) {
+				kfree_skb(skb1);
+				goto drop;
+			}
 			bh_lock_sock(sk);
 			if (!sock_owned_by_user(sk))
 				udpv6_queue_rcv_skb(sk, skb1);
@@ -759,6 +763,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 
 	/* deliver */
 
+	if (sk_rcvqueues_full(sk, skb)) {
+		sock_put(sk);
+		goto discard;
+	}
 	bh_lock_sock(sk);
 	if (!sock_owned_by_user(sk))
 		udpv6_queue_rcv_skb(sk, skb);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f34adcca8a8c..13d8229f3a9c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3721,9 +3721,6 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
 	SCTP_DBG_OBJCNT_INC(sock);
 	percpu_counter_inc(&sctp_sockets_allocated);
 
-	/* Set socket backlog limit. */
-	sk->sk_backlog.limit = sysctl_sctp_rmem[1];
-
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	local_bh_enable();
-- 
cgit v1.2.3


From 43815482370c510c569fd18edb57afcb0fa8cab6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 29 Apr 2010 11:01:49 +0000
Subject: net: sock_def_readable() and friends RCU conversion

sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we
need two atomic operations (and associated dirtying) per incoming
packet.

RCU conversion is pretty much needed :

1) Add a new structure, called "struct socket_wq" to hold all fields
that will need rcu_read_lock() protection (currently: a
wait_queue_head_t and a struct fasync_struct pointer).

[Future patch will add a list anchor for wakeup coalescing]

2) Attach one of such structure to each "struct socket" created in
sock_alloc_inode().

3) Respect RCU grace period when freeing a "struct socket_wq"

4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct
socket_wq"

5) Change sk_sleep() function to use new sk->sk_wq instead of
sk->sk_sleep

6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside
a rcu_read_lock() section.

7) Change all sk_has_sleeper() callers to :
  - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock)
  - Use wq_has_sleeper() to eventually wakeup tasks.
  - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock)

8) sock_wake_async() is modified to use rcu protection as well.

9) Exceptions :
  macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq"
instead of dynamically allocated ones. They dont need rcu freeing.

Some cleanups or followups are probably needed, (possible
sk_callback_lock conversion to a spinlock for example...).

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c | 13 +++++++++----
 drivers/net/tun.c     | 21 ++++++++++++---------
 include/linux/net.h   | 14 +++++++++-----
 include/net/af_unix.h | 20 +++++++++++---------
 include/net/sock.h    | 38 +++++++++++++++++++-------------------
 net/atm/common.c      | 22 ++++++++++++++--------
 net/core/sock.c       | 50 +++++++++++++++++++++++++++++++-------------------
 net/core/stream.c     | 10 +++++++---
 net/dccp/output.c     | 10 ++++++----
 net/iucv/af_iucv.c    | 11 +++++++----
 net/phonet/pep.c      |  8 ++++----
 net/phonet/socket.c   |  2 +-
 net/rxrpc/af_rxrpc.c  | 10 ++++++----
 net/sctp/socket.c     |  2 +-
 net/socket.c          | 47 ++++++++++++++++++++++++++++++++++++-----------
 net/unix/af_unix.c    | 17 ++++++++---------
 16 files changed, 181 insertions(+), 114 deletions(-)

(limited to 'net/core/sock.c')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index d97e1fd234ba..1c4110df343e 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -37,6 +37,7 @@
 struct macvtap_queue {
 	struct sock sk;
 	struct socket sock;
+	struct socket_wq wq;
 	struct macvlan_dev *vlan;
 	struct file *file;
 	unsigned int flags;
@@ -242,12 +243,15 @@ static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
 
 static void macvtap_sock_write_space(struct sock *sk)
 {
+	wait_queue_head_t *wqueue;
+
 	if (!sock_writeable(sk) ||
 	    !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
 		return;
 
-	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
-		wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | POLLWRNORM | POLLWRBAND);
+	wqueue = sk_sleep(sk);
+	if (wqueue && waitqueue_active(wqueue))
+		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
 }
 
 static int macvtap_open(struct inode *inode, struct file *file)
@@ -272,7 +276,8 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	if (!q)
 		goto out;
 
-	init_waitqueue_head(&q->sock.wait);
+	q->sock.wq = &q->wq;
+	init_waitqueue_head(&q->wq.wait);
 	q->sock.type = SOCK_RAW;
 	q->sock.state = SS_CONNECTED;
 	q->sock.file = file;
@@ -308,7 +313,7 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait)
 		goto out;
 
 	mask = 0;
-	poll_wait(file, &q->sock.wait, wait);
+	poll_wait(file, &q->wq.wait, wait);
 
 	if (!skb_queue_empty(&q->sk.sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 20a17938c62b..e525a6cf5587 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -109,7 +109,7 @@ struct tun_struct {
 
 	struct tap_filter       txflt;
 	struct socket		socket;
-
+	struct socket_wq	wq;
 #ifdef TUN_DEBUG
 	int debug;
 #endif
@@ -323,7 +323,7 @@ static void tun_net_uninit(struct net_device *dev)
 	/* Inform the methods they need to stop using the dev.
 	 */
 	if (tfile) {
-		wake_up_all(&tun->socket.wait);
+		wake_up_all(&tun->wq.wait);
 		if (atomic_dec_and_test(&tfile->count))
 			__tun_detach(tun);
 	}
@@ -398,7 +398,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Notify and wake up reader process */
 	if (tun->flags & TUN_FASYNC)
 		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
-	wake_up_interruptible_poll(&tun->socket.wait, POLLIN |
+	wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
 				   POLLRDNORM | POLLRDBAND);
 	return NETDEV_TX_OK;
 
@@ -498,7 +498,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
 
 	DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name);
 
-	poll_wait(file, &tun->socket.wait, wait);
+	poll_wait(file, &tun->wq.wait, wait);
 
 	if (!skb_queue_empty(&sk->sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
@@ -773,7 +773,7 @@ static ssize_t tun_do_read(struct tun_struct *tun,
 
 	DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name);
 
-	add_wait_queue(&tun->socket.wait, &wait);
+	add_wait_queue(&tun->wq.wait, &wait);
 	while (len) {
 		current->state = TASK_INTERRUPTIBLE;
 
@@ -804,7 +804,7 @@ static ssize_t tun_do_read(struct tun_struct *tun,
 	}
 
 	current->state = TASK_RUNNING;
-	remove_wait_queue(&tun->socket.wait, &wait);
+	remove_wait_queue(&tun->wq.wait, &wait);
 
 	return ret;
 }
@@ -861,6 +861,7 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = {
 static void tun_sock_write_space(struct sock *sk)
 {
 	struct tun_struct *tun;
+	wait_queue_head_t *wqueue;
 
 	if (!sock_writeable(sk))
 		return;
@@ -868,8 +869,9 @@ static void tun_sock_write_space(struct sock *sk)
 	if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
 		return;
 
-	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
-		wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT |
+	wqueue = sk_sleep(sk);
+	if (wqueue && waitqueue_active(wqueue))
+		wake_up_interruptible_sync_poll(wqueue, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 
 	tun = tun_sk(sk)->tun;
@@ -1039,7 +1041,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		if (!sk)
 			goto err_free_dev;
 
-		init_waitqueue_head(&tun->socket.wait);
+		tun->socket.wq = &tun->wq;
+		init_waitqueue_head(&tun->wq.wait);
 		tun->socket.ops = &tun_socket_ops;
 		sock_init_data(&tun->socket, sk);
 		sk->sk_write_space = tun_sock_write_space;
diff --git a/include/linux/net.h b/include/linux/net.h
index 4157b5d42bd6..2b4deeeb8646 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -59,6 +59,7 @@ typedef enum {
 #include <linux/wait.h>
 #include <linux/fcntl.h>	/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/kmemcheck.h>
+#include <linux/rcupdate.h>
 
 struct poll_table_struct;
 struct pipe_inode_info;
@@ -116,6 +117,12 @@ enum sock_shutdown_cmd {
 	SHUT_RDWR	= 2,
 };
 
+struct socket_wq {
+	wait_queue_head_t	wait;
+	struct fasync_struct	*fasync_list;
+	struct rcu_head		rcu;
+} ____cacheline_aligned_in_smp;
+
 /**
  *  struct socket - general BSD socket
  *  @state: socket state (%SS_CONNECTED, etc)
@@ -135,11 +142,8 @@ struct socket {
 	kmemcheck_bitfield_end(type);
 
 	unsigned long		flags;
-	/*
-	 * Please keep fasync_list & wait fields in the same cache line
-	 */
-	struct fasync_struct	*fasync_list;
-	wait_queue_head_t	wait;
+
+	struct socket_wq	*wq;
 
 	struct file		*file;
 	struct sock		*sk;
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 1614d78c60ed..20725e213aee 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -30,7 +30,7 @@ struct unix_skb_parms {
 #endif
 };
 
-#define UNIXCB(skb) 	(*(struct unix_skb_parms*)&((skb)->cb))
+#define UNIXCB(skb) 	(*(struct unix_skb_parms *)&((skb)->cb))
 #define UNIXCREDS(skb)	(&UNIXCB((skb)).creds)
 #define UNIXSID(skb)	(&UNIXCB((skb)).secid)
 
@@ -45,21 +45,23 @@ struct unix_skb_parms {
 struct unix_sock {
 	/* WARNING: sk has to be the first member */
 	struct sock		sk;
-        struct unix_address     *addr;
-        struct dentry		*dentry;
-        struct vfsmount		*mnt;
+	struct unix_address     *addr;
+	struct dentry		*dentry;
+	struct vfsmount		*mnt;
 	struct mutex		readlock;
-        struct sock		*peer;
-        struct sock		*other;
+	struct sock		*peer;
+	struct sock		*other;
 	struct list_head	link;
-        atomic_long_t           inflight;
-        spinlock_t		lock;
+	atomic_long_t		inflight;
+	spinlock_t		lock;
 	unsigned int		gc_candidate : 1;
 	unsigned int		gc_maybe_cycle : 1;
-        wait_queue_head_t       peer_wait;
+	struct socket_wq	peer_wq;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
 
+#define peer_wait peer_wq.wait
+
 #ifdef CONFIG_SYSCTL
 extern int unix_sysctl_register(struct net *net);
 extern void unix_sysctl_unregister(struct net *net);
diff --git a/include/net/sock.h b/include/net/sock.h
index e1777db5b9ab..cc7f91ec972c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -159,7 +159,7 @@ struct sock_common {
   *	@sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
   *	@sk_lock:	synchronizer
   *	@sk_rcvbuf: size of receive buffer in bytes
-  *	@sk_sleep: sock wait queue
+  *	@sk_wq: sock wait queue and async head
   *	@sk_dst_cache: destination cache
   *	@sk_dst_lock: destination cache lock
   *	@sk_policy: flow policy
@@ -257,7 +257,7 @@ struct sock {
 		struct sk_buff *tail;
 		int len;
 	} sk_backlog;
-	wait_queue_head_t	*sk_sleep;
+	struct socket_wq	*sk_wq;
 	struct dst_entry	*sk_dst_cache;
 #ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
@@ -1219,7 +1219,7 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 
 static inline wait_queue_head_t *sk_sleep(struct sock *sk)
 {
-	return sk->sk_sleep;
+	return &sk->sk_wq->wait;
 }
 /* Detach socket from process context.
  * Announce socket dead, detach it from wait queue and inode.
@@ -1233,14 +1233,14 @@ static inline void sock_orphan(struct sock *sk)
 	write_lock_bh(&sk->sk_callback_lock);
 	sock_set_flag(sk, SOCK_DEAD);
 	sk_set_socket(sk, NULL);
-	sk->sk_sleep  = NULL;
+	sk->sk_wq  = NULL;
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
 static inline void sock_graft(struct sock *sk, struct socket *parent)
 {
 	write_lock_bh(&sk->sk_callback_lock);
-	sk->sk_sleep = &parent->wait;
+	rcu_assign_pointer(sk->sk_wq, parent->wq);
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
 	security_sock_graft(sk, parent);
@@ -1392,12 +1392,12 @@ static inline int sk_has_allocations(const struct sock *sk)
 }
 
 /**
- * sk_has_sleeper - check if there are any waiting processes
- * @sk: socket
+ * wq_has_sleeper - check if there are any waiting processes
+ * @sk: struct socket_wq
  *
- * Returns true if socket has waiting processes
+ * Returns true if socket_wq has waiting processes
  *
- * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory
+ * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory
  * barrier call. They were added due to the race found within the tcp code.
  *
  * Consider following tcp code paths:
@@ -1410,9 +1410,10 @@ static inline int sk_has_allocations(const struct sock *sk)
  *   ...                 ...
  *   tp->rcv_nxt check   sock_def_readable
  *   ...                 {
- *   schedule               ...
- *                          if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- *                              wake_up_interruptible(sk_sleep(sk))
+ *   schedule               rcu_read_lock();
+ *                          wq = rcu_dereference(sk->sk_wq);
+ *                          if (wq && waitqueue_active(&wq->wait))
+ *                              wake_up_interruptible(&wq->wait)
  *                          ...
  *                       }
  *
@@ -1421,19 +1422,18 @@ static inline int sk_has_allocations(const struct sock *sk)
  * could then endup calling schedule and sleep forever if there are no more
  * data on the socket.
  *
- * The sk_has_sleeper is always called right after a call to read_lock, so we
- * can use smp_mb__after_lock barrier.
  */
-static inline int sk_has_sleeper(struct sock *sk)
+static inline bool wq_has_sleeper(struct socket_wq *wq)
 {
+
 	/*
 	 * We need to be sure we are in sync with the
 	 * add_wait_queue modifications to the wait queue.
 	 *
 	 * This memory barrier is paired in the sock_poll_wait.
 	 */
-	smp_mb__after_lock();
-	return sk_sleep(sk) && waitqueue_active(sk_sleep(sk));
+	smp_mb();
+	return wq && waitqueue_active(&wq->wait);
 }
 
 /**
@@ -1442,7 +1442,7 @@ static inline int sk_has_sleeper(struct sock *sk)
  * @wait_address:   socket wait queue
  * @p:              poll_table
  *
- * See the comments in the sk_has_sleeper function.
+ * See the comments in the wq_has_sleeper function.
  */
 static inline void sock_poll_wait(struct file *filp,
 		wait_queue_head_t *wait_address, poll_table *p)
@@ -1453,7 +1453,7 @@ static inline void sock_poll_wait(struct file *filp,
 		 * We need to be sure we are in sync with the
 		 * socket flags modification.
 		 *
-		 * This memory barrier is paired in the sk_has_sleeper.
+		 * This memory barrier is paired in the wq_has_sleeper.
 		*/
 		smp_mb();
 	}
diff --git a/net/atm/common.c b/net/atm/common.c
index e3e10e6f8628..b43feb1a3995 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -90,10 +90,13 @@ static void vcc_sock_destruct(struct sock *sk)
 
 static void vcc_def_wakeup(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
-	if (sk_has_sleeper(sk))
-		wake_up(sk_sleep(sk));
-	read_unlock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up(&wq->wait);
+	rcu_read_unlock();
 }
 
 static inline int vcc_writable(struct sock *sk)
@@ -106,16 +109,19 @@ static inline int vcc_writable(struct sock *sk)
 
 static void vcc_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
 
 	if (vcc_writable(sk)) {
-		if (sk_has_sleeper(sk))
-			wake_up_interruptible(sk_sleep(sk));
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible(&wq->wait);
 
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 static struct proto vcc_proto = {
diff --git a/net/core/sock.c b/net/core/sock.c
index 51041759517e..94c4affdda9b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1211,7 +1211,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		 */
 		sk_refcnt_debug_inc(newsk);
 		sk_set_socket(newsk, NULL);
-		newsk->sk_sleep	 = NULL;
+		newsk->sk_wq = NULL;
 
 		if (newsk->sk_prot->sockets_allocated)
 			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
@@ -1800,41 +1800,53 @@ EXPORT_SYMBOL(sock_no_sendpage);
 
 static void sock_def_wakeup(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
-	if (sk_has_sleeper(sk))
-		wake_up_interruptible_all(sk_sleep(sk));
-	read_unlock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_all(&wq->wait);
+	rcu_read_unlock();
 }
 
 static void sock_def_error_report(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
-	if (sk_has_sleeper(sk))
-		wake_up_interruptible_poll(sk_sleep(sk), POLLERR);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_poll(&wq->wait, POLLERR);
 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 static void sock_def_readable(struct sock *sk, int len)
 {
-	read_lock(&sk->sk_callback_lock);
-	if (sk_has_sleeper(sk))
-		wake_up_interruptible_sync_poll(sk_sleep(sk), POLLIN |
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
 						POLLRDNORM | POLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 static void sock_def_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
 
 	/* Do not wake up a writer until he can make "significant"
 	 * progress.  --DaveM
 	 */
 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
-		if (sk_has_sleeper(sk))
-			wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT |
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 
 		/* Should agree with poll, otherwise some programs break */
@@ -1842,7 +1854,7 @@ static void sock_def_write_space(struct sock *sk)
 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1896,10 +1908,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	if (sock) {
 		sk->sk_type	=	sock->type;
-		sk->sk_sleep	=	&sock->wait;
+		sk->sk_wq	=	sock->wq;
 		sock->sk	=	sk;
 	} else
-		sk->sk_sleep	=	NULL;
+		sk->sk_wq	=	NULL;
 
 	spin_lock_init(&sk->sk_dst_lock);
 	rwlock_init(&sk->sk_callback_lock);
diff --git a/net/core/stream.c b/net/core/stream.c
index 7b3c3f30b107..cc196f42b8d8 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -28,15 +28,19 @@
 void sk_stream_write_space(struct sock *sk)
 {
 	struct socket *sock = sk->sk_socket;
+	struct socket_wq *wq;
 
 	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
 		clear_bit(SOCK_NOSPACE, &sock->flags);
 
-		if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
-			wake_up_interruptible_poll(sk_sleep(sk), POLLOUT |
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
-		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+		rcu_read_unlock();
 	}
 }
 
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 2d3dcb39851f..aadbdb58758b 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -195,15 +195,17 @@ EXPORT_SYMBOL_GPL(dccp_sync_mss);
 
 void dccp_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
 
-	if (sk_has_sleeper(sk))
-		wake_up_interruptible(sk_sleep(sk));
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible(&wq->wait);
 	/* Should agree with poll, otherwise some programs break */
 	if (sock_writeable(sk))
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 /**
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 9636b7d27b48..8be324fe08b9 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -305,11 +305,14 @@ static inline int iucv_below_msglim(struct sock *sk)
  */
 static void iucv_sock_wake_msglim(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
-	if (sk_has_sleeper(sk))
-		wake_up_interruptible_all(sk_sleep(sk));
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_all(&wq->wait);
 	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 /* Timers */
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index e2a95762abd3..af4d38bc3b22 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -664,12 +664,12 @@ static int pep_wait_connreq(struct sock *sk, int noblock)
 		if (signal_pending(tsk))
 			return sock_intr_errno(timeo);
 
-		prepare_to_wait_exclusive(&sk->sk_socket->wait, &wait,
+		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
 						TASK_INTERRUPTIBLE);
 		release_sock(sk);
 		timeo = schedule_timeout(timeo);
 		lock_sock(sk);
-		finish_wait(&sk->sk_socket->wait, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 	}
 
 	return 0;
@@ -910,10 +910,10 @@ disabled:
 			goto out;
 		}
 
-		prepare_to_wait(&sk->sk_socket->wait, &wait,
+		prepare_to_wait(sk_sleep(sk), &wait,
 				TASK_INTERRUPTIBLE);
 		done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits));
-		finish_wait(&sk->sk_socket->wait, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 
 		if (sk->sk_state != TCP_ESTABLISHED)
 			goto disabled;
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index c785bfd0744f..6e9848bf0370 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -265,7 +265,7 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock,
 	struct pep_sock *pn = pep_sk(sk);
 	unsigned int mask = 0;
 
-	poll_wait(file, &sock->wait, wait);
+	poll_wait(file, sk_sleep(sk), wait);
 
 	switch (sk->sk_state) {
 	case TCP_LISTEN:
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index c432d76f415e..0b9bb2085ce4 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -62,13 +62,15 @@ static inline int rxrpc_writable(struct sock *sk)
 static void rxrpc_write_space(struct sock *sk)
 {
 	_enter("%p", sk);
-	read_lock(&sk->sk_callback_lock);
+	rcu_read_lock();
 	if (rxrpc_writable(sk)) {
-		if (sk_has_sleeper(sk))
-			wake_up_interruptible(sk_sleep(sk));
+		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
+
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible(&wq->wait);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 /*
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 13d8229f3a9c..d54700af927a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6065,7 +6065,7 @@ static void __sctp_write_space(struct sctp_association *asoc)
 			 * here by modeling from the current TCP/UDP code.
 			 * We have not tested with it yet.
 			 */
-			if (sock->fasync_list &&
+			if (sock->wq->fasync_list &&
 			    !(sk->sk_shutdown & SEND_SHUTDOWN))
 				sock_wake_async(sock,
 						SOCK_WAKE_SPACE, POLL_OUT);
diff --git a/net/socket.c b/net/socket.c
index cb7c1f6c0d6e..dae8c6b84a09 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -252,9 +252,14 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
-	init_waitqueue_head(&ei->socket.wait);
+	ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL);
+	if (!ei->socket.wq) {
+		kmem_cache_free(sock_inode_cachep, ei);
+		return NULL;
+	}
+	init_waitqueue_head(&ei->socket.wq->wait);
+	ei->socket.wq->fasync_list = NULL;
 
-	ei->socket.fasync_list = NULL;
 	ei->socket.state = SS_UNCONNECTED;
 	ei->socket.flags = 0;
 	ei->socket.ops = NULL;
@@ -264,10 +269,21 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
+
+static void wq_free_rcu(struct rcu_head *head)
+{
+	struct socket_wq *wq = container_of(head, struct socket_wq, rcu);
+
+	kfree(wq);
+}
+
 static void sock_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(sock_inode_cachep,
-			container_of(inode, struct socket_alloc, vfs_inode));
+	struct socket_alloc *ei;
+
+	ei = container_of(inode, struct socket_alloc, vfs_inode);
+	call_rcu(&ei->socket.wq->rcu, wq_free_rcu);
+	kmem_cache_free(sock_inode_cachep, ei);
 }
 
 static void init_once(void *foo)
@@ -513,7 +529,7 @@ void sock_release(struct socket *sock)
 		module_put(owner);
 	}
 
-	if (sock->fasync_list)
+	if (sock->wq->fasync_list)
 		printk(KERN_ERR "sock_release: fasync list not empty!\n");
 
 	percpu_sub(sockets_in_use, 1);
@@ -1080,9 +1096,9 @@ static int sock_fasync(int fd, struct file *filp, int on)
 
 	lock_sock(sk);
 
-	fasync_helper(fd, filp, on, &sock->fasync_list);
+	fasync_helper(fd, filp, on, &sock->wq->fasync_list);
 
-	if (!sock->fasync_list)
+	if (!sock->wq->fasync_list)
 		sock_reset_flag(sk, SOCK_FASYNC);
 	else
 		sock_set_flag(sk, SOCK_FASYNC);
@@ -1091,12 +1107,20 @@ static int sock_fasync(int fd, struct file *filp, int on)
 	return 0;
 }
 
-/* This function may be called only under socket lock or callback_lock */
+/* This function may be called only under socket lock or callback_lock or rcu_lock */
 
 int sock_wake_async(struct socket *sock, int how, int band)
 {
-	if (!sock || !sock->fasync_list)
+	struct socket_wq *wq;
+
+	if (!sock)
 		return -1;
+	rcu_read_lock();
+	wq = rcu_dereference(sock->wq);
+	if (!wq || !wq->fasync_list) {
+		rcu_read_unlock();
+		return -1;
+	}
 	switch (how) {
 	case SOCK_WAKE_WAITD:
 		if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
@@ -1108,11 +1132,12 @@ int sock_wake_async(struct socket *sock, int how, int band)
 		/* fall through */
 	case SOCK_WAKE_IO:
 call_kill:
-		kill_fasync(&sock->fasync_list, SIGIO, band);
+		kill_fasync(&wq->fasync_list, SIGIO, band);
 		break;
 	case SOCK_WAKE_URG:
-		kill_fasync(&sock->fasync_list, SIGURG, band);
+		kill_fasync(&wq->fasync_list, SIGURG, band);
 	}
+	rcu_read_unlock();
 	return 0;
 }
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 87c0360eaa25..fef2cc5e9d2b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -313,13 +313,16 @@ static inline int unix_writable(struct sock *sk)
 
 static void unix_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
 	if (unix_writable(sk)) {
-		if (sk_has_sleeper(sk))
-			wake_up_interruptible_sync(sk_sleep(sk));
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible_sync(&wq->wait);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 /* When dgram socket disconnects (or changes its peer), we clear its receive
@@ -406,9 +409,7 @@ static int unix_release_sock(struct sock *sk, int embrion)
 				skpair->sk_err = ECONNRESET;
 			unix_state_unlock(skpair);
 			skpair->sk_state_change(skpair);
-			read_lock(&skpair->sk_callback_lock);
 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
-			read_unlock(&skpair->sk_callback_lock);
 		}
 		sock_put(skpair); /* It may now die */
 		unix_peer(sk) = NULL;
@@ -1142,7 +1143,7 @@ restart:
 	newsk->sk_peercred.pid	= task_tgid_vnr(current);
 	current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid);
 	newu = unix_sk(newsk);
-	newsk->sk_sleep		= &newu->peer_wait;
+	newsk->sk_wq		= &newu->peer_wq;
 	otheru = unix_sk(other);
 
 	/* copy address information from listening to new sock*/
@@ -1931,12 +1932,10 @@ static int unix_shutdown(struct socket *sock, int mode)
 			other->sk_shutdown |= peer_mode;
 			unix_state_unlock(other);
 			other->sk_state_change(other);
-			read_lock(&other->sk_callback_lock);
 			if (peer_mode == SHUTDOWN_MASK)
 				sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
 			else if (peer_mode & RCV_SHUTDOWN)
 				sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
-			read_unlock(&other->sk_callback_lock);
 		}
 		if (other)
 			sock_put(other);
-- 
cgit v1.2.3


From a465419b1febb603821f924805529cff89cafeed Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 16 May 2010 00:36:33 -0700
Subject: net: Introduce sk_route_nocaps

TCP-MD5 sessions have intermittent failures, when route cache is
invalidated. ip_queue_xmit() has to find a new route, calls
sk_setup_caps(sk, &rt->u.dst), destroying the

sk->sk_route_caps &= ~NETIF_F_GSO_MASK

that MD5 desperately try to make all over its way (from
tcp_transmit_skb() for example)

So we send few bad packets, and everything is fine when
tcp_transmit_skb() is called again for this socket.

Since ip_queue_xmit() is at a lower level than TCP-MD5, I chose to use a
socket field, sk_route_nocaps, containing bits to mask on sk_route_caps.

Reported-by: Bhaskar Dutta <bhaskie@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h    | 8 ++++++++
 net/core/sock.c       | 1 +
 net/ipv4/tcp_ipv4.c   | 6 +++---
 net/ipv4/tcp_output.c | 2 +-
 net/ipv6/tcp_ipv6.c   | 4 ++--
 5 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'net/core/sock.c')

diff --git a/include/net/sock.h b/include/net/sock.h
index 328e03f47dd1..aed16eb9db4b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -177,6 +177,7 @@ struct sock_common {
   *		   %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
   *	@sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
   *	@sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
+  *	@sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
   *	@sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
   *	@sk_gso_max_size: Maximum GSO segment size to build
   *	@sk_lingertime: %SO_LINGER l_linger setting
@@ -276,6 +277,7 @@ struct sock {
 	int			sk_forward_alloc;
 	gfp_t			sk_allocation;
 	int			sk_route_caps;
+	int			sk_route_nocaps;
 	int			sk_gso_type;
 	unsigned int		sk_gso_max_size;
 	int			sk_rcvlowat;
@@ -1335,6 +1337,12 @@ static inline int sk_can_gso(const struct sock *sk)
 
 extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
 
+static inline void sk_nocaps_add(struct sock *sk, int flags)
+{
+	sk->sk_route_nocaps |= flags;
+	sk->sk_route_caps &= ~flags;
+}
+
 static inline int skb_copy_to_page(struct sock *sk, char __user *from,
 				   struct sk_buff *skb, struct page *page,
 				   int off, int copy)
diff --git a/net/core/sock.c b/net/core/sock.c
index 94c4affdda9b..63530a03b8c2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1231,6 +1231,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 	sk->sk_route_caps = dst->dev->features;
 	if (sk->sk_route_caps & NETIF_F_GSO)
 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
+	sk->sk_route_caps &= ~sk->sk_route_nocaps;
 	if (sk_can_gso(sk)) {
 		if (dst->header_len) {
 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 771f8146a2e5..202cf09c4cd4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -891,7 +891,7 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 				kfree(newkey);
 				return -ENOMEM;
 			}
-			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 		}
 		if (tcp_alloc_md5sig_pool(sk) == NULL) {
 			kfree(newkey);
@@ -1021,7 +1021,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 			return -EINVAL;
 
 		tp->md5sig_info = p;
-		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 	}
 
 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
@@ -1462,7 +1462,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		if (newkey != NULL)
 			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
 					  newkey, key->keylen);
-		newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
 	}
 #endif
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5db3a2c6cb33..18a3302480cb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -873,7 +873,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 #ifdef CONFIG_TCP_MD5SIG
 	/* Calculate the MD5 hash, as we have all we need now */
 	if (md5) {
-		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 		tp->af_specific->calc_md5_hash(opts.hash_location,
 					       md5, sk, NULL, skb);
 	}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6603511e3673..2b7c3a100e2c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -604,7 +604,7 @@ static int tcp_v6_md5_do_add(struct sock *sk, struct in6_addr *peer,
 				kfree(newkey);
 				return -ENOMEM;
 			}
-			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 		}
 		if (tcp_alloc_md5sig_pool(sk) == NULL) {
 			kfree(newkey);
@@ -741,7 +741,7 @@ static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval,
 			return -ENOMEM;
 
 		tp->md5sig_info = p;
-		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 	}
 
 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
-- 
cgit v1.2.3


From 7fee226ad2397b635e2fd565a59ca3ae08a164cd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 11 May 2010 23:19:48 +0000
Subject: net: add a noref bit on skb dst

Use low order bit of skb->_skb_dst to tell dst is not refcounted.

Change _skb_dst to _skb_refdst to make sure all uses are catched.

skb_dst() returns the dst, regardless of noref bit set or not, but
with a lockdep check to make sure a noref dst is not given if current
user is not rcu protected.

New skb_dst_set_noref() helper to set an notrefcounted dst on a skb.
(with lockdep check)

skb_dst_drop() drops a reference only if skb dst was refcounted.

skb_dst_force() helper is used to force a refcount on dst, when skb
is queued and not anymore RCU protected.

Use skb_dst_force() in __sk_add_backlog(), __dev_xmit_skb() if
!IFF_XMIT_DST_RELEASE or skb enqueued on qdisc queue, in
sock_queue_rcv_skb(), in __nf_queue().

Use skb_dst_force() in dev_requeue_skb().

Note: dst_use_noref() still dirties dst, we might transform it
later to do one dirtying per jiffies.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h   | 58 ++++++++++++++++++++++++++++++++++++++++++++----
 include/net/dst.h        | 48 ++++++++++++++++++++++++++++++++++++---
 include/net/sock.h       | 13 ++++++-----
 net/core/dev.c           |  3 +++
 net/core/skbuff.c        |  2 +-
 net/core/sock.c          |  6 +++++
 net/ipv4/icmp.c          |  6 ++---
 net/ipv4/ip_options.c    |  9 ++++----
 net/ipv4/netfilter.c     |  6 ++---
 net/ipv4/route.c         |  2 +-
 net/netfilter/nf_queue.c |  2 ++
 net/sched/sch_generic.c  |  4 +++-
 12 files changed, 134 insertions(+), 25 deletions(-)

(limited to 'net/core/sock.c')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c9525bce80f6..7cdfb4d52847 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -264,7 +264,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@transport_header: Transport layer header
  *	@network_header: Network layer header
  *	@mac_header: Link layer header
- *	@_skb_dst: destination entry
+ *	@_skb_refdst: destination entry (with norefcount bit)
  *	@sp: the security path, used for xfrm
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@len: Length of actual data
@@ -328,7 +328,7 @@ struct sk_buff {
 	 */
 	char			cb[48] __aligned(8);
 
-	unsigned long		_skb_dst;
+	unsigned long		_skb_refdst;
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
@@ -419,14 +419,64 @@ struct sk_buff {
 
 #include <asm/system.h>
 
+/*
+ * skb might have a dst pointer attached, refcounted or not.
+ * _skb_refdst low order bit is set if refcount was _not_ taken
+ */
+#define SKB_DST_NOREF	1UL
+#define SKB_DST_PTRMASK	~(SKB_DST_NOREF)
+
+/**
+ * skb_dst - returns skb dst_entry
+ * @skb: buffer
+ *
+ * Returns skb dst_entry, regardless of reference taken or not.
+ */
 static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
 {
-	return (struct dst_entry *)skb->_skb_dst;
+	/* If refdst was not refcounted, check we still are in a 
+	 * rcu_read_lock section
+	 */
+	WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
+		!rcu_read_lock_held() &&
+		!rcu_read_lock_bh_held());
+	return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
 }
 
+/**
+ * skb_dst_set - sets skb dst
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was taken on dst and should
+ * be released by skb_dst_drop()
+ */
 static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
 {
-	skb->_skb_dst = (unsigned long)dst;
+	skb->_skb_refdst = (unsigned long)dst;
+}
+
+/**
+ * skb_dst_set_noref - sets skb dst, without a reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst
+ * skb_dst_drop() should not dst_release() this dst
+ */
+static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
+	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
+}
+
+/**
+ * skb_dst_is_noref - Test if skb dst isnt refcounted
+ * @skb: buffer
+ */
+static inline bool skb_dst_is_noref(const struct sk_buff *skb)
+{
+	return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
 }
 
 static inline struct rtable *skb_rtable(const struct sk_buff *skb)
diff --git a/include/net/dst.h b/include/net/dst.h
index aac5a5fcfda9..27207a13f2a6 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -168,6 +168,12 @@ static inline void dst_use(struct dst_entry *dst, unsigned long time)
 	dst->lastuse = time;
 }
 
+static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
+{
+	dst->__use++;
+	dst->lastuse = time;
+}
+
 static inline
 struct dst_entry * dst_clone(struct dst_entry * dst)
 {
@@ -177,11 +183,47 @@ struct dst_entry * dst_clone(struct dst_entry * dst)
 }
 
 extern void dst_release(struct dst_entry *dst);
+
+static inline void refdst_drop(unsigned long refdst)
+{
+	if (!(refdst & SKB_DST_NOREF))
+		dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
+}
+
+/**
+ * skb_dst_drop - drops skb dst
+ * @skb: buffer
+ *
+ * Drops dst reference count if a reference was taken.
+ */
 static inline void skb_dst_drop(struct sk_buff *skb)
 {
-	if (skb->_skb_dst)
-		dst_release(skb_dst(skb));
-	skb->_skb_dst = 0UL;
+	if (skb->_skb_refdst) {
+		refdst_drop(skb->_skb_refdst);
+		skb->_skb_refdst = 0UL;
+	}
+}
+
+static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
+{
+	nskb->_skb_refdst = oskb->_skb_refdst;
+	if (!(nskb->_skb_refdst & SKB_DST_NOREF))
+		dst_clone(skb_dst(nskb));
+}
+
+/**
+ * skb_dst_force - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted, let's do it
+ */
+static inline void skb_dst_force(struct sk_buff *skb)
+{
+	if (skb_dst_is_noref(skb)) {
+		WARN_ON(!rcu_read_lock_held());
+		skb->_skb_refdst &= ~SKB_DST_NOREF;
+		dst_clone(skb_dst(skb));
+	}
 }
 
 /* Children define the path of the packet through the
diff --git a/include/net/sock.h b/include/net/sock.h
index aed16eb9db4b..5697caf8cc76 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -600,12 +600,15 @@ static inline int sk_stream_memory_free(struct sock *sk)
 /* OOB backlog add */
 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
-	if (!sk->sk_backlog.tail) {
-		sk->sk_backlog.head = sk->sk_backlog.tail = skb;
-	} else {
+	/* dont let skb dst not refcounted, we are going to leave rcu lock */
+	skb_dst_force(skb);
+
+	if (!sk->sk_backlog.tail)
+		sk->sk_backlog.head = skb;
+	else
 		sk->sk_backlog.tail->next = skb;
-		sk->sk_backlog.tail = skb;
-	}
+
+	sk->sk_backlog.tail = skb;
 	skb->next = NULL;
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index cdcb9cbedf41..6c820650b80f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2052,6 +2052,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		 * waiting to be sent out; and the qdisc is not running -
 		 * xmit the skb directly.
 		 */
+		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+			skb_dst_force(skb);
 		__qdisc_update_bstats(q, skb->len);
 		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
 			__qdisc_run(q);
@@ -2060,6 +2062,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 		rc = NET_XMIT_SUCCESS;
 	} else {
+		skb_dst_force(skb);
 		rc = qdisc_enqueue_root(skb, q);
 		qdisc_run(q);
 	}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a9b0e1f77806..c543dd252433 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -520,7 +520,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->transport_header	= old->transport_header;
 	new->network_header	= old->network_header;
 	new->mac_header		= old->mac_header;
-	skb_dst_set(new, dst_clone(skb_dst(old)));
+	skb_dst_copy(new, old);
 	new->rxhash		= old->rxhash;
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
diff --git a/net/core/sock.c b/net/core/sock.c
index 63530a03b8c2..bf88a167c8f2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -307,6 +307,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	 */
 	skb_len = skb->len;
 
+	/* we escape from rcu protected region, make sure we dont leak
+	 * a norefcounted dst
+	 */
+	skb_dst_force(skb);
+
 	spin_lock_irqsave(&list->lock, flags);
 	skb->dropcount = atomic_read(&sk->sk_drops);
 	__skb_queue_tail(list, skb);
@@ -1536,6 +1541,7 @@ static void __release_sock(struct sock *sk)
 		do {
 			struct sk_buff *next = skb->next;
 
+			WARN_ON_ONCE(skb_dst_is_noref(skb));
 			skb->next = NULL;
 			sk_backlog_rcv(sk, skb);
 
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f3d339f728b0..d65e9215bcd7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -587,20 +587,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 			err = __ip_route_output_key(net, &rt2, &fl);
 		else {
 			struct flowi fl2 = {};
-			struct dst_entry *odst;
+			unsigned long orefdst;
 
 			fl2.fl4_dst = fl.fl4_src;
 			if (ip_route_output_key(net, &rt2, &fl2))
 				goto relookup_failed;
 
 			/* Ugh! */
-			odst = skb_dst(skb_in);
+			orefdst = skb_in->_skb_refdst; /* save old refdst */
 			err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
 					     RT_TOS(tos), rt2->u.dst.dev);
 
 			dst_release(&rt2->u.dst);
 			rt2 = skb_rtable(skb_in);
-			skb_dst_set(skb_in, odst);
+			skb_in->_skb_refdst = orefdst; /* restore old refdst */
 		}
 
 		if (err)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 4c09a31fd140..3244133c24f6 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -601,6 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	unsigned char *optptr = skb_network_header(skb) + opt->srr;
 	struct rtable *rt = skb_rtable(skb);
 	struct rtable *rt2;
+	unsigned long orefdst;
 	int err;
 
 	if (!opt->srr)
@@ -624,16 +625,16 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 		}
 		memcpy(&nexthop, &optptr[srrptr-1], 4);
 
-		rt = skb_rtable(skb);
+		orefdst = skb->_skb_refdst;
 		skb_dst_set(skb, NULL);
 		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
 		rt2 = skb_rtable(skb);
 		if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
-			ip_rt_put(rt2);
-			skb_dst_set(skb, &rt->u.dst);
+			skb_dst_drop(skb);
+			skb->_skb_refdst = orefdst;
 			return -EINVAL;
 		}
-		ip_rt_put(rt);
+		refdst_drop(orefdst);
 		if (rt2->rt_type != RTN_LOCAL)
 			break;
 		/* Superfast 8) loopback forward */
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 82fb43c5c59e..07de855e2175 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -17,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 	struct flowi fl = {};
-	struct dst_entry *odst;
+	unsigned long orefdst;
 	unsigned int hh_len;
 	unsigned int type;
 
@@ -51,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 		if (ip_route_output_key(net, &rt, &fl) != 0)
 			return -1;
 
-		odst = skb_dst(skb);
+		orefdst = skb->_skb_refdst;
 		if (ip_route_input(skb, iph->daddr, iph->saddr,
 				   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
 			dst_release(&rt->u.dst);
 			return -1;
 		}
 		dst_release(&rt->u.dst);
-		dst_release(odst);
+		refdst_drop(orefdst);
 	}
 
 	if (skb_dst(skb)->error)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index dea3f9264250..705eccfb4769 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3033,7 +3033,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 				continue;
 			if (rt_is_expired(rt))
 				continue;
-			skb_dst_set(skb, dst_clone(&rt->u.dst));
+			skb_dst_set_noref(skb, &rt->u.dst);
 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					 1, NLM_F_MULTI) <= 0) {
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 0b1103c0b1f3..78b3cf9c519c 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -9,6 +9,7 @@
 #include <linux/rcupdate.h>
 #include <net/protocol.h>
 #include <net/netfilter/nf_queue.h>
+#include <net/dst.h>
 
 #include "nf_internals.h"
 
@@ -170,6 +171,7 @@ static int __nf_queue(struct sk_buff *skb,
 			dev_hold(physoutdev);
 	}
 #endif
+	skb_dst_force(skb);
 	afinfo->saveroute(skb, entry);
 	status = qh->outfn(entry, queuenum);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a969b111bd76..a63029ef3edd 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -26,6 +26,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <net/pkt_sched.h>
+#include <net/dst.h>
 
 /* Main transmission queue. */
 
@@ -40,6 +41,7 @@
 
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
+	skb_dst_force(skb);
 	q->gso_skb = skb;
 	q->qstats.requeues++;
 	q->q.qlen++;	/* it's still part of the queue */
@@ -179,7 +181,7 @@ static inline int qdisc_restart(struct Qdisc *q)
 	skb = dequeue_skb(q);
 	if (unlikely(!skb))
 		return 0;
-
+	WARN_ON_ONCE(skb_dst_is_noref(skb));
 	root_lock = qdisc_lock(q);
 	dev = qdisc_dev(q);
 	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
-- 
cgit v1.2.3