summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-03-22 15:12:57 -0400
committerDavid S. Miller <davem@davemloft.net>2018-03-22 15:12:57 -0400
commit1a2e10a24039147047cc21759ec57314c333c1ac (patch)
tree0b09f0de0ff410fef91ad0224a241b322bd0eb05
parentf2d254fac13cc7c86871ea607c4ab1afa7f13e2e (diff)
parentd9ff3049739e349b5380b96226f9ad766741773d (diff)
downloadlinux-1a2e10a24039147047cc21759ec57314c333c1ac.tar.bz2
Merge branch 'Rework-ip_ra_chain-protection'
Kirill Tkhai says: ==================== Rework ip_ra_chain protection Commit 1215e51edad1 "ipv4: fix a deadlock in ip_ra_control" made rtnl_lock() be used in raw_close(). This function is called on every RAW socket destruction, so that rtnl_mutex is taken every time. This scales very sadly. I observe cleanup_net() spending a lot of time in rtnl_lock() and raw_close() is one of the biggest rtnl user (since we have percpu net->ipv4.icmp_sk). This patchset reworks the locking: reverts the problem commit and its descendant, and introduces rtnl-independent locking. This may have a continuation, and someone may work on killing rtnl_lock() in mrtsock_destruct() in the future. v3: Change patches order: [2/5] and [3/5]. v2: Fix sparse warning [4/5], as reported by kbuild test robot. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/ip.h13
-rw-r--r--include/net/netns/ipv4.h2
-rw-r--r--net/core/net_namespace.c1
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_sockglue.c34
-rw-r--r--net/ipv4/ipmr.c11
-rw-r--r--net/ipv4/raw.c2
7 files changed, 38 insertions, 30 deletions
diff --git a/include/net/ip.h b/include/net/ip.h
index fe63ba95d12b..d53b5a9eae34 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb)
return 0;
}
+/* Special input handler for packets caught by router alert option.
+ They are selected only by protocol field, and then processed likely
+ local ones; but only if someone wants them! Otherwise, router
+ not running rsvpd will kill RSVP.
+
+ It is user level problem, what it will make with them.
+ I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+ but receiver should be enough clever f.e. to forward mtrace requests,
+ sent to multicast group to reach destination designated router.
+ */
+
struct ip_ra_chain {
struct ip_ra_chain __rcu *next;
struct sock *sk;
@@ -101,8 +112,6 @@ struct ip_ra_chain {
struct rcu_head rcu;
};
-extern struct ip_ra_chain __rcu *ip_ra_chain;
-
/* IP flags. */
#define IP_CE 0x8000 /* Flag: "Congestion" */
#define IP_DF 0x4000 /* Flag: "Don't Fragment" */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 382bfd7583cf..8491bc9c86b1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -49,6 +49,8 @@ struct netns_ipv4 {
#endif
struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt;
+ struct ip_ra_chain __rcu *ra_chain;
+ struct mutex ra_mutex;
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops *rules_ops;
bool fib_has_custom_rules;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c340d5cfbdec..95ba2c53bd9a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
net->user_ns = user_ns;
idr_init(&net->netns_ids);
spin_lock_init(&net->nsid_lock);
+ mutex_init(&net->ipv4.ra_mutex);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 57fc13c6ab2b..7582713dd18f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
- for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
+ for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
/* If socket is bound to an interface, only report
@@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
*/
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
- sk->sk_bound_dev_if == dev->ifindex) &&
- net_eq(sock_net(sk), net)) {
+ sk->sk_bound_dev_if == dev->ifindex)) {
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
return true;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 74c962b9b09c..5ad2d8ed3a3f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -322,20 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
return 0;
}
-
-/* Special input handler for packets caught by router alert option.
- They are selected only by protocol field, and then processed likely
- local ones; but only if someone wants them! Otherwise, router
- not running rsvpd will kill RSVP.
-
- It is user level problem, what it will make with them.
- I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
- but receiver should be enough clever f.e. to forward mtrace requests,
- sent to multicast group to reach destination designated router.
- */
-struct ip_ra_chain __rcu *ip_ra_chain;
-
-
static void ip_ra_destroy_rcu(struct rcu_head *head)
{
struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
@@ -349,23 +335,28 @@ int ip_ra_control(struct sock *sk, unsigned char on,
{
struct ip_ra_chain *ra, *new_ra;
struct ip_ra_chain __rcu **rap;
+ struct net *net = sock_net(sk);
if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
return -EINVAL;
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
- for (rap = &ip_ra_chain;
- (ra = rtnl_dereference(*rap)) != NULL;
+ mutex_lock(&net->ipv4.ra_mutex);
+ for (rap = &net->ipv4.ra_chain;
+ (ra = rcu_dereference_protected(*rap,
+ lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
+ mutex_unlock(&net->ipv4.ra_mutex);
kfree(new_ra);
return -EADDRINUSE;
}
/* dont let ip_call_ra_chain() use sk again */
ra->sk = NULL;
RCU_INIT_POINTER(*rap, ra->next);
+ mutex_unlock(&net->ipv4.ra_mutex);
if (ra->destructor)
ra->destructor(sk);
@@ -379,14 +370,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
return 0;
}
}
- if (!new_ra)
+ if (!new_ra) {
+ mutex_unlock(&net->ipv4.ra_mutex);
return -ENOBUFS;
+ }
new_ra->sk = sk;
new_ra->destructor = destructor;
RCU_INIT_POINTER(new_ra->next, ra);
rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
+ mutex_unlock(&net->ipv4.ra_mutex);
return 0;
}
@@ -586,7 +580,6 @@ static bool setsockopt_needs_rtnl(int optname)
case MCAST_LEAVE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_UNBLOCK_SOURCE:
- case IP_ROUTER_ALERT:
return true;
}
return false;
@@ -639,6 +632,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
/* If optlen==0, it is equivalent to val == 0 */
+ if (optname == IP_ROUTER_ALERT)
+ return ip_ra_control(sk, val ? 1 : 0, NULL);
if (ip_mroute_opt(optname))
return ip_mroute_setsockopt(sk, optname, optval, optlen);
@@ -1149,9 +1144,6 @@ mc_msf_out:
goto e_inval;
inet->mc_all = val;
break;
- case IP_ROUTER_ALERT:
- err = ip_ra_control(sk, val ? 1 : 0, NULL);
- break;
case IP_FREEBIND:
if (optlen < 1)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d752a70855d8..f6be5db16da2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk)
struct net *net = sock_net(sk);
struct mr_table *mrt;
- ASSERT_RTNL();
+ rtnl_lock();
ipmr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk)
mroute_clean_tables(mrt, false);
}
}
+ rtnl_unlock();
}
/* Socket options and virtual interface manipulation. The whole
@@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
if (sk != rcu_access_pointer(mrt->mroute_sk)) {
ret = -EACCES;
} else {
+ /* We need to unlock here because mrtsock_destruct takes
+ * care of rtnl itself and we can't change that due to
+ * the IP_ROUTER_ALERT setsockopt which runs without it.
+ */
+ rtnl_unlock();
ret = ip_ra_control(sk, 0, NULL);
- goto out_unlock;
+ goto out;
}
break;
case MRT_ADD_VIF:
@@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
}
out_unlock:
rtnl_unlock();
+out:
return ret;
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 54648d20bf0f..720bef7da2f6 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout)
/*
* Raw sockets may have direct kernel references. Kill them.
*/
- rtnl_lock();
ip_ra_control(sk, 0, NULL);
- rtnl_unlock();
sk_common_release(sk);
}