From f828ad0ce22474820255c6faeb88b05724a8c1eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Apr 2015 16:20:58 -0700 Subject: tcp_westwood: fix tcp_westwood_info() I forgot to update tcp_westwood when changing get_info() behavior, this patch should fix this. Fixes: 64f40ff5bbdb ("tcp: prepare CC get_info() access from getsockopt()") Reported-by: kbuild test robot Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index b3c57cceb990..c10732e39837 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -256,18 +256,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) } /* Extract info for Tcp socket info provided via netlink. */ -static int tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct westwood *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = 1, - .tcpv_rtt = jiffies_to_usecs(ca->rtt), - .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), - }; + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = 0; + info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt), + info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), - return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } return 0; } -- cgit v1.2.3 From 355b590ca24a53f5e2f6999218b5321eaf730a7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 May 2015 10:37:49 -0700 Subject: ipv4: speedup ip_idents_reserve() Under stress, ip_idents_reserve() is accessing a contended cache line twice, with non optimal MESI transactions. If we place timestamps in separate location, we reduce this pressure by ~50% and allow atomic_add_return() to issue a Request for Ownership. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a78540f28276..81f513d6f27f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -457,12 +457,9 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, } #define IP_IDENTS_SZ 2048u -struct ip_ident_bucket { - atomic_t id; - u32 stamp32; -}; -static struct ip_ident_bucket *ip_idents __read_mostly; +static atomic_t *ip_idents __read_mostly; +static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker @@ -470,15 +467,16 @@ static struct ip_ident_bucket *ip_idents __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(bucket->stamp32); + u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; + atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(*p_tstamp); u32 now = (u32)jiffies; u32 delta = 0; - if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - return atomic_add_return(segs + delta, &bucket->id) - segs; + return atomic_add_return(segs + delta, p_id) - segs; } EXPORT_SYMBOL(ip_idents_reserve); @@ -2741,6 +2739,10 @@ int __init ip_rt_init(void) prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); + if (!ip_tstamps) + panic("IP: failed to allocate ip_tstamps\n"); + for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); -- cgit v1.2.3 From 7eee8cd4d8d6ce3f70e6bea716c19b28635a5232 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 30 Apr 2015 17:25:12 +0800 Subject: ipv4: remove the unnecessary codes in fib_info_hash_move The whole hlist will be moved, so not need to call hlist_del before add the hlist_node to other hlist_head. Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8d695b6659c7..28ec3c1823bf 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -713,8 +713,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *dest; unsigned int new_hash; - hlist_del(&fi->fib_hash); - new_hash = fib_info_hashfn(fi); dest = &new_info_hash[new_hash]; hlist_add_head(&fi->fib_hash, dest); @@ -731,8 +729,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *ldest; unsigned int new_hash; - hlist_del(&fi->fib_lhash); - new_hash = fib_laddr_hashfn(fi->fib_prefsrc); ldest = &new_laddrhash[new_hash]; hlist_add_head(&fi->fib_lhash, ldest); -- cgit v1.2.3 From 196da974758550a3933c8b0244ef98148df10552 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Fri, 1 May 2015 01:10:57 +0200 Subject: tcp: move struct tcp_sacktag_state to tcp_ack() Later patch passes two values set in tcp_sacktag_one() to tcp_clean_rtx_queue(). Prepare passing them via struct tcp_sacktag_state. Acked-by: Yuchung Cheng Cc: Eric Dumazet Signed-off-by: Kenneth Klette Jonassen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bc790ea9960f..9902cf1abdf4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1634,7 +1634,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una, long *sack_rtt_us) + u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1642,7 +1642,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; - struct tcp_sacktag_state state; struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; @@ -1650,9 +1649,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, int i, j; int first_sack_index; - state.flag = 0; - state.reord = tp->packets_out; - state.rtt_us = -1L; + state->flag = 0; + state->reord = tp->packets_out; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1663,7 +1661,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) - state.flag |= FLAG_DSACKING_ACK; + state->flag |= FLAG_DSACKING_ACK; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1728,7 +1726,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } skb = tcp_write_queue_head(sk); - state.fack_count = 0; + state->fack_count = 0; i = 0; if (!tp->sacked_out) { @@ -1762,10 +1760,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, /* Head todo? */ if (before(start_seq, cache->start_seq)) { - skb = tcp_sacktag_skip(skb, sk, &state, + skb = tcp_sacktag_skip(skb, sk, state, start_seq); skb = tcp_sacktag_walk(skb, sk, next_dup, - &state, + state, start_seq, cache->start_seq, dup_sack); @@ -1776,7 +1774,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, goto advance_sp; skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, - &state, + state, cache->end_seq); /* ...tail remains todo... */ @@ -1785,12 +1783,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state.fack_count = tp->fackets_out; + state->fack_count = tp->fackets_out; cache++; goto walk; } - skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq); + skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq); /* Check overlap against next cached too (past this one already) */ cache++; continue; @@ -1800,12 +1798,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state.fack_count = tp->fackets_out; + state->fack_count = tp->fackets_out; } - skb = tcp_sacktag_skip(skb, sk, &state, start_seq); + skb = tcp_sacktag_skip(skb, sk, state, start_seq); walk: - skb = tcp_sacktag_walk(skb, sk, next_dup, &state, + skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, end_seq, dup_sack); advance_sp: @@ -1820,9 +1818,9 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - if ((state.reord < tp->fackets_out) && + if ((state->reord < tp->fackets_out) && ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) - tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); + tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); tcp_mark_lost_retrans(sk); tcp_verify_left_out(tp); @@ -1834,8 +1832,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif - *sack_rtt_us = state.rtt_us; - return state.flag; + return state->flag; } /* Limits sacked_out so that sum with lost_out isn't ever larger than @@ -3459,6 +3456,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sacktag_state sack_state; u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -3467,7 +3465,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - long sack_rtt_us = -1L; + + sack_state.rtt_us = -1L; /* We very likely will need to access write queue head. */ prefetchw(sk->sk_write_queue.next); @@ -3531,7 +3530,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt_us); + &sack_state); if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; @@ -3556,7 +3555,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, - sack_rtt_us); + sack_state.rtt_us); acked -= tp->packets_out; /* Advance cwnd if state allows */ @@ -3608,7 +3607,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt_us); + &sack_state); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } -- cgit v1.2.3 From 31231a8a873026410eab438c5757430546a517d1 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Fri, 1 May 2015 01:10:58 +0200 Subject: tcp: improve RTT from SACK for CC tcp_sacktag_one() always picks the earliest sequence SACKed for RTT. This might not make sense for congestion control in cases where: 1. ACKs are lost, i.e. a SACK following a lost SACK covers both new and old segments at the receiver. 2. The receiver disregards the RFC 5681 recommendation to immediately ACK out-of-order segments. Give congestion control a RTT for the latest segment SACKed, which is the most accurate RTT estimate, but preserve the conservative RTT for RTO. Removes the call to skb_mstamp_get() in tcp_sacktag_one(). Cc: Yuchung Cheng Cc: Eric Dumazet Signed-off-by: Kenneth Klette Jonassen Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9902cf1abdf4..f563d2a88809 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1130,7 +1130,12 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sacktag_state { int reord; int fack_count; - long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + /* Timestamps for earliest and latest never-retransmitted segment + * that was SACKed. RTO needs the earliest RTT to stay conservative, + * but congestion control should still get an accurate delay signal. + */ + struct skb_mstamp first_sackt; + struct skb_mstamp last_sackt; int flag; }; @@ -1233,14 +1238,9 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; - /* Pick the earliest sequence sacked for RTT */ - if (state->rtt_us < 0) { - struct skb_mstamp now; - - skb_mstamp_get(&now); - state->rtt_us = skb_mstamp_us_delta(&now, - xmit_time); - } + if (state->first_sackt.v64 == 0) + state->first_sackt = *xmit_time; + state->last_sackt = *xmit_time; } if (sacked & TCPCB_LOST) { @@ -3049,7 +3049,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, long sack_rtt_us) + u32 prior_snd_una, + struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt, now; @@ -3057,8 +3058,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; - long ca_seq_rtt_us = -1L; + long sack_rtt_us = -1L; long seq_rtt_us = -1L; + long ca_rtt_us = -1L; struct sk_buff *skb; u32 pkts_acked = 0; bool rtt_update; @@ -3147,7 +3149,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, skb_mstamp_get(&now); if (likely(first_ackt.v64)) { seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); - ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + if (sack->first_sackt.v64) { + sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); + ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); } rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); @@ -3178,10 +3184,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { @@ -3466,7 +3470,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - sack_state.rtt_us = -1L; + sack_state.first_sackt.v64 = 0; /* We very likely will need to access write queue head. */ prefetchw(sk->sk_write_queue.next); @@ -3555,7 +3559,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, - sack_state.rtt_us); + &sack_state); acked -= tp->packets_out; /* Advance cwnd if state allows */ -- cgit v1.2.3 From 138998fdd12e7362756e158d00856a2aabd5f0c1 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Fri, 1 May 2015 01:10:59 +0200 Subject: tcp: invoke pkts_acked hook on every ACK Invoking pkts_acked is currently conditioned on FLAG_ACKED: receiving a cumulative ACK of new data, or ACK with SYN flag set. Remove this condition so that CC may get RTT measurements from all SACKs. Cc: Yuchung Cheng Cc: Eric Dumazet Cc: Neal Cardwell Signed-off-by: Kenneth Klette Jonassen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f563d2a88809..09bdc4abfcbb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3159,9 +3159,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); if (flag & FLAG_ACKED) { - const struct tcp_congestion_ops *ca_ops - = inet_csk(sk)->icsk_ca_ops; - tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { @@ -3184,9 +3181,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) - ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); - } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent @@ -3196,6 +3190,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_rearm_rto(sk); } + if (icsk->icsk_ca_ops->pkts_acked) + icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); + #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); WARN_ON((int)tp->lost_out < 0); -- cgit v1.2.3 From 6a21165480a066a27c1f1dbd32aec581c612ba23 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 1 May 2015 16:39:54 +0200 Subject: net: ipv4: route: Fix sending IGMP messages with link address In setups with a global scope address on an interface, and a lesser scope address on an interface sending IGMP reports, the reports can be sent using the other interfaces global scope address rather than the local interface address. RFC 2236 suggests: Ignore the Report if you cannot identify the source address of the packet as belonging to a subnet assigned to the interface on which the packet was received. since such reports could be forged. Look at the protocol when deciding if a RT_SCOPE_LINK address should be used for the packet. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7cad0bf1c71c..9e15f5ca4495 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2091,7 +2091,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto out; } if (ipv4_is_local_multicast(fl4->daddr) || - ipv4_is_lbcast(fl4->daddr)) { + ipv4_is_lbcast(fl4->daddr) || + fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); -- cgit v1.2.3 From 9afd85c9e4552b276e2f4cfefd622bdeeffbbf26 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sat, 2 May 2015 14:01:07 +0200 Subject: net: Export IGMP/MLD message validation code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this patch, the IGMP and MLD message validation functions are moved from the bridge code to IPv4/IPv6 multicast files. Some small refactoring was done to enhance readibility and to iron out some differences in behaviour between the IGMP and MLD parsing code (e.g. the skb-cloning of MLD messages is now only done if necessary, just like the IGMP part always did). Finally, these IGMP and MLD message validation functions are exported so that not only the bridge can use it but batman-adv later, too. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- include/linux/igmp.h | 1 + include/linux/skbuff.h | 3 + include/net/addrconf.h | 1 + net/bridge/br_multicast.c | 218 +++++++--------------------------------------- net/core/skbuff.c | 87 ++++++++++++++++++ net/ipv4/igmp.c | 162 ++++++++++++++++++++++++++++++++++ net/ipv6/Makefile | 1 + net/ipv6/mcast_snoop.c | 213 ++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 498 insertions(+), 188 deletions(-) create mode 100644 net/ipv6/mcast_snoop.c (limited to 'net/ipv4') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 2c677afeea47..193ad488d3e2 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -130,5 +130,6 @@ extern void ip_mc_unmap(struct in_device *); extern void ip_mc_remap(struct in_device *); extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr); extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr); +int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed); #endif diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index acb83e249e3f..9c2f793573fa 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3419,6 +3419,9 @@ static inline void skb_checksum_none_assert(const struct sk_buff *skb) bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); int skb_checksum_setup(struct sk_buff *skb, bool recalculate); +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 80456f72d70a..def59d3a34d5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -142,6 +142,7 @@ void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); +int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed); void addrconf_dad_failure(struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b52f4cb8aee9..2d69d5cab52f 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -975,9 +975,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, int err = 0; __be32 group; - if (!pskb_may_pull(skb, sizeof(*ih))) - return -EINVAL; - ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = sizeof(*ih); @@ -1248,25 +1245,14 @@ static int br_ip4_multicast_query(struct net_bridge *br, max_delay = 10 * HZ; group = 0; } - } else { - if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) { - err = -EINVAL; - goto out; - } - + } else if (skb->len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) goto out; max_delay = ih3->code ? IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; - } - - /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer - * all-systems destination addresses (224.0.0.1) for general queries - */ - if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) { - err = -EINVAL; + } else { goto out; } @@ -1329,12 +1315,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ - if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) { - err = -EINVAL; - goto out; - } - if (skb->len == sizeof(*mld)) { if (!pskb_may_pull(skb, sizeof(*mld))) { err = -EINVAL; @@ -1358,14 +1338,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, is_general_query = group && ipv6_addr_any(group); - /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer - * all-nodes destination address (ff02::1) for general queries - */ - if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) { - err = -EINVAL; - goto out; - } - if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); saddr.u.ip6 = ip6h->saddr; @@ -1557,66 +1529,22 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2 = skb; - const struct iphdr *iph; + struct sk_buff *skb_trimmed = NULL; struct igmphdr *ih; - unsigned int len; - unsigned int offset; int err; - /* We treat OOM as packet loss for now. */ - if (!pskb_may_pull(skb, sizeof(*iph))) - return -EINVAL; - - iph = ip_hdr(skb); - - if (iph->ihl < 5 || iph->version != 4) - return -EINVAL; - - if (!pskb_may_pull(skb, ip_hdrlen(skb))) - return -EINVAL; - - iph = ip_hdr(skb); + err = ip_mc_check_igmp(skb, &skb_trimmed); - if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - return -EINVAL; - - if (iph->protocol != IPPROTO_IGMP) { - if (!ipv4_is_local_multicast(iph->daddr)) + if (err == -ENOMSG) { + if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; + } else if (err < 0) { + return err; } - len = ntohs(iph->tot_len); - if (skb->len < len || len < ip_hdrlen(skb)) - return -EINVAL; - - if (skb->len > len) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = pskb_trim_rcsum(skb2, len); - if (err) - goto err_out; - } - - len -= ip_hdrlen(skb2); - offset = skb_network_offset(skb2) + ip_hdrlen(skb2); - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - - err = -EINVAL; - if (!pskb_may_pull(skb2, sizeof(*ih))) - goto out; - - if (skb_checksum_simple_validate(skb2)) - goto out; - - err = 0; - BR_INPUT_SKB_CB(skb)->igmp = 1; - ih = igmp_hdr(skb2); + ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: @@ -1625,21 +1553,19 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_add_group(br, port, ih->group, vid); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb2, vid); + err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - err = br_ip4_multicast_query(br, port, skb2, vid); + err = br_ip4_multicast_query(br, port, skb_trimmed, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(br, port, ih->group, vid); break; } -out: - __skb_push(skb2, offset); -err_out: - if (skb2 != skb) - kfree_skb(skb2); + if (skb_trimmed) + kfree_skb(skb_trimmed); + return err; } @@ -1649,126 +1575,42 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2; - const struct ipv6hdr *ip6h; - u8 icmp6_type; - u8 nexthdr; - __be16 frag_off; - unsigned int len; - int offset; + struct sk_buff *skb_trimmed = NULL; + struct mld_msg *mld; int err; - if (!pskb_may_pull(skb, sizeof(*ip6h))) - return -EINVAL; + err = ipv6_mc_check_mld(skb, &skb_trimmed); - ip6h = ipv6_hdr(skb); - - /* - * We're interested in MLD messages only. - * - Version is 6 - * - MLD has always Router Alert hop-by-hop option - * - But we do not support jumbrograms. - */ - if (ip6h->version != 6) - return 0; - - /* Prevent flooding this packet if there is no listener present */ - if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) - BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - - if (ip6h->nexthdr != IPPROTO_HOPOPTS || - ip6h->payload_len == 0) - return 0; - - len = ntohs(ip6h->payload_len) + sizeof(*ip6h); - if (skb->len < len) - return -EINVAL; - - nexthdr = ip6h->nexthdr; - offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off); - - if (offset < 0 || nexthdr != IPPROTO_ICMPV6) + if (err == -ENOMSG) { + if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; - - /* Okay, we found ICMPv6 header */ - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = -EINVAL; - if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr))) - goto out; - - len -= offset - skb_network_offset(skb2); - - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - skb_postpull_rcsum(skb2, skb_network_header(skb2), - skb_network_header_len(skb2)); - - icmp6_type = icmp6_hdr(skb2)->icmp6_type; - - switch (icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - break; - default: - err = 0; - goto out; - } - - /* Okay, we found MLD message. Check further. */ - if (skb2->len > len) { - err = pskb_trim_rcsum(skb2, len); - if (err) - goto out; - err = -EINVAL; + } else if (err < 0) { + return err; } - ip6h = ipv6_hdr(skb2); - - if (skb_checksum_validate(skb2, IPPROTO_ICMPV6, ip6_compute_pseudo)) - goto out; - - err = 0; - BR_INPUT_SKB_CB(skb)->igmp = 1; + mld = (struct mld_msg *)skb_transport_header(skb); - switch (icmp6_type) { + switch (mld->mld_type) { case ICMPV6_MGM_REPORT: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid); break; - } case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb2, vid); + err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb2, vid); + err = br_ip6_multicast_query(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_REDUCTION: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid); - } + break; } -out: - kfree_skb(skb2); + if (skb_trimmed) + kfree_skb(skb_trimmed); + return err; } #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3cfff2a3d651..1e4278a4dd7e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4030,6 +4030,93 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate) } EXPORT_SYMBOL(skb_checksum_setup); +/** + * skb_checksum_maybe_trim - maybe trims the given skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * + * Checks whether the given skb has data beyond the given transport length. + * If so, returns a cloned skb trimmed to this transport length. + * Otherwise returns the provided skb. Returns NULL in error cases + * (e.g. transport_len exceeds skb length or out-of-memory). + * + * Caller needs to set the skb transport header and release the returned skb. + * Provided skb is consumed. + */ +static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, + unsigned int transport_len) +{ + struct sk_buff *skb_chk; + unsigned int len = skb_transport_offset(skb) + transport_len; + int ret; + + if (skb->len < len) { + kfree_skb(skb); + return NULL; + } else if (skb->len == len) { + return skb; + } + + skb_chk = skb_clone(skb, GFP_ATOMIC); + kfree_skb(skb); + + if (!skb_chk) + return NULL; + + ret = pskb_trim_rcsum(skb_chk, len); + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} + +/** + * skb_checksum_trimmed - validate checksum of an skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * @skb_chkf: checksum function to use + * + * Applies the given checksum function skb_chkf to the provided skb. + * Returns a checked and maybe trimmed skb. Returns NULL on error. + * + * If the skb has data beyond the given transport length, then a + * trimmed & cloned skb is checked and returned. + * + * Caller needs to set the skb transport header and release the returned skb. + * Provided skb is consumed. + */ +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)) +{ + struct sk_buff *skb_chk; + unsigned int offset = skb_transport_offset(skb); + int ret; + + skb_chk = skb_checksum_maybe_trim(skb, transport_len); + if (!skb_chk) + return NULL; + + if (!pskb_may_pull(skb_chk, offset)) { + kfree_skb(skb_chk); + return NULL; + } + + __skb_pull(skb_chk, offset); + ret = skb_chkf(skb_chk); + __skb_push(skb_chk, offset); + + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} +EXPORT_SYMBOL(skb_checksum_trimmed); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a3a697f5ffba..651cdf648ec4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1339,6 +1339,168 @@ out: } EXPORT_SYMBOL(ip_mc_inc_group); +static int ip_mc_check_iphdr(struct sk_buff *skb) +{ + const struct iphdr *iph; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) + return -EINVAL; + + offset += ip_hdrlen(skb) - sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + return -EINVAL; + + len = skb_network_offset(skb) + ntohs(iph->tot_len); + if (skb->len < len || len < offset) + return -EINVAL; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmpv3_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ip_mc_check_igmp_query(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmphdr); + if (skb->len < len) + return -EINVAL; + + /* IGMPv{1,2}? */ + if (skb->len != len) { + /* or IGMPv3? */ + len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer + * all-systems destination addresses (224.0.0.1) for general queries + */ + if (!igmp_hdr(skb)->group && + ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) + return -EINVAL; + + return 0; +} + +static int ip_mc_check_igmp_msg(struct sk_buff *skb) +{ + switch (igmp_hdr(skb)->type) { + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + /* fall through */ + return 0; + case IGMPV3_HOST_MEMBERSHIP_REPORT: + return ip_mc_check_igmp_reportv3(skb); + case IGMP_HOST_MEMBERSHIP_QUERY: + return ip_mc_check_igmp_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_simple_validate(skb); +} + +static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); + int ret; + + transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + + skb_get(skb); + skb_chk = skb_checksum_trimmed(skb, transport_len, + ip_mc_validate_checksum); + if (!skb_chk) + return -EINVAL; + + if (!pskb_may_pull(skb_chk, len)) { + kfree_skb(skb_chk); + return -EINVAL; + } + + ret = ip_mc_check_igmp_msg(skb_chk); + if (ret) { + kfree_skb(skb_chk); + return ret; + } + + if (skb_trimmed) + *skb_trimmed = skb_chk; + else + kfree_skb(skb_chk); + + return 0; +} + +/** + * ip_mc_check_igmp - checks whether this is a sane IGMP packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional) + * + * Checks whether an IPv4 packet is a valid IGMP packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an IGMP packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * The caller needs to release a reference count from any returned skb_trimmed. + */ +int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret = ip_mc_check_iphdr(skb); + + if (ret < 0) + return ret; + + if (ip_hdr(skb)->protocol != IPPROTO_IGMP) + return -ENOMSG; + + return __ip_mc_check_igmp(skb, skb_trimmed); +} +EXPORT_SYMBOL(ip_mc_check_igmp); + /* * Resend IGMP JOIN report; used by netdev notifier. */ diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 2e8c06108ab9..0f3f1999719a 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -48,4 +48,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o ifneq ($(CONFIG_IPV6),) obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o +obj-y += mcast_snoop.o endif diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c new file mode 100644 index 000000000000..1a2cbc13a7d3 --- /dev/null +++ b/net/ipv6/mcast_snoop.c @@ -0,0 +1,213 @@ +/* Copyright (C) 2010: YOSHIFUJI Hideaki + * Copyright (C) 2015: Linus Lüssing + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + * + * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki. + */ + +#include +#include +#include +#include +#include + +static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + ip6h = ipv6_hdr(skb); + + if (ip6h->version != 6) + return -EINVAL; + + len = offset + ntohs(ip6h->payload_len); + if (skb->len < len || len <= offset) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_exthdrs(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h; + unsigned int offset; + u8 nexthdr; + __be16 frag_off; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_HOPOPTS) + return -ENOMSG; + + nexthdr = ip6h->nexthdr; + offset = skb_network_offset(skb) + sizeof(*ip6h); + offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); + + if (offset < 0) + return -EINVAL; + + if (nexthdr != IPPROTO_ICMPV6) + return -ENOMSG; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct mld2_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ipv6_mc_check_mld_query(struct sk_buff *skb) +{ + struct mld_msg *mld; + unsigned int len = skb_transport_offset(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + len += sizeof(struct mld_msg); + if (skb->len < len) + return -EINVAL; + + /* MLDv1? */ + if (skb->len != len) { + /* or MLDv2? */ + len += sizeof(struct mld2_query) - sizeof(struct mld_msg); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + mld = (struct mld_msg *)skb_transport_header(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer + * all-nodes destination address (ff02::1) for general queries + */ + if (ipv6_addr_any(&mld->mld_mca) && + !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_mld_msg(struct sk_buff *skb) +{ + struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb); + + switch (mld->mld_type) { + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MGM_REPORT: + /* fall through */ + return 0; + case ICMPV6_MLD2_REPORT: + return ipv6_mc_check_mld_reportv2(skb); + case ICMPV6_MGM_QUERY: + return ipv6_mc_check_mld_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); +} + +static int __ipv6_mc_check_mld(struct sk_buff *skb, + struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk = NULL; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); + int ret; + + transport_len = ntohs(ipv6_hdr(skb)->payload_len); + transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr); + + skb_get(skb); + skb_chk = skb_checksum_trimmed(skb, transport_len, + ipv6_mc_validate_checksum); + if (!skb_chk) + return -EINVAL; + + if (!pskb_may_pull(skb_chk, len)) { + kfree_skb(skb_chk); + return -EINVAL; + } + + ret = ipv6_mc_check_mld_msg(skb_chk); + if (ret) { + kfree_skb(skb_chk); + return ret; + } + + if (skb_trimmed) + *skb_trimmed = skb_chk; + else + kfree_skb(skb_chk); + + return 0; +} + +/** + * ipv6_mc_check_mld - checks whether this is a sane MLD packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional) + * + * Checks whether an IPv6 packet is a valid MLD packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an MLD packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an MLD packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * The caller needs to release a reference count from any returned skb_trimmed. + */ +int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret; + + ret = ipv6_mc_check_ip6hdr(skb); + if (ret < 0) + return ret; + + ret = ipv6_mc_check_exthdrs(skb); + if (ret < 0) + return ret; + + return __ipv6_mc_check_mld(skb, skb_trimmed); +} +EXPORT_SYMBOL(ipv6_mc_check_mld); -- cgit v1.2.3 From cd8ae85299d54155702a56811b2e035e63064d3d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 May 2015 21:34:46 -0700 Subject: tcp: provide SYN headers for passive connections This patch allows a server application to get the TCP SYN headers for its passive connections. This is useful if the server is doing fingerprinting of clients based on SYN packet contents. Two socket options are added: TCP_SAVE_SYN and TCP_SAVED_SYN. The first is used on a socket to enable saving the SYN headers for child connections. This can be set before or after the listen() call. The latter is used to retrieve the SYN headers for passive connections, if the parent listener has enabled TCP_SAVE_SYN. TCP_SAVED_SYN is read once, it frees the saved SYN headers. The data returned in TCP_SAVED_SYN are network (IPv4/IPv6) and TCP headers. Original patch was written by Tom Herbert, I changed it to not hold a full skb (and associated dst and conntracking reference). We have used such patch for about 3 years at Google. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 8 ++++++++ include/net/request_sock.h | 4 +++- include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 35 +++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 18 ++++++++++++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 3 +++ 7 files changed, 70 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3b2911502a8c..e6fb5df22db1 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -199,6 +199,7 @@ struct tcp_sock { syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ + save_syn:1, /* Save headers of SYN packet */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ @@ -326,6 +327,7 @@ struct tcp_sock { * socket. Used to retransmit SYNACKs etc. */ struct request_sock *fastopen_rsk; + u32 *saved_syn; }; enum tsq_flags { @@ -393,4 +395,10 @@ static inline int fastopen_init_queue(struct sock *sk, int backlog) return 0; } +static inline void tcp_saved_syn_free(struct tcp_sock *tp) +{ + kfree(tp->saved_syn); + tp->saved_syn = NULL; +} + #endif /* _LINUX_TCP_H */ diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 9f4265ce8892..87935cad2f7b 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -64,6 +64,7 @@ struct request_sock { struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; struct sock *sk; + u32 *saved_syn; u32 secid; u32 peer_secid; }; @@ -77,7 +78,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) req->rsk_ops = ops; sock_hold(sk_listener); req->rsk_listener = sk_listener; - + req->saved_syn = NULL; /* Following is temporary. It is coupled with debugging * helpers in reqsk_put() & reqsk_free() */ @@ -104,6 +105,7 @@ static inline void reqsk_free(struct request_sock *req) req->rsk_ops->destructor(req); if (req->rsk_listener) sock_put(req->rsk_listener); + kfree(req->saved_syn); kmem_cache_free(req->rsk_ops->slab, req); } diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index faa72f4fa547..51ebedba577f 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -113,6 +113,8 @@ enum { #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ #define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ +#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ +#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ struct tcp_repair_opt { __u32 opt_code; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 46efa03d2b11..ecccfdc50d76 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2482,6 +2482,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, icsk->icsk_syn_retries = val; break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->save_syn = val; + break; + case TCP_LINGER2: if (val < 0) tp->linger2 = -1; @@ -2818,6 +2825,34 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; + case TCP_SAVE_SYN: + val = tp->save_syn; + break; + case TCP_SAVED_SYN: { + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + if (tp->saved_syn) { + len = min_t(unsigned int, tp->saved_syn[0], len); + if (put_user(len, optlen)) { + release_sock(sk); + return -EFAULT; + } + if (copy_to_user(optval, tp->saved_syn + 1, len)) { + release_sock(sk); + return -EFAULT; + } + tcp_saved_syn_free(tp); + release_sock(sk); + } else { + release_sock(sk); + len = 0; + if (put_user(len, optlen)) + return -EFAULT; + } + return 0; + } default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 09bdc4abfcbb..df2ca615cd0c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6060,6 +6060,23 @@ static bool tcp_syn_flood_action(struct sock *sk, return want_cookie; } +static void tcp_reqsk_record_syn(const struct sock *sk, + struct request_sock *req, + const struct sk_buff *skb) +{ + if (tcp_sk(sk)->save_syn) { + u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); + u32 *copy; + + copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); + if (copy) { + copy[0] = len; + memcpy(©[1], skb_network_header(skb), len); + req->saved_syn = copy; + } + } +} + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -6192,6 +6209,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->tfo_listener = false; af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } + tcp_reqsk_record_syn(sk, req, skb); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fc1c658ec6c1..91cb4768a860 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1802,6 +1802,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); sock_release_memcg(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e5d7649136fc..ebe2ab2596ed 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -536,6 +536,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; + newtp->saved_syn = req->saved_syn; + req->saved_syn = NULL; + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; -- cgit v1.2.3 From d181ddca89cf8680db6c767ae38e3b66224d1657 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 May 2015 14:33:59 -0700 Subject: ipv4/ip_tunnel_core: Use eth_proto_is_802_3 Replace "ntohs(proto) >= ETH_P_802_3_MIN" w/ eth_proto_is_802_3(proto). Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index ce63ab21b6cd..3998b1822d85 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -98,7 +98,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) return -ENOMEM; eh = (struct ethhdr *)skb->data; - if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) + if (likely(eth_proto_is_802_3(eh->h_proto))) skb->protocol = eh->h_proto; else skb->protocol = htons(ETH_P_802_2); -- cgit v1.2.3 From 21c8fe9915276d923f8c1e43434fd6d37a3b9aef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 May 2015 14:26:24 -0700 Subject: tcp: adjust window probe timers to safer values With the advent of small rto timers in datacenter TCP, (ip route ... rto_min x), the following can happen : 1) Qdisc is full, transmit fails. TCP sets a timer based on icsk_rto to retry the transmit, without exponential backoff. With low icsk_rto, and lot of sockets, all cpus are servicing timer interrupts like crazy. Intent of the code was to retry with a timer between 200 (TCP_RTO_MIN) and 500ms (TCP_RESOURCE_PROBE_INTERVAL) 2) Receivers can send zero windows if they don't drain their receive queue. TCP sends zero window probes, based on icsk_rto current value, with exponential backoff. With /proc/sys/net/ipv4/tcp_retries2 being 15 (or even smaller in some cases), sender can abort in less than one or two minutes ! If receiver stops the sender, it obviously doesn't care of very tight rto. Probability of dropping the ACK reopening the window is not worth the risk. Lets change the base timer to be at least 200ms (TCP_RTO_MIN) for these events (but not normal RTO based retransmits) A followup patch adds a new SNMP counter, as it would have helped a lot diagnosing this issue. Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 27 ++++++++++++++++++++++----- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 2 +- 3 files changed, 24 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6d204f3f9df8..7a2248a35b13 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1043,14 +1043,31 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) return tp->is_cwnd_limited; } -static inline void tcp_check_probe_timer(struct sock *sk) +/* Something is really bad, we could not queue an additional packet, + * because qdisc is full or receiver sent a 0 window. + * We do not want to add fuel to the fire, or abort too early, + * so make sure the timer we arm now is at least 200ms in the future, + * regardless of current icsk_rto value (as it could be ~2ms) + */ +static inline unsigned long tcp_probe0_base(const struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); + return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN); +} - if (!tp->packets_out && !icsk->icsk_pending) +/* Variant of inet_csk_rto_backoff() used for zero window probes */ +static inline unsigned long tcp_probe0_when(const struct sock *sk, + unsigned long max_when) +{ + u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff; + + return (unsigned long)min_t(u64, when, max_when); +} + +static inline void tcp_check_probe_timer(struct sock *sk) +{ + if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - icsk->icsk_rto, TCP_RTO_MAX); + tcp_probe0_base(sk), TCP_RTO_MAX); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index df2ca615cd0c..cf8b20ff6658 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3233,7 +3233,7 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { - unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a369e8a70b2c..b76c719e1979 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3490,7 +3490,7 @@ void tcp_send_probe0(struct sock *sk) probe_max = TCP_RESOURCE_PROBE_INTERVAL; } inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - inet_csk_rto_backoff(icsk, probe_max), + tcp_probe0_when(sk, probe_max), TCP_RTO_MAX); } -- cgit v1.2.3 From e520af48c7e5acae5f17f82a79ba7ab7cf156f3b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 May 2015 14:26:25 -0700 Subject: tcp: add TCPWinProbe and TCPKeepAlive SNMP counters Diagnosing problems related to Window Probes has been hard because we lack a counter. TCPWinProbe counts the number of ACK packets a sender has to send at regular intervals to make sure a reverse ACK packet opening back a window had not been lost. TCPKeepAlive counts the number of ACK packets sent to keep TCP flows alive (SO_KEEPALIVE) Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Nandita Dukkipati Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- include/uapi/linux/snmp.h | 2 ++ net/ipv4/proc.c | 2 ++ net/ipv4/tcp_output.c | 13 +++++++------ net/ipv4/tcp_timer.c | 2 +- 5 files changed, 13 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7a2248a35b13..b8ea12880fd9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -527,7 +527,7 @@ int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t); void tcp_send_probe0(struct sock *); void tcp_send_partial(struct sock *); -int tcp_write_wakeup(struct sock *); +int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority); int tcp_send_synack(struct sock *); diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 6a6fb747c78d..eee8968407f0 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -276,6 +276,8 @@ enum LINUX_MIB_TCPACKSKIPPEDFINWAIT2, /* TCPACKSkippedFinWait2 */ LINUX_MIB_TCPACKSKIPPEDTIMEWAIT, /* TCPACKSkippedTimeWait */ LINUX_MIB_TCPACKSKIPPEDCHALLENGE, /* TCPACKSkippedChallenge */ + LINUX_MIB_TCPWINPROBE, /* TCPWinProbe */ + LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index e1f3b911dd1e..da5d483e236a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -298,6 +298,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), + SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), + SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b76c719e1979..7386d32cd670 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3382,7 +3382,7 @@ EXPORT_SYMBOL_GPL(tcp_send_ack); * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ -static int tcp_xmit_probe_skb(struct sock *sk, int urgent) +static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3400,6 +3400,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); skb_mstamp_get(&skb->skb_mstamp); + NET_INC_STATS_BH(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -3407,12 +3408,12 @@ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; - tcp_xmit_probe_skb(sk, 0); + tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ -int tcp_write_wakeup(struct sock *sk) +int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3449,8 +3450,8 @@ int tcp_write_wakeup(struct sock *sk) return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) - tcp_xmit_probe_skb(sk, 1); - return tcp_xmit_probe_skb(sk, 0); + tcp_xmit_probe_skb(sk, 1, mib); + return tcp_xmit_probe_skb(sk, 0, mib); } } @@ -3464,7 +3465,7 @@ void tcp_send_probe0(struct sock *sk) unsigned long probe_max; int err; - err = tcp_write_wakeup(sk); + err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || !tcp_send_head(sk)) { /* Cancel probe timer, if it is not required. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8c65dc147d8b..65bf670e8714 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -616,7 +616,7 @@ static void tcp_keepalive_timer (unsigned long data) tcp_write_err(sk); goto out; } - if (tcp_write_wakeup(sk) <= 0) { + if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { -- cgit v1.2.3 From eeb1bd5c40edb0e2fd925c8535e2fdebdbc5cef2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:08:05 -0500 Subject: net: Add a struct net parameter to sock_create_kern This is long overdue, and is part of cleaning up how we allocate kernel sockets that don't reference count struct net. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_receiver.c | 4 ++-- fs/afs/rxrpc.c | 2 +- fs/dlm/lowcomms.c | 16 ++++++++-------- include/linux/net.h | 2 +- net/bluetooth/rfcomm/core.c | 2 +- net/ceph/messenger.c | 4 ++-- net/ipv4/af_inet.c | 2 +- net/ipv4/udp_tunnel.c | 2 +- net/ipv6/ip6_udp_tunnel.c | 2 +- net/l2tp/l2tp_core.c | 4 ++-- net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- net/rxrpc/ar-local.c | 4 ++-- net/socket.c | 4 ++-- 13 files changed, 26 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index cee20354ac37..c097909c589c 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -598,7 +598,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection) memcpy(&peer_in6, &connection->peer_addr, peer_addr_len); what = "sock_create_kern"; - err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, + err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family, SOCK_STREAM, IPPROTO_TCP, &sock); if (err < 0) { sock = NULL; @@ -693,7 +693,7 @@ static int prepare_listen_socket(struct drbd_connection *connection, struct acce memcpy(&my_addr, &connection->my_addr, my_addr_len); what = "sock_create_kern"; - err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family, + err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family, SOCK_STREAM, IPPROTO_TCP, &s_listen); if (err) { s_listen = NULL; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 3a57a1b0fb51..b50642870a43 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -85,7 +85,7 @@ int afs_open_socket(void) return -ENOMEM; } - ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); + ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); if (ret < 0) { destroy_workqueue(afs_async_calls); _leave(" = %d [socket]", ret); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d08e079ea5d3..754fd6c0b747 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -921,8 +921,8 @@ static int tcp_accept_from_sock(struct connection *con) mutex_unlock(&connections_lock); memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &newsock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &newsock); if (result < 0) return -ENOMEM; @@ -1173,8 +1173,8 @@ static void tcp_connect_to_sock(struct connection *con) goto out; /* Create a socket to communicate with */ - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (result < 0) goto out_err; @@ -1258,8 +1258,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con, addr_len = sizeof(struct sockaddr_in6); /* Create a socket to communicate with */ - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (result < 0) { log_print("Can't create listening comms socket"); goto create_out; @@ -1365,8 +1365,8 @@ static int sctp_listen_for_all(void) log_print("Using SCTP for communications"); - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET, - IPPROTO_SCTP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_SEQPACKET, IPPROTO_SCTP, &sock); if (result < 0) { log_print("Can't create comms socket, check SCTP is loaded"); goto out; diff --git a/include/linux/net.h b/include/linux/net.h index 8a5e81d2bdf7..04aa06852771 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -207,7 +207,7 @@ void sock_unregister(int family); int __sock_create(struct net *net, int family, int type, int proto, struct socket **res, int kern); int sock_create(int family, int type, int proto, struct socket **res); -int sock_create_kern(int family, int type, int proto, struct socket **res); +int sock_create_kern(struct net *net, int family, int type, int proto, struct socket **res); int sock_create_lite(int family, int type, int proto, struct socket **res); void sock_release(struct socket *sock); int sock_sendmsg(struct socket *sock, struct msghdr *msg); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 4fea24275b17..29709fbfd1f5 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -200,7 +200,7 @@ static int rfcomm_l2sock_create(struct socket **sock) BT_DBG(""); - err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); + err = sock_create_kern(&init_net, PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); if (!err) { struct sock *sk = (*sock)->sk; sk->sk_data_ready = rfcomm_l2data_ready; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 967080a9f043..073262fea6dd 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -480,8 +480,8 @@ static int ceph_tcp_connect(struct ceph_connection *con) int ret; BUG_ON(con->sock); - ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + ret = sock_create_kern(&init_net, con->peer_addr.in_addr.ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8b47a4d79d04..09f4d024dfe5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1430,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, struct net *net) { struct socket *sock; - int rc = sock_create_kern(family, type, protocol, &sock); + int rc = sock_create_kern(&init_net, family, type, protocol, &sock); if (rc == 0) { *sk = sock->sk; diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 6bb98cc193c9..4e2837476967 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -15,7 +15,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket *sock = NULL; struct sockaddr_in udp_addr; - err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index bba8903e871f..478576b61214 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -19,7 +19,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, int err; struct socket *sock = NULL; - err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index a29a504492af..ae513a2fe7f3 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1399,7 +1399,7 @@ static int l2tp_tunnel_sock_create(struct net *net, if (cfg->local_ip6 && cfg->peer_ip6) { struct sockaddr_l2tpip6 ip6_addr = {0}; - err = sock_create_kern(AF_INET6, SOCK_DGRAM, + err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; @@ -1429,7 +1429,7 @@ static int l2tp_tunnel_sock_create(struct net *net, { struct sockaddr_l2tpip ip_addr = {0}; - err = sock_create_kern(AF_INET, SOCK_DGRAM, + err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 19b9cce6c210..2e9a5b5d1239 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1458,7 +1458,7 @@ static struct socket *make_send_sock(struct net *net, int id) int result; /* First create a socket move it to right name space later */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); @@ -1518,7 +1518,7 @@ static struct socket *make_receive_sock(struct net *net, int id) int result; /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c index ca904ed5400a..78483b4602bf 100644 --- a/net/rxrpc/ar-local.c +++ b/net/rxrpc/ar-local.c @@ -73,8 +73,8 @@ static int rxrpc_create_local(struct rxrpc_local *local) _enter("%p{%d}", local, local->srx.transport_type); /* create a socket to represent the local endpoint */ - ret = sock_create_kern(PF_INET, local->srx.transport_type, IPPROTO_UDP, - &local->socket); + ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type, + IPPROTO_UDP, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); return ret; diff --git a/net/socket.c b/net/socket.c index b5f1f43ed8f4..9963a0b53a64 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1210,9 +1210,9 @@ int sock_create(int family, int type, int protocol, struct socket **res) } EXPORT_SYMBOL(sock_create); -int sock_create_kern(int family, int type, int protocol, struct socket **res) +int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res) { - return __sock_create(&init_net, family, type, protocol, res, 1); + return __sock_create(net, family, type, protocol, res, 1); } EXPORT_SYMBOL(sock_create_kern); -- cgit v1.2.3 From 11aa9c28b4209242a9de0a661a7b3405adb568a0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:09:13 -0500 Subject: net: Pass kern from net_proto_family.create to sk_alloc In preparation for changing how struct net is refcounted on kernel sockets pass the knowledge that we are creating a kernel socket from sock_create_kern through to sk_alloc. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- crypto/af_alg.c | 4 ++-- drivers/isdn/mISDN/socket.c | 12 ++++++------ drivers/net/macvtap.c | 2 +- drivers/net/ppp/pppoe.c | 4 ++-- drivers/net/ppp/pppox.c | 2 +- drivers/net/ppp/pptp.c | 4 ++-- drivers/net/tun.c | 2 +- include/linux/if_pppox.h | 2 +- include/net/af_vsock.h | 2 +- include/net/llc_conn.h | 2 +- include/net/sock.h | 2 +- net/appletalk/ddp.c | 2 +- net/atm/common.c | 4 ++-- net/atm/common.h | 2 +- net/atm/pvc.c | 2 +- net/atm/svc.c | 2 +- net/ax25/af_ax25.c | 4 ++-- net/bluetooth/bnep/sock.c | 2 +- net/bluetooth/cmtp/sock.c | 2 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/hidp/sock.c | 2 +- net/bluetooth/l2cap_sock.c | 10 +++++----- net/bluetooth/rfcomm/sock.c | 8 ++++---- net/bluetooth/sco.c | 8 ++++---- net/caif/caif_socket.c | 2 +- net/can/af_can.c | 2 +- net/core/sock.c | 3 ++- net/decnet/af_decnet.c | 8 ++++---- net/ieee802154/socket.c | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv6/af_inet6.c | 2 +- net/ipx/af_ipx.c | 2 +- net/irda/af_irda.c | 2 +- net/iucv/af_iucv.c | 10 +++++----- net/key/af_key.c | 2 +- net/l2tp/l2tp_ppp.c | 4 ++-- net/llc/af_llc.c | 2 +- net/llc/llc_conn.c | 6 +++--- net/netlink/af_netlink.c | 11 +++++------ net/netrom/af_netrom.c | 4 ++-- net/nfc/af_nfc.c | 2 +- net/nfc/llcp.h | 2 +- net/nfc/llcp_core.c | 2 +- net/nfc/llcp_sock.c | 8 ++++---- net/nfc/nfc.h | 2 +- net/nfc/rawsock.c | 4 ++-- net/packet/af_packet.c | 2 +- net/phonet/af_phonet.c | 2 +- net/phonet/pep.c | 2 +- net/rds/af_rds.c | 2 +- net/rose/af_rose.c | 4 ++-- net/rxrpc/af_rxrpc.c | 2 +- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 2 +- net/tipc/socket.c | 2 +- net/unix/af_unix.c | 8 ++++---- net/vmw_vsock/af_vsock.c | 7 ++++--- net/vmw_vsock/vmci_transport.c | 2 +- net/x25/af_x25.c | 8 ++++---- 59 files changed, 109 insertions(+), 108 deletions(-) (limited to 'net/ipv4') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index f22cc56fd1b3..5ad0d5354535 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -244,7 +244,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock) if (!type) goto unlock; - sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto); + sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, 0); err = -ENOMEM; if (!sk2) goto unlock; @@ -324,7 +324,7 @@ static int alg_create(struct net *net, struct socket *sock, int protocol, return -EPROTONOSUPPORT; err = -ENOMEM; - sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto); + sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto, kern); if (!sk) goto out; diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 8dc7290089bb..0d29b5a6356d 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -601,14 +601,14 @@ static const struct proto_ops data_sock_ops = { }; static int -data_sock_create(struct net *net, struct socket *sock, int protocol) +data_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto); + sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; @@ -756,14 +756,14 @@ static const struct proto_ops base_sock_ops = { static int -base_sock_create(struct net *net, struct socket *sock, int protocol) +base_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto); + sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; @@ -785,7 +785,7 @@ mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern) switch (proto) { case ISDN_P_BASE: - err = base_sock_create(net, sock, proto); + err = base_sock_create(net, sock, proto, kern); break; case ISDN_P_TE_S0: case ISDN_P_NT_S0: @@ -799,7 +799,7 @@ mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern) case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: - err = data_sock_create(net, sock, proto); + err = data_sock_create(net, sock, proto, kern); break; default: return err; diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 8c350c5d54ad..0398631a3c24 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -476,7 +476,7 @@ static int macvtap_open(struct inode *inode, struct file *file) err = -ENOMEM; q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, - &macvtap_proto); + &macvtap_proto, 0); if (!q) goto out; diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index aa1dd926623a..f86c5ab334aa 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -546,11 +546,11 @@ static struct proto pppoe_sk_proto __read_mostly = { * Initialize a new struct sock. * **********************************************************************/ -static int pppoe_create(struct net *net, struct socket *sock) +static int pppoe_create(struct net *net, struct socket *sock, int kern) { struct sock *sk; - sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto); + sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto, kern); if (!sk) return -ENOMEM; diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c index 2940e9fe351b..0e1b30622477 100644 --- a/drivers/net/ppp/pppox.c +++ b/drivers/net/ppp/pppox.c @@ -118,7 +118,7 @@ static int pppox_create(struct net *net, struct socket *sock, int protocol, !try_module_get(pppox_protos[protocol]->owner)) goto out; - rc = pppox_protos[protocol]->create(net, sock); + rc = pppox_protos[protocol]->create(net, sock, kern); module_put(pppox_protos[protocol]->owner); out: diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index e3bfbd4d0136..14839bc0aaf5 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -561,14 +561,14 @@ static void pptp_sock_destruct(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); } -static int pptp_create(struct net *net, struct socket *sock) +static int pptp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; struct pppox_sock *po; struct pptp_opt *opt; - sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto); + sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto, kern); if (!sk) goto out; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3262f3e2b8b2..1a1c4f7b3ec5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2148,7 +2148,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) DBG1(KERN_INFO, "tunX: tun_chr_open\n"); tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, - &tun_proto); + &tun_proto, 0); if (!tfile) return -ENOMEM; RCU_INIT_POINTER(tfile->tun, NULL); diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 66a7d7600f43..b49cf923becc 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -74,7 +74,7 @@ static inline struct sock *sk_pppox(struct pppox_sock *po) struct module; struct pppox_proto { - int (*create)(struct net *net, struct socket *sock); + int (*create)(struct net *net, struct socket *sock, int kern); int (*ioctl)(struct socket *sock, unsigned int cmd, unsigned long arg); struct module *owner; diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 172632dd9930..db639a4c5ab8 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -74,7 +74,7 @@ void vsock_pending_work(struct work_struct *work); struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, - gfp_t priority, unsigned short type); + gfp_t priority, unsigned short type, int kern); /**** TRANSPORT ****/ diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index 0134681acc4c..fe994d2e5286 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -96,7 +96,7 @@ static __inline__ char llc_backlog_type(struct sk_buff *skb) } struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot); + struct proto *prot, int kern); void llc_sk_free(struct sock *sk); void llc_sk_reset(struct sock *sk); diff --git a/include/net/sock.h b/include/net/sock.h index 3a4898ec8c67..d8dcf91732b0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1514,7 +1514,7 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow) struct sock *sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot); + struct proto *prot, int kern); void sk_free(struct sock *sk); void sk_release_kernel(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 3b7ad43c7dad..d5871ac493eb 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1030,7 +1030,7 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) goto out; rc = -ENOMEM; - sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto); + sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern); if (!sk) goto out; rc = 0; diff --git a/net/atm/common.c b/net/atm/common.c index ed0466637e13..49a872db7e42 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -141,7 +141,7 @@ static struct proto vcc_proto = { .release_cb = vcc_release_cb, }; -int vcc_create(struct net *net, struct socket *sock, int protocol, int family) +int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern) { struct sock *sk; struct atm_vcc *vcc; @@ -149,7 +149,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family) sock->sk = NULL; if (sock->type == SOCK_STREAM) return -EINVAL; - sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto); + sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); diff --git a/net/atm/common.h b/net/atm/common.h index 4d6f5b2068ac..959436b87182 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -10,7 +10,7 @@ #include /* for poll_table */ -int vcc_create(struct net *net, struct socket *sock, int protocol, int family); +int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern); int vcc_release(struct socket *sock); int vcc_connect(struct socket *sock, int itf, short vpi, int vci); int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index ae0324021407..040207ec399f 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -136,7 +136,7 @@ static int pvc_create(struct net *net, struct socket *sock, int protocol, return -EAFNOSUPPORT; sock->ops = &pvc_proto_ops; - return vcc_create(net, sock, protocol, PF_ATMPVC); + return vcc_create(net, sock, protocol, PF_ATMPVC, kern); } static const struct net_proto_family pvc_family_ops = { diff --git a/net/atm/svc.c b/net/atm/svc.c index 1ba23f5018e7..3fa0a9ee98d1 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -660,7 +660,7 @@ static int svc_create(struct net *net, struct socket *sock, int protocol, return -EAFNOSUPPORT; sock->ops = &svc_proto_ops; - error = vcc_create(net, sock, protocol, AF_ATMSVC); + error = vcc_create(net, sock, protocol, AF_ATMSVC, kern); if (error) return error; ATM_SD(sock)->local.sas_family = AF_ATMSVC; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 330c1f4a5a0b..4273533d22b1 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -855,7 +855,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto); + sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto, kern); if (sk == NULL) return -ENOMEM; @@ -881,7 +881,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) struct sock *sk; ax25_cb *ax25, *oax25; - sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot); + sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index bde2bdd9e929..b5116fa9835e 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -202,7 +202,7 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index d82787d417bd..ce86a7bae844 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -205,7 +205,7 @@ static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 56f9edbf3d05..5b14dcafcd08 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1377,7 +1377,7 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &hci_sock_ops; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index cb3fdde1968a..008ba439bd62 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -235,7 +235,7 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a7278f05eafb..244287706f91 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -43,7 +43,7 @@ static struct bt_sock_list l2cap_sk_list = { static const struct proto_ops l2cap_sock_ops; static void l2cap_sock_init(struct sock *sk, struct sock *parent); static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, - int proto, gfp_t prio); + int proto, gfp_t prio, int kern); bool l2cap_is_socket(struct socket *sock) { @@ -1193,7 +1193,7 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) } sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP, - GFP_ATOMIC); + GFP_ATOMIC, 0); if (!sk) { release_sock(parent); return NULL; @@ -1523,12 +1523,12 @@ static struct proto l2cap_proto = { }; static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, - int proto, gfp_t prio) + int proto, gfp_t prio, int kern) { struct sock *sk; struct l2cap_chan *chan; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern); if (!sk) return NULL; @@ -1574,7 +1574,7 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &l2cap_sock_ops; - sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 825e8fb5114b..b2338e971b33 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -269,12 +269,12 @@ static struct proto rfcomm_proto = { .obj_size = sizeof(struct rfcomm_pinfo) }; -static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct rfcomm_dlc *d; struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern); if (!sk) return NULL; @@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, sock->ops = &rfcomm_sock_ops; - sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; @@ -969,7 +969,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * goto done; } - sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC); + sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC, 0); if (!sk) goto done; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 4322c833e748..6b6e59dc54cf 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -460,11 +460,11 @@ static struct proto sco_proto = { .obj_size = sizeof(struct sco_pinfo) }; -static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern); if (!sk) return NULL; @@ -501,7 +501,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &sco_sock_ops; - sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; @@ -1026,7 +1026,7 @@ static void sco_conn_ready(struct sco_conn *conn) bh_lock_sock(parent); sk = sco_sock_alloc(sock_net(parent), NULL, - BTPROTO_SCO, GFP_ATOMIC); + BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { bh_unlock_sock(parent); sco_conn_unlock(conn); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 4ec0c803aef1..78a04ebb113c 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -1047,7 +1047,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol, * is really not used at all in the net/core or socket.c but the * initialization makes sure that sock->state is not uninitialized. */ - sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot); + sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern); if (!sk) return -ENOMEM; diff --git a/net/can/af_can.c b/net/can/af_can.c index 32d710eaf1fc..d4d404bdfc9a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -179,7 +179,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol, sock->ops = cp->ops; - sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot); + sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern); if (!sk) { err = -ENOMEM; goto errout; diff --git a/net/core/sock.c b/net/core/sock.c index e891bcf325ca..cbc3789b830c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1396,9 +1396,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx); * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance + * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot) + struct proto *prot, int kern) { struct sock *sk; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 754484b3cd0e..675cf94e04f8 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -468,10 +468,10 @@ static struct proto dn_proto = { .obj_size = sizeof(struct dn_sock), }; -static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp) +static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern) { struct dn_scp *scp; - struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto); + struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern); if (!sk) goto out; @@ -693,7 +693,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, } - if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL) + if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL) return -ENOBUFS; sk->sk_protocol = protocol; @@ -1096,7 +1096,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags) cb = DN_SKB_CB(skb); sk->sk_ack_backlog--; - newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation); + newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0); if (newsk == NULL) { release_sock(sk); kfree_skb(skb); diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index b60c65f70346..7aaaf967df58 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1014,7 +1014,7 @@ static int ieee802154_create(struct net *net, struct socket *sock, } rc = -ENOMEM; - sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto); + sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern); if (!sk) goto out; rc = 0; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 09f4d024dfe5..e2dd9cb99d61 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -317,7 +317,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); err = -ENOBUFS; - sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); + sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 4632afa57e05..f3866c0b6cfe 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -167,7 +167,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); err = -ENOBUFS; - sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot); + sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 4ea5d7497b5f..48d0dc89b58d 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1347,7 +1347,7 @@ static int ipx_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -ENOMEM; - sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto); + sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto, kern); if (!sk) goto out; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index ee0ea25c8e7a..fae6822cc367 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1100,7 +1100,7 @@ static int irda_create(struct net *net, struct socket *sock, int protocol, } /* Allocate networking socket */ - sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto); + sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto, kern); if (sk == NULL) return -ENOMEM; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 6daa52a18d40..918151c11348 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -535,12 +535,12 @@ static void iucv_sock_init(struct sock *sk, struct sock *parent) sk->sk_type = parent->sk_type; } -static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio) +static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; struct iucv_sock *iucv; - sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto); + sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto, kern); if (!sk) return NULL; iucv = iucv_sk(sk); @@ -602,7 +602,7 @@ static int iucv_sock_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL); + sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern); if (!sk) return -ENOMEM; @@ -1723,7 +1723,7 @@ static int iucv_callback_connreq(struct iucv_path *path, } /* Create the new socket */ - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC); + nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); if (!nsk) { err = pr_iucv->path_sever(path, user_data); iucv_path_free(path); @@ -1933,7 +1933,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) goto out; } - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC); + nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); bh_lock_sock(sk); if ((sk->sk_state != IUCV_LISTEN) || sk_acceptq_is_full(sk) || diff --git a/net/key/af_key.c b/net/key/af_key.c index f0d52d721b3a..9e834ec475a9 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -149,7 +149,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, return -EPROTONOSUPPORT; err = -ENOMEM; - sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto); + sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) goto out; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index e9b0dec56b8e..f56c9f69e9f2 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -542,12 +542,12 @@ static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb) /* socket() handler. Initialize a new struct sock. */ -static int pppol2tp_create(struct net *net, struct socket *sock) +static int pppol2tp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; - sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto); + sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern); if (!sk) goto out; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 17a8dff06090..8fd9febaa5ba 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -168,7 +168,7 @@ static int llc_ui_create(struct net *net, struct socket *sock, int protocol, if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) { rc = -ENOMEM; - sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto); + sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern); if (sk) { rc = 0; llc_ui_sk_init(sock, sk); diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 81a61fce3afb..3e821daf9dd4 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -768,7 +768,7 @@ static struct sock *llc_create_incoming_sock(struct sock *sk, struct llc_addr *daddr) { struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC, - sk->sk_prot); + sk->sk_prot, 0); struct llc_sock *newllc, *llc = llc_sk(sk); if (!newsk) @@ -931,9 +931,9 @@ static void llc_sk_init(struct sock *sk) * Allocates a LLC sock and initializes it. Returns the new LLC sock * or %NULL if there's no memory available for one */ -struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot) +struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { - struct sock *sk = sk_alloc(net, family, priority, prot); + struct sock *sk = sk_alloc(net, family, priority, prot, kern); if (!sk) goto out; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index a5fff75accf8..be6665ab7f40 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1119,14 +1119,15 @@ static struct proto netlink_proto = { }; static int __netlink_create(struct net *net, struct socket *sock, - struct mutex *cb_mutex, int protocol) + struct mutex *cb_mutex, int protocol, + int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; - sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); + sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; @@ -1188,7 +1189,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, if (err < 0) goto out; - err = __netlink_create(net, sock, cb_mutex, protocol); + err = __netlink_create(net, sock, cb_mutex, protocol, kern); if (err < 0) goto out_module; @@ -2515,14 +2516,12 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - /* * We have to just have a reference on the net from sk, but don't * get_net it. Besides, we cannot get and then put the net here. * So we create one inside init_net and the move it to net. */ - - if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0) + if (__netlink_create(&init_net, sock, cb_mutex, unit, 0) < 0) goto out_sock_release_nosk; sk = sock->sk; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index b987fd56c3c5..ed212ffc1d9d 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -433,7 +433,7 @@ static int nr_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto); + sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern); if (sk == NULL) return -ENOMEM; @@ -476,7 +476,7 @@ static struct sock *nr_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) return NULL; - sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot); + sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 2277276f52bc..54e40fa47822 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -40,7 +40,7 @@ static int nfc_sock_create(struct net *net, struct socket *sock, int proto, read_lock(&proto_tab_lock); if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) { - rc = proto_tab[proto]->create(net, sock, proto_tab[proto]); + rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern); module_put(proto_tab[proto]->owner); } read_unlock(&proto_tab_lock); diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h index de1789e3cc82..1f68724d44d3 100644 --- a/net/nfc/llcp.h +++ b/net/nfc/llcp.h @@ -225,7 +225,7 @@ void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, struct sk_buff *skb, u8 direction); /* Sock API */ -struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp); +struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern); void nfc_llcp_sock_free(struct nfc_llcp_sock *sock); void nfc_llcp_accept_unlink(struct sock *sk); void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk); diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index b18f07ccb504..98876274a1ee 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -934,7 +934,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, sock->ssap = ssap; } - new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC); + new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0); if (new_sk == NULL) { reason = LLCP_DM_REJ; release_sock(&sock->sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 9578bd6a4f3e..b7de0da46acd 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -942,12 +942,12 @@ static void llcp_sock_destruct(struct sock *sk) } } -struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp) +struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; - sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto); + sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern); if (!sk) return NULL; @@ -993,7 +993,7 @@ void nfc_llcp_sock_free(struct nfc_llcp_sock *sock) } static int llcp_sock_create(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto) + const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; @@ -1009,7 +1009,7 @@ static int llcp_sock_create(struct net *net, struct socket *sock, else sock->ops = &llcp_sock_ops; - sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC); + sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) return -ENOMEM; diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index a8ce80b47720..5c93e8412a26 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -30,7 +30,7 @@ struct nfc_protocol { struct proto *proto; struct module *owner; int (*create)(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto); + const struct nfc_protocol *nfc_proto, int kern); }; struct nfc_rawsock { diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 82b4e8024778..e9a91488fe3d 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -334,7 +334,7 @@ static void rawsock_destruct(struct sock *sk) } static int rawsock_create(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto) + const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; @@ -348,7 +348,7 @@ static int rawsock_create(struct net *net, struct socket *sock, else sock->ops = &rawsock_ops; - sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto); + sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern); if (!sk) return -ENOMEM; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5102c3cc4eec..94713276a1d9 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2832,7 +2832,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, sock->state = SS_UNCONNECTED; err = -ENOBUFS; - sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); + sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 32ab87d34828..10d42f3220ab 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -97,7 +97,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol, goto out; } - sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot); + sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern); if (sk == NULL) { err = -ENOMEM; goto out; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 6de2aeb98a1f..850a86cde0b3 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -845,7 +845,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) } /* Create a new to-be-accepted sock */ - newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot); + newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0); if (!newsk) { pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); err = -ENOBUFS; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 10443377fb9d..3d83641f2861 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -440,7 +440,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); + sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 8ae603069a1a..36dbc2da3661 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -520,7 +520,7 @@ static int rose_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto); + sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern); if (sk == NULL) return -ENOMEM; @@ -559,7 +559,7 @@ static struct sock *rose_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) return NULL; - sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto); + sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0); if (sk == NULL) return NULL; diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 0095b9a0b779..25d60ed15284 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -632,7 +632,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, sock->ops = &rxrpc_rpc_ops; sock->state = SS_UNCONNECTED; - sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto); + sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0e4198ee2370..e703ff7fed40 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -635,7 +635,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; - newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot); + newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0); if (!newsk) goto out; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 53b7acde9aa3..59e80356672b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -550,7 +550,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, struct sctp_association *asoc) { struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, - sk->sk_prot); + sk->sk_prot, 0); struct inet_sock *newinet; if (!newsk) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 9074b5cede38..8f3c8e2cef8e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -342,7 +342,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, } /* Allocate socket's protocol area */ - sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto); + sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern); if (sk == NULL) return -ENOMEM; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5266ea7b922b..941b3d26e3bf 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -620,7 +620,7 @@ static struct proto unix_proto = { */ static struct lock_class_key af_unix_sk_receive_queue_lock_key; -static struct sock *unix_create1(struct net *net, struct socket *sock) +static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) { struct sock *sk = NULL; struct unix_sock *u; @@ -629,7 +629,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; - sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); if (!sk) goto out; @@ -688,7 +688,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock) ? 0 : -ENOMEM; + return unix_create1(net, sock, kern) ? 0 : -ENOMEM; } static int unix_release(struct socket *sock) @@ -1088,7 +1088,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -ENOMEM; /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL); + newsk = unix_create1(sock_net(sk), NULL, 0); if (newsk == NULL) goto out; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 2ec86e652a19..df5fc6b340f1 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -581,13 +581,14 @@ struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, gfp_t priority, - unsigned short type) + unsigned short type, + int kern) { struct sock *sk; struct vsock_sock *psk; struct vsock_sock *vsk; - sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto); + sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); if (!sk) return NULL; @@ -1866,7 +1867,7 @@ static int vsock_create(struct net *net, struct socket *sock, sock->state = SS_UNCONNECTED; - return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM; + return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM; } static const struct net_proto_family vsock_family_ops = { diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index c294da095461..1f63daff3965 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1022,7 +1022,7 @@ static int vmci_transport_recv_listen(struct sock *sk, } pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type); + sk->sk_type, 0); if (!pending) { vmci_transport_send_reset(sk, pkt); return -ENOMEM; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index c3ab230e4493..a750f330b8dd 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -515,10 +515,10 @@ static struct proto x25_proto = { .obj_size = sizeof(struct x25_sock), }; -static struct sock *x25_alloc_socket(struct net *net) +static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; - struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto); + struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; @@ -553,7 +553,7 @@ static int x25_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -ENOBUFS; - if ((sk = x25_alloc_socket(net)) == NULL) + if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); @@ -602,7 +602,7 @@ static struct sock *x25_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) goto out; - if ((sk = x25_alloc_socket(sock_net(osk))) == NULL) + if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); -- cgit v1.2.3 From 26abe14379f8e2fa3fd1bcf97c9a7ad9364886fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 21:10:31 -0500 Subject: net: Modify sk_alloc to not reference count the netns of kernel sockets. Now that sk_alloc knows when a kernel socket is being allocated modify it to not reference count the network namespace of kernel sockets. Keep track of if a socket needs reference counting by adding a flag to struct sock called sk_net_refcnt. Update all of the callers of sock_create_kern to stop using sk_change_net and sk_release_kernel as those hacks are no longer needed, to avoid reference counting a kernel socket. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/net/inet_common.h | 2 +- include/net/sock.h | 2 ++ net/core/sock.c | 8 ++++++-- net/ipv4/af_inet.c | 4 +--- net/ipv4/udp_tunnel.c | 8 +++----- net/ipv6/ip6_udp_tunnel.c | 6 ++---- net/l2tp/l2tp_core.c | 15 ++++++--------- net/netfilter/ipvs/ip_vs_sync.c | 30 +++++++++--------------------- 8 files changed, 30 insertions(+), 45 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 4a92423eefa5..279f83591971 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -41,7 +41,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, static inline void inet_ctl_sock_destroy(struct sock *sk) { - sk_release_kernel(sk); + sock_release(sk->sk_socket); } #endif diff --git a/include/net/sock.h b/include/net/sock.h index d8dcf91732b0..9e6b2c0b4741 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -184,6 +184,7 @@ struct sock_common { unsigned char skc_reuse:4; unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; + unsigned char skc_net_refcnt:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; @@ -323,6 +324,7 @@ struct sock { #define sk_reuse __sk_common.skc_reuse #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only +#define sk_net_refcnt __sk_common.skc_net_refcnt #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot diff --git a/net/core/sock.c b/net/core/sock.c index cbc3789b830c..9e8968e9ebeb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1412,7 +1412,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); - sock_net_set(sk, get_net(net)); + sk->sk_net_refcnt = kern ? 0 : 1; + if (likely(sk->sk_net_refcnt)) + get_net(net); + sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); @@ -1446,7 +1449,8 @@ static void __sk_free(struct sock *sk) if (sk->sk_peer_cred) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - put_net(sock_net(sk)); + if (likely(sk->sk_net_refcnt)) + put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e2dd9cb99d61..235d36afece3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1430,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, struct net *net) { struct socket *sock; - int rc = sock_create_kern(&init_net, family, type, protocol, &sock); + int rc = sock_create_kern(net, family, type, protocol, &sock); if (rc == 0) { *sk = sock->sk; @@ -1440,8 +1440,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, * we do not wish this socket to see incoming packets. */ (*sk)->sk_prot->unhash(*sk); - - sk_change_net(*sk, net); } return rc; } diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 4e2837476967..933ea903f7b8 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -15,12 +15,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket *sock = NULL; struct sockaddr_in udp_addr; - err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; - sk_change_net(sock->sk, net); - udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; @@ -47,7 +45,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } *sockp = NULL; return err; @@ -101,7 +99,7 @@ void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 478576b61214..e1a1136bda7c 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -19,12 +19,10 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, int err; struct socket *sock = NULL; - err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; - sk_change_net(sock->sk, net); - udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); @@ -55,7 +53,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } *sockp = NULL; return err; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ae513a2fe7f3..f6b090df3930 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1334,9 +1334,10 @@ static void l2tp_tunnel_del_work(struct work_struct *work) if (sock) inet_shutdown(sock, 2); } else { - if (sock) + if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sk); + sock_release(sock); + } } l2tp_tunnel_sock_put(sk); @@ -1399,13 +1400,11 @@ static int l2tp_tunnel_sock_create(struct net *net, if (cfg->local_ip6 && cfg->peer_ip6) { struct sockaddr_l2tpip6 ip6_addr = {0}; - err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, + err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; - sk_change_net(sock->sk, net); - ip6_addr.l2tp_family = AF_INET6; memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); @@ -1429,13 +1428,11 @@ static int l2tp_tunnel_sock_create(struct net *net, { struct sockaddr_l2tpip ip_addr = {0}; - err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, + err = sock_create_kern(net, AF_INET, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; - sk_change_net(sock->sk, net); - ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; @@ -1462,7 +1459,7 @@ out: *sockp = sock; if ((err < 0) && sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); *sockp = NULL; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 2e9a5b5d1239..b08ba9538d12 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1457,18 +1457,12 @@ static struct socket *make_send_sock(struct net *net, int id) struct socket *sock; int result; - /* First create a socket move it to right name space later */ - result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + /* First create a socket */ + result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); @@ -1497,7 +1491,7 @@ static struct socket *make_send_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1518,17 +1512,11 @@ static struct socket *make_receive_sock(struct net *net, int id) int result; /* First create a socket */ - result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = SK_CAN_REUSE; result = sysctl_sync_sock_size(ipvs); @@ -1554,7 +1542,7 @@ static struct socket *make_receive_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1692,7 +1680,7 @@ done: ip_vs_sync_buff_release(sb); /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo); return 0; @@ -1729,7 +1717,7 @@ static int sync_thread_backup(void *data) } /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); @@ -1854,11 +1842,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) return 0; outsocket: - sk_release_kernel(sock->sk); + sock_release(sock); outtinfo: if (tinfo) { - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); } -- cgit v1.2.3 From ebb9a03a590e2325f747be43c8db450e92509501 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 10 May 2015 09:47:46 -0700 Subject: switchdev: s/netdev_switch_/switchdev_/ and s/NETDEV_SWITCH_/SWITCHDEV_/ Turned out that "switchdev" sticks. So just unify all related terms to use this prefix. Signed-off-by: Jiri Pirko Signed-off-by: Scott Feldman Acked-by: Roopa Prabhu Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 +- drivers/net/ethernet/rocker/rocker.c | 10 +-- drivers/net/team/team.c | 4 +- include/net/switchdev.h | 111 +++++++++++++++-------------- net/bridge/br.c | 22 +++--- net/bridge/br_netlink.c | 6 +- net/bridge/br_stp.c | 2 +- net/core/net-sysfs.c | 2 +- net/core/rtnetlink.c | 2 +- net/ipv4/fib_trie.c | 40 +++++------ net/switchdev/switchdev.c | 134 +++++++++++++++++------------------ 11 files changed, 166 insertions(+), 171 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2ee13be7551b..a475a0389cc7 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4039,8 +4039,8 @@ static const struct net_device_ops bond_netdev_ops = { .ndo_add_slave = bond_enslave, .ndo_del_slave = bond_release, .ndo_fix_features = bond_fix_features, - .ndo_bridge_setlink = ndo_dflt_netdev_switch_port_bridge_setlink, - .ndo_bridge_dellink = ndo_dflt_netdev_switch_port_bridge_dellink, + .ndo_bridge_setlink = ndo_dflt_switchdev_port_bridge_setlink, + .ndo_bridge_dellink = ndo_dflt_switchdev_port_bridge_dellink, .ndo_features_check = passthru_features_check, }; diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index 0c6f0a8b42dd..418f62b19545 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -3377,17 +3377,17 @@ static void rocker_port_fdb_learn_work(struct work_struct *work) container_of(work, struct rocker_fdb_learn_work, work); bool removing = (lw->flags & ROCKER_OP_FLAG_REMOVE); bool learned = (lw->flags & ROCKER_OP_FLAG_LEARNED); - struct netdev_switch_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info; info.addr = lw->addr; info.vid = lw->vid; if (learned && removing) - call_netdev_switch_notifiers(NETDEV_SWITCH_FDB_DEL, - lw->dev, &info.info); + call_switchdev_notifiers(SWITCHDEV_FDB_DEL, + lw->dev, &info.info); else if (learned && !removing) - call_netdev_switch_notifiers(NETDEV_SWITCH_FDB_ADD, - lw->dev, &info.info); + call_switchdev_notifiers(SWITCHDEV_FDB_ADD, + lw->dev, &info.info); kfree(work); } diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 6928448f6b7f..cfe84965afb6 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1977,8 +1977,8 @@ static const struct net_device_ops team_netdev_ops = { .ndo_del_slave = team_del_slave, .ndo_fix_features = team_fix_features, .ndo_change_carrier = team_change_carrier, - .ndo_bridge_setlink = ndo_dflt_netdev_switch_port_bridge_setlink, - .ndo_bridge_dellink = ndo_dflt_netdev_switch_port_bridge_dellink, + .ndo_bridge_setlink = ndo_dflt_switchdev_port_bridge_setlink, + .ndo_bridge_dellink = ndo_dflt_switchdev_port_bridge_dellink, .ndo_features_check = passthru_features_check, }; diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d2e69ee3019a..cd921fa0d63a 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -43,124 +43,125 @@ struct swdev_ops { u8 tos, u8 type, u32 tb_id); }; -enum netdev_switch_notifier_type { - NETDEV_SWITCH_FDB_ADD = 1, - NETDEV_SWITCH_FDB_DEL, +enum switchdev_notifier_type { + SWITCHDEV_FDB_ADD = 1, + SWITCHDEV_FDB_DEL, }; -struct netdev_switch_notifier_info { +struct switchdev_notifier_info { struct net_device *dev; }; -struct netdev_switch_notifier_fdb_info { - struct netdev_switch_notifier_info info; /* must be first */ +struct switchdev_notifier_fdb_info { + struct switchdev_notifier_info info; /* must be first */ const unsigned char *addr; u16 vid; }; static inline struct net_device * -netdev_switch_notifier_info_to_dev(const struct netdev_switch_notifier_info *info) +switchdev_notifier_info_to_dev(const struct switchdev_notifier_info *info) { return info->dev; } #ifdef CONFIG_NET_SWITCHDEV -int netdev_switch_parent_id_get(struct net_device *dev, - struct netdev_phys_item_id *psid); -int netdev_switch_port_stp_update(struct net_device *dev, u8 state); -int register_netdev_switch_notifier(struct notifier_block *nb); -int unregister_netdev_switch_notifier(struct notifier_block *nb); -int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev, - struct netdev_switch_notifier_info *info); -int netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags); -int netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags); -int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags); -int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags); -int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 nlflags, u32 tb_id); -int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id); -void netdev_switch_fib_ipv4_abort(struct fib_info *fi); +int switchdev_parent_id_get(struct net_device *dev, + struct netdev_phys_item_id *psid); +int switchdev_port_stp_update(struct net_device *dev, u8 state); +int register_switchdev_notifier(struct notifier_block *nb); +int unregister_switchdev_notifier(struct notifier_block *nb); +int call_switchdev_notifiers(unsigned long val, struct net_device *dev, + struct switchdev_notifier_info *info); +int switchdev_port_bridge_setlink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags); +int switchdev_port_bridge_dellink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags); +int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags); +int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags); +int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 nlflags, u32 tb_id); +int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id); +void switchdev_fib_ipv4_abort(struct fib_info *fi); #else -static inline int netdev_switch_parent_id_get(struct net_device *dev, - struct netdev_phys_item_id *psid) +static inline int switchdev_parent_id_get(struct net_device *dev, + struct netdev_phys_item_id *psid) { return -EOPNOTSUPP; } -static inline int netdev_switch_port_stp_update(struct net_device *dev, - u8 state) +static inline int switchdev_port_stp_update(struct net_device *dev, + u8 state) { return -EOPNOTSUPP; } -static inline int register_netdev_switch_notifier(struct notifier_block *nb) +static inline int register_switchdev_notifier(struct notifier_block *nb) { return 0; } -static inline int unregister_netdev_switch_notifier(struct notifier_block *nb) +static inline int unregister_switchdev_notifier(struct notifier_block *nb) { return 0; } -static inline int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev, - struct netdev_switch_notifier_info *info) +static inline int call_switchdev_notifiers(unsigned long val, + struct net_device *dev, + struct switchdev_notifier_info *info) { return NOTIFY_DONE; } -static inline int netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, - u16 flags) +static inline int switchdev_port_bridge_setlink(struct net_device *dev, + struct nlmsghdr *nlh, + u16 flags) { return -EOPNOTSUPP; } -static inline int netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, - u16 flags) +static inline int switchdev_port_bridge_dellink(struct net_device *dev, + struct nlmsghdr *nlh, + u16 flags) { return -EOPNOTSUPP; } -static inline int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, - u16 flags) +static inline int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev, + struct nlmsghdr *nlh, + u16 flags) { return 0; } -static inline int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, - u16 flags) +static inline int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev, + struct nlmsghdr *nlh, + u16 flags) { return 0; } -static inline int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, - struct fib_info *fi, - u8 tos, u8 type, - u32 nlflags, u32 tb_id) +static inline int switchdev_fib_ipv4_add(u32 dst, int dst_len, + struct fib_info *fi, + u8 tos, u8 type, + u32 nlflags, u32 tb_id) { return 0; } -static inline int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, - struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) +static inline int switchdev_fib_ipv4_del(u32 dst, int dst_len, + struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) { return 0; } -static inline void netdev_switch_fib_ipv4_abort(struct fib_info *fi) +static inline void switchdev_fib_ipv4_abort(struct fib_info *fi) { } diff --git a/net/bridge/br.c b/net/bridge/br.c index 02c24cf63c34..a1abe4936fe1 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -121,13 +121,13 @@ static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; -static int br_netdev_switch_event(struct notifier_block *unused, - unsigned long event, void *ptr) +static int br_switchdev_event(struct notifier_block *unused, + unsigned long event, void *ptr) { - struct net_device *dev = netdev_switch_notifier_info_to_dev(ptr); + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; - struct netdev_switch_notifier_fdb_info *fdb_info; + struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; rtnl_lock(); @@ -138,14 +138,14 @@ static int br_netdev_switch_event(struct notifier_block *unused, br = p->br; switch (event) { - case NETDEV_SWITCH_FDB_ADD: + case SWITCHDEV_FDB_ADD: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, fdb_info->vid); if (err) err = notifier_from_errno(err); break; - case NETDEV_SWITCH_FDB_DEL: + case SWITCHDEV_FDB_DEL: fdb_info = ptr; err = br_fdb_external_learn_del(br, p, fdb_info->addr, fdb_info->vid); @@ -159,8 +159,8 @@ out: return err; } -static struct notifier_block br_netdev_switch_notifier = { - .notifier_call = br_netdev_switch_event, +static struct notifier_block br_switchdev_notifier = { + .notifier_call = br_switchdev_event, }; static void __net_exit br_net_exit(struct net *net) @@ -214,7 +214,7 @@ static int __init br_init(void) if (err) goto err_out3; - err = register_netdev_switch_notifier(&br_netdev_switch_notifier); + err = register_switchdev_notifier(&br_switchdev_notifier); if (err) goto err_out4; @@ -235,7 +235,7 @@ static int __init br_init(void) return 0; err_out5: - unregister_netdev_switch_notifier(&br_netdev_switch_notifier); + unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: @@ -253,7 +253,7 @@ static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); - unregister_netdev_switch_notifier(&br_netdev_switch_notifier); + unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); unregister_pernet_subsys(&br_net_ops); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 4b5c236998ff..dc234533e204 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -631,8 +631,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) if (p && !(flags & BRIDGE_FLAGS_SELF)) { /* set bridge attributes in hardware if supported */ - ret_offload = netdev_switch_port_bridge_setlink(dev, nlh, - flags); + ret_offload = switchdev_port_bridge_setlink(dev, nlh, flags); if (ret_offload && ret_offload != -EOPNOTSUPP) br_warn(p->br, "error setting attrs on port %u(%s)\n", (unsigned int)p->port_no, p->dev->name); @@ -671,8 +670,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) if (p && !(flags & BRIDGE_FLAGS_SELF)) { /* del bridge attributes in hardware */ - ret_offload = netdev_switch_port_bridge_dellink(dev, nlh, - flags); + ret_offload = switchdev_port_bridge_dellink(dev, nlh, flags); if (ret_offload && ret_offload != -EOPNOTSUPP) br_warn(p->br, "error deleting attrs on port %u (%s)\n", (unsigned int)p->port_no, p->dev->name); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index fb3ebe615513..28e3f4bc01e0 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -42,7 +42,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) int err; p->state = state; - err = netdev_switch_port_stp_update(p->dev, state); + err = switchdev_port_stp_update(p->dev, state); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4238d6da5c60..be86a7ce9282 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -460,7 +460,7 @@ static ssize_t phys_switch_id_show(struct device *dev, if (dev_isalive(netdev)) { struct netdev_phys_item_id ppid; - ret = netdev_switch_parent_id_get(netdev, &ppid); + ret = switchdev_parent_id_get(netdev, &ppid); if (!ret) ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 83e08323fdcd..fcd41fcd7e70 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1006,7 +1006,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) int err; struct netdev_phys_item_id psid; - err = netdev_switch_parent_id_get(dev, &psid); + err = switchdev_parent_id_get(dev, &psid); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e13fcc602da2..03444c6ae342 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1165,13 +1165,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; - err = netdev_switch_fib_ipv4_add(key, plen, fi, - new_fa->fa_tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); + err = switchdev_fib_ipv4_add(key, plen, fi, + new_fa->fa_tos, + cfg->fc_type, + cfg->fc_nlflags, + tb->tb_id); if (err) { - netdev_switch_fib_ipv4_abort(fi); + switchdev_fib_ipv4_abort(fi); kmem_cache_free(fn_alias_kmem, new_fa); goto out; } @@ -1215,12 +1215,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->tb_id = tb->tb_id; /* (Optionally) offload fib entry to switch hardware. */ - err = netdev_switch_fib_ipv4_add(key, plen, fi, tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); + err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type, + cfg->fc_nlflags, tb->tb_id); if (err) { - netdev_switch_fib_ipv4_abort(fi); + switchdev_fib_ipv4_abort(fi); goto out_free_new_fa; } @@ -1239,7 +1237,7 @@ succeeded: return 0; out_sw_fib_del: - netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); + switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1517,8 +1515,8 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!fa_to_delete) return -ESRCH; - netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, - cfg->fc_type, tb->tb_id); + switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, + cfg->fc_type, tb->tb_id); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1767,10 +1765,9 @@ void fib_table_flush_external(struct fib_table *tb) if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL)) continue; - netdev_switch_fib_ipv4_del(n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, - fa->fa_type, tb->tb_id); + switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id); } /* update leaf slen */ @@ -1835,10 +1832,9 @@ int fib_table_flush(struct fib_table *tb) continue; } - netdev_switch_fib_ipv4_del(n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, - fa->fa_type, tb->tb_id); + switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 46568b85c333..52613ed49a8c 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -19,14 +19,14 @@ #include /** - * netdev_switch_parent_id_get - Get ID of a switch + * switchdev_parent_id_get - Get ID of a switch * @dev: port device * @psid: switch ID * * Get ID of a switch this port is part of. */ -int netdev_switch_parent_id_get(struct net_device *dev, - struct netdev_phys_item_id *psid) +int switchdev_parent_id_get(struct net_device *dev, + struct netdev_phys_item_id *psid) { const struct swdev_ops *ops = dev->swdev_ops; @@ -34,17 +34,17 @@ int netdev_switch_parent_id_get(struct net_device *dev, return -EOPNOTSUPP; return ops->swdev_parent_id_get(dev, psid); } -EXPORT_SYMBOL_GPL(netdev_switch_parent_id_get); +EXPORT_SYMBOL_GPL(switchdev_parent_id_get); /** - * netdev_switch_port_stp_update - Notify switch device port of STP + * switchdev_port_stp_update - Notify switch device port of STP * state change * @dev: port device * @state: port STP state * * Notify switch device port of bridge port STP state change. */ -int netdev_switch_port_stp_update(struct net_device *dev, u8 state) +int switchdev_port_stp_update(struct net_device *dev, u8 state) { const struct swdev_ops *ops = dev->swdev_ops; struct net_device *lower_dev; @@ -55,57 +55,57 @@ int netdev_switch_port_stp_update(struct net_device *dev, u8 state) return ops->swdev_port_stp_update(dev, state); netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_stp_update(lower_dev, state); + err = switchdev_port_stp_update(lower_dev, state); if (err && err != -EOPNOTSUPP) return err; } return err; } -EXPORT_SYMBOL_GPL(netdev_switch_port_stp_update); +EXPORT_SYMBOL_GPL(switchdev_port_stp_update); -static DEFINE_MUTEX(netdev_switch_mutex); -static RAW_NOTIFIER_HEAD(netdev_switch_notif_chain); +static DEFINE_MUTEX(switchdev_mutex); +static RAW_NOTIFIER_HEAD(switchdev_notif_chain); /** - * register_netdev_switch_notifier - Register notifier + * register_switchdev_notifier - Register notifier * @nb: notifier_block * * Register switch device notifier. This should be used by code * which needs to monitor events happening in particular device. * Return values are same as for atomic_notifier_chain_register(). */ -int register_netdev_switch_notifier(struct notifier_block *nb) +int register_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_chain_register(&netdev_switch_notif_chain, nb); - mutex_unlock(&netdev_switch_mutex); + mutex_lock(&switchdev_mutex); + err = raw_notifier_chain_register(&switchdev_notif_chain, nb); + mutex_unlock(&switchdev_mutex); return err; } -EXPORT_SYMBOL_GPL(register_netdev_switch_notifier); +EXPORT_SYMBOL_GPL(register_switchdev_notifier); /** - * unregister_netdev_switch_notifier - Unregister notifier + * unregister_switchdev_notifier - Unregister notifier * @nb: notifier_block * * Unregister switch device notifier. * Return values are same as for atomic_notifier_chain_unregister(). */ -int unregister_netdev_switch_notifier(struct notifier_block *nb) +int unregister_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_chain_unregister(&netdev_switch_notif_chain, nb); - mutex_unlock(&netdev_switch_mutex); + mutex_lock(&switchdev_mutex); + err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb); + mutex_unlock(&switchdev_mutex); return err; } -EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier); +EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); /** - * call_netdev_switch_notifiers - Call notifiers + * call_switchdev_notifiers - Call notifiers * @val: value passed unmodified to notifier function * @dev: port device * @info: notifier information data @@ -114,21 +114,21 @@ EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier); * when it needs to propagate hardware event. * Return values are same as for atomic_notifier_call_chain(). */ -int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev, - struct netdev_switch_notifier_info *info) +int call_switchdev_notifiers(unsigned long val, struct net_device *dev, + struct switchdev_notifier_info *info) { int err; info->dev = dev; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_call_chain(&netdev_switch_notif_chain, val, info); - mutex_unlock(&netdev_switch_mutex); + mutex_lock(&switchdev_mutex); + err = raw_notifier_call_chain(&switchdev_notif_chain, val, info); + mutex_unlock(&switchdev_mutex); return err; } -EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers); +EXPORT_SYMBOL_GPL(call_switchdev_notifiers); /** - * netdev_switch_port_bridge_setlink - Notify switch device port of bridge + * switchdev_port_bridge_setlink - Notify switch device port of bridge * port attributes * * @dev: port device @@ -137,8 +137,8 @@ EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers); * * Notify switch device port of bridge port attributes */ -int netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_bridge_setlink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -150,10 +150,10 @@ int netdev_switch_port_bridge_setlink(struct net_device *dev, return ops->ndo_bridge_setlink(dev, nlh, flags); } -EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink); +EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink); /** - * netdev_switch_port_bridge_dellink - Notify switch device port of bridge + * switchdev_port_bridge_dellink - Notify switch device port of bridge * port attribute delete * * @dev: port device @@ -162,8 +162,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink); * * Notify switch device port of bridge port attribute delete */ -int netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_bridge_dellink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -175,11 +175,11 @@ int netdev_switch_port_bridge_dellink(struct net_device *dev, return ops->ndo_bridge_dellink(dev, nlh, flags); } -EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink); +EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink); /** - * ndo_dflt_netdev_switch_port_bridge_setlink - default ndo bridge setlink - * op for master devices + * ndo_dflt_switchdev_port_bridge_setlink - default ndo bridge setlink + * op for master devices * * @dev: port device * @nlh: netlink msg with bridge port attributes @@ -187,8 +187,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink); * * Notify master device slaves of bridge port attributes */ -int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { struct net_device *lower_dev; struct list_head *iter; @@ -198,18 +198,18 @@ int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, return ret; netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_bridge_setlink(lower_dev, nlh, flags); + err = switchdev_port_bridge_setlink(lower_dev, nlh, flags); if (err && err != -EOPNOTSUPP) ret = err; } return ret; } -EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink); +EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_setlink); /** - * ndo_dflt_netdev_switch_port_bridge_dellink - default ndo bridge dellink - * op for master devices + * ndo_dflt_switchdev_port_bridge_dellink - default ndo bridge dellink + * op for master devices * * @dev: port device * @nlh: netlink msg with bridge port attributes @@ -217,8 +217,8 @@ EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink); * * Notify master device slaves of bridge port attribute deletes */ -int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { struct net_device *lower_dev; struct list_head *iter; @@ -228,16 +228,16 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, return ret; netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_bridge_dellink(lower_dev, nlh, flags); + err = switchdev_port_bridge_dellink(lower_dev, nlh, flags); if (err && err != -EOPNOTSUPP) ret = err; } return ret; } -EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_dellink); +EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_dellink); -static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev) +static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) { const struct swdev_ops *ops = dev->swdev_ops; struct net_device *lower_dev; @@ -253,7 +253,7 @@ static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev) return dev; netdev_for_each_lower_dev(dev, lower_dev, iter) { - port_dev = netdev_switch_get_lowest_dev(lower_dev); + port_dev = switchdev_get_lowest_dev(lower_dev); if (port_dev) return port_dev; } @@ -261,7 +261,7 @@ static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev) return NULL; } -static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) +static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) { struct netdev_phys_item_id psid; struct netdev_phys_item_id prev_psid; @@ -276,11 +276,11 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) if (!nh->nh_dev) return NULL; - dev = netdev_switch_get_lowest_dev(nh->nh_dev); + dev = switchdev_get_lowest_dev(nh->nh_dev); if (!dev) return NULL; - if (netdev_switch_parent_id_get(dev, &psid)) + if (switchdev_parent_id_get(dev, &psid)) return NULL; if (nhsel > 0) { @@ -297,7 +297,7 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) } /** - * netdev_switch_fib_ipv4_add - Add IPv4 route entry to switch + * switchdev_fib_ipv4_add - Add IPv4 route entry to switch * * @dst: route's IPv4 destination address * @dst_len: destination address length (prefix length) @@ -309,8 +309,8 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) * * Add IPv4 route entry to switch device. */ -int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 nlflags, u32 tb_id) +int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 nlflags, u32 tb_id) { struct net_device *dev; const struct swdev_ops *ops; @@ -328,7 +328,7 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, if (fi->fib_net->ipv4.fib_offload_disabled) return 0; - dev = netdev_switch_get_dev_by_nhs(fi); + dev = switchdev_get_dev_by_nhs(fi); if (!dev) return 0; ops = dev->swdev_ops; @@ -343,10 +343,10 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, return err; } -EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add); +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add); /** - * netdev_switch_fib_ipv4_del - Delete IPv4 route entry from switch + * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch * * @dst: route's IPv4 destination address * @dst_len: destination address length (prefix length) @@ -357,8 +357,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add); * * Delete IPv4 route entry from switch device. */ -int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) +int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) { struct net_device *dev; const struct swdev_ops *ops; @@ -367,7 +367,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, if (!(fi->fib_flags & RTNH_F_EXTERNAL)) return 0; - dev = netdev_switch_get_dev_by_nhs(fi); + dev = switchdev_get_dev_by_nhs(fi); if (!dev) return 0; ops = dev->swdev_ops; @@ -381,14 +381,14 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, return err; } -EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_del); +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del); /** - * netdev_switch_fib_ipv4_abort - Abort an IPv4 FIB operation + * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation * * @fi: route FIB info structure */ -void netdev_switch_fib_ipv4_abort(struct fib_info *fi) +void switchdev_fib_ipv4_abort(struct fib_info *fi) { /* There was a problem installing this route to the offload * device. For now, until we come up with more refined @@ -401,4 +401,4 @@ void netdev_switch_fib_ipv4_abort(struct fib_info *fi) fib_flush_external(fi->fib_net); fi->fib_net->ipv4.fib_offload_disabled = true; } -EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_abort); +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); -- cgit v1.2.3 From 216f8bb9f67ff636769064f86812e1ce6466e4af Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 May 2015 06:22:56 -0700 Subject: tcp/dccp: tw_timer_handler() is static tw_timer_handler() is only used from net/ipv4/inet_timewait_sock.c Fixes: 789f558cfb36 ("tcp/dccp: get rid of central timewait timer") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 00ec8d5d7e7e..2ffbd16b79e0 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -170,7 +170,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -void tw_timer_handler(unsigned long data) +static void tw_timer_handler(unsigned long data) { struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; -- cgit v1.2.3 From 7d771aaac7b2459013e9a246f16c06d4f2b819e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 May 2015 06:31:48 -0700 Subject: ipv4: __ip_local_out_sk() is static __ip_local_out_sk() is only used from net/ipv4/ip_output.c net/ipv4/ip_output.c:94:5: warning: symbol '__ip_local_out_sk' was not declared. Should it be static? Fixes: 7026b1ddb6b8 ("netfilter: Pass socket pointer down through okfn().") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c65b93a7b711..2acc5dc32807 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -91,7 +91,7 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); -- cgit v1.2.3 From 125907ae5ef000a3855b33f6545d672275413836 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 13 May 2015 12:57:26 -0400 Subject: geneve: remove MODULE_ALIAS_RTNL_LINK from net/ipv4/geneve.c This file is essentially a library for implementing the geneve encapsulation protocol. The file does not register any rtnl_link_ops, so the MODULE_ALIAS_RTNL_LINK macro is inappropriate here. Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- net/ipv4/geneve.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 8986e63f3bda..8e6a7fe27a4c 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -450,4 +450,3 @@ module_exit(geneve_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jesse Gross "); MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); -MODULE_ALIAS_RTNL_LINK("geneve"); -- cgit v1.2.3 From 35d32e8fe4ab44180e46a0dd54abea6985398d00 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 13 May 2015 12:57:27 -0400 Subject: geneve: move definition of geneve_hdr() to geneve.h This is a static inline with identical definitions in multiple places... Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- include/net/geneve.h | 5 +++++ net/ipv4/geneve.c | 5 ----- net/openvswitch/vport-geneve.c | 5 ----- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/geneve.h b/include/net/geneve.h index 14fb8d3390b4..2a0543a1899d 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -62,6 +62,11 @@ struct genevehdr { struct geneve_opt options[]; }; +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) +{ + return (struct genevehdr *)(udp_hdr(skb) + 1); +} + #ifdef CONFIG_INET struct geneve_sock; diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 8e6a7fe27a4c..001843d41135 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -60,11 +60,6 @@ struct geneve_net { static int geneve_net_id; -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - static struct geneve_sock *geneve_find_sock(struct net *net, sa_family_t family, __be16 port) { diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index bf02fd5808c9..208c576bd1b6 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -46,11 +46,6 @@ static inline struct geneve_port *geneve_vport(const struct vport *vport) return vport_priv(vport); } -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - /* Convert 64 bit tunnel ID to 24 bit VNI. */ static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) { -- cgit v1.2.3 From 11e1fa46b43216458e0f67f1f0b257586c5d8e5c Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 13 May 2015 12:57:28 -0400 Subject: geneve: Rename support library as geneve_core net/ipv4/geneve.c -> net/ipv4/geneve_core.c This name better reflects the purpose of the module. Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 4 +- net/ipv4/Makefile | 2 +- net/ipv4/geneve.c | 447 ------------------------------------------------ net/ipv4/geneve_core.c | 447 ++++++++++++++++++++++++++++++++++++++++++++++++ net/openvswitch/Kconfig | 2 +- 5 files changed, 451 insertions(+), 451 deletions(-) delete mode 100644 net/ipv4/geneve.c create mode 100644 net/ipv4/geneve_core.c (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index bd2901604842..d83071dccd74 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -331,8 +331,8 @@ config NET_FOU_IP_TUNNELS When this option is enabled IP tunnels can be configured to use FOU or GUE encapsulation. -config GENEVE - tristate "Generic Network Virtualization Encapsulation (Geneve)" +config GENEVE_CORE + tristate "Generic Network Virtualization Encapsulation library" depends on INET select NET_UDP_TUNNEL ---help--- diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 518c04ed666e..b36236dd6014 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -56,7 +56,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o -obj-$(CONFIG_GENEVE) += geneve.o +obj-$(CONFIG_GENEVE_CORE) += geneve_core.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c deleted file mode 100644 index 001843d41135..000000000000 --- a/net/ipv4/geneve.c +++ /dev/null @@ -1,447 +0,0 @@ -/* - * Geneve: Generic Network Virtualization Encapsulation - * - * Copyright (c) 2014 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if IS_ENABLED(CONFIG_IPV6) -#include -#include -#include -#include -#endif - -/* Protects sock_list and refcounts. */ -static DEFINE_MUTEX(geneve_mutex); - -/* per-network namespace private data for this module */ -struct geneve_net { - struct list_head sock_list; -}; - -static int geneve_net_id; - -static struct geneve_sock *geneve_find_sock(struct net *net, - sa_family_t family, __be16 port) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - - list_for_each_entry(gs, &gn->sock_list, list) { - if (inet_sk(gs->sock->sk)->inet_sport == port && - inet_sk(gs->sock->sk)->sk.sk_family == family) - return gs; - } - - return NULL; -} - -static void geneve_build_header(struct genevehdr *geneveh, - __be16 tun_flags, u8 vni[3], - u8 options_len, u8 *options) -{ - geneveh->ver = GENEVE_VER; - geneveh->opt_len = options_len / 4; - geneveh->oam = !!(tun_flags & TUNNEL_OAM); - geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); - geneveh->rsvd1 = 0; - memcpy(geneveh->vni, vni, 3); - geneveh->proto_type = htons(ETH_P_TEB); - geneveh->rsvd2 = 0; - - memcpy(geneveh->options, options, options_len); -} - -/* Transmit a fully formatted Geneve frame. - * - * When calling this function. The skb->data should point - * to the geneve header which is fully formed. - * - * This function will add other UDP tunnel headers. - */ -int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet) -{ - struct genevehdr *gnvh; - int min_headroom; - int err; - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } - - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) - return -ENOMEM; - - skb = udp_tunnel_handle_offloads(skb, csum); - if (IS_ERR(skb)) - return PTR_ERR(skb); - - gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); - geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); - - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - - return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst, - tos, ttl, df, src_port, dst_port, xnet, - !csum); -} -EXPORT_SYMBOL_GPL(geneve_xmit_skb); - -static int geneve_hlen(struct genevehdr *gh) -{ - return sizeof(*gh) + gh->opt_len * 4; -} - -static struct sk_buff **geneve_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) -{ - struct sk_buff *p, **pp = NULL; - struct genevehdr *gh, *gh2; - unsigned int hlen, gh_len, off_gnv; - const struct packet_offload *ptype; - __be16 type; - int flush = 1; - - off_gnv = skb_gro_offset(skb); - hlen = off_gnv + sizeof(*gh); - gh = skb_gro_header_fast(skb, off_gnv); - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - if (gh->ver != GENEVE_VER || gh->oam) - goto out; - gh_len = geneve_hlen(gh); - - hlen = off_gnv + gh_len; - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - flush = 0; - - for (p = *head; p; p = p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - gh2 = (struct genevehdr *)(p->data + off_gnv); - if (gh->opt_len != gh2->opt_len || - memcmp(gh, gh2, gh_len)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - } - - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (!ptype) { - flush = 1; - goto out_unlock; - } - - skb_gro_pull(skb, gh_len); - skb_gro_postpull_rcsum(skb, gh, gh_len); - pp = ptype->callbacks.gro_receive(head, skb); - -out_unlock: - rcu_read_unlock(); -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} - -static int geneve_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) -{ - struct genevehdr *gh; - struct packet_offload *ptype; - __be16 type; - int gh_len; - int err = -ENOSYS; - - udp_tunnel_gro_complete(skb, nhoff); - - gh = (struct genevehdr *)(skb->data + nhoff); - gh_len = geneve_hlen(gh); - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); - - rcu_read_unlock(); - return err; -} - -static void geneve_notify_add_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - int err; - - if (sa_family == AF_INET) { - err = udp_add_offload(&gs->udp_offloads); - if (err) - pr_warn("geneve: udp_add_offload failed with status %d\n", - err); - } -} - -static void geneve_notify_del_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - - if (sa_family == AF_INET) - udp_del_offload(&gs->udp_offloads); -} - -/* Callback from net/ipv4/udp.c to receive packets */ -static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) -{ - struct genevehdr *geneveh; - struct geneve_sock *gs; - int opts_len; - - /* Need Geneve and inner Ethernet header to be present */ - if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) - goto error; - - /* Return packets with reserved bits set */ - geneveh = geneve_hdr(skb); - - if (unlikely(geneveh->ver != GENEVE_VER)) - goto error; - - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) - goto error; - - opts_len = geneveh->opt_len * 4; - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, - htons(ETH_P_TEB))) - goto drop; - - gs = rcu_dereference_sk_user_data(sk); - if (!gs) - goto drop; - - gs->rcv(gs, skb); - return 0; - -drop: - /* Consume bad packet */ - kfree_skb(skb); - return 0; - -error: - /* Let the UDP layer deal with the skb */ - return 1; -} - -static struct socket *geneve_create_sock(struct net *net, bool ipv6, - __be16 port) -{ - struct socket *sock; - struct udp_port_cfg udp_conf; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - - if (ipv6) { - udp_conf.family = AF_INET6; - } else { - udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); - } - - udp_conf.local_udp_port = port; - - /* Open UDP socket */ - err = udp_sock_create(net, &udp_conf, &sock); - if (err < 0) - return ERR_PTR(err); - - return sock; -} - -/* Create new listen socket if needed */ -static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool ipv6) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - struct socket *sock; - struct udp_tunnel_sock_cfg tunnel_cfg; - - gs = kzalloc(sizeof(*gs), GFP_KERNEL); - if (!gs) - return ERR_PTR(-ENOMEM); - - sock = geneve_create_sock(net, ipv6, port); - if (IS_ERR(sock)) { - kfree(gs); - return ERR_CAST(sock); - } - - gs->sock = sock; - gs->refcnt = 1; - gs->rcv = rcv; - gs->rcv_data = data; - - /* Initialize the geneve udp offloads structure */ - gs->udp_offloads.port = port; - gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; - gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; - geneve_notify_add_rx_port(gs); - - /* Mark socket as an encapsulation socket */ - tunnel_cfg.sk_user_data = gs; - tunnel_cfg.encap_type = 1; - tunnel_cfg.encap_rcv = geneve_udp_encap_recv; - tunnel_cfg.encap_destroy = NULL; - setup_udp_tunnel_sock(net, sock, &tunnel_cfg); - - list_add(&gs->list, &gn->sock_list); - - return gs; -} - -struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6) -{ - struct geneve_sock *gs; - - mutex_lock(&geneve_mutex); - - gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); - if (gs) { - if (!no_share && gs->rcv == rcv) - gs->refcnt++; - else - gs = ERR_PTR(-EBUSY); - } else { - gs = geneve_socket_create(net, port, rcv, data, ipv6); - } - - mutex_unlock(&geneve_mutex); - - return gs; -} -EXPORT_SYMBOL_GPL(geneve_sock_add); - -void geneve_sock_release(struct geneve_sock *gs) -{ - mutex_lock(&geneve_mutex); - - if (--gs->refcnt) - goto unlock; - - list_del(&gs->list); - geneve_notify_del_rx_port(gs); - udp_tunnel_sock_release(gs->sock); - kfree_rcu(gs, rcu); - -unlock: - mutex_unlock(&geneve_mutex); -} -EXPORT_SYMBOL_GPL(geneve_sock_release); - -static __net_init int geneve_init_net(struct net *net) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - - INIT_LIST_HEAD(&gn->sock_list); - - return 0; -} - -static struct pernet_operations geneve_net_ops = { - .init = geneve_init_net, - .id = &geneve_net_id, - .size = sizeof(struct geneve_net), -}; - -static int __init geneve_init_module(void) -{ - int rc; - - rc = register_pernet_subsys(&geneve_net_ops); - if (rc) - return rc; - - pr_info("Geneve driver\n"); - - return 0; -} -module_init(geneve_init_module); - -static void __exit geneve_cleanup_module(void) -{ - unregister_pernet_subsys(&geneve_net_ops); -} -module_exit(geneve_cleanup_module); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jesse Gross "); -MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); diff --git a/net/ipv4/geneve_core.c b/net/ipv4/geneve_core.c new file mode 100644 index 000000000000..001843d41135 --- /dev/null +++ b/net/ipv4/geneve_core.c @@ -0,0 +1,447 @@ +/* + * Geneve: Generic Network Virtualization Encapsulation + * + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#include +#include +#endif + +/* Protects sock_list and refcounts. */ +static DEFINE_MUTEX(geneve_mutex); + +/* per-network namespace private data for this module */ +struct geneve_net { + struct list_head sock_list; +}; + +static int geneve_net_id; + +static struct geneve_sock *geneve_find_sock(struct net *net, + sa_family_t family, __be16 port) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + + list_for_each_entry(gs, &gn->sock_list, list) { + if (inet_sk(gs->sock->sk)->inet_sport == port && + inet_sk(gs->sock->sk)->sk.sk_family == family) + return gs; + } + + return NULL; +} + +static void geneve_build_header(struct genevehdr *geneveh, + __be16 tun_flags, u8 vni[3], + u8 options_len, u8 *options) +{ + geneveh->ver = GENEVE_VER; + geneveh->opt_len = options_len / 4; + geneveh->oam = !!(tun_flags & TUNNEL_OAM); + geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); + geneveh->rsvd1 = 0; + memcpy(geneveh->vni, vni, 3); + geneveh->proto_type = htons(ETH_P_TEB); + geneveh->rsvd2 = 0; + + memcpy(geneveh->options, options, options_len); +} + +/* Transmit a fully formatted Geneve frame. + * + * When calling this function. The skb->data should point + * to the geneve header which is fully formed. + * + * This function will add other UDP tunnel headers. + */ +int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool csum, bool xnet) +{ + struct genevehdr *gnvh; + int min_headroom; + int err; + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + + skb = vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) + return -ENOMEM; + + skb = udp_tunnel_handle_offloads(skb, csum); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); + geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); + + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + + return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst, + tos, ttl, df, src_port, dst_port, xnet, + !csum); +} +EXPORT_SYMBOL_GPL(geneve_xmit_skb); + +static int geneve_hlen(struct genevehdr *gh) +{ + return sizeof(*gh) + gh->opt_len * 4; +} + +static struct sk_buff **geneve_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + struct udp_offload *uoff) +{ + struct sk_buff *p, **pp = NULL; + struct genevehdr *gh, *gh2; + unsigned int hlen, gh_len, off_gnv; + const struct packet_offload *ptype; + __be16 type; + int flush = 1; + + off_gnv = skb_gro_offset(skb); + hlen = off_gnv + sizeof(*gh); + gh = skb_gro_header_fast(skb, off_gnv); + if (skb_gro_header_hard(skb, hlen)) { + gh = skb_gro_header_slow(skb, hlen, off_gnv); + if (unlikely(!gh)) + goto out; + } + + if (gh->ver != GENEVE_VER || gh->oam) + goto out; + gh_len = geneve_hlen(gh); + + hlen = off_gnv + gh_len; + if (skb_gro_header_hard(skb, hlen)) { + gh = skb_gro_header_slow(skb, hlen, off_gnv); + if (unlikely(!gh)) + goto out; + } + + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + gh2 = (struct genevehdr *)(p->data + off_gnv); + if (gh->opt_len != gh2->opt_len || + memcmp(gh, gh2, gh_len)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + type = gh->proto_type; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (!ptype) { + flush = 1; + goto out_unlock; + } + + skb_gro_pull(skb, gh_len); + skb_gro_postpull_rcsum(skb, gh, gh_len); + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int geneve_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) +{ + struct genevehdr *gh; + struct packet_offload *ptype; + __be16 type; + int gh_len; + int err = -ENOSYS; + + udp_tunnel_gro_complete(skb, nhoff); + + gh = (struct genevehdr *)(skb->data + nhoff); + gh_len = geneve_hlen(gh); + type = gh->proto_type; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype) + err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); + + rcu_read_unlock(); + return err; +} + +static void geneve_notify_add_rx_port(struct geneve_sock *gs) +{ + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + int err; + + if (sa_family == AF_INET) { + err = udp_add_offload(&gs->udp_offloads); + if (err) + pr_warn("geneve: udp_add_offload failed with status %d\n", + err); + } +} + +static void geneve_notify_del_rx_port(struct geneve_sock *gs) +{ + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + + if (sa_family == AF_INET) + udp_del_offload(&gs->udp_offloads); +} + +/* Callback from net/ipv4/udp.c to receive packets */ +static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct genevehdr *geneveh; + struct geneve_sock *gs; + int opts_len; + + /* Need Geneve and inner Ethernet header to be present */ + if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) + goto error; + + /* Return packets with reserved bits set */ + geneveh = geneve_hdr(skb); + + if (unlikely(geneveh->ver != GENEVE_VER)) + goto error; + + if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) + goto error; + + opts_len = geneveh->opt_len * 4; + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, + htons(ETH_P_TEB))) + goto drop; + + gs = rcu_dereference_sk_user_data(sk); + if (!gs) + goto drop; + + gs->rcv(gs, skb); + return 0; + +drop: + /* Consume bad packet */ + kfree_skb(skb); + return 0; + +error: + /* Let the UDP layer deal with the skb */ + return 1; +} + +static struct socket *geneve_create_sock(struct net *net, bool ipv6, + __be16 port) +{ + struct socket *sock; + struct udp_port_cfg udp_conf; + int err; + + memset(&udp_conf, 0, sizeof(udp_conf)); + + if (ipv6) { + udp_conf.family = AF_INET6; + } else { + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + } + + udp_conf.local_udp_port = port; + + /* Open UDP socket */ + err = udp_sock_create(net, &udp_conf, &sock); + if (err < 0) + return ERR_PTR(err); + + return sock; +} + +/* Create new listen socket if needed */ +static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool ipv6) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + struct socket *sock; + struct udp_tunnel_sock_cfg tunnel_cfg; + + gs = kzalloc(sizeof(*gs), GFP_KERNEL); + if (!gs) + return ERR_PTR(-ENOMEM); + + sock = geneve_create_sock(net, ipv6, port); + if (IS_ERR(sock)) { + kfree(gs); + return ERR_CAST(sock); + } + + gs->sock = sock; + gs->refcnt = 1; + gs->rcv = rcv; + gs->rcv_data = data; + + /* Initialize the geneve udp offloads structure */ + gs->udp_offloads.port = port; + gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; + gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; + geneve_notify_add_rx_port(gs); + + /* Mark socket as an encapsulation socket */ + tunnel_cfg.sk_user_data = gs; + tunnel_cfg.encap_type = 1; + tunnel_cfg.encap_rcv = geneve_udp_encap_recv; + tunnel_cfg.encap_destroy = NULL; + setup_udp_tunnel_sock(net, sock, &tunnel_cfg); + + list_add(&gs->list, &gn->sock_list); + + return gs; +} + +struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool no_share, bool ipv6) +{ + struct geneve_sock *gs; + + mutex_lock(&geneve_mutex); + + gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); + if (gs) { + if (!no_share && gs->rcv == rcv) + gs->refcnt++; + else + gs = ERR_PTR(-EBUSY); + } else { + gs = geneve_socket_create(net, port, rcv, data, ipv6); + } + + mutex_unlock(&geneve_mutex); + + return gs; +} +EXPORT_SYMBOL_GPL(geneve_sock_add); + +void geneve_sock_release(struct geneve_sock *gs) +{ + mutex_lock(&geneve_mutex); + + if (--gs->refcnt) + goto unlock; + + list_del(&gs->list); + geneve_notify_del_rx_port(gs); + udp_tunnel_sock_release(gs->sock); + kfree_rcu(gs, rcu); + +unlock: + mutex_unlock(&geneve_mutex); +} +EXPORT_SYMBOL_GPL(geneve_sock_release); + +static __net_init int geneve_init_net(struct net *net) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + + INIT_LIST_HEAD(&gn->sock_list); + + return 0; +} + +static struct pernet_operations geneve_net_ops = { + .init = geneve_init_net, + .id = &geneve_net_id, + .size = sizeof(struct geneve_net), +}; + +static int __init geneve_init_module(void) +{ + int rc; + + rc = register_pernet_subsys(&geneve_net_ops); + if (rc) + return rc; + + pr_info("Geneve driver\n"); + + return 0; +} +module_init(geneve_init_module); + +static void __exit geneve_cleanup_module(void) +{ + unregister_pernet_subsys(&geneve_net_ops); +} +module_exit(geneve_cleanup_module); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jesse Gross "); +MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index ed6b0f8dd1bb..15840401a2ce 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -59,7 +59,7 @@ config OPENVSWITCH_VXLAN config OPENVSWITCH_GENEVE tristate "Open vSwitch Geneve tunneling support" depends on OPENVSWITCH - depends on GENEVE + depends on GENEVE_CORE default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create geneve vport. -- cgit v1.2.3 From d37d29c30509854d15401277f39aab0336525f27 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 13 May 2015 12:57:29 -0400 Subject: geneve_core: identify as driver library in modules description Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- net/ipv4/geneve_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/geneve_core.c b/net/ipv4/geneve_core.c index 001843d41135..311a4ba6950a 100644 --- a/net/ipv4/geneve_core.c +++ b/net/ipv4/geneve_core.c @@ -430,7 +430,7 @@ static int __init geneve_init_module(void) if (rc) return rc; - pr_info("Geneve driver\n"); + pr_info("Geneve core logic\n"); return 0; } @@ -444,4 +444,4 @@ module_exit(geneve_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jesse Gross "); -MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); +MODULE_DESCRIPTION("Driver library for GENEVE encapsulated traffic"); -- cgit v1.2.3 From c24a59649f3c8f4f78adc2d0e31423fa883b012b Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 14 May 2015 14:31:28 -0700 Subject: ip_tunnel: Report Rx dropped in ip_tunnel_get_stats64 The rx_dropped stat wasn't being reported when ip_tunnel_get_stats64 was called. This was leading to some confusing results in my debug as I was seeing rx_errors increment but no other value which pointed me toward the type of error being seen. This change corrects that by using netdev_stats_to_stats64 to copy all available dev stats instead of just the few that were hand picked. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 3998b1822d85..6a51a71a6c67 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -165,6 +165,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, { int i; + netdev_stats_to_stats64(tot, &dev->stats); + for_each_possible_cpu(i) { const struct pcpu_sw_netstats *tstats = per_cpu_ptr(dev->tstats, i); @@ -185,22 +187,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, tot->tx_bytes += tx_bytes; } - tot->multicast = dev->stats.multicast; - - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_frame_errors = dev->stats.rx_frame_errors; - tot->rx_errors = dev->stats.rx_errors; - - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - - tot->collisions = dev->stats.collisions; - return tot; } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); -- cgit v1.2.3 From 55917a21d0cc012bb6073bb05bb768fd51d8e237 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 14 May 2015 14:57:23 +0200 Subject: netfilter: x_tables: add context to know if extension runs from nft_compat Currently, we have four xtables extensions that cannot be used from the xt over nft compat layer. The problem is that they need real access to the full blown xt_entry to validate that the rule comes with the right dependencies. This check was introduced to overcome the lack of sufficient userspace dependency validation in iptables. To resolve this problem, this patch introduces a new field to the xt_tgchk_param structure that tell us if the extension is run from nft_compat context. The three affected extensions are: 1) CLUSTERIP, this target has been superseded by xt_cluster. So just bail out by returning -EINVAL. 2) TCPMSS. Relax the checking when used from nft_compat. If used with the wrong configuration, it will corrupt !syn packets by adding TCP MSS option. 3) ebt_stp. Relax the check to make sure it uses the reserved destination MAC address for STP. Signed-off-by: Pablo Neira Ayuso Tested-by: Arturo Borrero Gonzalez --- include/linux/netfilter/x_tables.h | 2 ++ net/bridge/netfilter/ebt_stp.c | 6 ++++-- net/ipv4/netfilter/ipt_CLUSTERIP.c | 5 +++++ net/netfilter/nft_compat.c | 2 ++ net/netfilter/xt_TCPMSS.c | 6 ++++++ 5 files changed, 19 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index a3e215bb0241..09f38206c18f 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -62,6 +62,7 @@ struct xt_mtchk_param { void *matchinfo; unsigned int hook_mask; u_int8_t family; + bool nft_compat; }; /** @@ -92,6 +93,7 @@ struct xt_tgchk_param { void *targinfo; unsigned int hook_mask; u_int8_t family; + bool nft_compat; }; /* Target destructor parameters */ diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 071d87214dde..0c40570069ba 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -164,8 +164,10 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par) !(info->bitmask & EBT_STP_MASK)) return -EINVAL; /* Make sure the match only receives stp frames */ - if (!ether_addr_equal(e->destmac, bridge_ula) || - !ether_addr_equal(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC)) + if (!par->nft_compat && + (!ether_addr_equal(e->destmac, bridge_ula) || + !ether_addr_equal(e->destmsk, msk) || + !(e->bitmask & EBT_DESTMAC))) return -EINVAL; return 0; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 771ab3d01ad3..45cb16a6a4a3 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -367,6 +367,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) struct clusterip_config *config; int ret; + if (par->nft_compat) { + pr_err("cannot use CLUSTERIP target from nftables compat\n"); + return -EOPNOTSUPP; + } + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 7f29cfc76349..66def315eb56 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -161,6 +161,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, par->hook_mask = 0; } par->family = ctx->afi->family; + par->nft_compat = true; } static void target_compat_from_user(struct xt_target *t, void *in, void *out) @@ -377,6 +378,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, par->hook_mask = 0; } par->family = ctx->afi->family; + par->nft_compat = true; } static void match_compat_from_user(struct xt_match *m, void *in, void *out) diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index e762de5ee89b..8c3190e2fc6a 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -277,6 +277,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) "FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } + if (par->nft_compat) + return 0; + xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; @@ -299,6 +302,9 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par) "FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } + if (par->nft_compat) + return 0; + xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; -- cgit v1.2.3 From 252a8fbe819d041b29789e2035cd1760f373345f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 08:58:45 -0700 Subject: ipip: fix one sparse error make C=2 CF=-D__CHECK_ENDIAN__ net/ipv4/ipip.o CHECK net/ipv4/ipip.c net/ipv4/ipip.c:254:27: warning: incorrect type in assignment (different base types) net/ipv4/ipip.c:254:27: expected restricted __be32 [addressable] [usertype] o_key net/ipv4/ipip.c:254:27: got restricted __be16 [addressable] [usertype] i_flags Fixes: 3b7b514f44bf ("ipip: fix a regression in ioctl") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ff96396ebec5..254238daf58b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -251,7 +251,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EINVAL; } - p.i_key = p.o_key = p.i_flags = p.o_flags = 0; + p.i_key = p.o_key = 0; + p.i_flags = p.o_flags = 0; if (p.iph.ttl) p.iph.frag_off |= htons(IP_DF); -- cgit v1.2.3 From ba6d05641c8a22272a181ffbdacde139d4b986b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 09:07:31 -0700 Subject: netfilter: synproxy: fix sparse errors Fix verbose sparse errors : make C=2 CF=-D__CHECK_ENDIAN__ net/ipv4/netfilter/ipt_SYNPROXY.o Signed-off-by: Eric Dumazet Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_SYNPROXY.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index e9e67793055f..fe8cc183411e 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -18,7 +18,7 @@ #include static struct iphdr * -synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) +synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; @@ -220,7 +220,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; - nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; -- cgit v1.2.3 From a6c5ea4ccf0033591e6e476d7a273c0074c07aa7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:26 -0700 Subject: tcp: rename sk_forced_wmem_schedule() to sk_forced_mem_schedule() We plan to use sk_forced_wmem_schedule() in input path as well, so make it non static and rename it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_output.c | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7ace6acbf5fd..841691a296dc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -311,6 +311,8 @@ static inline bool tcp_out_of_memory(struct sock *sk) return false; } +void sk_forced_mem_schedule(struct sock *sk, int size); + static inline bool tcp_too_many_orphans(struct sock *sk, int shift) { struct percpu_counter *ocp = sk->sk_prot->orphan_count; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7386d32cd670..bac1a950d087 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2816,8 +2816,10 @@ begin_fwd: * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. + * In general, we want to allow one skb per socket to avoid hangs + * with edge trigger epoll() */ -static void sk_forced_wmem_schedule(struct sock *sk, int size) +void sk_forced_mem_schedule(struct sock *sk, int size) { int amt, status; @@ -2864,7 +2866,7 @@ coalesce: return; } skb_reserve(skb, MAX_TCP_HEADER); - sk_forced_wmem_schedule(sk, skb->truesize); + sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); -- cgit v1.2.3 From b8da51ebb1aa93908350f95efae73aecbc2e266c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:27 -0700 Subject: tcp: introduce tcp_under_memory_pressure() Introduce an optimized version of sk_under_memory_pressure() for TCP. Our intent is to use it in fast paths. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++++ net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 4 ++-- net/ipv4/tcp_timer.c | 2 +- 4 files changed, 15 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 841691a296dc..0d85223efa4c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -286,6 +286,14 @@ extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; extern int tcp_memory_pressure; +/* optimized version of sk_under_memory_pressure() for TCP sockets */ +static inline bool tcp_under_memory_pressure(const struct sock *sk) +{ + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + return !!sk->sk_cgrp->memory_pressure; + + return tcp_memory_pressure; +} /* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cf8b20ff6658..093779f7e893 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !sk_under_memory_pressure(sk)) { + !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !sk_under_memory_pressure(sk) && + !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); @@ -4781,7 +4781,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (sk_under_memory_pressure(sk)) + else if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4825,7 +4825,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we are under global TCP memory pressure, do not expand. */ - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) return false; /* If we are under soft global TCP memory pressure, do not expand. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bac1a950d087..08c2cc40b26d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2392,7 +2392,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -2843,7 +2843,7 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) { + if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 65bf670e8714..5b752f58a900 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -247,7 +247,7 @@ void tcp_delack_timer_handler(struct sock *sk) } out: - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) sk_mem_reclaim(sk); } -- cgit v1.2.3 From 8e4d980ac21596a9b91d8e720c77ad081975a0a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:28 -0700 Subject: tcp: fix behavior for epoll edge trigger Under memory pressure, tcp_sendmsg() can fail to queue a packet while no packet is present in write queue. If we return -EAGAIN with no packet in write queue, no ACK packet will ever come to raise EPOLLOUT. We need to allow one skb per TCP socket, and make sure that tcp sockets can release their forward allocations under pressure. This is a followup to commit 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ecccfdc50d76..9eabfd3e0925 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -815,9 +815,20 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); + if (unlikely(tcp_under_memory_pressure(sk))) + sk_mem_reclaim_partial(sk); + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); - if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { + if (likely(skb)) { + bool mem_schedule; + + if (skb_queue_len(&sk->sk_write_queue) == 0) { + mem_schedule = true; + sk_forced_mem_schedule(sk, skb->truesize); + } else { + mem_schedule = sk_wmem_schedule(sk, skb->truesize); + } + if (likely(mem_schedule)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes -- cgit v1.2.3 From 76dfa6082032b5c179864816fa508879421678eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:29 -0700 Subject: tcp: allow one skb to be received per socket under memory pressure While testing tight tcp_mem settings, I found tcp sessions could be stuck because we do not allow even one skb to be received on them. By allowing one skb to be received, we introduce fairness and eventuallu force memory hogs to release their allocation. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 093779f7e893..40c435997e54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4507,10 +4507,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: - if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - + if (eaten < 0) { + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); -- cgit v1.2.3 From b66e91ccbc34ebd5a2f90f9e1bc1597e2924a500 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:30 -0700 Subject: tcp: halves tcp_mem[] limits Allowing tcp to use ~19% of physical memory is way too much, and allowed bugs to be hidden. Add to this that some drivers use a full page per incoming frame, so real cost can be twice the advertized one. Reduce tcp_mem by 50 % as a first step to sanity. tcp_mem[0,1,2] defaults are now 4.68%, 6.25%, 9.37% of physical memory. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9eabfd3e0925..c724195e5862 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3068,11 +3068,12 @@ __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { - unsigned long limit = nr_free_buffer_pages() / 8; + unsigned long limit = nr_free_buffer_pages() / 16; + limit = max(limit, 128UL); - sysctl_tcp_mem[0] = limit / 4 * 3; - sysctl_tcp_mem[1] = limit; - sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ + sysctl_tcp_mem[1] = limit; /* 6.25 % */ + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } void __init tcp_init(void) -- cgit v1.2.3 From 5cf422808244ca8f1177c72fe6e1ce8322794b57 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 15 May 2015 14:15:35 -0700 Subject: ipv4: introduce frag_expire_skip_icmp() Improve readability of skip ICMP for de-fragmentation expiration logic. This change will also make the logic easier to maintain when the following patches in this series are applied. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/net/ip.h | 10 ++++++++++ net/ipv4/ip_fragment.c | 13 +++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 0ed6d768e606..43f6f39df9fc 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -478,6 +478,16 @@ enum ip_defrag_users { IP_DEFRAG_MACVLAN, }; +/* Return true if the value of 'user' is between 'lower_bond' + * and 'upper_bond' inclusively. + */ +static inline bool ip_defrag_user_in_between(u32 user, + enum ip_defrag_users lower_bond, + enum ip_defrag_users upper_bond) +{ + return user >= lower_bond && user <= upper_bond; +} + int ip_defrag(struct sk_buff *skb, u32 user); #ifdef CONFIG_INET struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cc1da6d9cb35..83424f1742b8 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -173,6 +173,13 @@ static void ipq_kill(struct ipq *ipq) inet_frag_kill(&ipq->q, &ip4_frags); } +static bool frag_expire_skip_icmp(u32 user) +{ + return user == IP_DEFRAG_AF_PACKET || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, + __IP_DEFRAG_CONNTRACK_IN_END); +} + /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ @@ -217,10 +224,8 @@ static void ip_expire(unsigned long arg) /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_AF_PACKET || - ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && - (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && - (skb_rtable(head)->rt_type != RTN_LOCAL))) + if (frag_expire_skip_icmp(qp->user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out_rcu_unlock; /* Send an ICMP "Fragment Reassembly Timeout" message. */ -- cgit v1.2.3 From 8bc04864ac89616e55fc8a196dd32b7066433ea8 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 15 May 2015 14:15:36 -0700 Subject: IPv4: skip ICMP for bridge contrack users when defrag expires users in [IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BR_IN] should not ICMP message also. Reported-by: Florian Westphal Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 83424f1742b8..47fa64ee82b1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -177,7 +177,9 @@ static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, - __IP_DEFRAG_CONNTRACK_IN_END); + __IP_DEFRAG_CONNTRACK_IN_END) || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, + __IP_DEFRAG_CONNTRACK_BRIDGE_IN); } /* -- cgit v1.2.3 From 49d16b23cd1e61c028ee088c5a64e9ac6a9c6147 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 15 May 2015 14:15:37 -0700 Subject: bridge_netfilter: No ICMP packet on IPv4 fragmentation error When bridge netfilter re-fragments an IP packet for output, all packets that can not be re-fragmented to their original input size should be silently discarded. However, current bridge netfilter output path generates an ICMP packet with 'size exceeded MTU' message for such packets, this is a bug. This patch refactors the ip_fragment() API to allow two separate use cases. The bridge netfilter user case will not send ICMP, the routing output will, as before. Signed-off-by: Andy Zhou Acked-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/ip.h | 4 ++-- net/bridge/br_netfilter.c | 21 ++++++++++++++++++++- net/ipv4/ip_output.c | 40 ++++++++++++++++++++++++++++------------ 3 files changed, 50 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 43f6f39df9fc..cd7a6a458bb6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -108,8 +108,8 @@ int ip_local_deliver(struct sk_buff *skb); int ip_mr_input(struct sk_buff *skb); int ip_output(struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct sock *sk, struct sk_buff *skb); -int ip_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)); +int ip_do_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)); int ip_do_nat(struct sk_buff *skb); void ip_send_check(struct iphdr *ip); int __ip_local_out(struct sk_buff *skb); diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 2b0e8bb49944..1d2eb32d8270 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -853,6 +853,25 @@ static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) return br_dev_queue_push_xmit(sk, skb); } +static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) +{ + unsigned int mtu = ip_skb_dst_mtu(skb); + struct iphdr *iph = ip_hdr(skb); + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + return ip_do_fragment(sk, skb, output); +} + static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) { int ret; @@ -886,7 +905,7 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - ret = ip_fragment(sk, skb, br_nf_push_frag_xmit); + ret = br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit); } else { nf_bridge_info_free(skb); ret = br_dev_queue_push_xmit(sk, skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2acc5dc32807..8d91b922fcfe 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,6 +83,9 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); +static int ip_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)); + /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) { @@ -478,6 +481,28 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +static int ip_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int mtu = ip_skb_dst_mtu(skb); + + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + kfree_skb(skb); + return -EMSGSIZE; + } + + return ip_do_fragment(sk, skb, output); +} + /* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus @@ -485,8 +510,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) * single device frame, and queue such a frame for sending. */ -int ip_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip_do_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; @@ -507,15 +532,6 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb, iph = ip_hdr(skb); mtu = ip_skb_dst_mtu(skb); - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || - (IPCB(skb)->frag_max_size && - IPCB(skb)->frag_max_size > mtu))) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - kfree_skb(skb); - return -EMSGSIZE; - } /* * Setup starting values. @@ -751,7 +767,7 @@ fail: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } -EXPORT_SYMBOL(ip_fragment); +EXPORT_SYMBOL(ip_do_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) -- cgit v1.2.3 From aea0929e516a1ff4c4458203d1f3375eee9acc26 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Mon, 18 May 2015 14:35:58 -0400 Subject: tcp: Return error instead of partial read for saved syn headers Currently the getsockopt() requesting the cached contents of the syn packet headers will fail silently if the caller uses a buffer that is too small to contain the requested data. Rather than fail silently and discard the headers, getsockopt() should return an error and report the required size to hold the data. Signed-off-by: Eric B Munson Cc: Eric Dumazet Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c724195e5862..bb9bb844204f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2845,7 +2845,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level, lock_sock(sk); if (tp->saved_syn) { - len = min_t(unsigned int, tp->saved_syn[0], len); + if (len < tp->saved_syn[0]) { + if (put_user(tp->saved_syn[0], optlen)) { + release_sock(sk); + return -EFAULT; + } + release_sock(sk); + return -EINVAL; + } + len = tp->saved_syn[0]; if (put_user(len, optlen)) { release_sock(sk); return -EFAULT; -- cgit v1.2.3 From 492135557dc090a1abb2cfbe1a412757e3ed68ab Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2015 21:04:22 +0200 Subject: tcp: add rfc3168, section 6.1.1.1. fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work as a follow-up of commit f7b3bec6f516 ("net: allow setting ecn via routing table") and adds RFC3168 section 6.1.1.1. fallback for outgoing ECN connections. In other words, this work adds a retry with a non-ECN setup SYN packet, as suggested from the RFC on the first timeout: [...] A host that receives no reply to an ECN-setup SYN within the normal SYN retransmission timeout interval MAY resend the SYN and any subsequent SYN retransmissions with CWR and ECE cleared. [...] Schematic client-side view when assuming the server is in tcp_ecn=2 mode, that is, Linux default since 2009 via commit 255cac91c3c9 ("tcp: extend ECN sysctl to allow server-side only ECN"): 1) Normal ECN-capable path: SYN ECE CWR -----> <----- SYN ACK ECE ACK -----> 2) Path with broken middlebox, when client has fallback: SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN -----> <----- SYN ACK ACK -----> In case we would not have the fallback implemented, the middlebox drop point would basically end up as: SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) In any case, it's rather a smaller percentage of sites where there would occur such additional setup latency: it was found in end of 2014 that ~56% of IPv4 and 65% of IPv6 servers of Alexa 1 million list would negotiate ECN (aka tcp_ecn=2 default), 0.42% of these webservers will fail to connect when trying to negotiate with ECN (tcp_ecn=1) due to timeouts, which the fallback would mitigate with a slight latency trade-off. Recent related paper on this topic: Brian Trammell, Mirja Kühlewind, Damiano Boppart, Iain Learmonth, Gorry Fairhurst, and Richard Scheffenegger: "Enabling Internet-Wide Deployment of Explicit Congestion Notification." Proc. PAM 2015, New York. http://ecn.ethz.ch/ecn-pam15.pdf Thus, when net.ipv4.tcp_ecn=1 is being set, the patch will perform RFC3168, section 6.1.1.1. fallback on timeout. For users explicitly not wanting this which can be in DC use case, we add a net.ipv4.tcp_ecn_fallback knob that allows for disabling the fallback. tp->ecn_flags are not being cleared in tcp_ecn_clear_syn() on output, but rather we let tcp_ecn_rcv_synack() take that over on input path in case a SYN ACK ECE was delayed. Thus a spurious SYN retransmission will not prevent ECN being negotiated eventually in that case. Reference: https://www.ietf.org/proceedings/92/slides/slides-92-iccrg-1.pdf Reference: https://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Mirja Kühlewind Signed-off-by: Brian Trammell Cc: Eric Dumazet Cc: Dave That Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/dctcp.txt | 1 + Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 2 ++ net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_ipv4.c | 5 ++++- net/ipv4/tcp_output.c | 13 +++++++++++++ 7 files changed, 38 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt index 0d5dfbc89ec9..13a857753208 100644 --- a/Documentation/networking/dctcp.txt +++ b/Documentation/networking/dctcp.txt @@ -8,6 +8,7 @@ the data center network to provide multi-bit feedback to the end hosts. To enable it on end hosts: sysctl -w net.ipv4.tcp_congestion_control=dctcp + sysctl -w net.ipv4.tcp_ecn_fallback=0 (optional) All switches in the data center network running DCTCP must support ECN marking and be configured for marking when reaching defined switch buffer diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 5095c63e50ed..cb083e0d682c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -267,6 +267,15 @@ tcp_ecn - INTEGER but do not request ECN on outgoing connections. Default: 2 +tcp_ecn_fallback - BOOLEAN + If the kernel detects that ECN connection misbehaves, enable fall + back to non-ECN. Currently, this knob implements the fallback + from RFC3168, section 6.1.1.1., but we reserve that in future, + additional detection mechanisms could be implemented under this + knob. The value is not used, if tcp_ecn or per route (or congestion + control) ECN settings are disabled. + Default: 1 (fallback enabled) + tcp_fack - BOOLEAN Enable FACK congestion avoidance and fast retransmission. The value is not used, if tcp_sack is not enabled. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 614a49be68a9..6848b8bb2e63 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -77,6 +77,8 @@ struct netns_ipv4 { struct local_ports ip_local_ports; int sysctl_tcp_ecn; + int sysctl_tcp_ecn_fallback; + int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; int sysctl_ip_nonlocal_bind; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0d85223efa4c..2bb2bad21d5c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -712,6 +712,8 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) #define TCPHDR_ECE 0x40 #define TCPHDR_CWR 0x80 +#define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) + /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c3852a7ff3c7..841de32f1fee 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -820,6 +820,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_ecn_fallback", + .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "ip_local_port_range", .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 91cb4768a860..0cc4b5a630cd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2411,12 +2411,15 @@ static int __net_init tcp_sk_init(struct net *net) goto fail; *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } + net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn_fallback = 1; + net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; - return 0; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 08c2cc40b26d..e29d43b5a0bb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -350,6 +350,15 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) } } +static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) +{ + if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback) + /* tp->ecn_flags are cleared at a later point in time when + * SYN ACK is ultimatively being received. + */ + TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); +} + static void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, struct sock *sk) @@ -2615,6 +2624,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } } + /* RFC3168, section 6.1.1.1. ECN fallback */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + tcp_retrans_try_collapse(sk, skb, cur_mss); /* Make a copy, if the first transmission SKB clone we made -- cgit v1.2.3 From eb9344781a2f8381ed60cd9e662d9ced2d168ecb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 May 2015 13:26:55 -0700 Subject: tcp: add a force_schedule argument to sk_stream_alloc_skb() In commit 8e4d980ac215 ("tcp: fix behavior for epoll edge trigger") we fixed a possible hang of TCP sockets under memory pressure, by allowing sk_stream_alloc_skb() to use sk_forced_mem_schedule() if no packet is in socket write queue. It turns out there are other cases where we want to force memory schedule : tcp_fragment() & tso_fragment() need to split a big TSO packet into two smaller ones. If we block here because of TCP memory pressure, we can effectively block TCP socket from sending new data. If no further ACK is coming, this hang would be definitive, and socket has no chance to effectively reduce its memory usage. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 3 ++- net/ipv4/tcp.c | 19 +++++++++++-------- net/ipv4/tcp_output.c | 10 +++++----- 3 files changed, 18 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/sock.h b/include/net/sock.h index 4581a60636f8..26c1c3171e00 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2025,7 +2025,8 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) } } -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp); +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + bool force_schedule); /** * sk_page_frag - return an appropriate page_frag diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bb9bb844204f..ca1d476c80ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -808,7 +808,8 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_SYMBOL(tcp_splice_read); -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + bool force_schedule) { struct sk_buff *skb; @@ -820,15 +821,15 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); if (likely(skb)) { - bool mem_schedule; + bool mem_scheduled; - if (skb_queue_len(&sk->sk_write_queue) == 0) { - mem_schedule = true; + if (force_schedule) { + mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); } else { - mem_schedule = sk_wmem_schedule(sk, skb->truesize); + mem_scheduled = sk_wmem_schedule(sk, skb->truesize); } - if (likely(mem_schedule)) { + if (likely(mem_scheduled)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes @@ -918,7 +919,8 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -1154,7 +1156,8 @@ new_segment: skb = sk_stream_alloc_skb(sk, select_size(sk, sg), - sk->sk_allocation); + sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e29d43b5a0bb..3aebe0157dfa 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1172,7 +1172,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, gfp); + buff = sk_stream_alloc_skb(sk, nsize, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ @@ -1731,7 +1731,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (skb->len != skb->data_len) return tcp_fragment(sk, skb, len, mss_now, gfp); - buff = sk_stream_alloc_skb(sk, 0, gfp); + buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) return -ENOMEM; @@ -1950,7 +1950,7 @@ static int tcp_mtu_probe(struct sock *sk) } /* We're allowed to probe. Build it now. */ - nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC); + nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; sk->sk_wmem_queued += nskb->truesize; @@ -3190,7 +3190,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false); if (!syn_data) goto fallback; syn_data->ip_summed = CHECKSUM_PARTIAL; @@ -3256,7 +3256,7 @@ int tcp_connect(struct sock *sk) return 0; } - buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; -- cgit v1.2.3 From ce5ec440994b7b2b714633b516b9e0a4f6a98dfe Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 20 May 2015 15:52:53 +0000 Subject: tcp: ensure epoll edge trigger wakeup when write queue is empty We currently rely on the setting of SOCK_NOSPACE in the write() path to ensure that we wake up any epoll edge trigger waiters when acks return to free space in the write queue. However, if we fail to allocate even a single skb in the write queue, we could end up waiting indefinitely. Fix this by explicitly issuing a wakeup when we detect the condition of an empty write queue and a return value of -EAGAIN. This allows userspace to re-try as we expect this to be a temporary failure. I've tested this approach by artificially making sk_stream_alloc_skb() return NULL periodically. In that case, epoll edge trigger waiters will hang indefinitely in epoll_wait() without this patch. Signed-off-by: Jason Baron Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ca1d476c80ef..0a3f9a00565b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -999,6 +999,9 @@ do_error: if (copied) goto out; out_err: + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); return sk_stream_error(sk, flags, err); } @@ -1288,6 +1291,9 @@ do_error: goto out; out_err: err = sk_stream_error(sk, flags, err); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); release_sock(sk); return err; } -- cgit v1.2.3 From f5af1f57a2914e290de40e2c93716da8885c4965 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 May 2015 10:59:01 -0700 Subject: inet_hashinfo: remove bsocket counter We no longer need bsocket atomic counter, as inet_csk_get_port() calls bind_conflict() regardless of its value, after commit 2b05ad33e1e624e ("tcp: bind() fix autoselection to share ports") This patch removes overhead of maintaining this counter and double inet_csk_get_port() calls under pressure. Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Cc: Flavio Leitner Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 2 -- net/ipv4/inet_connection_sock.c | 5 ----- net/ipv4/inet_hashtables.c | 7 ------- 3 files changed, 14 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 73fe0f9525d9..774d24151d4a 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -148,8 +148,6 @@ struct inet_hashinfo { */ struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE] ____cacheline_aligned_in_smp; - - atomic_t bsockets; }; static inline struct inet_ehash_bucket *inet_ehash_bucket( diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8976ca423a07..b95fb263a13f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -127,11 +127,6 @@ again: (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; smallest_rover = rover; - if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && - !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { - snum = smallest_rover; - goto tb_found; - } } if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { snum = rover; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c6fb80bd5826..3766bddb3e8a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -90,10 +90,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - - atomic_inc(&hashinfo->bsockets); - inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); tb->num_owners++; @@ -111,8 +107,6 @@ static void __inet_put_port(struct sock *sk) struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; - atomic_dec(&hashinfo->bsockets); - spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); @@ -608,7 +602,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h) { int i; - atomic_set(&h->bsockets, 0); for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, -- cgit v1.2.3 From 946f9eb226c296da0d6c630bdad282ca11d77f60 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 May 2015 10:59:02 -0700 Subject: tcp: improve REUSEADDR/NOREUSEADDR cohabitation inet_csk_get_port() randomization effort tends to spread sockets on all the available range (ip_local_port_range) This is unfortunate because SO_REUSEADDR sockets have less requirements than non SO_REUSEADDR ones. If an application uses SO_REUSEADDR hint, it is to try to allow source ports being shared. So instead of picking a random port number in ip_local_port_range, lets try first in first half of the range. This gives more chances to use upper half of the range for the sockets with strong requirements (not using SO_REUSEADDR) Note this patch does not add a new sysctl, and only changes the way we try to pick port number. Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Cc: Flavio Leitner Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b95fb263a13f..60021d0d9326 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -99,6 +99,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct net *net = sock_net(sk); int smallest_size = -1, smallest_rover; kuid_t uid = sock_i_uid(sk); + int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; local_bh_disable(); if (!snum) { @@ -106,6 +107,14 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) again: inet_get_local_port_range(net, &low, &high); + if (attempt_half) { + int half = low + ((high - low) >> 1); + + if (attempt_half == 1) + high = half; + else + low = half; + } remaining = (high - low) + 1; smallest_rover = rover = prandom_u32() % remaining + low; @@ -154,6 +163,11 @@ again: snum = smallest_rover; goto have_snum; } + if (attempt_half == 1) { + /* OK we now try the upper half of the range */ + attempt_half = 2; + goto again; + } goto fail; } /* OK, here is the one we will use. HEAD is -- cgit v1.2.3 From 2efd055c53c06b7e89c167c98069bab9afce7e59 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 20 May 2015 16:35:41 -0700 Subject: tcp: add tcpi_segs_in and tcpi_segs_out to tcp_info This patch tracks the total number of inbound and outbound segments on a TCP socket. One may use this number to have an idea on connection quality when compared against the retransmissions. RFC4898 named these : tcpEStatsPerfSegsIn and tcpEStatsPerfSegsOut These are a 32bit field each and can be fetched both from TCP_INFO getsockopt() if one has a handle on a TCP socket, or from inet_diag netlink facility (iproute2/ss patch will follow) Note that tp->segs_out was placed near tp->snd_nxt for good data locality and minimal performance impact, while tp->segs_in was placed near tp->bytes_received for the same reason. Join work with Eric Dumazet. Note that received SYN are accounted on the listener, but sent SYNACK are not accounted. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 7 ++++++- include/uapi/linux/tcp.h | 4 +++- net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 7 files changed, 15 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e6fb5df22db1..f0212026c77f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -149,11 +149,16 @@ struct tcp_sock { * sum(delta(rcv_nxt)), or how many bytes * were acked. */ + u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 snd_nxt; /* Next sequence we send */ - + u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 51ebedba577f..65a77b071e22 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -192,8 +192,10 @@ struct tcp_info { __u64 tcpi_pacing_rate; __u64 tcpi_max_pacing_rate; - __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ + __u32 tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */ + __u32 tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0a3f9a00565b..7f3e721b9e69 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2695,6 +2695,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) spin_lock_bh(&sk->sk_lock.slock); info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; + info->tcpi_segs_out = tp->segs_out; + info->tcpi_segs_in = tp->segs_in; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0cc4b5a630cd..feb875769b8d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1626,6 +1626,7 @@ process: skb->dev = NULL; bh_lock_sock_nested(sk); + tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ebe2ab2596ed..b62d15c86946 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -448,6 +448,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 0; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3aebe0157dfa..534e5fdb04c1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1027,6 +1027,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + tp->segs_out += tcp_skb_pcount(skb); /* OK, its time to fill skb_shinfo(skb)->gso_segs */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b6575d665568..beac6bf840b9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1421,6 +1421,7 @@ process: skb->dev = NULL; bh_lock_sock_nested(sk); + tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) -- cgit v1.2.3 From be12a1fe298e8be04d5215364f94654dff81b0bc Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 21 May 2015 16:59:58 +0200 Subject: net: skbuff: add skb_append_pagefrags and use it Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ net/core/skbuff.c | 18 ++++++++++++++++++ net/ipv4/ip_output.c | 15 ++++----------- 3 files changed, 25 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b617095adb88..f708936cdd23 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -861,6 +861,9 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, int len, int odd, struct sk_buff *skb), void *from, int length); +int skb_append_pagefrags(struct sk_buff *skb, struct page *page, + int offset, size_t size); + struct skb_seq_state { __u32 lower_offset; __u32 upper_offset; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f3fe9bd9e672..4f2babeaf18d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2915,6 +2915,24 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, } EXPORT_SYMBOL(skb_append_datato_frags); +int skb_append_pagefrags(struct sk_buff *skb, struct page *page, + int offset, size_t size) +{ + int i = skb_shinfo(skb)->nr_frags; + + if (skb_can_coalesce(skb, i, page, offset)) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); + } else if (i < MAX_SKB_FRAGS) { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, size); + } else { + return -EMSGSIZE; + } + + return 0; +} +EXPORT_SYMBOL_GPL(skb_append_pagefrags); + /** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8d91b922fcfe..451b009dae75 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1233,11 +1233,9 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, } while (size > 0) { - int i; - - if (skb_is_gso(skb)) + if (skb_is_gso(skb)) { len = size; - else { + } else { /* Check if the remaining data fits into current packet. */ len = mtu - skb->len; @@ -1289,15 +1287,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, continue; } - i = skb_shinfo(skb)->nr_frags; if (len > size) len = size; - if (skb_can_coalesce(skb, i, page, offset)) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); - } else if (i < MAX_SKB_FRAGS) { - get_page(page); - skb_fill_page_desc(skb, i, page, offset, len); - } else { + + if (skb_append_pagefrags(skb, page, offset, len)) { err = -EMSGSIZE; goto error; } -- cgit v1.2.3 From a60e3cc7c92973a31fad0fd04dc5cf4355d3d1ef Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 21 May 2015 17:00:00 +0200 Subject: net: make skb_splice_bits more configureable Prepare skb_splice_bits to be able to deal with AF_UNIX sockets. AF_UNIX sockets don't use lock_sock/release_sock and thus we have to use a callback to make the locking and unlocking configureable. Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 +++++++++-- net/core/skbuff.c | 45 ++++++++++++++++++++++++++++----------------- net/ipv4/tcp.c | 5 +++-- 3 files changed, 40 insertions(+), 21 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f708936cdd23..6b41c15efa27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -35,6 +35,7 @@ #include #include #include +#include /* A. Checksumming of received packets by device. * @@ -2699,9 +2700,15 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len); __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, __wsum csum); -int skb_splice_bits(struct sk_buff *skb, unsigned int offset, +ssize_t skb_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd); +int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int len, - unsigned int flags); + unsigned int flags, + ssize_t (*splice_cb)(struct sock *, + struct pipe_inode_info *, + struct splice_pipe_desc *)); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4f2babeaf18d..02769fa4f5c8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1870,15 +1870,39 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return false; } +ssize_t skb_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + + /* Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(sk); + ret = splice_to_pipe(pipe, spd); + lock_sock(sk); + + return ret; +} + /* * Map data from the skb to a pipe. Should handle both the linear part, * the fragments, and the frag list. It does NOT handle frag lists within * the frag list, if such a thing exists. We'd probably need to recurse to * handle that cleanly. */ -int skb_splice_bits(struct sk_buff *skb, unsigned int offset, +int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, - unsigned int flags) + unsigned int flags, + ssize_t (*splice_cb)(struct sock *, + struct pipe_inode_info *, + struct splice_pipe_desc *)) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; @@ -1891,7 +1915,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .spd_release = sock_spd_release, }; struct sk_buff *frag_iter; - struct sock *sk = skb->sk; int ret = 0; /* @@ -1914,20 +1937,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, } done: - if (spd.nr_pages) { - /* - * Drop the socket lock, otherwise we have reverse - * locking dependencies between sk_lock and i_mutex - * here as compared to sendfile(). We enter here - * with the socket lock held, and splice_to_pipe() will - * grab the pipe inode lock. For sendfile() emulation, - * we call into ->sendpage() with the i_mutex lock held - * and networking will grab the socket lock. - */ - release_sock(sk); - ret = splice_to_pipe(pipe, &spd); - lock_sock(sk); - } + if (spd.nr_pages) + ret = splice_cb(sk, pipe, &spd); return ret; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 90afcec3f427..65f791f74845 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -695,8 +695,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, struct tcp_splice_state *tss = rd_desc->arg.data; int ret; - ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), - tss->flags); + ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, + min(rd_desc->count, len), tss->flags, + skb_socket_splice); if (ret > 0) rd_desc->count -= ret; return ret; -- cgit v1.2.3 From cf826244322443215c156c0751fa2c982969a9c9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 May 2015 00:06:40 +0200 Subject: ip: reject too-big defragmented DF-skb when forwarding Send icmp pmtu error if we find that the largest fragment of df-skb exceeded the output path mtu. The ip output path will still catch this later on but we can avoid the forward/postrouting hook traversal by rejecting right away. This is what ipv6 already does. Acked-by: Hannes Frederic Sowa Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 3674484946a5..2d3aa408fbdc 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,17 +39,21 @@ #include #include -static bool ip_may_fragment(const struct sk_buff *skb) -{ - return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || - skb->ignore_df; -} - static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; + if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)) + return false; + + /* original fragment exceeds mtu and DF is set */ + if (unlikely(IPCB(skb)->frag_max_size > mtu)) + return true; + + if (skb->ignore_df) + return false; + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) return false; @@ -114,7 +118,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { + if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); -- cgit v1.2.3 From 2f06550b3b0e26f54045337e34ec2a1b666bb6c6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 24 May 2015 01:00:41 +0200 Subject: netfilter: remove unused comefrom hookmask argument Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ip_tables.c | 4 +--- net/ipv6/netfilter/ip6_tables.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c69db7fa25ee..583779fc3425 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1441,7 +1441,6 @@ static int compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ipt_ip *ip, - unsigned int hookmask, int *size) { struct xt_match *match; @@ -1510,8 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { - ret = compat_find_calc_match(ematch, name, - &e->ip, e->comefrom, &off); + ret = compat_find_calc_match(ematch, name, &e->ip, &off); if (ret != 0) goto release_matches; ++j; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 1a732a1d3c8e..d54f04970ee0 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1456,7 +1456,6 @@ static int compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ip6t_ip6 *ipv6, - unsigned int hookmask, int *size) { struct xt_match *match; @@ -1525,8 +1524,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { - ret = compat_find_calc_match(ematch, name, - &e->ipv6, e->comefrom, &off); + ret = compat_find_calc_match(ematch, name, &e->ipv6, &off); if (ret != 0) goto release_matches; ++j; -- cgit v1.2.3 From 095dc8e0c3686d586a01a50abc3e1bb9ac633054 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2015 07:55:34 -0700 Subject: tcp: fix/cleanup inet_ehash_locks_alloc() If tcp ehash table is constrained to a very small number of buckets (eg boot parameter thash_entries=128), then we can crash if spinlock array has more entries. While we are at it, un-inline inet_ehash_locks_alloc() and make following changes : - Budget 2 cache lines per cpu worth of 'spinlocks' - Try to kmalloc() the array to avoid extra TLB pressure. (Most servers at Google allocate 8192 bytes for this hash table) - Get rid of various #ifdef Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 47 +++---------------------------------------- net/ipv4/inet_hashtables.c | 31 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 44 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 774d24151d4a..b73c88a19dd4 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -164,52 +163,12 @@ static inline spinlock_t *inet_ehash_lockp( return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; } -static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) -{ - unsigned int i, size = 256; -#if defined(CONFIG_PROVE_LOCKING) - unsigned int nr_pcpus = 2; -#else - unsigned int nr_pcpus = num_possible_cpus(); -#endif - if (nr_pcpus >= 4) - size = 512; - if (nr_pcpus >= 8) - size = 1024; - if (nr_pcpus >= 16) - size = 2048; - if (nr_pcpus >= 32) - size = 4096; - if (sizeof(spinlock_t) != 0) { -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE) - hashinfo->ehash_locks = vmalloc(size * sizeof(spinlock_t)); - else -#endif - hashinfo->ehash_locks = kmalloc(size * sizeof(spinlock_t), - GFP_KERNEL); - if (!hashinfo->ehash_locks) - return ENOMEM; - for (i = 0; i < size; i++) - spin_lock_init(&hashinfo->ehash_locks[i]); - } - hashinfo->ehash_locks_mask = size - 1; - return 0; -} +int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { - if (hashinfo->ehash_locks) { -#ifdef CONFIG_NUMA - unsigned int size = (hashinfo->ehash_locks_mask + 1) * - sizeof(spinlock_t); - if (size > PAGE_SIZE) - vfree(hashinfo->ehash_locks); - else -#endif - kfree(hashinfo->ehash_locks); - hashinfo->ehash_locks = NULL; - } + kvfree(hashinfo->ehash_locks); + hashinfo->ehash_locks = NULL; } struct inet_bind_bucket * diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3766bddb3e8a..185efef0f125 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -609,3 +610,33 @@ void inet_hashinfo_init(struct inet_hashinfo *h) } } EXPORT_SYMBOL_GPL(inet_hashinfo_init); + +int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) +{ + unsigned int i, nblocks = 1; + + if (sizeof(spinlock_t) != 0) { + /* allocate 2 cache lines or at least one spinlock per cpu */ + nblocks = max_t(unsigned int, + 2 * L1_CACHE_BYTES / sizeof(spinlock_t), + 1); + nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + + /* no more locks than number of hash buckets */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); + + hashinfo->ehash_locks = kmalloc_array(nblocks, sizeof(spinlock_t), + GFP_KERNEL | __GFP_NOWARN); + if (!hashinfo->ehash_locks) + hashinfo->ehash_locks = vmalloc(nblocks * sizeof(spinlock_t)); + + if (!hashinfo->ehash_locks) + return -ENOMEM; + + for (i = 0; i < nblocks; i++) + spin_lock_init(&hashinfo->ehash_locks[i]); + } + hashinfo->ehash_locks_mask = nblocks - 1; + return 0; +} +EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); -- cgit v1.2.3 From d6a4e26afb80c049e7f94e1b7b506dcda61eee88 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2015 08:55:28 -0700 Subject: tcp: tcp_tso_autosize() minimum is one packet By making sure sk->sk_gso_max_segs minimal value is one, and sysctl_tcp_min_tso_segs minimal value is one as well, tcp_tso_autosize() will return a non zero value. We can then revert 843925f33fcc293d80acf2c5c8a78adf3344d49b ("tcp: Do not apply TSO segment limit to non-TSO packets") and save few cpu cycles in fast path. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Herbert Xu Acked-by: Neal Cardwell Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/sock.c | 5 ++++- net/ipv4/sysctl_net_ipv4.c | 2 +- net/ipv4/tcp_output.c | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/core/sock.c b/net/core/sock.c index 29124fcdc42a..e72633c346b1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1581,6 +1581,8 @@ EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + u32 max_segs = 1; + __sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) @@ -1592,9 +1594,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; - sk->sk_gso_max_segs = dst->dev->gso_max_segs; + max_segs = max_t(u32, dst->dev->gso_max_segs, 1); } } + sk->sk_gso_max_segs = max_segs; } EXPORT_SYMBOL_GPL(sk_setup_caps); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 841de32f1fee..e64892769607 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -702,7 +702,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = &one, .extra2 = &gso_max_segs, }, { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 534e5fdb04c1..190538a2a88c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2088,7 +2088,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; - if (tso_segs == 1 || !max_segs) { + if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) @@ -2101,7 +2101,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } limit = mss_now; - if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp)) + if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, -- cgit v1.2.3 From ffa915d071ce4a05dcd866409df26513d25786f8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 27 May 2015 00:19:03 -0400 Subject: ipv4: Fix fib_trie.c build, missing linux/vmalloc.h include. We used to get this indirectly I supposed, but no longer do. Either way, an explicit include should have been done in the first place. net/ipv4/fib_trie.c: In function '__node_free_rcu': >> net/ipv4/fib_trie.c:293:3: error: implicit declaration of function 'vfree' [-Werror=implicit-function-declaration] vfree(n); ^ net/ipv4/fib_trie.c: In function 'tnode_alloc': >> net/ipv4/fib_trie.c:312:3: error: implicit declaration of function 'vzalloc' [-Werror=implicit-function-declaration] return vzalloc(size); ^ >> net/ipv4/fib_trie.c:312:3: warning: return makes pointer from integer without a cast cc1: some warnings being treated as errors Reported-by: kbuild test robot Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5a5d9bdeaeb4..01bce1506cd7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From c5501eb3406d0f88b3efb2c437c4c40b35f865d8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 May 2015 16:32:50 +0200 Subject: net: ipv4: avoid repeated calls to ip_skb_dst_mtu helper ip_skb_dst_mtu is small inline helper, but its called in several places. before: 17061 44 0 17105 42d1 net/ipv4/ip_output.o after: 16805 44 0 16849 41d1 net/ipv4/ip_output.o Signed-off-by: Florian Westphal Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 451b009dae75..d6dd8ba04441 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -84,6 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); static int ip_fragment(struct sock *sk, struct sk_buff *skb, + unsigned int mtu, int (*output)(struct sock *, struct sk_buff *)); /* Generate a checksum for an outgoing IP datagram. */ @@ -219,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, + unsigned int mtu) { netdev_features_t features; struct sk_buff *segs; @@ -227,7 +229,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || - skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) + skb_gso_network_seglen(skb) <= mtu) return ip_finish_output2(sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. @@ -251,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) int err; segs->next = NULL; - err = ip_fragment(sk, segs, ip_finish_output2); + err = ip_fragment(sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; @@ -263,6 +265,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) static int ip_finish_output(struct sock *sk, struct sk_buff *skb) { + unsigned int mtu; + #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -270,11 +274,12 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb) return dst_output_sk(sk, skb); } #endif + mtu = ip_skb_dst_mtu(skb); if (skb_is_gso(skb)) - return ip_finish_output_gso(sk, skb); + return ip_finish_output_gso(sk, skb, mtu); - if (skb->len > ip_skb_dst_mtu(skb)) - return ip_fragment(sk, skb, ip_finish_output2); + if (skb->len > mtu) + return ip_fragment(sk, skb, mtu, ip_finish_output2); return ip_finish_output2(sk, skb); } @@ -482,10 +487,10 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) } static int ip_fragment(struct sock *sk, struct sk_buff *skb, + unsigned int mtu, int (*output)(struct sock *, struct sk_buff *)) { struct iphdr *iph = ip_hdr(skb); - unsigned int mtu = ip_skb_dst_mtu(skb); if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && -- cgit v1.2.3 From d6b915e29f4adea94bc02ba7675bb4f84e6a1abd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 May 2015 16:32:51 +0200 Subject: ip_fragment: don't forward defragmented DF packet We currently always send fragments without DF bit set. Thus, given following setup: mtu1500 - mtu1500:1400 - mtu1400:1280 - mtu1280 A R1 R2 B Where R1 and R2 run linux with netfilter defragmentation/conntrack enabled, then if Host A sent a fragmented packet _with_ DF set to B, R1 will respond with icmp too big error if one of these fragments exceeded 1400 bytes. However, if R1 receives fragment sizes 1200 and 100, it would forward the reassembled packet without refragmenting, i.e. R2 will send an icmp error in response to a packet that was never sent, citing mtu that the original sender never exceeded. The other minor issue is that a refragmentation on R1 will conceal the MTU of R2-B since refragmentation does not set DF bit on the fragments. This modifies ip_fragment so that we track largest fragment size seen both for DF and non-DF packets, and set frag_max_size to the largest value. If the DF fragment size is larger or equal to the non-df one, we will consider the packet a path mtu probe: We set DF bit on the reassembled skb and also tag it with a new IPCB flag to force refragmentation even if skb fits outdev mtu. We will also set DF bit on each fragment in this case. Joint work with Hannes Frederic Sowa. Reported-by: Jesse Gross Signed-off-by: Florian Westphal Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 +- include/net/ip.h | 1 + net/ipv4/ip_fragment.c | 31 ++++++++++++++++++++++++++----- net/ipv4/ip_output.c | 12 ++++++++++-- 4 files changed, 38 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 8d1765577acc..e1300b3dd597 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -43,7 +43,7 @@ enum { * @len: total length of the original datagram * @meat: length of received fragments so far * @flags: fragment queue flags - * @max_size: (ipv4 only) maximum received fragment size with IP_DF set + * @max_size: maximum received fragment size * @net: namespace that this frag belongs to */ struct inet_frag_queue { diff --git a/include/net/ip.h b/include/net/ip.h index 7921a36b805c..9b976cf99122 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -45,6 +45,7 @@ struct inet_skb_parm { #define IPSKB_FRAG_COMPLETE BIT(3) #define IPSKB_REROUTED BIT(4) #define IPSKB_DOREDIRECT BIT(5) +#define IPSKB_FRAG_PMTU BIT(6) u16 frag_max_size; }; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 47fa64ee82b1..a50dc6d408d1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -75,6 +75,7 @@ struct ipq { __be16 id; u8 protocol; u8 ecn; /* RFC3168 support */ + u16 max_df_size; /* largest frag with DF set seen */ int iif; unsigned int rid; struct inet_peer *peer; @@ -326,6 +327,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct sk_buff *prev, *next; struct net_device *dev; + unsigned int fragsize; int flags, offset; int ihl, end; int err = -ENOENT; @@ -481,9 +483,14 @@ found: if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; + fragsize = skb->len + ihl; + + if (fragsize > qp->q.max_size) + qp->q.max_size = fragsize; + if (ip_hdr(skb)->frag_off & htons(IP_DF) && - skb->len + ihl > qp->q.max_size) - qp->q.max_size = skb->len + ihl; + fragsize > qp->max_df_size) + qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { @@ -613,13 +620,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = qp->q.max_size; + IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(head); - /* max_size != 0 implies at least one fragment had IP_DF set */ - iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); iph->tos |= ecn; + + /* When we set IP_DF on a refragmented skb we must also force a + * call to ip_fragment to avoid forwarding a DF-skb of size s while + * original sender only sent fragments of size f (where f < s). + * + * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest + * frag seen to avoid sending tiny DF-fragments in case skb was built + * from one very small df-fragment and one large non-df frag. + */ + if (qp->max_df_size == qp->q.max_size) { + IPCB(head)->flags |= IPSKB_FRAG_PMTU; + iph->frag_off = htons(IP_DF); + } else { + iph->frag_off = 0; + } + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d6dd8ba04441..f5f5ef1cebd5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -278,7 +278,7 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb) if (skb_is_gso(skb)) return ip_finish_output_gso(sk, skb, mtu); - if (skb->len > mtu) + if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) return ip_fragment(sk, skb, mtu, ip_finish_output2); return ip_finish_output2(sk, skb); @@ -492,7 +492,10 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + if ((iph->frag_off & htons(IP_DF)) == 0) + return ip_do_fragment(sk, skb, output); + + if (unlikely(!skb->ignore_df || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { struct rtable *rt = skb_rtable(skb); @@ -537,6 +540,8 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, iph = ip_hdr(skb); mtu = ip_skb_dst_mtu(skb); + if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) + mtu = IPCB(skb)->frag_max_size; /* * Setup starting values. @@ -732,6 +737,9 @@ slow_path: iph = ip_hdr(skb2); iph->frag_off = htons((offset >> 3)); + if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) + iph->frag_off |= htons(IP_DF); + /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE -- cgit v1.2.3 From 07f4c90062f8fc7c8c26f8f95324cbe8fa3145a5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 May 2015 14:49:35 -0700 Subject: tcp/dccp: try to not exhaust ip_local_port_range in connect() A long standing problem on busy servers is the tiny available TCP port range (/proc/sys/net/ipv4/ip_local_port_range) and the default sequential allocation of source ports in connect() system call. If a host is having a lot of active TCP sessions, chances are very high that all ports are in use by at least one flow, and subsequent bind(0) attempts fail, or have to scan a big portion of space to find a slot. In this patch, I changed the starting point in __inet_hash_connect() so that we try to favor even [1] ports, leaving odd ports for bind() users. We still perform a sequential search, so there is no guarantee, but if connect() targets are very different, end result is we leave more ports available to bind(), and we spread them all over the range, lowering time for both connect() and bind() to find a slot. This strategy only works well if /proc/sys/net/ipv4/ip_local_port_range is even, ie if start/end values have different parity. Therefore, default /proc/sys/net/ipv4/ip_local_port_range was changed to 32768 - 60999 (instead of 32768 - 61000) There is no change on security aspects here, only some poor hashing schemes could be eventually impacted by this change. [1] : The odd/even property depends on ip_local_port_range values parity Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 8 +++++--- net/ipv4/af_inet.c | 2 +- net/ipv4/inet_hashtables.c | 10 ++++++++-- 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index cb083e0d682c..5fae7704daab 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -751,8 +751,10 @@ IP Variables: ip_local_port_range - 2 INTEGERS Defines the local port range that is used by TCP and UDP to choose the local port. The first number is the first, the - second the last local port number. The default values are - 32768 and 61000 respectively. + second the last local port number. + If possible, it is better these numbers have different parity. + (one even and one odd values) + The default values are 32768 and 60999 respectively. ip_local_reserved_ports - list of comma separated ranges Specify the ports which are reserved for known third-party @@ -775,7 +777,7 @@ ip_local_reserved_ports - list of comma separated ranges ip_local_port_range, e.g.: $ cat /proc/sys/net/ipv4/ip_local_port_range - 32000 61000 + 32000 60999 $ cat /proc/sys/net/ipv4/ip_local_reserved_ports 8080,9148 diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 235d36afece3..6ad0f7a711c9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1595,7 +1595,7 @@ static __net_init int inet_init_net(struct net *net) */ seqlock_init(&net->ipv4.ip_local_ports.lock); net->ipv4.ip_local_ports.range[0] = 32768; - net->ipv4.ip_local_ports.range[1] = 61000; + net->ipv4.ip_local_ports.range[1] = 60999; seqlock_init(&net->ipv4.ping_group_range.lock); /* diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 185efef0f125..1b336dc179f8 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -502,8 +502,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; + /* By starting with offset being an even number, + * we tend to leave about 50% of ports for other uses, + * like bind(0). + */ + offset &= ~1; + local_bh_disable(); - for (i = 1; i <= remaining; i++) { + for (i = 0; i < remaining; i++) { port = low + (i + offset) % remaining; if (inet_is_local_reserved_port(net, port)) continue; @@ -547,7 +553,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, return -EADDRNOTAVAIL; ok: - hint += i; + hint += (i + 2) & ~1; /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); -- cgit v1.2.3 From e2baad9e4b153c67dddc5ccf987395b842329c84 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 May 2015 10:46:02 -0700 Subject: tcp: connect() from bound sockets can be faster __inet_hash_connect() does not use its third argument (port_offset) if socket was already bound to a source port. No need to perform useless but expensive md5 computations. Reported-by: Crestez Dan Leonard Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 9 +++++++-- net/ipv6/inet6_hashtables.c | 8 ++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 1b336dc179f8..5f9b063bbe8a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -394,9 +394,10 @@ not_unique: return -EADDRNOTAVAIL; } -static inline u32 inet_sk_port_offset(const struct sock *sk) +static u32 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); @@ -600,7 +601,11 @@ out: int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet_sk_port_offset(sk); + return __inet_hash_connect(death_row, sk, port_offset, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 871641bc1ed4..b4fd96de97e6 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -257,7 +257,7 @@ not_unique: return -EADDRNOTAVAIL; } -static inline u32 inet6_sk_port_offset(const struct sock *sk) +static u32 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); @@ -269,7 +269,11 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk) int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk), + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet6_sk_port_offset(sk); + return __inet_hash_connect(death_row, sk, port_offset, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); -- cgit v1.2.3 From ed2dfd900992aa7b6b3d0abd8ec9a7e9d2c7f827 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 May 2015 11:34:37 -0700 Subject: tcp/dccp: warn user for preferred ip_local_port_range After commit 07f4c90062f8f ("tcp/dccp: try to not exhaust ip_local_port_range in connect()") it is advised to have an even number of ports described in /proc/sys/net/ipv4/ip_local_port_range This means start/end values should have a different parity. Let's warn sysadmins of this, so that they can update their settings if they want to. Suggested-by: David S. Miller Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net/ipv4') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 6848b8bb2e63..c68926b4899c 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -19,6 +19,7 @@ struct sock; struct local_ports { seqlock_t lock; int range[2]; + bool warned; }; struct ping_group_range { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e64892769607..0330ab2e2b63 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,7 +45,13 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { + bool same_parity = !((range[0] ^ range[1]) & 1); + write_seqlock(&net->ipv4.ip_local_ports.lock); + if (same_parity && !net->ipv4.ip_local_ports.warned) { + net->ipv4.ip_local_ports.warned = true; + pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); + } net->ipv4.ip_local_ports.range[0] = range[0]; net->ipv4.ip_local_ports.range[1] = range[1]; write_sequnlock(&net->ipv4.ip_local_ports.lock); -- cgit v1.2.3 From 8133534c760d4083f79d2cde42c636ccc0b2792e Mon Sep 17 00:00:00 2001 From: Sorin Dumitru Date: Wed, 27 May 2015 22:16:49 +0300 Subject: net: limit tcp/udp rmem/wmem to SOCK_{RCV,SND}BUF_MIN This is similar to b1cb59cf2efe(net: sysctl_net_core: check SNDBUF and RCVBUF for min length). I don't think too small values can cause crashes in the case of udp and tcp, but I've seen this set to too small values which triggered awful performance. It also makes the setting consistent across all the wmem/rmem sysctls. Signed-off-by: Sorin Dumitru Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0330ab2e2b63..433231ccfb17 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -41,6 +41,8 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +static int min_sndbuf = SOCK_MIN_SNDBUF; +static int min_rcvbuf = SOCK_MIN_RCVBUF; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) @@ -528,7 +530,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_tcp_wmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "tcp_notsent_lowat", @@ -543,7 +545,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_tcp_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "tcp_app_win", @@ -756,7 +758,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = &min_rcvbuf, }, { .procname = "udp_wmem_min", @@ -764,7 +766,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = &min_sndbuf, }, { } }; -- cgit v1.2.3 From 12e25e1041d044d4204f2b7c54695e14e8ffb282 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Jun 2015 23:49:21 -0700 Subject: tcp: remove redundant checks tcp_v4_rcv() checks the following before calling tcp_v4_do_rcv(): if (th->doff < sizeof(struct tcphdr) / 4) goto bad_packet; if (!pskb_may_pull(skb, th->doff * 4)) goto discard_it; So following check in tcp_v4_do_rcv() is redundant and "goto csum_err;" is wrong anyway. if (skb->len < tcp_hdrlen(skb) || ...) goto csum_err; A second check can be removed after no_tcp_socket label for same reason. Same tests can be removed in tcp_v6_do_rcv() Note : short tcp frames are not properly accounted in tcpInErrs MIB, because pskb_may_pull() failure simply drops incoming skb, we might fix this in a separate patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index feb875769b8d..3130c42240c8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1400,7 +1400,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { @@ -1647,7 +1647,7 @@ no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; - if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { + if (tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7be3d858cbf0..b72e1ffc6bb4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1250,7 +1250,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { @@ -1442,7 +1442,7 @@ no_tcp_socket: tcp_v6_fill_cb(skb, hdr, th); - if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { + if (tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: -- cgit v1.2.3 From c39c4c6abb89d24454b63798ccbae12b538206a5 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Jun 2015 11:10:42 +0100 Subject: tcp: double default TSQ output bytes limit Xen virtual network driver has higher latency than a physical NIC. Having only 128K as limit for TSQ introduced 30% regression in guest throughput. This patch raises the limit to 256K. This reduces the regression to 8%. This buys us more time to work out a proper solution in the long run. Signed-off-by: Wei Liu Cc: David Miller Cc: Eric Dumazet Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 190538a2a88c..eeb59befaf06 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -50,8 +50,8 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; -/* Default TSQ limit of two TSO segments */ -int sysctl_tcp_limit_output_bytes __read_mostly = 131072; +/* Default TSQ limit of four TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 262144; /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames -- cgit v1.2.3 From 90c337da1524863838658078ec34241f45d8394d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Jun 2015 21:17:57 -0700 Subject: inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations When an application needs to force a source IP on an active TCP socket it has to use bind(IP, port=x). As most applications do not want to deal with already used ports, x is often set to 0, meaning the kernel is in charge to find an available port. But kernel does not know yet if this socket is going to be a listener or be connected. It has very limited choices (no full knowledge of final 4-tuple for a connect()) With limited ephemeral port range (about 32K ports), it is very easy to fill the space. This patch adds a new SOL_IP socket option, asking kernel to ignore the 0 port provided by application in bind(IP, port=0) and only remember the given IP address. The port will be automatically chosen at connect() time, in a way that allows sharing a source port as long as the 4-tuples are unique. This new feature is available for both IPv4 and IPv6 (Thanks Neal) Tested: Wrote a test program and checked its behavior on IPv4 and IPv6. strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by connect(). Also getsockname() show that the port is still 0 right after bind() but properly allocated after connect(). socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5 setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0 bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0 getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0 connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0 getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0 IPv6 test : socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 7 setsockopt(7, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0 bind(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0 getsockname(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0 connect(7, {sa_family=AF_INET6, sin6_port=htons(57300), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0 getsockname(7, {sa_family=AF_INET6, sin6_port=htons(60964), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0 I was able to bind()/connect() a million concurrent IPv4 sockets, instead of ~32000 before patch. lpaa23:~# ulimit -n 1000010 lpaa23:~# ./bind --connect --num-flows=1000000 & 1000000 sockets lpaa23:~# grep TCP /proc/net/sockstat TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66 Check that a given source port is indeed used by many different connections : lpaa23:~# ss -t src :40000 | head -10 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 0 0 127.0.0.2:40000 127.0.202.33:44983 ESTAB 0 0 127.0.0.2:40000 127.2.27.240:44983 ESTAB 0 0 127.0.0.2:40000 127.2.98.5:44983 ESTAB 0 0 127.0.0.2:40000 127.0.124.196:44983 ESTAB 0 0 127.0.0.2:40000 127.2.139.38:44983 ESTAB 0 0 127.0.0.2:40000 127.1.59.80:44983 ESTAB 0 0 127.0.0.2:40000 127.3.6.228:44983 ESTAB 0 0 127.0.0.2:40000 127.0.38.53:44983 ESTAB 0 0 127.0.0.2:40000 127.1.197.10:44983 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + include/uapi/linux/in.h | 1 + net/ipv4/af_inet.c | 3 ++- net/ipv4/ip_sockglue.c | 7 +++++++ net/ipv6/af_inet6.c | 3 ++- 5 files changed, 13 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index b6c3737da4e9..47eb67b08abd 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -187,6 +187,7 @@ struct inet_sock { transparent:1, mc_all:1, nodefrag:1; + __u8 bind_address_no_port:1; __u8 rcv_tos; __u8 convert_csum; int uc_index; diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 641338bef651..83d6236a2f08 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -112,6 +112,7 @@ struct in_addr { #define IP_MINTTL 21 #define IP_NODEFRAG 22 #define IP_CHECKSUM 23 +#define IP_BIND_ADDRESS_NO_PORT 24 /* IP_MTU_DISCOVER values */ #define IP_PMTUDISC_DONT 0 /* Never send DF frames */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6ad0f7a711c9..cc858ef44451 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -488,7 +488,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if (sk->sk_prot->get_port(sk, snum)) { + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 7cfb0893f263..04ae2992a5cd 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -582,6 +582,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_TRANSPARENT: case IP_MINTTL: case IP_NODEFRAG: + case IP_BIND_ADDRESS_NO_PORT: case IP_UNICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_ALL: @@ -732,6 +733,9 @@ static int do_ip_setsockopt(struct sock *sk, int level, } inet->nodefrag = val ? 1 : 0; break; + case IP_BIND_ADDRESS_NO_PORT: + inet->bind_address_no_port = val ? 1 : 0; + break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; @@ -1324,6 +1328,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_NODEFRAG: val = inet->nodefrag; break; + case IP_BIND_ADDRESS_NO_PORT: + val = inet->bind_address_no_port; + break; case IP_MTU_DISCOVER: val = inet->pmtudisc; break; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f3866c0b6cfe..7de52b65173f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -362,7 +362,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) np->saddr = addr->sin6_addr; /* Make sure we are allowed to bind here. */ - if (sk->sk_prot->get_port(sk, snum)) { + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { inet_reset_saddr(sk); err = -EADDRINUSE; goto out; -- cgit v1.2.3 From 98da81a426a76c351f3c2854d5bc31f17bba3194 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Jun 2015 08:01:12 -0700 Subject: tcp: remove redundant checks II For same reasons than in commit 12e25e1041d0 ("tcp: remove redundant checks"), we can remove redundant checks done for timewait sockets. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ---- net/ipv6/tcp_ipv6.c | 4 ---- 2 files changed, 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3130c42240c8..d7d4c2b79cf2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1671,10 +1671,6 @@ do_time_wait: goto discard_it; } - if (skb->len < (th->doff << 2)) { - inet_twsk_put(inet_twsk(sk)); - goto bad_packet; - } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b72e1ffc6bb4..45a7176ed460 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1467,10 +1467,6 @@ do_time_wait: tcp_v6_fill_cb(skb, hdr, th); - if (skb->len < (th->doff<<2)) { - inet_twsk_put(inet_twsk(sk)); - goto bad_packet; - } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; -- cgit v1.2.3 From b80c0e78582d4a3a004dc1ade4eb06babc6a4eea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Jun 2015 18:30:43 -0700 Subject: tcp: get_cookie_sock() consolidation IPv4 and IPv6 share same implementation of get_cookie_sock(), and there is no point inlining it. We add tcp_ prefix to the common helper name and export it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +++ net/ipv4/syncookies.c | 10 +++++----- net/ipv6/syncookies.c | 19 +------------------ 3 files changed, 9 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 2bb2bad21d5c..978cebedd3fc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -469,6 +469,9 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size); void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ +struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index df849e5a10f1..d70b1f603692 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -219,9 +219,9 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_check); -static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; @@ -235,7 +235,7 @@ static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, } return child; } - +EXPORT_SYMBOL(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored @@ -391,7 +391,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); - ret = get_cookie_sock(sk, skb, req, &rt->dst); + ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 21bc2eb53c57..0909f4e0d53c 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -41,23 +41,6 @@ static __u16 const msstab[] = { 9000 - 60, }; -static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct sock *child; - - child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); - if (child) { - atomic_set(&req->rsk_refcnt, 1); - inet_csk_reqsk_queue_add(sk, req, child); - } else { - reqsk_free(req); - } - return child; -} - static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); @@ -264,7 +247,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); - ret = get_cookie_sock(sk, skb, req, dst); + ret = tcp_get_cookie_sock(sk, skb, req, dst); out: return ret; out_free: -- cgit v1.2.3 From f38b24c90528c888915ef6e3bc320bdb30b14cf2 Mon Sep 17 00:00:00 2001 From: Firo Yang Date: Mon, 8 Jun 2015 11:54:51 +0800 Subject: fib_trie: coding style: Use pointer after check As Alexander Duyck pointed out that: struct tnode { ... struct key_vector kv[1]; } The kv[1] member of struct tnode is an arry that refernced by a null pointer will not crash the system, like this: struct tnode *p = NULL; struct key_vector *kv = p->kv; As such p->kv doesn't actually dereference anything, it is simply a means for getting the offset to the array from the pointer p. This patch make the code more regular to avoid making people feel odd when they look at the code. Signed-off-by: Firo Yang Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 01bce1506cd7..3c699c4e90a4 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -325,13 +325,15 @@ static inline void empty_child_dec(struct key_vector *n) static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { - struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); - struct key_vector *l = kv->kv; + struct key_vector *l; + struct tnode *kv; + kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (!kv) return NULL; /* initialize key vector */ + l = kv->kv; l->key = key; l->pos = 0; l->bits = 0; @@ -346,24 +348,26 @@ static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) static struct key_vector *tnode_new(t_key key, int pos, int bits) { - struct tnode *tnode = tnode_alloc(bits); unsigned int shift = pos + bits; - struct key_vector *tn = tnode->kv; + struct key_vector *tn; + struct tnode *tnode; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); - pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), - sizeof(struct key_vector *) << bits); - + tnode = tnode_alloc(bits); if (!tnode) return NULL; + pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), + sizeof(struct key_vector *) << bits); + if (bits == KEYLENGTH) tnode->full_children = 1; else tnode->empty_children = 1ul << bits; + tn = tnode->kv; tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; tn->pos = pos; tn->bits = bits; @@ -2054,11 +2058,12 @@ static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { - struct key_vector *n, *pn = t->kv; + struct key_vector *n, *pn; if (!t) return NULL; + pn = t->kv; n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; -- cgit v1.2.3 From f9c2ff22bb2df7b8f153afd2a4bea07176bad144 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Jun 2015 22:11:17 -0700 Subject: net: tcp: dctcp_update_alpha() fixes. dctcp_alpha can be read by from dctcp_get_info() without synchro, so use WRITE_ONCE() to prevent compiler from using dctcp_alpha as a temporary variable. Also, playing with small dctcp_shift_g (like 1), can expose an overflow with 32bit values shifted 9 times before divide. Use an u64 field to avoid this problem, and perform the divide only if acked_bytes_ecn is not zero. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_dctcp.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 4c41c1287197..7092a61c4dc8 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -204,20 +204,26 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { - /* For avoiding denominator == 1. */ - if (ca->acked_bytes_total == 0) - ca->acked_bytes_total = 1; + u64 bytes_ecn = ca->acked_bytes_ecn; + u32 alpha = ca->dctcp_alpha; /* alpha = (1 - g) * alpha + g * F */ - ca->dctcp_alpha = ca->dctcp_alpha - - (ca->dctcp_alpha >> dctcp_shift_g) + - (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / - ca->acked_bytes_total; - if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) - /* Clamp dctcp_alpha to max. */ - ca->dctcp_alpha = DCTCP_MAX_ALPHA; + alpha -= alpha >> dctcp_shift_g; + if (bytes_ecn) { + /* If dctcp_shift_g == 1, a 32bit value would overflow + * after 8 Mbytes. + */ + bytes_ecn <<= (10 - dctcp_shift_g); + do_div(bytes_ecn, max(1U, ca->acked_bytes_total)); + alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA); + } + /* dctcp_alpha can be read from dctcp_get_info() without + * synchro, so we ask compiler to not use dctcp_alpha + * as a temporary variable in prior operations. + */ + WRITE_ONCE(ca->dctcp_alpha, alpha); dctcp_reset(tp, ca); } } -- cgit v1.2.3 From 7782ad8bb55c18f634a6713ec38c8f1ae86ad0b5 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Wed, 10 Jun 2015 19:08:16 +0200 Subject: tcp: export tcp_enter_cwr() Upcoming tcp_cdg uses tcp_enter_cwr() to initiate PRR. Export this function so that CDG can be compiled as a module. Cc: Eric Dumazet Cc: Yuchung Cheng Cc: Stephen Hemminger Cc: Neal Cardwell Cc: David Hayes Cc: Andreas Petlund Cc: Dave Taht Cc: Nicolas Kuhn Signed-off-by: Kenneth Klette Jonassen Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 15c4536188a4..d4f76ab6e136 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2552,6 +2552,7 @@ void tcp_enter_cwr(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_CWR); } } +EXPORT_SYMBOL(tcp_enter_cwr); static void tcp_try_keep_open(struct sock *sk) { -- cgit v1.2.3 From 2b0a8c9eee81882fc0001ccf6d9af62cdc682f9e Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Wed, 10 Jun 2015 19:08:17 +0200 Subject: tcp: add CDG congestion control CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies the TCP sender in order to [1]: o Use the delay gradient as a congestion signal. o Back off with an average probability that is independent of the RTT. o Coexist with flows that use loss-based congestion control, i.e., flows that are unresponsive to the delay signal. o Tolerate packet loss unrelated to congestion. (Disabled by default.) Its FreeBSD implementation was presented for the ICCRG in July 2012; slides are available at http://www.ietf.org/proceedings/84/iccrg.html Running the experiment scenarios in [1] suggests that our implementation achieves more goodput compared with FreeBSD 10.0 senders, although it also causes more queueing delay for a given backoff factor. The loss tolerance heuristic is disabled by default due to safety concerns for its use in the Internet [2, p. 45-46]. We use a variant of the Hybrid Slow start algorithm in tcp_cubic to reduce the probability of slow start overshoot. [1] D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using delay gradients." In Networking 2011, pages 328-341. Springer, 2011. [2] K.K. Jonassen. "Implementing CAIA Delay-Gradient in Linux." MSc thesis. Department of Informatics, University of Oslo, 2015. Cc: Eric Dumazet Cc: Yuchung Cheng Cc: Stephen Hemminger Cc: Neal Cardwell Cc: David Hayes Cc: Andreas Petlund Cc: Dave Taht Cc: Nicolas Kuhn Signed-off-by: Kenneth Klette Jonassen Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 20 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_cdg.c | 433 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 454 insertions(+) create mode 100644 net/ipv4/tcp_cdg.c (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index d83071dccd74..6fb3c90ad726 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -615,6 +615,22 @@ config TCP_CONG_DCTCP For further details see: http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf +config TCP_CONG_CDG + tristate "CAIA Delay-Gradient (CDG)" + default n + ---help--- + CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies + the TCP sender in order to: + + o Use the delay gradient as a congestion signal. + o Back off with an average probability that is independent of the RTT. + o Coexist with flows that use loss-based congestion control. + o Tolerate packet loss unrelated to congestion. + + For further details see: + D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -646,6 +662,9 @@ choice config DEFAULT_DCTCP bool "DCTCP" if TCP_CONG_DCTCP=y + config DEFAULT_CDG + bool "CDG" if TCP_CONG_CDG=y + config DEFAULT_RENO bool "Reno" endchoice @@ -668,6 +687,7 @@ config DEFAULT_TCP_CONG default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO default "dctcp" if DEFAULT_DCTCP + default "cdg" if DEFAULT_CDG default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index b36236dd6014..efc43f300b8c 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c new file mode 100644 index 000000000000..a52ce2db53f2 --- /dev/null +++ b/net/ipv4/tcp_cdg.c @@ -0,0 +1,433 @@ +/* + * CAIA Delay-Gradient (CDG) congestion control + * + * This implementation is based on the paper: + * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. + * + * Scavenger traffic (Less-than-Best-Effort) should disable coexistence + * heuristics using parameters use_shadow=0 and use_ineff=0. + * + * Parameters window, backoff_beta, and backoff_factor are crucial for + * throughput and delay. Future work is needed to determine better defaults, + * and to provide guidelines for use in different environments/contexts. + * + * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. + * Parameter window is only configurable when loading tcp_cdg as a module. + * + * Notable differences from paper/FreeBSD: + * o Using Hybrid Slow start and Proportional Rate Reduction. + * o Add toggle for shadow window mechanism. Suggested by David Hayes. + * o Add toggle for non-congestion loss tolerance. + * o Scaling parameter G is changed to a backoff factor; + * conversion is given by: backoff_factor = 1000/(G * window). + * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. + * o More accurate e^-x. + */ +#include +#include +#include +#include + +#define HYSTART_ACK_TRAIN 1 +#define HYSTART_DELAY 2 + +static int window __read_mostly = 8; +static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ +static unsigned int backoff_factor __read_mostly = 42; +static unsigned int hystart_detect __read_mostly = 3; +static unsigned int use_ineff __read_mostly = 5; +static bool use_shadow __read_mostly = true; +static bool use_tolerance __read_mostly; + +module_param(window, int, 0444); +MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); +module_param(backoff_beta, uint, 0644); +MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); +module_param(backoff_factor, uint, 0644); +MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); +module_param(hystart_detect, uint, 0644); +MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " + "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); +module_param(use_ineff, uint, 0644); +MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); +module_param(use_shadow, bool, 0644); +MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); +module_param(use_tolerance, bool, 0644); +MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); + +struct minmax { + union { + struct { + s32 min; + s32 max; + }; + u64 v64; + }; +}; + +enum cdg_state { + CDG_UNKNOWN = 0, + CDG_NONFULL = 1, + CDG_FULL = 2, + CDG_BACKOFF = 3, +}; + +struct cdg { + struct minmax rtt; + struct minmax rtt_prev; + struct minmax *gradients; + struct minmax gsum; + bool gfilled; + u8 tail; + u8 state; + u8 delack; + u32 rtt_seq; + u32 undo_cwnd; + u32 shadow_wnd; + u16 backoff_cnt; + u16 sample_cnt; + s32 delay_min; + u32 last_ack; + u32 round_start; +}; + +/** + * nexp_u32 - negative base-e exponential + * @ux: x in units of micro + * + * Returns exp(ux * -1e-6) * U32_MAX. + */ +static u32 __pure nexp_u32(u32 ux) +{ + static const u16 v[] = { + /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ + 65535, + 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, + 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, + }; + u32 msb = ux >> 8; + u32 res; + int i; + + /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ + if (msb > U16_MAX) + return 0; + + /* Scale first eight bits linearly: */ + res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); + + /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ + for (i = 1; msb; i++, msb >>= 1) { + u32 y = v[i & -(msb & 1)] + U32_C(1); + + res = ((u64)res * y) >> 16; + } + + return res; +} + +/* Based on the HyStart algorithm (by Ha et al.) that is implemented in + * tcp_cubic. Differences/experimental changes: + * o Using Hayes' delayed ACK filter. + * o Using a usec clock for the ACK train. + * o Reset ACK train when application limited. + * o Invoked at any cwnd (i.e. also when cwnd < 16). + * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). + */ +static void tcp_cdg_hystart_update(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); + if (ca->delay_min == 0) + return; + + if (hystart_detect & HYSTART_ACK_TRAIN) { + u32 now_us = local_clock() / NSEC_PER_USEC; + + if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { + ca->last_ack = now_us; + ca->round_start = now_us; + } else if (before(now_us, ca->last_ack + 3000)) { + u32 base_owd = max(ca->delay_min / 2U, 125U); + + ca->last_ack = now_us; + if (after(now_us, ca->round_start + base_owd)) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + return; + } + } + } + + if (hystart_detect & HYSTART_DELAY) { + if (ca->sample_cnt < 8) { + ca->sample_cnt++; + } else { + s32 thresh = max(ca->delay_min + ca->delay_min / 8U, + 125U); + + if (ca->rtt.min > thresh) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } + } + } +} + +static s32 tcp_cdg_grad(struct cdg *ca) +{ + s32 gmin = ca->rtt.min - ca->rtt_prev.min; + s32 gmax = ca->rtt.max - ca->rtt_prev.max; + s32 grad; + + if (ca->gradients) { + ca->gsum.min += gmin - ca->gradients[ca->tail].min; + ca->gsum.max += gmax - ca->gradients[ca->tail].max; + ca->gradients[ca->tail].min = gmin; + ca->gradients[ca->tail].max = gmax; + ca->tail = (ca->tail + 1) & (window - 1); + gmin = ca->gsum.min; + gmax = ca->gsum.max; + } + + /* We keep sums to ignore gradients during cwnd reductions; + * the paper's smoothed gradients otherwise simplify to: + * (rtt_latest - rtt_oldest) / window. + * + * We also drop division by window here. + */ + grad = gmin > 0 ? gmin : gmax; + + /* Extrapolate missing values in gradient window: */ + if (!ca->gfilled) { + if (!ca->gradients && window > 1) + grad *= window; /* Memory allocation failed. */ + else if (ca->tail == 0) + ca->gfilled = true; + else + grad = (grad * window) / (int)ca->tail; + } + + /* Backoff was effectual: */ + if (gmin <= -32 || gmax <= -32) + ca->backoff_cnt = 0; + + if (use_tolerance) { + /* Reduce small variations to zero: */ + gmin = DIV_ROUND_CLOSEST(gmin, 64); + gmax = DIV_ROUND_CLOSEST(gmax, 64); + + if (gmin > 0 && gmax <= 0) + ca->state = CDG_FULL; + else if ((gmin > 0 && gmax > 0) || gmax < 0) + ca->state = CDG_NONFULL; + } + return grad; +} + +static bool tcp_cdg_backoff(struct sock *sk, u32 grad) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (prandom_u32() <= nexp_u32(grad * backoff_factor)) + return false; + + if (use_ineff) { + ca->backoff_cnt++; + if (ca->backoff_cnt > use_ineff) + return false; + } + + ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd); + ca->state = CDG_BACKOFF; + tcp_enter_cwr(sk); + return true; +} + +/* Not called in CWR or Recovery state. */ +static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_snd_cwnd; + u32 incr; + + if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect) + tcp_cdg_hystart_update(sk); + + if (after(ack, ca->rtt_seq) && ca->rtt.v64) { + s32 grad = 0; + + if (ca->rtt_prev.v64) + grad = tcp_cdg_grad(ca); + ca->rtt_seq = tp->snd_nxt; + ca->rtt_prev = ca->rtt; + ca->rtt.v64 = 0; + ca->last_ack = 0; + ca->sample_cnt = 0; + + if (grad > 0 && tcp_cdg_backoff(sk, grad)) + return; + } + + if (!tcp_is_cwnd_limited(sk)) { + ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd); + return; + } + + prior_snd_cwnd = tp->snd_cwnd; + tcp_reno_cong_avoid(sk, ack, acked); + + incr = tp->snd_cwnd - prior_snd_cwnd; + ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); +} + +static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (rtt_us <= 0) + return; + + /* A heuristic for filtering delayed ACKs, adapted from: + * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support + * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. + */ + if (tp->sacked_out == 0) { + if (num_acked == 1 && ca->delack) { + /* A delayed ACK is only used for the minimum if it is + * provenly lower than an existing non-zero minimum. + */ + ca->rtt.min = min(ca->rtt.min, rtt_us); + ca->delack--; + return; + } else if (num_acked > 1 && ca->delack < 5) { + ca->delack++; + } + } + + ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us); + ca->rtt.max = max(ca->rtt.max, rtt_us); +} + +static u32 tcp_cdg_ssthresh(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->undo_cwnd = tp->snd_cwnd; + + if (ca->state == CDG_BACKOFF) + return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); + + if (ca->state == CDG_NONFULL && use_tolerance) + return tp->snd_cwnd; + + ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd); + if (use_shadow) + return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1); + return max(2U, tp->snd_cwnd >> 1); +} + +static u32 tcp_cdg_undo_cwnd(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd); +} + +static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct minmax *gradients; + + switch (ev) { + case CA_EVENT_CWND_RESTART: + gradients = ca->gradients; + if (gradients) + memset(gradients, 0, window * sizeof(gradients[0])); + memset(ca, 0, sizeof(*ca)); + + ca->gradients = gradients; + ca->rtt_seq = tp->snd_nxt; + ca->shadow_wnd = tp->snd_cwnd; + break; + case CA_EVENT_COMPLETE_CWR: + ca->state = CDG_UNKNOWN; + ca->rtt_seq = tp->snd_nxt; + ca->rtt_prev = ca->rtt; + ca->rtt.v64 = 0; + break; + default: + break; + } +} + +static void tcp_cdg_init(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* We silently fall back to window = 1 if allocation fails. */ + if (window > 1) + ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), + GFP_NOWAIT | __GFP_NOWARN); + ca->rtt_seq = tp->snd_nxt; + ca->shadow_wnd = tp->snd_cwnd; +} + +static void tcp_cdg_release(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + + kfree(ca->gradients); +} + +struct tcp_congestion_ops tcp_cdg __read_mostly = { + .cong_avoid = tcp_cdg_cong_avoid, + .cwnd_event = tcp_cdg_cwnd_event, + .pkts_acked = tcp_cdg_acked, + .undo_cwnd = tcp_cdg_undo_cwnd, + .ssthresh = tcp_cdg_ssthresh, + .release = tcp_cdg_release, + .init = tcp_cdg_init, + .owner = THIS_MODULE, + .name = "cdg", +}; + +static int __init tcp_cdg_register(void) +{ + if (backoff_beta > 1024 || window < 1 || window > 256) + return -ERANGE; + if (!is_power_of_2(window)) + return -EINVAL; + + BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_cdg); + return 0; +} + +static void __exit tcp_cdg_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_cdg); +} + +module_init(tcp_cdg_register); +module_exit(tcp_cdg_unregister); +MODULE_AUTHOR("Kenneth Klette Jonassen"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP CDG"); -- cgit v1.2.3 From a7eea416cb08a514f94c0ca5ff30c18783fab054 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:15 -0700 Subject: tcp: reserve tcp_skb_mss() to tcp stack tcp_gso_segment() and tcp_gro_receive() are not strictly part of TCP stack. They should not assume tcp_skb_mss(skb) is in fact skb_shinfo(skb)->gso_size. This will allow us to change tcp_skb_mss() in following patches. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 3f7c2fca5431..9864a2dbadce 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -77,7 +77,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, oldlen = (u16)~skb->len; __skb_pull(skb, thlen); - mss = tcp_skb_mss(skb); + mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; @@ -242,7 +242,7 @@ found: flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); - mss = tcp_skb_mss(p); + mss = skb_shinfo(p)->gso_size; flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); -- cgit v1.2.3 From 51466a7545b73b7ad7bcfb33410d2823ccfaa501 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:16 -0700 Subject: tcp: fill shinfo->gso_type at last moment Our goal is to touch skb_shinfo(skb) only when absolutely needed, to avoid two cache line misses in TCP output path for last skb that is considered but not sent because of various conditions (cwnd, tso defer, receiver window, TSQ...) A packet is GSO only when skb_shinfo(skb)->gso_size is not zero. We can set skb_shinfo(skb)->gso_type to sk->sk_gso_type even for non GSO packets. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++------ net/ipv4/tcp_output.c | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d4f76ab6e136..70a6fa8ecbd3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1316,16 +1316,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * code can come after this skb later on it's better to keep * setting gso_size to something. */ - if (!skb_shinfo(prev)->gso_size) { + if (!skb_shinfo(prev)->gso_size) skb_shinfo(prev)->gso_size = mss; - skb_shinfo(prev)->gso_type = sk->sk_gso_type; - } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (tcp_skb_pcount(skb) <= 1) { + if (tcp_skb_pcount(skb) <= 1) skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - } /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index eeb59befaf06..a51f7aab27d6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -412,7 +412,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; - shinfo->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -1003,6 +1002,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } tcp_options_write((__be32 *)(th + 1), tp, &opts); + skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) tcp_ecn_send(sk, skb, tcp_header_size); @@ -1080,11 +1080,9 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, */ tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; - shinfo->gso_type = 0; } else { tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); shinfo->gso_size = mss_now; - shinfo->gso_type = sk->sk_gso_type; } } -- cgit v1.2.3 From 5bbb432c896d23ce8f41f38e88dbd38982df99f9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:17 -0700 Subject: tcp: tcp_set_skb_tso_segs() no longer need struct sock parameter tcp_set_skb_tso_segs() & tcp_init_tso_segs() no longer use the sock pointer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a51f7aab27d6..d12888581337 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,8 +1066,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -1214,8 +1213,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* If this packet has been sent out already, we must * adjust the various packet counters. @@ -1295,7 +1294,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); + tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } @@ -1627,13 +1626,12 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, * This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); tso_segs = tcp_skb_pcount(skb); } return tso_segs; @@ -1688,7 +1686,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, const struct tcp_sock *tp = tcp_sk(sk); unsigned int cwnd_quota; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) return 0; @@ -1757,8 +1755,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); @@ -1992,7 +1990,7 @@ static int tcp_mtu_probe(struct sock *sk) skb->len, 0); } else { __pskb_trim_head(skb, copy); - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); } TCP_SKB_CB(skb)->seq += copy; } @@ -2002,7 +2000,7 @@ static int tcp_mtu_probe(struct sock *sk) if (len >= probe_size) break; } - tcp_init_tso_segs(sk, nskb, nskb->len); + tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). @@ -2064,7 +2062,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; - tso_segs = tcp_init_tso_segs(sk, skb, mss_now); + tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { @@ -2618,7 +2616,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (unlikely(oldpcount > 1)) { if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } } @@ -3455,7 +3453,7 @@ int tcp_write_wakeup(struct sock *sk, int mib) if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) - tcp_set_skb_tso_segs(sk, skb, mss); + tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); -- cgit v1.2.3 From f69ad292cfd13aa7ee00847320c6bb9ba2154e87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:18 -0700 Subject: tcp: fill shinfo->gso_size at last moment In commit cd7d8498c9a5 ("tcp: change tcp_skb_pcount() location") we stored gso_segs in a temporary cache hot location. This patch does the same for gso_size. This allows to save 2 cache line misses in tcp xmit path for the last packet that is considered but not sent because of various conditions (cwnd, tso defer, receiver window, TSQ...) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 13 ++++++++----- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 12 ++++-------- 3 files changed, 16 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 978cebedd3fc..950cfecaad3c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -730,11 +730,14 @@ struct tcp_skb_cb { /* Note : tcp_tw_isn is used in input path only * (isn chosen by tcp_timewait_state_process()) * - * tcp_gso_segs is used in write queue only, - * cf tcp_skb_pcount() + * tcp_gso_segs/size are used in write queue only, + * cf tcp_skb_pcount()/tcp_skb_mss() */ __u32 tcp_tw_isn; - __u32 tcp_gso_segs; + struct { + u16 tcp_gso_segs; + u16 tcp_gso_size; + }; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ @@ -790,10 +793,10 @@ static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) TCP_SKB_CB(skb)->tcp_gso_segs += segs; } -/* This is valid iff tcp_skb_pcount() > 1. */ +/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { - return skb_shinfo(skb)->gso_size; + return TCP_SKB_CB(skb)->tcp_gso_size; } /* Events passed to congestion control interface */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 70a6fa8ecbd3..684f095d196e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1316,12 +1316,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * code can come after this skb later on it's better to keep * setting gso_size to something. */ - if (!skb_shinfo(prev)->gso_size) - skb_shinfo(prev)->gso_size = mss; + if (!TCP_SKB_CB(prev)->tcp_gso_size) + TCP_SKB_CB(prev)->tcp_gso_size = mss; /* CHECKME: To clear or not to clear? Mimics normal skb currently */ if (tcp_skb_pcount(skb) <= 1) - skb_shinfo(skb)->gso_size = 0; + TCP_SKB_CB(skb)->tcp_gso_size = 0; /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -2248,7 +2248,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) (oldcnt >= packets)) break; - mss = skb_shinfo(skb)->gso_size; + mss = tcp_skb_mss(skb); err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss, GFP_ATOMIC); if (err < 0) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d12888581337..787f57ff87c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -402,8 +402,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; @@ -411,7 +409,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->sacked = 0; tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -1028,8 +1025,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); - /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; @@ -1068,8 +1066,6 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Make sure we own this skb before messing gso_size/gso_segs */ WARN_ON_ONCE(skb_cloned(skb)); @@ -1078,10 +1074,10 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) * non-TSO case. */ tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; + TCP_SKB_CB(skb)->tcp_gso_size = 0; } else { tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - shinfo->gso_size = mss_now; + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; } } -- cgit v1.2.3 From b5e2c45783aa785cbb195e43d5f0c0c6b228bde4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:19 -0700 Subject: tcp: remove obsolete check in tcp_set_skb_tso_segs() We had various issues in the past when TCP stack was modifying gso_size/gso_segs while clones were in flight. Commit c52e2421f73 ("tcp: must unclone packets before mangling them") fixed these bugs and added a WARN_ON_ONCE(skb_cloned(skb)); in tcp_set_skb_tso_segs() These bugs are now fixed, and because TCP stack now only sets shinfo->gso_size|segs on the clone itself, the check can be removed. As a result of this change, compiler inlines tcp_set_skb_tso_segs() in tcp_init_tso_segs() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 787f57ff87c4..b1c218df2c85 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,9 +1066,6 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - /* Make sure we own this skb before messing gso_size/gso_segs */ - WARN_ON_ONCE(skb_cloned(skb)); - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. -- cgit v1.2.3 From 33b1f31392861947fa2a2a57c3a39ab63b8c9f9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 5 Jun 2015 13:28:38 +0200 Subject: net: ip_fragment: remove BRIDGE_NETFILTER mtu special handling since commit d6b915e29f4adea9 ("ip_fragment: don't forward defragmented DF packet") the largest fragment size is available in the IPCB. Therefore we no longer need to care about 'encapsulation' overhead of stripped PPPOE/VLAN headers since ip_do_fragment doesn't use device mtu in such cases. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 7 ------- net/bridge/br_netfilter.c | 7 +++++++ net/ipv4/ip_output.c | 4 ---- 3 files changed, 7 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index f2fdb5a52070..6d80fc686323 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -20,13 +20,6 @@ enum nf_br_hook_priorities { #define BRNF_BRIDGED_DNAT 0x02 #define BRNF_NF_BRIDGE_PREROUTING 0x08 -static inline unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) -{ - if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE) - return PPPOE_SES_HLEN; - return 0; -} - int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb); static inline void br_drop_fake_rtable(struct sk_buff *skb) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 535f9dae743e..1e62ae5d8f4e 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -936,6 +936,13 @@ static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, return ip_do_fragment(sk, skb, output); } +static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) +{ + if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE) + return PPPOE_SES_HLEN; + return 0; +} + static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f5f5ef1cebd5..19d7e43b5370 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -549,10 +549,6 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge) - mtu -= nf_bridge_mtu_reduction(skb); -#endif IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: -- cgit v1.2.3 From 71ae0dff02d756e4d2ca710b79f2ff5390029a5f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Jun 2015 01:34:54 +0200 Subject: netfilter: xtables: use percpu rule counters The binary arp/ip/ip6tables ruleset is stored per cpu. The only reason left as to why we need percpu duplication are the rule counters embedded into ipt_entry et al -- since each cpu has its own copy of the rules, all counters can be lockless. The downside is that the more cpus are supported, the more memory is required. Rules are not just duplicated per online cpu but for each possible cpu, i.e. if maxcpu is 144, then rule is duplicated 144 times, not for the e.g. 64 cores present. To save some memory and also improve utilization of shared caches it would be preferable to only store the rule blob once. So we first need to separate counters and the rule blob. Instead of using entry->counters, allocate this percpu and store the percpu address in entry->counters.pcnt on CONFIG_SMP. This change makes no sense as-is; it is merely an intermediate step to remove the percpu duplication of the rule set in a followup patch. Suggested-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Reported-by: Marcelo Ricardo Leitner Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 49 ++++++++++++++++++++++++++++++++++++++ net/ipv4/netfilter/arp_tables.c | 32 +++++++++++++++++++++---- net/ipv4/netfilter/ip_tables.c | 31 ++++++++++++++++++++---- net/ipv6/netfilter/ip6_tables.c | 31 ++++++++++++++++++++---- 4 files changed, 129 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 09f38206c18f..b77ab9f17641 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -353,6 +353,55 @@ static inline unsigned long ifname_compare_aligned(const char *_a, return ret; } + +/* On SMP, ip(6)t_entry->counters.pcnt holds address of the + * real (percpu) counter. On !SMP, its just the packet count, + * so nothing needs to be done there. + * + * xt_percpu_counter_alloc returns the address of the percpu + * counter, or 0 on !SMP. + * + * Hence caller must use IS_ERR_VALUE to check for error, this + * allows us to return 0 for single core systems without forcing + * callers to deal with SMP vs. NONSMP issues. + */ +static inline u64 xt_percpu_counter_alloc(void) +{ + if (nr_cpu_ids > 1) { + void __percpu *res = alloc_percpu(struct xt_counters); + + if (res == NULL) + return (u64) -ENOMEM; + + return (__force u64) res; + } + + return 0; +} +static inline void xt_percpu_counter_free(u64 pcnt) +{ + if (nr_cpu_ids > 1) + free_percpu((void __percpu *) pcnt); +} + +static inline struct xt_counters * +xt_get_this_cpu_counter(struct xt_counters *cnt) +{ + if (nr_cpu_ids > 1) + return this_cpu_ptr((void __percpu *) cnt->pcnt); + + return cnt; +} + +static inline struct xt_counters * +xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) +{ + if (nr_cpu_ids > 1) + return per_cpu_ptr((void __percpu *) cnt->pcnt, cpu); + + return cnt; +} + struct nf_hook_ops *xt_hook_link(const struct xt_table *, nf_hookfn *); void xt_hook_unlink(const struct xt_table *, struct nf_hook_ops *); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index a61200754f4b..0ada09ae6e81 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -289,13 +289,15 @@ unsigned int arpt_do_table(struct sk_buff *skb, arp = arp_hdr(skb); do { const struct xt_entry_target *t; + struct xt_counters *counter; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { e = arpt_next_entry(e); continue; } - ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1); t = arpt_get_target_c(e); @@ -521,6 +523,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); @@ -538,6 +544,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) err: module_put(t->u.kernel.target->me); out: + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -614,6 +622,7 @@ static inline void cleanup_entry(struct arpt_entry *e) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -723,13 +732,15 @@ static void get_counters(const struct xt_table_info *t, i = 0; xt_entry_foreach(iter, t->entries[cpu], t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -1186,7 +1197,10 @@ static int do_add_counters(struct net *net, const void __user *user, loc_cpu_entry = private->entries[curcpu]; addend = xt_write_recseq_begin(); xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1416,9 +1430,17 @@ static int translate_compat_table(const char *name, i = 0; xt_entry_foreach(iter1, entry1, newinfo->size) { + iter1->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(iter1->counters.pcnt)) { + ret = -ENOMEM; + break; + } + ret = check_target(iter1, name); - if (ret != 0) + if (ret != 0) { + xt_percpu_counter_free(iter1->counters.pcnt); break; + } ++i; if (strcmp(arpt_get_target(iter1)->u.user.name, XT_ERROR_TARGET) == 0) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e7abf5145edc..d190b10dabfe 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -345,6 +345,7 @@ ipt_do_table(struct sk_buff *skb, do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; + struct xt_counters *counter; IP_NF_ASSERT(e); if (!ip_packet_match(ip, indev, outdev, @@ -361,7 +362,8 @@ ipt_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, skb->len, 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, skb->len, 1); t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); @@ -665,6 +667,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -691,6 +697,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, ret = check_target(e, net, name); if (ret) goto err; + return 0; err: module_put(t->u.kernel.target->me); @@ -700,6 +707,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -784,6 +794,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -888,13 +899,15 @@ get_counters(const struct xt_table_info *t, i = 0; xt_entry_foreach(iter, t->entries[cpu], t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -1374,7 +1387,10 @@ do_add_counters(struct net *net, const void __user *user, loc_cpu_entry = private->entries[curcpu]; addend = xt_write_recseq_begin(); xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1608,6 +1624,10 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) unsigned int j; int ret = 0; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -1632,6 +1652,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index cdd085f8b770..a1190ee14723 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -367,6 +367,7 @@ ip6t_do_table(struct sk_buff *skb, do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; + struct xt_counters *counter; IP_NF_ASSERT(e); acpar.thoff = 0; @@ -384,7 +385,8 @@ ip6t_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, skb->len, 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, skb->len, 1); t = ip6t_get_target_c(e); IP_NF_ASSERT(t->u.kernel.target); @@ -679,6 +681,10 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -714,6 +720,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -797,6 +806,8 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -901,13 +912,15 @@ get_counters(const struct xt_table_info *t, i = 0; xt_entry_foreach(iter, t->entries[cpu], t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -1374,7 +1387,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, goto free; } - local_bh_disable(); private = t->private; if (private->number != num_counters) { @@ -1388,7 +1400,10 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, addend = xt_write_recseq_begin(); loc_cpu_entry = private->entries[curcpu]; xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1621,6 +1636,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; j = 0; mtpar.net = net; mtpar.table = name; @@ -1645,6 +1663,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } -- cgit v1.2.3 From 482cfc318559e2527dfd8513582d2fdb276e47c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Jun 2015 01:34:55 +0200 Subject: netfilter: xtables: avoid percpu ruleset duplication We store the rule blob per (possible) cpu. Unfortunately this means we can waste lot of memory on big smp machines. ipt_entry structure ('rule head') is 112 byte, so e.g. with maxcpu=64 one single rule eats close to 8k RAM. Since previous patch made counters percpu it appears there is nothing left in the rule blob that needs to be percpu. On my test system (144 possible cpus, 400k dummy rules) this change saves close to 9 Gigabyte of RAM. Reported-by: Marcelo Ricardo Leitner Acked-by: Jesper Dangaard Brouer Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 4 +-- net/ipv4/netfilter/arp_tables.c | 50 +++++++++-------------------- net/ipv4/netfilter/ip_tables.c | 64 ++++++++++--------------------------- net/ipv6/netfilter/ip6_tables.c | 65 ++++++++++---------------------------- net/netfilter/x_tables.c | 23 +++++--------- 5 files changed, 57 insertions(+), 149 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index b77ab9f17641..9969d79dcde1 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -224,9 +224,9 @@ struct xt_table_info { unsigned int stacksize; unsigned int __percpu *stackptr; void ***jumpstack; - /* ipt_entry tables: one per CPU */ + /* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */ - void *entries[1]; + void *entries; }; #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \ diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 0ada09ae6e81..d75c139393fc 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -275,7 +275,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, * pointer. */ smp_read_barrier_depends(); - table_base = private->entries[smp_processor_id()]; + table_base = private->entries; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -711,12 +711,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -731,7 +725,7 @@ static void get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; @@ -785,7 +779,7 @@ static int copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; /* ... then copy entire thing ... */ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; @@ -880,10 +874,10 @@ static int compat_table_info(const struct xt_table_info *info, if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1048,7 +1042,7 @@ static int __do_replace(struct net *net, const char *name, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + loc_cpu_old_entry = oldinfo->entries; xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) cleanup_entry(iter); @@ -1095,8 +1089,7 @@ static int do_replace(struct net *net, const void __user *user, if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1126,7 +1119,7 @@ static int do_replace(struct net *net, const void __user *user, static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1136,7 +1129,6 @@ static int do_add_counters(struct net *net, const void __user *user, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; struct arpt_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1192,11 +1184,9 @@ static int do_add_counters(struct net *net, const void __user *user, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); - loc_cpu_entry = private->entries[curcpu]; + addend = xt_write_recseq_begin(); - xt_entry_foreach(iter, loc_cpu_entry, private->size) { + xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); @@ -1410,7 +1400,7 @@ static int translate_compat_table(const char *name, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1470,11 +1460,6 @@ static int translate_compat_table(const char *name, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1533,8 +1518,7 @@ static int compat_do_replace(struct net *net, void __user *user, if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1631,7 +1615,6 @@ static int compat_copy_entries_to_user(unsigned int total_size, void __user *pos; unsigned int size; int ret = 0; - void *loc_cpu_entry; unsigned int i = 0; struct arpt_entry *iter; @@ -1639,11 +1622,9 @@ static int compat_copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy on our node/cpu */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -1812,8 +1793,7 @@ struct xt_table *arpt_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(newinfo, loc_cpu_entry, repl); @@ -1844,7 +1824,7 @@ void arpt_unregister_table(struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter); if (private->number > private->initial_entries) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d190b10dabfe..6151500c2df8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -254,15 +254,13 @@ static void trace_packet(const struct sk_buff *skb, const struct xt_table_info *private, const struct ipt_entry *e) { - const void *table_base; const struct ipt_entry *root; const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; struct net *net = dev_net(in ? in : out); - table_base = private->entries[smp_processor_id()]; - root = get_entry(table_base, private->hook_entry[hook]); + root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP_TRACE_COMMENT_RULE]; @@ -331,7 +329,7 @@ ipt_do_table(struct sk_buff *skb, * pointer. */ smp_read_barrier_depends(); - table_base = private->entries[cpu]; + table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); origptr = *stackptr; @@ -877,12 +875,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -898,7 +890,7 @@ get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; @@ -946,17 +938,13 @@ copy_entries_to_user(unsigned int total_size, struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; - const void *loc_cpu_entry; + void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1070,10 +1058,10 @@ static int compat_table_info(const struct xt_table_info *info, if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1194,7 +1182,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; - void *loc_cpu_old_entry; struct ipt_entry *iter; ret = 0; @@ -1237,8 +1224,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); @@ -1284,8 +1270,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1316,7 +1301,7 @@ static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1326,7 +1311,6 @@ do_add_counters(struct net *net, const void __user *user, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; struct ipt_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1382,11 +1366,8 @@ do_add_counters(struct net *net, const void __user *user, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); - loc_cpu_entry = private->entries[curcpu]; addend = xt_write_recseq_begin(); - xt_entry_foreach(iter, loc_cpu_entry, private->size) { + xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); @@ -1739,7 +1720,7 @@ translate_compat_table(struct net *net, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1791,11 +1772,6 @@ translate_compat_table(struct net *net, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1842,8 +1818,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1914,7 +1889,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *pos; unsigned int size; int ret = 0; - const void *loc_cpu_entry; unsigned int i = 0; struct ipt_entry *iter; @@ -1922,14 +1896,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -2104,8 +2073,7 @@ struct xt_table *ipt_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu, but dont care about preemption */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); @@ -2136,7 +2104,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index a1190ee14723..80a7f0d38aa4 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -283,15 +283,13 @@ static void trace_packet(const struct sk_buff *skb, const struct xt_table_info *private, const struct ip6t_entry *e) { - const void *table_base; const struct ip6t_entry *root; const char *hookname, *chainname, *comment; const struct ip6t_entry *iter; unsigned int rulenum = 0; struct net *net = dev_net(in ? in : out); - table_base = private->entries[smp_processor_id()]; - root = get_entry(table_base, private->hook_entry[hook]); + root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP6_TRACE_COMMENT_RULE]; @@ -357,7 +355,7 @@ ip6t_do_table(struct sk_buff *skb, */ smp_read_barrier_depends(); cpu = smp_processor_id(); - table_base = private->entries[cpu]; + table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); origptr = *stackptr; @@ -890,12 +888,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -911,7 +903,7 @@ get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; @@ -959,17 +951,13 @@ copy_entries_to_user(unsigned int total_size, struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; - const void *loc_cpu_entry; + void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1083,10 +1071,10 @@ static int compat_table_info(const struct xt_table_info *info, if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(AF_INET6, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1207,7 +1195,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; - const void *loc_cpu_old_entry; struct ip6t_entry *iter; ret = 0; @@ -1250,8 +1237,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); @@ -1297,8 +1283,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1329,7 +1314,7 @@ static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1339,7 +1324,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - const void *loc_cpu_entry; struct ip6t_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1395,11 +1379,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); addend = xt_write_recseq_begin(); - loc_cpu_entry = private->entries[curcpu]; - xt_entry_foreach(iter, loc_cpu_entry, private->size) { + xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); @@ -1407,7 +1388,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, ++i; } xt_write_recseq_end(addend); - unlock_up_free: local_bh_enable(); xt_table_unlock(t); @@ -1750,7 +1730,7 @@ translate_compat_table(struct net *net, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1802,11 +1782,6 @@ translate_compat_table(struct net *net, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1853,8 +1828,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1925,7 +1899,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *pos; unsigned int size; int ret = 0; - const void *loc_cpu_entry; unsigned int i = 0; struct ip6t_entry *iter; @@ -1933,14 +1906,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -2115,8 +2083,7 @@ struct xt_table *ip6t_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu, but dont care about preemption */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); @@ -2146,7 +2113,7 @@ void ip6t_unregister_table(struct net *net, struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 83032464a4bd..6062ce3e862c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -659,7 +659,6 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user); struct xt_table_info *xt_alloc_table_info(unsigned int size) { struct xt_table_info *newinfo; - int cpu; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) @@ -671,19 +670,14 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) newinfo->size = size; - for_each_possible_cpu(cpu) { - if (size <= PAGE_SIZE) - newinfo->entries[cpu] = kmalloc_node(size, - GFP_KERNEL, - cpu_to_node(cpu)); - else - newinfo->entries[cpu] = vmalloc_node(size, - cpu_to_node(cpu)); + if (size <= PAGE_SIZE) + newinfo->entries = kmalloc(size, GFP_KERNEL); + else + newinfo->entries = vmalloc(size); - if (newinfo->entries[cpu] == NULL) { - xt_free_table_info(newinfo); - return NULL; - } + if (newinfo->entries == NULL) { + xt_free_table_info(newinfo); + return NULL; } return newinfo; @@ -694,8 +688,7 @@ void xt_free_table_info(struct xt_table_info *info) { int cpu; - for_each_possible_cpu(cpu) - kvfree(info->entries[cpu]); + kvfree(info->entries); if (info->jumpstack != NULL) { for_each_possible_cpu(cpu) -- cgit v1.2.3 From b60f2f3d65de3f3e9e63855e5f5070a3fedcccba Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Jun 2015 12:12:22 +0200 Subject: net: ipv4: un-inline ip_finish_output2 text data bss dec hex filename old: 16527 44 0 16571 40bb net/ipv4/ip_output.o new: 14935 44 0 14979 3a83 net/ipv4/ip_output.o Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f5f5ef1cebd5..55f3c2e94357 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -172,7 +172,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); -static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; -- cgit v1.2.3 From 758f0d4b16e0508aa47a4a4d06eacba0d66e24d5 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Fri, 12 Jun 2015 17:24:03 +0200 Subject: tcp: cdg: use div_u64() Fixes cross-compile to mips. Signed-off-by: Kenneth Klette Jonassen Signed-off-by: David S. Miller --- net/ipv4/tcp_cdg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index a52ce2db53f2..8c6fd3d5e40f 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -145,7 +145,7 @@ static void tcp_cdg_hystart_update(struct sock *sk) return; if (hystart_detect & HYSTART_ACK_TRAIN) { - u32 now_us = local_clock() / NSEC_PER_USEC; + u32 now_us = div_u64(local_clock(), NSEC_PER_USEC); if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { ca->last_ack = now_us; -- cgit v1.2.3 From f09becc79f899f92557ce6d5562a8b80d6addb34 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 12 Jun 2015 13:58:52 +0200 Subject: netfilter: Kconfig: get rid of parens around depends on According to the reporter, they are not needed. Reported-by: Sergei Shtylyov Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 3 ++- net/ipv6/netfilter/Kconfig | 3 ++- net/netfilter/Kconfig | 18 +++++++++--------- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index fb20f363151f..2199a5db25e6 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -195,7 +195,8 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP_NF_MANGLE || IP_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ca6998345b42..b552cf0d6198 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -186,7 +186,8 @@ config IP6_NF_MATCH_MH config IP6_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP6_NF_MANGLE || IP6_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index fbc8d15c7fda..6eae69a698ed 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -206,7 +206,7 @@ config NF_CONNTRACK_FTP config NF_CONNTRACK_H323 tristate "H.323 protocol support" - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on NETFILTER_ADVANCED help H.323 is a VoIP signalling protocol from ITU-T. As one of the most @@ -723,7 +723,7 @@ config NETFILTER_XT_TARGET_HL config NETFILTER_XT_TARGET_HMARK tristate '"HMARK" target support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED ---help--- This option adds the "HMARK" target. @@ -865,7 +865,7 @@ config NETFILTER_XT_TARGET_REDIRECT config NETFILTER_XT_TARGET_TEE tristate '"TEE" - packet cloning to alternate destination' depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK ---help--- This option adds a "TEE" target with which a packet can be cloned and @@ -875,8 +875,8 @@ config NETFILTER_XT_TARGET_TPROXY tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IPV6 || IPV6=n + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES @@ -915,7 +915,7 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n default m if NETFILTER_ADVANCED=n ---help--- This option adds a `TCPMSS' target, which allows you to alter the @@ -1127,7 +1127,7 @@ config NETFILTER_XT_MATCH_ESP config NETFILTER_XT_MATCH_HASHLIMIT tristate '"hashlimit" match support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED help This option adds a `hashlimit' match. @@ -1369,8 +1369,8 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK - depends on (IPV6 || IPV6=n) - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IPV6 || IPV6=n + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help -- cgit v1.2.3 From 711bdde6a884354ddae8da2fcb495b2a9364cc90 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Jun 2015 09:57:30 -0700 Subject: netfilter: x_tables: remove XT_TABLE_INFO_SZ and a dereference. After Florian patches, there is no need for XT_TABLE_INFO_SZ anymore : Only one copy of table is kept, instead of one copy per cpu. We also can avoid a dereference if we put table data right after xt_table_info. It reduces register pressure and helps compiler. Then, we attempt a kmalloc() if total size is under order-3 allocation, to reduce TLB pressure, as in many cases, rules fit in 32 KB. Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 5 +---- net/ipv4/netfilter/arp_tables.c | 4 ++-- net/ipv4/netfilter/ip_tables.c | 4 ++-- net/ipv6/netfilter/ip6_tables.c | 4 ++-- net/netfilter/x_tables.c | 32 ++++++++++++-------------------- 5 files changed, 19 insertions(+), 30 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 9969d79dcde1..95693c4cebdd 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -225,12 +225,9 @@ struct xt_table_info { unsigned int __percpu *stackptr; void ***jumpstack; - /* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */ - void *entries; + unsigned char entries[0] __aligned(8); }; -#define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \ - + nr_cpu_ids * sizeof(char *)) int xt_register_target(struct xt_target *target); void xt_unregister_target(struct xt_target *target); int xt_register_targets(struct xt_target *target, unsigned int n); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d75c139393fc..95c9b6eece25 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -256,7 +256,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, const struct arphdr *arp; struct arpt_entry *e, *back; const char *indev, *outdev; - void *table_base; + const void *table_base; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; @@ -868,7 +868,7 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct arpt_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 6151500c2df8..6c72fbb7b49e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -938,7 +938,7 @@ copy_entries_to_user(unsigned int total_size, struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; - void *loc_cpu_entry; + const void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1052,7 +1052,7 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ipt_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 80a7f0d38aa4..3c35ced39b42 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -951,7 +951,7 @@ copy_entries_to_user(unsigned int total_size, struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; - void *loc_cpu_entry; + const void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1065,7 +1065,7 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ip6t_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 6062ce3e862c..d324fe71260c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -658,29 +658,23 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user); struct xt_table_info *xt_alloc_table_info(unsigned int size) { - struct xt_table_info *newinfo; + struct xt_table_info *info = NULL; + size_t sz = sizeof(*info) + size; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; - newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); - if (!newinfo) - return NULL; - - newinfo->size = size; - - if (size <= PAGE_SIZE) - newinfo->entries = kmalloc(size, GFP_KERNEL); - else - newinfo->entries = vmalloc(size); - - if (newinfo->entries == NULL) { - xt_free_table_info(newinfo); - return NULL; + if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!info) { + info = vmalloc(sz); + if (!info) + return NULL; } - - return newinfo; + memset(info, 0, sizeof(*info)); + info->size = size; + return info; } EXPORT_SYMBOL(xt_alloc_table_info); @@ -688,8 +682,6 @@ void xt_free_table_info(struct xt_table_info *info) { int cpu; - kvfree(info->entries); - if (info->jumpstack != NULL) { for_each_possible_cpu(cpu) kvfree(info->jumpstack[cpu]); @@ -698,7 +690,7 @@ void xt_free_table_info(struct xt_table_info *info) free_percpu(info->stackptr); - kfree(info); + kvfree(info); } EXPORT_SYMBOL(xt_free_table_info); -- cgit v1.2.3 From 3fd22af808f4d7455ba91596d334438c7ee0f889 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 15 Jun 2015 11:26:19 -0400 Subject: sock_diag: specify info_size per inet protocol Previously, there was no clear distinction between the inet protocols that used struct tcp_info to report information and those that didn't. This change adds a specific size attribute to the inet_diag_handler struct which defines these interfaces. This will make dispatching sock_diag get_info requests identical for all inet protocols in a following patch. Tested: ss -au Tested: ss -at Signed-off-by: Craig Gallek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 1 + net/dccp/diag.c | 1 + net/ipv4/inet_diag.c | 4 ++-- net/ipv4/tcp_diag.c | 1 + net/ipv4/udp_diag.c | 2 ++ 5 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index ac48b10c9395..0e707f0c1a3e 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -24,6 +24,7 @@ struct inet_diag_handler { struct inet_diag_msg *r, void *info); __u16 idiag_type; + __u16 idiag_info_size; }; struct inet_connection_sock; diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 5a45f8de5d99..2d84303ea6bf 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -66,6 +66,7 @@ static const struct inet_diag_handler dccp_diag_handler = { .dump_one = dccp_diag_dump_one, .idiag_get_info = dccp_diag_get_info, .idiag_type = IPPROTO_DCCP, + .idiag_info_size = sizeof(struct tcp_info), }; static int __init dccp_diag_init(void) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4d32262c7502..b1f01174bf32 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -200,9 +200,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } #undef EXPIRES_IN_MS - if (ext & (1 << (INET_DIAG_INFO - 1))) { + if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { attr = nla_reserve(skb, INET_DIAG_INFO, - sizeof(struct tcp_info)); + handler->idiag_info_size); if (!attr) goto errout; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 79b34a0f4a4a..423e3881a40b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -50,6 +50,7 @@ static const struct inet_diag_handler tcp_diag_handler = { .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_type = IPPROTO_TCP, + .idiag_info_size = sizeof(struct tcp_info), }; static int __init tcp_diag_init(void) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index b763c39ae1d7..6116604bf6e8 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -170,6 +170,7 @@ static const struct inet_diag_handler udp_diag_handler = { .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, + .idiag_info_size = 0, }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -190,6 +191,7 @@ static const struct inet_diag_handler udplite_diag_handler = { .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, + .idiag_info_size = 0, }; static int __init udp_diag_init(void) -- cgit v1.2.3 From 35ac838a9b96470f999db04320f53a2033642bfb Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 15 Jun 2015 11:26:20 -0400 Subject: sock_diag: implement a get_info handler for inet This get_info handler will simply dispatch to the appropriate existing inet protocol handler. This patch also includes a new netlink attribute (INET_DIAG_PROTOCOL). This attribute is currently only used for multicast messages. Without this attribute, there is no way of knowing the IP protocol used by the socket information being broadcast. This attribute is not necessary in the 'dump' variant of this protocol (though it could easily be added) because dump requests are issued for specific family/protocol pairs. Tested: ss -E (note, the -E option has not yet been merged into the upstream version of ss). Signed-off-by: Craig Gallek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 3 ++- net/ipv4/inet_diag.c | 46 ++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp.c | 4 +++- net/ipv4/tcp_diag.c | 5 +++-- 4 files changed, 54 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index c7093c75bdd6..b629fc53b109 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -111,9 +111,10 @@ enum { INET_DIAG_SKMEMINFO, INET_DIAG_SHUTDOWN, INET_DIAG_DCTCPINFO, + INET_DIAG_PROTOCOL, /* response attribute only */ }; -#define INET_DIAG_MAX INET_DIAG_DCTCPINFO +#define INET_DIAG_MAX INET_DIAG_PROTOCOL /* INET_DIAG_MEM */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index b1f01174bf32..21985d8d41e7 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1078,14 +1078,60 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) return inet_diag_get_exact(skb, h, nlmsg_data(h)); } +static +int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) +{ + const struct inet_diag_handler *handler; + struct nlmsghdr *nlh; + struct nlattr *attr; + struct inet_diag_msg *r; + void *info = NULL; + int err = 0; + + nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0); + if (!nlh) + return -ENOMEM; + + r = nlmsg_data(nlh); + memset(r, 0, sizeof(*r)); + inet_diag_msg_common_fill(r, sk); + r->idiag_state = sk->sk_state; + + if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) { + nlmsg_cancel(skb, nlh); + return err; + } + + handler = inet_diag_lock_handler(sk->sk_protocol); + if (IS_ERR(handler)) { + inet_diag_unlock_handler(handler); + nlmsg_cancel(skb, nlh); + return PTR_ERR(handler); + } + + attr = handler->idiag_info_size + ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size) + : NULL; + if (attr) + info = nla_data(attr); + + handler->idiag_get_info(sk, r, info); + inet_diag_unlock_handler(handler); + + nlmsg_end(skb, nlh); + return 0; +} + static const struct sock_diag_handler inet_diag_handler = { .family = AF_INET, .dump = inet_diag_handler_dump, + .get_info = inet_diag_handler_get_info, }; static const struct sock_diag_handler inet6_diag_handler = { .family = AF_INET6, .dump = inet_diag_handler_dump, + .get_info = inet_diag_handler_get_info, }; int inet_diag_register(const struct inet_diag_handler *h) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 65f791f74845..697b86dd45b3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2624,13 +2624,15 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; u32 rate; memset(info, 0, sizeof(*info)); + if (sk->sk_type != SOCK_STREAM) + return; info->tcpi_state = sk->sk_state; info->tcpi_ca_state = icsk->icsk_ca_state; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 423e3881a40b..479f34946177 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -19,13 +19,14 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { - const struct tcp_sock *tp = tcp_sk(sk); struct tcp_info *info = _info; if (sk->sk_state == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; - } else { + } else if (sk->sk_type == SOCK_STREAM) { + const struct tcp_sock *tp = tcp_sk(sk); + r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); r->idiag_wqueue = tp->write_seq - tp->snd_una; } -- cgit v1.2.3 From e0df02e0c2118892b302e8248dbbb21698bedfed Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 17 Jun 2015 10:59:10 -0400 Subject: sock_diag: fetch source port from inet_sock When an inet_sock is destroyed, its source port (sk_num) is set to zero as part of the unhash procedure. In order to supply a source port as part of the NETLINK_SOCK_DIAG socket destruction broadcasts, the source port number must be read from inet_sport instead. Tested: ss -E Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 21985d8d41e7..ba6a80f90e7d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1095,6 +1095,8 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) r = nlmsg_data(nlh); memset(r, 0, sizeof(*r)); inet_diag_msg_common_fill(r, sk); + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM) + r->id.idiag_sport = inet_sk(sk)->inet_sport; r->idiag_state = sk->sk_state; if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) { -- cgit v1.2.3 From a2bb6d7d6f4249691b6a554cde59969d55b0d9c3 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 17 Jun 2015 11:07:01 -0700 Subject: ipv4: include NLM_F_APPEND flag in append route notifications This patch adds NLM_F_APPEND flag to struct nlmsg_hdr->nlmsg_flags in newroute notifications if the route add was an append. (This is similar to how NLM_F_REPLACE is already part of new route replace notifications today) This helps userspace determine if the route add operation was an append. Signed-off-by: Roopa Prabhu Acked-by: Scott Feldman Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3c699c4e90a4..6c666a9f1bd5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1082,6 +1082,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; + unsigned int nlflags = 0; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; @@ -1201,7 +1202,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (fa_match) goto out; - if (!(cfg->fc_nlflags & NLM_F_APPEND)) + if (cfg->fc_nlflags & NLM_F_APPEND) + nlflags = NLM_F_APPEND; + else fa = fa_first; } err = -ENOENT; @@ -1238,7 +1241,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, - &cfg->fc_nlinfo, 0); + &cfg->fc_nlinfo, nlflags); succeeded: return 0; -- cgit v1.2.3 From 3b1884435aa61320ac12d3b91f69ab034946cc3d Mon Sep 17 00:00:00 2001 From: Hiroaki SHIMODA Date: Thu, 18 Jun 2015 20:40:54 +0900 Subject: inet_diag: Remove _bh suffix in inet_diag_dump_reqs(). inet_diag_dump_reqs() is called from inet_diag_dump_icsk() with BH disabled. So no need to disable BH in inet_diag_dump_reqs(). Signed-off-by: Hiroaki Shimoda Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ba6a80f90e7d..516a1f6fbdd3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -746,7 +746,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.family = sk->sk_family; - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); lopt = icsk->icsk_accept_queue.listen_opt; if (!lopt || !listen_sock_qlen(lopt)) @@ -794,7 +794,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, } out: - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); return err; } -- cgit v1.2.3 From dfea2aa654243f70dc53b8648d0bbdeec55a7df1 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Thu, 18 Jun 2015 09:15:34 -0700 Subject: tcp: Do not call tcp_fastopen_reset_cipher from interrupt context tcp_fastopen_reset_cipher really cannot be called from interrupt context. It allocates the tcp_fastopen_context with GFP_KERNEL and calls crypto_alloc_cipher, which allocates all kind of stuff with GFP_KERNEL. Thus, we might sleep when the key-generation is triggered by an incoming TFO cookie-request which would then happen in interrupt- context, as shown by enabling CONFIG_DEBUG_ATOMIC_SLEEP: [ 36.001813] BUG: sleeping function called from invalid context at mm/slub.c:1266 [ 36.003624] in_atomic(): 1, irqs_disabled(): 0, pid: 1016, name: packetdrill [ 36.004859] CPU: 1 PID: 1016 Comm: packetdrill Not tainted 4.1.0-rc7 #14 [ 36.006085] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [ 36.008250] 00000000000004f2 ffff88007f8838a8 ffffffff8171d53a ffff880075a084a8 [ 36.009630] ffff880075a08000 ffff88007f8838c8 ffffffff810967d3 ffff88007f883928 [ 36.011076] 0000000000000000 ffff88007f8838f8 ffffffff81096892 ffff88007f89be00 [ 36.012494] Call Trace: [ 36.012953] [] dump_stack+0x4f/0x6d [ 36.014085] [] ___might_sleep+0x103/0x170 [ 36.015117] [] __might_sleep+0x52/0x90 [ 36.016117] [] kmem_cache_alloc_trace+0x47/0x190 [ 36.017266] [] ? tcp_fastopen_reset_cipher+0x42/0x130 [ 36.018485] [] tcp_fastopen_reset_cipher+0x42/0x130 [ 36.019679] [] tcp_fastopen_init_key_once+0x61/0x70 [ 36.020884] [] __tcp_fastopen_cookie_gen+0x1c/0x60 [ 36.022058] [] tcp_try_fastopen+0x58f/0x730 [ 36.023118] [] tcp_conn_request+0x3e8/0x7b0 [ 36.024185] [] ? __module_text_address+0x12/0x60 [ 36.025327] [] tcp_v4_conn_request+0x51/0x60 [ 36.026410] [] tcp_rcv_state_process+0x190/0xda0 [ 36.027556] [] ? __inet_lookup_established+0x47/0x170 [ 36.028784] [] tcp_v4_do_rcv+0x16d/0x3d0 [ 36.029832] [] ? security_sock_rcv_skb+0x16/0x20 [ 36.030936] [] tcp_v4_rcv+0x77a/0x7b0 [ 36.031875] [] ? iptable_filter_hook+0x33/0x70 [ 36.032953] [] ip_local_deliver_finish+0x92/0x1f0 [ 36.034065] [] ip_local_deliver+0x9a/0xb0 [ 36.035069] [] ? ip_rcv+0x3d0/0x3d0 [ 36.035963] [] ip_rcv_finish+0x119/0x330 [ 36.036950] [] ip_rcv+0x2e7/0x3d0 [ 36.037847] [] __netif_receive_skb_core+0x552/0x930 [ 36.038994] [] __netif_receive_skb+0x27/0x70 [ 36.040033] [] process_backlog+0xd2/0x1f0 [ 36.041025] [] net_rx_action+0x122/0x310 [ 36.042007] [] __do_softirq+0x103/0x2f0 [ 36.042978] [] do_softirq_own_stack+0x1c/0x30 This patch moves the call to tcp_fastopen_init_key_once to the places where a listener socket creates its TFO-state, which always happens in user-context (either from the setsockopt, or implicitly during the listen()-call) Cc: Eric Dumazet Cc: Hannes Frederic Sowa Fixes: 222e83d2e0ae ("tcp: switch tcp_fastopen key generation to net_get_random_once") Signed-off-by: Christoph Paasch Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 ++ net/ipv4/tcp.c | 7 +++++-- net/ipv4/tcp_fastopen.c | 2 -- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8b47a4d79d04..a5aa54ea6533 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -228,6 +228,8 @@ int inet_listen(struct socket *sock, int backlog) err = 0; if (err) goto out; + + tcp_fastopen_init_key_once(true); } err = inet_csk_listen_start(sk, backlog); if (err) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1377f2a0472..bb2ce74f6004 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2545,10 +2545,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | - TCPF_LISTEN))) + TCPF_LISTEN))) { + tcp_fastopen_init_key_once(true); + err = fastopen_init_queue(sk, val); - else + } else { err = -EINVAL; + } break; case TCP_TIMESTAMP: if (!tp->repair) diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 46b087a27503..f9c0fb84e435 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -78,8 +78,6 @@ static bool __tcp_fastopen_cookie_gen(const void *path, struct tcp_fastopen_context *ctx; bool ok = false; - tcp_fastopen_init_key_once(true); - rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { -- cgit v1.2.3 From 34b99df4e6256ddafb663c6de0711dceceddfe0e Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 23 Jun 2015 08:34:39 +0300 Subject: ip: report the original address of ICMP messages ICMP messages can trigger ICMP and local errors. In this case serr->port is 0 and starting from Linux 4.0 we do not return the original target address to the error queue readers. Add function to define which errors provide addr_offset. With this fix my ping command is not silent anymore. Fixes: c247f0534cc5 ("ip: fix error queue empty skb handling") Signed-off-by: Julian Anastasov Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 11 ++++++++++- net/ipv6/datagram.c | 12 +++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 7cfb0893f263..6ddde89996f4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -432,6 +432,15 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } +/* For some errors we have valid addr_offset even with zero payload and + * zero port. Also, addr_offset should be supported if port is set. + */ +static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr) +{ + return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; +} + /* IPv4 supports cmsg on all imcp errors and some timestamps * * Timestamp code paths do not initialize the fields expected by cmsg: @@ -498,7 +507,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && serr->port) { + if (sin && ipv4_datagram_support_addr(serr)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 762a58c772b8..62d908e64eeb 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -325,6 +325,16 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) kfree_skb(skb); } +/* For some errors we have valid addr_offset even with zero payload and + * zero port. Also, addr_offset should be supported if port is set. + */ +static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr) +{ + return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 || + serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; +} + /* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. * * At one point, excluding local errors was a quick test to identify icmp/icmp6 @@ -389,7 +399,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && serr->port) { + if (sin && ipv6_datagram_support_addr(serr)) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; -- cgit v1.2.3 From 8a3d03166f19329b46c6f9e900f93a89f446077b Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Tue, 23 Jun 2015 13:45:36 -0400 Subject: net: track link-status of ipv4 nexthops Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are reachable via an interface where carrier is off. No action is taken, but additional flags are passed to userspace to indicate carrier status. This also includes a cleanup to fib_disable_ip to more clearly indicate what event made the function call to replace the more cryptic force option previously used. v2: Split out kernel functionality into 2 patches, this patch simply sets and clears new nexthop flag RTNH_F_LINKDOWN. v3: Cleanups suggested by Alex as well as a bug noticed in fib_sync_down_dev and fib_sync_up when multipath was not enabled. v5: Whitespace and variable declaration fixups suggested by Dave. v6: Style fixups noticed by Dave; ran checkpatch to be sure I got them all. Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Acked-by: Scott Feldman Signed-off-by: David S. Miller --- include/net/ip_fib.h | 4 +-- include/uapi/linux/rtnetlink.h | 3 +++ net/ipv4/fib_frontend.c | 23 ++++++++++------ net/ipv4/fib_semantics.c | 60 +++++++++++++++++++++++++++++++++--------- 4 files changed, 67 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 54271ed0ed45..f73d27c5575a 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -305,9 +305,9 @@ void fib_flush_external(struct net *net); /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); -int fib_sync_down_dev(struct net_device *dev, int force); +int fib_sync_down_dev(struct net_device *dev, unsigned long event); int fib_sync_down_addr(struct net *net, __be32 local); -int fib_sync_up(struct net_device *dev); +int fib_sync_up(struct net_device *dev, unsigned int nh_flags); void fib_select_multipath(struct fib_result *res); /* Exported by fib_trie.c */ diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 17fb02f488da..fdd8f07f1d34 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -338,6 +338,9 @@ struct rtnexthop { #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ #define RTNH_F_OFFLOAD 8 /* offloaded route */ +#define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */ + +#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN) /* Macros to handle hexthops */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 872494e6e6eb..534eb1485045 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1063,9 +1063,9 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force) +static void fib_disable_ip(struct net_device *dev, unsigned long event) { - if (fib_sync_down_dev(dev, force)) + if (fib_sync_down_dev(dev, event)) fib_flush(dev_net(dev)); rt_cache_flush(dev_net(dev)); arp_ifdown(dev); @@ -1081,7 +1081,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev)); @@ -1093,7 +1093,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, 1); + fib_disable_ip(dev, event); } else { rt_cache_flush(dev_net(dev)); } @@ -1107,9 +1107,10 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; struct net *net = dev_net(dev); + unsigned int flags; if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2); + fib_disable_ip(dev, event); rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1124,16 +1125,22 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_add_ifaddr(ifa); } endfor_ifa(in_dev); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0); + fib_disable_ip(dev, event); break; - case NETDEV_CHANGEMTU: case NETDEV_CHANGE: + flags = dev_get_flags(dev); + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + fib_sync_up(dev, RTNH_F_LINKDOWN); + else + fib_sync_down_dev(dev, event); + /* fall through */ + case NETDEV_CHANGEMTU: rt_cache_flush(net); break; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 28ec3c1823bf..b1b305b1e340 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -266,7 +266,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; } endfor_nexthops(fi); @@ -318,7 +318,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_type == fi->fib_type && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && - ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && + !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } @@ -604,6 +604,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, return -ENODEV; if (!(dev->flags & IFF_UP)) return -ENETDOWN; + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; dev_hold(dev); nh->nh_scope = RT_SCOPE_LINK; @@ -636,6 +638,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (!dev) goto out; dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; @@ -654,6 +658,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->nh_dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = 0; } out: @@ -920,11 +926,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!nh->nh_dev) goto failure; } else { + int linkdown = 0; + change_nexthops(fi) { err = fib_check_nh(cfg, fi, nexthop_nh); if (err != 0) goto failure; + if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + linkdown++; } endfor_nexthops(fi) + if (linkdown == fi->fib_nhs) + fi->fib_flags |= RTNH_F_LINKDOWN; } if (fi->fib_prefsrc) { @@ -1103,7 +1115,7 @@ int fib_sync_down_addr(struct net *net, __be32 local) return ret; } -int fib_sync_down_dev(struct net_device *dev, int force) +int fib_sync_down_dev(struct net_device *dev, unsigned long event) { int ret = 0; int scope = RT_SCOPE_NOWHERE; @@ -1112,7 +1124,8 @@ int fib_sync_down_dev(struct net_device *dev, int force) struct hlist_head *head = &fib_info_devhash[hash]; struct fib_nh *nh; - if (force) + if (event == NETDEV_UNREGISTER || + event == NETDEV_DOWN) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { @@ -1129,7 +1142,15 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; else if (nexthop_nh->nh_dev == dev && nexthop_nh->nh_scope != scope) { - nexthop_nh->nh_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_nh->nh_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; + break; + } #ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); fi->fib_power -= nexthop_nh->nh_power; @@ -1139,14 +1160,23 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nexthop_nh->nh_dev == dev) { + if (event == NETDEV_UNREGISTER && + nexthop_nh->nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { - fi->fib_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + fi->fib_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + fi->fib_flags |= RTNH_F_LINKDOWN; + break; + } ret++; } } @@ -1210,13 +1240,11 @@ out: return; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. */ -int fib_sync_up(struct net_device *dev) +int fib_sync_up(struct net_device *dev, unsigned int nh_flags) { struct fib_info *prev_fi; unsigned int hash; @@ -1243,7 +1271,7 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & nh_flags)) { alive++; continue; } @@ -1254,14 +1282,18 @@ int fib_sync_up(struct net_device *dev) !__in_dev_get_rtnl(dev)) continue; alive++; +#ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~RTNH_F_DEAD; + nexthop_nh->nh_flags &= ~nh_flags; spin_unlock_bh(&fib_multipath_lock); +#else + nexthop_nh->nh_flags &= ~nh_flags; +#endif } endfor_nexthops(fi) if (alive > 0) { - fi->fib_flags &= ~RTNH_F_DEAD; + fi->fib_flags &= ~nh_flags; ret++; } } @@ -1269,6 +1301,8 @@ int fib_sync_up(struct net_device *dev) return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* * The algorithm is suboptimal, but it provides really * fair weighted route distribution. -- cgit v1.2.3 From 0eeb075fad736fb92620af995c47c204bbb5e829 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Tue, 23 Jun 2015 13:45:37 -0400 Subject: net: ipv4 sysctl option to ignore routes when nexthop link is down This feature is only enabled with the new per-interface or ipv4 global sysctls called 'ignore_routes_with_linkdown'. net.ipv4.conf.all.ignore_routes_with_linkdown = 0 net.ipv4.conf.default.ignore_routes_with_linkdown = 0 net.ipv4.conf.lo.ignore_routes_with_linkdown = 0 ... When the above sysctls are set, will report to userspace that a route is dead and will no longer resolve to this nexthop when performing a fib lookup. This will signal to userspace that the route will not be selected. The signalling of a RTNH_F_DEAD is only passed to userspace if the sysctl is enabled and link is down. This was done as without it the netlink listeners would have no idea whether or not a nexthop would be selected. The kernel only sets RTNH_F_DEAD internally if the interface has IFF_UP cleared. With the new sysctl set, the following behavior can be observed (interface p8p1 is link-down): default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 dead linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 dead linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 90.0.0.1 via 70.0.0.2 dev p7p1 src 70.0.0.1 cache local 80.0.0.1 dev lo src 80.0.0.1 cache 80.0.0.2 via 10.0.5.2 dev p9p1 src 10.0.5.15 cache While the route does remain in the table (so it can be modified if needed rather than being wiped away as it would be if IFF_UP was cleared), the proper next-hop is chosen automatically when the link is down. Now interface p8p1 is linked-up: default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 90.0.0.1 via 80.0.0.2 dev p8p1 src 80.0.0.1 cache local 80.0.0.1 dev lo src 80.0.0.1 cache 80.0.0.2 dev p8p1 src 80.0.0.1 cache and the output changes to what one would expect. If the sysctl is not set, the following output would be expected when p8p1 is down: default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 Since the dead flag does not appear, there should be no expectation that the kernel would skip using this route due to link being down. v2: Split kernel changes into 2 patches, this actually makes a behavioral change if the sysctl is set. Also took suggestion from Alex to simplify code by only checking sysctl during fib lookup and suggestion from Scott to add a per-interface sysctl. v3: Code clean-ups to make it more readable and efficient as well as a reverse path check fix. v4: Drop binary sysctl v5: Whitespace fixups from Dave v6: Style changes from Dave and checkpatch suggestions v7: One more checkpatch fixup Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Acked-by: Scott Feldman Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 3 +++ include/net/fib_rules.h | 3 ++- include/net/ip_fib.h | 16 +++++++++------- include/uapi/linux/ip.h | 1 + net/ipv4/devinet.c | 2 ++ net/ipv4/fib_frontend.c | 6 +++--- net/ipv4/fib_rules.c | 5 +++-- net/ipv4/fib_semantics.c | 33 ++++++++++++++++++++++++++++----- net/ipv4/fib_trie.c | 6 ++++++ net/ipv4/netfilter/ipt_rpfilter.c | 2 +- net/ipv4/route.c | 10 +++++----- 11 files changed, 63 insertions(+), 24 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 0a21fbefdfbe..a4328cea376a 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -120,6 +120,9 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) || (!IN_DEV_FORWARD(in_dev) && \ IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS))) +#define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \ + IN_DEV_CONF_GET((in_dev), IGNORE_ROUTES_WITH_LINKDOWN) + #define IN_DEV_ARPFILTER(in_dev) IN_DEV_ORCONF((in_dev), ARPFILTER) #define IN_DEV_ARP_ACCEPT(in_dev) IN_DEV_ORCONF((in_dev), ARP_ACCEPT) #define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 6d67383a5114..903a55efbffe 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -36,7 +36,8 @@ struct fib_lookup_arg { void *result; struct fib_rule *rule; int flags; -#define FIB_LOOKUP_NOREF 1 +#define FIB_LOOKUP_NOREF 1 +#define FIB_LOOKUP_IGNORE_LINKSTATE 2 }; struct fib_rules_ops { diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f73d27c5575a..49c142bdf01e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -226,7 +226,7 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id) } static inline int fib_lookup(struct net *net, const struct flowi4 *flp, - struct fib_result *res) + struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; @@ -234,7 +234,7 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp, rcu_read_lock(); tb = fib_get_table(net, RT_TABLE_MAIN); - if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) + if (tb && !fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF)) err = 0; rcu_read_unlock(); @@ -249,16 +249,18 @@ void __net_exit fib4_rules_exit(struct net *net); struct fib_table *fib_new_table(struct net *net, u32 id); struct fib_table *fib_get_table(struct net *net, u32 id); -int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res); +int __fib_lookup(struct net *net, struct flowi4 *flp, + struct fib_result *res, unsigned int flags); static inline int fib_lookup(struct net *net, struct flowi4 *flp, - struct fib_result *res) + struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err; + flags |= FIB_LOOKUP_NOREF; if (net->ipv4.fib_has_custom_rules) - return __fib_lookup(net, flp, res); + return __fib_lookup(net, flp, res, flags); rcu_read_lock(); @@ -266,11 +268,11 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, for (err = 0; !err; err = -ENETUNREACH) { tb = rcu_dereference_rtnl(net->ipv4.fib_main); - if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) + if (tb && !fib_table_lookup(tb, flp, res, flags)) break; tb = rcu_dereference_rtnl(net->ipv4.fib_default); - if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) + if (tb && !fib_table_lookup(tb, flp, res, flags)) break; } diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 411959405ab6..08f894d2ddbd 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -164,6 +164,7 @@ enum IPV4_DEVCONF_ROUTE_LOCALNET, IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, __IPV4_DEVCONF_MAX }; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 419d23c53ec7..7498716e8f54 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2169,6 +2169,8 @@ static struct devinet_sysctl_table { "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), + DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, + "ignore_routes_with_linkdown"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 534eb1485045..6bbc54940eb4 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -280,7 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_scope = scope; fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; - if (!fib_lookup(net, &fl4, &res)) + if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); } else { scope = RT_SCOPE_LINK; @@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; net = dev_net(dev); - if (fib_lookup(net, &fl4, &res)) + if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) @@ -354,7 +354,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_oif = dev->ifindex; ret = 0; - if (fib_lookup(net, &fl4, &res) == 0) { + if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 56151982f74e..18123d50f576 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -47,11 +47,12 @@ struct fib4_rule { #endif }; -int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) +int __fib_lookup(struct net *net, struct flowi4 *flp, + struct fib_result *res, unsigned int flags) { struct fib_lookup_arg arg = { .result = res, - .flags = FIB_LOOKUP_NOREF, + .flags = flags, }; int err; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b1b305b1e340..3bfccd83551c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -623,7 +623,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl4, &res); + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); if (err) { rcu_read_unlock(); return err; @@ -1035,12 +1036,20 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { + struct in_device *in_dev; + if (fi->fib_nh->nh_gw && nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) goto nla_put_failure; if (fi->fib_nh->nh_oif && nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) goto nla_put_failure; + if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtm->rtm_flags |= RTNH_F_DEAD; + } #ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid && nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) @@ -1057,11 +1066,19 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; for_nexthops(fi) { + struct in_device *in_dev; + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_flags = nh->nh_flags & 0xFF; + if (nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rcu(nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtnh->rtnh_flags |= RTNH_F_DEAD; + } rtnh->rtnh_hops = nh->nh_weight - 1; rtnh->rtnh_ifindex = nh->nh_oif; @@ -1310,16 +1327,22 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) void fib_select_multipath(struct fib_result *res) { struct fib_info *fi = res->fi; + struct in_device *in_dev; int w; spin_lock_bh(&fib_multipath_lock); if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { - power += nexthop_nh->nh_weight; - nexthop_nh->nh_power = nexthop_nh->nh_weight; - } + in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); + if (nexthop_nh->nh_flags & RTNH_F_DEAD) + continue; + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + continue; + power += nexthop_nh->nh_weight; + nexthop_nh->nh_power = nexthop_nh->nh_weight; } endfor_nexthops(fi); fi->fib_power = power; if (power <= 0) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 6c666a9f1bd5..15d32612e3c6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1412,9 +1412,15 @@ found: continue; for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { const struct fib_nh *nh = &fi->fib_nh[nhsel]; + struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev); if (nh->nh_flags & RTNH_F_DEAD) continue; + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN && + !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) + continue; if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 4bfaedf9b34e..8618fd150c96 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -40,7 +40,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, struct net *net = dev_net(dev); int ret __maybe_unused; - if (fib_lookup(net, fl4, &res)) + if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (res.type != RTN_UNICAST) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f6055984c307..d0362a2de3d3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -747,7 +747,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); } else { - if (fib_lookup(net, fl4, &res) == 0) { + if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, @@ -975,7 +975,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { + if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, @@ -1186,7 +1186,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) fl4.flowi4_mark = skb->mark; rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, @@ -1716,7 +1716,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.daddr = daddr; fl4.saddr = saddr; - err = fib_lookup(net, &fl4, &res); + err = fib_lookup(net, &fl4, &res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; @@ -2123,7 +2123,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res)) { + if (fib_lookup(net, fl4, &res, 0)) { res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif) { -- cgit v1.2.3 From 204621551b2a0060a013b92f7add4d5c452fa7cb Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 24 Jun 2015 11:02:51 +0200 Subject: net: inet_diag: export IPV6_V6ONLY sockopt For AF_INET6 sockets, the value of struct ipv6_pinfo.ipv6only is exported to userspace. It indicates whether a socket bound to in6addr_any listens on IPv4 as well as IPv6. Since the socket is natively IPv6, it is not listed by e.g. 'ss -l -4'. This patch is accompanied by an appropriate one for iproute2 to enable the additional information in 'ss -e'. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 3 ++- net/ipv4/inet_diag.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index b629fc53b109..68a1f71fde9f 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -112,9 +112,10 @@ enum { INET_DIAG_SHUTDOWN, INET_DIAG_DCTCPINFO, INET_DIAG_PROTOCOL, /* response attribute only */ + INET_DIAG_SKV6ONLY, }; -#define INET_DIAG_MAX INET_DIAG_PROTOCOL +#define INET_DIAG_MAX INET_DIAG_SKV6ONLY /* INET_DIAG_MEM */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 516a1f6fbdd3..9bc26677058e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -151,6 +151,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (nla_put_u8(skb, INET_DIAG_TCLASS, inet6_sk(sk)->tclass) < 0) goto errout; + + if (ipv6_only_sock(sk) && + nla_put_u8(skb, INET_DIAG_SKV6ONLY, 1)) + goto errout; } #endif -- cgit v1.2.3