From 278f2b3e2af5f32ea1afe34fa12a2518153e6e49 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 30 Sep 2013 22:05:08 +0200 Subject: netfilter: ipt_ULOG: fix info leaks The ulog messages leak heap bytes by the means of padding bytes and incompletely filled string arrays. Fix those by memset(0)'ing the whole struct before filling it. Signed-off-by: Mathias Krause Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_ULOG.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index cbc22158af49..9cb993cd224b 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -220,6 +220,7 @@ static void ipt_ulog_packet(struct net *net, ub->qlen++; pm = nlmsg_data(nlh); + memset(pm, 0, sizeof(*pm)); /* We might not have a timestamp, get one */ if (skb->tstamp.tv64 == 0) @@ -238,8 +239,6 @@ static void ipt_ulog_packet(struct net *net, } else if (loginfo->prefix[0] != '\0') strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); - else - *(pm->prefix) = '\0'; if (in && in->hard_header_len > 0 && skb->mac_header != skb->network_header && @@ -251,13 +250,9 @@ static void ipt_ulog_packet(struct net *net, if (in) strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); - else - pm->indev_name[0] = '\0'; if (out) strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); - else - pm->outdev_name[0] = '\0'; /* copy_len <= skb->len, so can't fail. */ if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) -- cgit v1.2.3 From b416c144f46af1a30ddfa4e4319a8f077381ad63 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 21 Oct 2013 13:14:53 +0100 Subject: netfilter: x_tables: fix ordering of jumpstack allocation and table update During kernel stability testing on an SMP ARMv7 system, Yalin Wang reported the following panic from the netfilter code: 1fe0: 0000001c 5e2d3b10 4007e779 4009e110 60000010 00000032 ff565656 ff545454 [] (ipt_do_table+0x448/0x584) from [] (nf_iterate+0x48/0x7c) [] (nf_iterate+0x48/0x7c) from [] (nf_hook_slow+0x58/0x104) [] (nf_hook_slow+0x58/0x104) from [] (ip_local_deliver+0x88/0xa8) [] (ip_local_deliver+0x88/0xa8) from [] (ip_rcv_finish+0x418/0x43c) [] (ip_rcv_finish+0x418/0x43c) from [] (__netif_receive_skb+0x4cc/0x598) [] (__netif_receive_skb+0x4cc/0x598) from [] (process_backlog+0x84/0x158) [] (process_backlog+0x84/0x158) from [] (net_rx_action+0x70/0x1dc) [] (net_rx_action+0x70/0x1dc) from [] (__do_softirq+0x11c/0x27c) [] (__do_softirq+0x11c/0x27c) from [] (do_softirq+0x44/0x50) [] (do_softirq+0x44/0x50) from [] (local_bh_enable_ip+0x8c/0xd0) [] (local_bh_enable_ip+0x8c/0xd0) from [] (inet_stream_connect+0x164/0x298) [] (inet_stream_connect+0x164/0x298) from [] (sys_connect+0x88/0xc8) [] (sys_connect+0x88/0xc8) from [] (ret_fast_syscall+0x0/0x30) Code: 2a000021 e59d2028 e59de01c e59f011c (e7824103) ---[ end trace da227214a82491bd ]--- Kernel panic - not syncing: Fatal exception in interrupt This comes about because CPU1 is executing xt_replace_table in response to a setsockopt syscall, resulting in: ret = xt_jumpstack_alloc(newinfo); --> newinfo->jumpstack = kzalloc(size, GFP_KERNEL); [...] table->private = newinfo; newinfo->initial_entries = private->initial_entries; Meanwhile, CPU0 is handling the network receive path and ends up in ipt_do_table, resulting in: private = table->private; [...] jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; On weakly ordered memory architectures, the writes to table->private and newinfo->jumpstack from CPU1 can be observed out of order by CPU0. Furthermore, on architectures which don't respect ordering of address dependencies (i.e. Alpha), the reads from CPU0 can also be re-ordered. This patch adds an smp_wmb() before the assignment to table->private (which is essentially publishing newinfo) to ensure that all writes to newinfo will be observed before plugging it into the table structure. A dependent-read barrier is also added on the consumer sides, to ensure the same ordering requirements are also respected there. Cc: Paul E. McKenney Reported-by: Wang, Yalin Tested-by: Wang, Yalin Signed-off-by: Will Deacon Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 5 +++++ net/ipv4/netfilter/ip_tables.c | 5 +++++ net/ipv6/netfilter/ip6_tables.c | 5 +++++ net/netfilter/x_tables.c | 7 ++++++- 4 files changed, 21 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 85a4f21aac1a..59da7cde0724 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -271,6 +271,11 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); private = table->private; + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d23118d95ff9..718dfbd30cbe 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -327,6 +327,11 @@ ipt_do_table(struct sk_buff *skb, addend = xt_write_recseq_begin(); private = table->private; cpu = smp_processor_id(); + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); table_base = private->entries[cpu]; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 44400c216dc6..710238f58aa9 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -349,6 +349,11 @@ ip6t_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); private = table->private; + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); cpu = smp_processor_id(); table_base = private->entries[cpu]; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 8b03028cca69..227aa11e8409 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -845,8 +845,13 @@ xt_replace_table(struct xt_table *table, return NULL; } - table->private = newinfo; newinfo->initial_entries = private->initial_entries; + /* + * Ensure contents of newinfo are visible before assigning to + * private. + */ + smp_wmb(); + table->private = newinfo; /* * Even though table entries have now been swapped, other CPU's -- cgit v1.2.3 From bc15afa39ecc16f01c3389d15d8f6015a427fe85 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 24 Oct 2013 08:44:25 -0700 Subject: tcp: fix SYNACK RTT estimation in Fast Open tp->lsndtime may not always be the SYNACK timestamp if a passive Fast Open socket sends data before handshake completes. And if the remote acknowledges both the data and the SYNACK, the RTT sample is already taken in tcp_ack(), so no need to call tcp_update_ack_rtt() in tcp_synack_rtt_meas() aagain. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a16b01b537ba..305cd0526a78 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2871,14 +2871,19 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) +static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) { struct tcp_sock *tp = tcp_sk(sk); s32 seq_rtt = -1; - if (tp->lsndtime && !tp->total_retrans) - seq_rtt = tcp_time_stamp - tp->lsndtime; - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); + if (synack_stamp && !tp->total_retrans) + seq_rtt = tcp_time_stamp - synack_stamp; + + /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets + * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() + */ + if (!tp->srtt) + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); } static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) @@ -5587,6 +5592,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct request_sock *req; int queued = 0; bool acceptable; + u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; @@ -5669,9 +5675,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * so release it. */ if (req) { + synack_stamp = tcp_rsk(req)->snt_synack; tp->total_retrans = req->num_retrans; reqsk_fastopen_remove(sk, req, false); } else { + synack_stamp = tp->lsndtime; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_init_congestion_control(sk); @@ -5694,7 +5702,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_synack_rtt_meas(sk, req); + tcp_synack_rtt_meas(sk, synack_stamp); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; -- cgit v1.2.3 From 2909d874f34eae157aecab0af27c6dc4a1751f8f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 24 Oct 2013 08:55:25 -0700 Subject: tcp: only take RTT from timestamps if new data is acked Patch ed08495c3 "tcp: use RTT from SACK for RTO" has a bug that it does not check if the ACK acknowledge new data before taking the RTT sample from TCP timestamps. This patch adds the check back as required by the RFC. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 305cd0526a78..6ffe41a82c00 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2856,7 +2856,8 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + flag & FLAG_ACKED) seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; if (seq_rtt < 0) -- cgit v1.2.3 From 2f715c1dde6e1760f3101358dc26f8c9489be0bf Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 24 Oct 2013 08:59:27 -0700 Subject: tcp: do not rearm RTO when future data are sacked Patch ed08495c3 "tcp: use RTT from SACK for RTO" always re-arms RTO upon obtaining a RTT sample from newly sacked data. But technically RTO should only be re-armed when the data sent before the last (re)transmission of write queue head are (s)acked. Otherwise the RTO may continue to extend during loss recovery on data sent in the future. Note that RTTs from ACK or timestamps do not have this problem, as the RTT source must be from data sent before. The new RTO re-arm policy is 1) Always re-arm RTO if SND.UNA is advanced 2) Re-arm RTO if sack RTT is available, provided the sacked data was sent before the last time write_queue_head was sent. Signed-off-by: Larry Brakmo Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6ffe41a82c00..068c8fb0d158 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2987,6 +2987,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, s32 seq_rtt = -1; s32 ca_seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); + bool rtt_update; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); @@ -3063,14 +3064,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || - (flag & FLAG_ACKED)) - tcp_rearm_rto(sk); + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); @@ -3109,6 +3109,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ca_ops->pkts_acked(sk, pkts_acked, rtt_us); } + } else if (skb && rtt_update && sack_rtt >= 0 && + sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) { + /* Do not re-arm RTO if the sack RTT is measured from data sent + * after when the head was last (re)transmitted. Otherwise the + * timeout may continue to extend in loss recovery. + */ + tcp_rearm_rto(sk); } #if FASTRETRANS_DEBUG > 0 -- cgit v1.2.3 From eeb1b73378b560e00ff1da2ef09fed9254f4e128 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 25 Oct 2013 10:21:32 +0200 Subject: xfrm: Increase the garbage collector threshold With the removal of the routing cache, we lost the option to tweak the garbage collector threshold along with the maximum routing cache size. So git commit 703fb94ec ("xfrm: Fix the gc threshold value for ipv4") moved back to a static threshold. It turned out that the current threshold before we start garbage collecting is much to small for some workloads, so increase it from 1024 to 32768. This means that we start the garbage collector if we have more than 32768 dst entries in the system and refuse new allocations if we are above 65536. Reported-by: Wolfgang Walter Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_policy.c | 2 +- net/ipv6/xfrm6_policy.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index ccde54248c8c..4764ee48447c 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -236,7 +236,7 @@ static struct dst_ops xfrm4_dst_ops = { .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = 1024, + .gc_thresh = 32768, }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 08ed2772b7aa..dd503a3535ff 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -285,7 +285,7 @@ static struct dst_ops xfrm6_dst_ops = { .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, - .gc_thresh = 1024, + .gc_thresh = 32768, }; static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { -- cgit v1.2.3 From 0d08c42cf9a71530fef5ebcfe368f38f2dd0476f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Oct 2013 17:26:17 -0700 Subject: tcp: gso: fix truesize tracking commit 6ff50cd55545 ("tcp: gso: do not generate out of order packets") had an heuristic that can trigger a warning in skb_try_coalesce(), because skb->truesize of the gso segments were exactly set to mss. This breaks the requirement that skb->truesize >= skb->len + truesizeof(struct sk_buff); It can trivially be reproduced by : ifconfig lo mtu 1500 ethtool -K lo tso off netperf As the skbs are looped into the TCP networking stack, skb_try_coalesce() warns us of these skb under-estimating their truesize. Signed-off-by: Eric Dumazet Reported-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/tcp_offload.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 3a7525e6c086..533c58a5cfb7 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -18,6 +18,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int sum_truesize = 0; struct tcphdr *th; unsigned int thlen; unsigned int seq; @@ -102,13 +103,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, if (copy_destructor) { skb->destructor = gso_skb->destructor; skb->sk = gso_skb->sk; - /* {tcp|sock}_wfree() use exact truesize accounting : - * sum(skb->truesize) MUST be exactly be gso_skb->truesize - * So we account mss bytes of 'true size' for each segment. - * The last segment will contain the remaining. - */ - skb->truesize = mss; - gso_skb->truesize -= mss; + sum_truesize += skb->truesize; } skb = skb->next; th = tcp_hdr(skb); @@ -125,7 +120,9 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, if (copy_destructor) { swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); - swap(gso_skb->truesize, skb->truesize); + sum_truesize += skb->truesize; + atomic_add(sum_truesize - gso_skb->truesize, + &skb->sk->sk_wmem_alloc); } delta = htonl(oldlen + (skb_tail_pointer(skb) - -- cgit v1.2.3 From 84502b5ef9849a9694673b15c31bd3ac693010ae Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 30 Oct 2013 11:16:28 +0100 Subject: xfrm: Fix null pointer dereference when decoding sessions On some codepaths the skb does not have a dst entry when xfrm_decode_session() is called. So check for a valid skb_dst() before dereferencing the device interface index. We use 0 as the device index if there is no valid skb_dst(), or at reverse decoding we use skb_iif as device interface index. Bug was introduced with git commit bafd4bd4dc ("xfrm: Decode sessions with output interface."). Reported-by: Meelis Roos Tested-by: Meelis Roos Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_policy.c | 6 +++++- net/ipv6/xfrm6_policy.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 4764ee48447c..e1a63930a967 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -104,10 +104,14 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) const struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; + int oif = 0; + + if (skb_dst(skb)) + oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; - fl4->flowi4_oif = skb_dst(skb)->dev->ifindex; + fl4->flowi4_oif = reverse ? skb->skb_iif : oif; if (!ip_is_fragment(iph)) { switch (iph->protocol) { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index dd503a3535ff..5f8e128c512d 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -135,10 +135,14 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) struct ipv6_opt_hdr *exthdr; const unsigned char *nh = skb_network_header(skb); u8 nexthdr = nh[IP6CB(skb)->nhoff]; + int oif = 0; + + if (skb_dst(skb)) + oif = skb_dst(skb)->dev->ifindex; memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; - fl6->flowi6_oif = skb_dst(skb)->dev->ifindex; + fl6->flowi6_oif = reverse ? skb->skb_iif : oif; fl6->daddr = reverse ? hdr->saddr : hdr->daddr; fl6->saddr = reverse ? hdr->daddr : hdr->saddr; -- cgit v1.2.3