From 64380a04deeed4720de76b086a3a4eab8dd41671 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Tue, 11 Feb 2014 11:38:26 +0100 Subject: tipc: fix message corruption bug for deferred packets If a packet received on a link is out-of-sequence, it will be placed on a deferred queue and later reinserted in the receive path once the preceding packets have been processed. The problem with this is that it will be subject to the buffer adjustment from link_recv_buf_validate twice. The second adjustment for 20 bytes header space will corrupt the packet. We solve this by tagging the deferred packets and bail out from receive buffer validation for packets that have already been subjected to this. Signed-off-by: Erik Hugne Reviewed-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/core.h | 1 + net/tipc/link.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/tipc/core.h b/net/tipc/core.h index 1ff477b0450d..5569d96b4da3 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -192,6 +192,7 @@ static inline void k_term_timer(struct timer_list *timer) struct tipc_skb_cb { void *handle; + bool deferred; }; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) diff --git a/net/tipc/link.c b/net/tipc/link.c index d4b5de41b682..da6018beb6eb 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1391,6 +1391,12 @@ static int link_recv_buf_validate(struct sk_buff *buf) u32 hdr_size; u32 min_hdr_size; + /* If this packet comes from the defer queue, the skb has already + * been validated + */ + if (unlikely(TIPC_SKB_CB(buf)->deferred)) + return 1; + if (unlikely(buf->len < MIN_H_SIZE)) return 0; @@ -1703,6 +1709,7 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, &l_ptr->newest_deferred_in, buf)) { l_ptr->deferred_inqueue_sz++; l_ptr->stats.deferred_recv++; + TIPC_SKB_CB(buf)->deferred = true; if ((l_ptr->deferred_inqueue_sz % 16) == 1) tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); } else -- cgit v1.2.3 From 0e0eee2465df77bcec2e8ff75432b8e57897b143 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 11 Feb 2014 15:51:30 -0800 Subject: net: correct error path in rtnl_newlink() I saw the following BUG when ->newlink() fails in rtnl_newlink(): [ 40.240058] kernel BUG at net/core/dev.c:6438! this is due to free_netdev() is not supposed to be called before netdev is completely unregistered, therefore it is not correct to call free_netdev() here, at least for ops->newlink!=NULL case, many drivers call it in ->destructor so that rtnl_unlock() will take care of it, we probably don't need to do anything here. Cc: David S. Miller Cc: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 048dc8d183aa..1a0dac2ef9ad 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1963,16 +1963,21 @@ replay: dev->ifindex = ifm->ifi_index; - if (ops->newlink) + if (ops->newlink) { err = ops->newlink(net, dev, tb, data); - else + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure so that device could be + * finally freed in rtnl_unlock. + */ + if (err < 0) + goto out; + } else { err = register_netdevice(dev); - - if (err < 0) { - free_netdev(dev); - goto out; + if (err < 0) { + free_netdev(dev); + goto out; + } } - err = rtnl_configure_link(dev, ifm); if (err < 0) unregister_netdevice(dev); -- cgit v1.2.3 From 22a1f5140ed69baa5906ce9b2b9143478f5a4da0 Mon Sep 17 00:00:00 2001 From: wangweidong Date: Wed, 12 Feb 2014 09:44:43 +0800 Subject: sctp: fix a missed .data initialization As commit 3c68198e75111a90("sctp: Make hmac algorithm selection for cookie generation dynamic"), we miss the .data initialization. If we don't use the net_namespace, the problem that parts of the sysctl configuration won't be isolation and won't occur. In sctp_sysctl_net_register(), we register the sysctl for each net, in the for(), we use the 'table[i].data' as check condition, so when the 'i' is the index of sctp_hmac_alg, the data is NULL, then break. So add the .data initialization. Acked-by: Neil Horman Signed-off-by: Wang Weidong Signed-off-by: David S. Miller --- net/sctp/sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 7135e617ab0f..d354de5a6fe0 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -151,6 +151,7 @@ static struct ctl_table sctp_net_table[] = { }, { .procname = "cookie_hmac_alg", + .data = &init_net.sctp.sctp_hmac_alg, .maxlen = 8, .mode = 0644, .proc_handler = proc_sctp_do_hmac_alg, -- cgit v1.2.3 From efb842c45e667332774196bd5a1d539d048159cc Mon Sep 17 00:00:00 2001 From: wangweidong Date: Wed, 12 Feb 2014 09:44:44 +0800 Subject: sctp: optimize the sctp_sysctl_net_register Here, when the net is init_net, we needn't to kmemdup the ctl_table again. So add a check for net. Also we can save some memory. Signed-off-by: Wang Weidong Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/sysctl.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index d354de5a6fe0..35c8923b5554 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -402,15 +402,18 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, int sctp_sysctl_net_register(struct net *net) { - struct ctl_table *table; - int i; + struct ctl_table *table = sctp_net_table; + + if (!net_eq(net, &init_net)) { + int i; - table = kmemdup(sctp_net_table, sizeof(sctp_net_table), GFP_KERNEL); - if (!table) - return -ENOMEM; + table = kmemdup(sctp_net_table, sizeof(sctp_net_table), GFP_KERNEL); + if (!table) + return -ENOMEM; - for (i = 0; table[i].data; i++) - table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; + for (i = 0; table[i].data; i++) + table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; + } net->sctp.sysctl_header = register_net_sysctl(net, "net/sctp", table); return 0; -- cgit v1.2.3 From d206940319c41df4299db75ed56142177bb2e5f6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Feb 2014 23:09:11 +0100 Subject: net: core: introduce netif_skb_dev_features Will be used by upcoming ipv4 forward path change that needs to determine feature mask using skb->dst->dev instead of skb->dev. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++++- net/core/dev.c | 22 ++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 440a02ee6f92..21d4e6be8949 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3068,7 +3068,12 @@ void netdev_change_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); -netdev_features_t netif_skb_features(struct sk_buff *skb); +netdev_features_t netif_skb_dev_features(struct sk_buff *skb, + const struct net_device *dev); +static inline netdev_features_t netif_skb_features(struct sk_buff *skb) +{ + return netif_skb_dev_features(skb, skb->dev); +} static inline bool net_gso_ok(netdev_features_t features, int gso_type) { diff --git a/net/core/dev.c b/net/core/dev.c index 4ad1b78c9c77..b1b0c8d4d7df 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2420,7 +2420,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault); * 2. No high memory really exists on this machine. */ -static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; @@ -2495,34 +2495,36 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) } static netdev_features_t harmonize_features(struct sk_buff *skb, - netdev_features_t features) + const struct net_device *dev, + netdev_features_t features) { if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, skb_network_protocol(skb))) { features &= ~NETIF_F_ALL_CSUM; - } else if (illegal_highdma(skb->dev, skb)) { + } else if (illegal_highdma(dev, skb)) { features &= ~NETIF_F_SG; } return features; } -netdev_features_t netif_skb_features(struct sk_buff *skb) +netdev_features_t netif_skb_dev_features(struct sk_buff *skb, + const struct net_device *dev) { __be16 protocol = skb->protocol; - netdev_features_t features = skb->dev->features; + netdev_features_t features = dev->features; - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, features); + return harmonize_features(skb, dev, features); } - features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | + features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) @@ -2530,9 +2532,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - return harmonize_features(skb, features); + return harmonize_features(skb, dev, features); } -EXPORT_SYMBOL(netif_skb_features); +EXPORT_SYMBOL(netif_skb_dev_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) -- cgit v1.2.3 From fe6cc55f3a9a053482a76f5a6b2257cee51b4663 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Feb 2014 23:09:12 +0100 Subject: net: ip, ipv6: handle gso skbs in forwarding path Marcelo Ricardo Leitner reported problems when the forwarding link path has a lower mtu than the incoming one if the inbound interface supports GRO. Given: Host R1 R2 Host sends tcp stream which is routed via R1 and R2. R1 performs GRO. In this case, the kernel will fail to send ICMP fragmentation needed messages (or pkt too big for ipv6), as GSO packets currently bypass dstmtu checks in forward path. Instead, Linux tries to send out packets exceeding the mtu. When locking route MTU on Host (i.e., no ipv4 DF bit set), R1 does not fragment the packets when forwarding, and again tries to send out packets exceeding R1-R2 link mtu. This alters the forwarding dstmtu checks to take the individual gso segment lengths into account. For ipv6, we send out pkt too big error for gso if the individual segments are too big. For ipv4, we either send icmp fragmentation needed, or, if the DF bit is not set, perform software segmentation and let the output path create fragments when the packet is leaving the machine. It is not 100% correct as the error message will contain the headers of the GRO skb instead of the original/segmented one, but it seems to work fine in my (limited) tests. Eric Dumazet suggested to simply shrink mss via ->gso_size to avoid sofware segmentation. However it turns out that skb_segment() assumes skb nr_frags is related to mss size so we would BUG there. I don't want to mess with it considering Herbert and Eric disagree on what the correct behavior should be. Hannes Frederic Sowa notes that when we would shrink gso_size skb_segment would then also need to deal with the case where SKB_MAX_FRAGS would be exceeded. This uses sofware segmentation in the forward path when we hit ipv4 non-DF packets and the outgoing link mtu is too small. Its not perfect, but given the lack of bug reports wrt. GRO fwd being broken this is a rare case anyway. Also its not like this could not be improved later once the dust settles. Acked-by: Herbert Xu Reported-by: Marcelo Ricardo Leitner Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 17 ++++++++++++ net/ipv4/ip_forward.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++-- net/ipv6/ip6_output.c | 17 ++++++++++-- 3 files changed, 101 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f589c9af8cbf..3ebbbe7b6d05 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2916,5 +2916,22 @@ static inline bool skb_head_is_locked(const struct sk_buff *skb) { return !skb->head_frag || skb_cloned(skb); } + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + return hdr_len + skb_gso_transport_seglen(skb); +} #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index e9f1217a8afd..f3869c186d97 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,6 +39,71 @@ #include #include +static bool ip_may_fragment(const struct sk_buff *skb) +{ + return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || + !skb->local_df; +} + +static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu || skb->local_df) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + +static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb) +{ + unsigned int mtu; + + if (skb->local_df || !skb_is_gso(skb)) + return false; + + mtu = ip_dst_mtu_maybe_forward(skb_dst(skb), true); + + /* if seglen > mtu, do software segmentation for IP fragmentation on + * output. DF bit cannot be set since ip_forward would have sent + * icmp error. + */ + return skb_gso_network_seglen(skb) > mtu; +} + +/* called if GSO skb needs to be fragmented on forward */ +static int ip_forward_finish_gso(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + features = netif_skb_dev_features(skb, dst->dev); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = dst_output(segs); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); + + return ret; +} + static int ip_forward_finish(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); @@ -49,6 +114,9 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); + if (ip_gso_exceeds_dst_mtu(skb)) + return ip_forward_finish_gso(skb); + return dst_output(skb); } @@ -91,8 +159,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (unlikely(skb->len > mtu && !skb_is_gso(skb) && - (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ef02b26ccf81..070a2fae2375 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -342,6 +342,20 @@ static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } +static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu || skb->local_df) + return false; + + if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) + return true; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -466,8 +480,7 @@ int ip6_forward(struct sk_buff *skb) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) || - (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) { + if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); -- cgit v1.2.3 From 219e288e8900fac65211e0a23e2a1037fd521af1 Mon Sep 17 00:00:00 2001 From: Vijay Subramanian Date: Wed, 12 Feb 2014 18:58:21 -0800 Subject: net: sched: Cleanup PIE comments Fix incorrect comment reported by Norbert Kiesel. Edit another comment to add more details. Also add references to algorithm (IETF draft and paper) to top of file. Signed-off-by: Vijay Subramanian CC: Mythili Prabhu CC: Norbert Kiesel Signed-off-by: David S. Miller --- net/sched/sch_pie.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index a255d0200a59..fefeeb73f15f 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -15,6 +15,11 @@ * * ECN support is added by Naeem Khademi * University of Oslo, Norway. + * + * References: + * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 + * IEEE Conference on High Performance Switching and Routing 2013 : + * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem" */ #include @@ -36,7 +41,7 @@ struct pie_params { psched_time_t target; /* user specified target delay in pschedtime */ u32 tupdate; /* timer frequency (in jiffies) */ u32 limit; /* number of packets that can be enqueued */ - u32 alpha; /* alpha and beta are between -4 and 4 */ + u32 alpha; /* alpha and beta are between 0 and 32 */ u32 beta; /* and are used for shift relative to 1 */ bool ecn; /* true if ecn is enabled */ bool bytemode; /* to scale drop early prob based on pkt size */ @@ -326,10 +331,16 @@ static void calculate_probability(struct Qdisc *sch) if (qdelay == 0 && qlen != 0) update_prob = false; - /* Add ranges for alpha and beta, more aggressive for high dropping - * mode and gentle steps for light dropping mode - * In light dropping mode, take gentle steps; in medium dropping mode, - * take medium steps; in high dropping mode, take big steps. + /* In the algorithm, alpha and beta are between 0 and 2 with typical + * value for alpha as 0.125. In this implementation, we use values 0-32 + * passed from user space to represent this. Also, alpha and beta have + * unit of HZ and need to be scaled before they can used to update + * probability. alpha/beta are updated locally below by 1) scaling them + * appropriately 2) scaling down by 16 to come to 0-2 range. + * Please see paper for details. + * + * We scale alpha and beta differently depending on whether we are in + * light, medium or high dropping mode. */ if (q->vars.prob < MAX_PROB / 100) { alpha = -- cgit v1.2.3 From 357137a4222b1984b3d867923c7e388669744ceb Mon Sep 17 00:00:00 2001 From: FX Le Bail Date: Mon, 10 Feb 2014 16:46:54 +0100 Subject: ipv4: ipconfig.c: add parentheses in an if statement Even if the 'time_before' macro expand with parentheses, the look is bad. Signed-off-by: Francois-Xavier Le Bail Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index efa1138fa523..b3e86ea7b71b 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -273,7 +273,7 @@ static int __init ic_open_devs(void) msleep(1); - if time_before(jiffies, next_msg) + if (time_before(jiffies, next_msg)) continue; elapsed = jiffies_to_msecs(jiffies - start); -- cgit v1.2.3 From 09db30805300e9ed5ad43d4d339115cf1d9c84e1 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 13 Feb 2014 19:02:33 -0700 Subject: dccp: re-enable debug macro dccp tfrc: revert This reverts 6aee49c558de ("dccp: make local variable static") since the variable tfrc_debug is referenced by the tfrc_pr_debug(fmt, ...) macro when TFRC debugging is enabled. If it is enabled, use of the macro produces a compilation error. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/lib/tfrc.c | 2 +- net/dccp/ccids/lib/tfrc.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c index c073b81a1f3e..62b5828acde0 100644 --- a/net/dccp/ccids/lib/tfrc.c +++ b/net/dccp/ccids/lib/tfrc.c @@ -8,7 +8,7 @@ #include "tfrc.h" #ifdef CONFIG_IP_DCCP_TFRC_DEBUG -static bool tfrc_debug; +bool tfrc_debug; module_param(tfrc_debug, bool, 0644); MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages"); #endif diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index a3d8f7c76ae0..40ee7d62b652 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -21,6 +21,7 @@ #include "packet_history.h" #ifdef CONFIG_IP_DCCP_TFRC_DEBUG +extern bool tfrc_debug; #define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a) #else #define tfrc_pr_debug(format, a...) -- cgit v1.2.3 From cd0f0b95fd2cd2b716caf5f15db73ab76992789b Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Fri, 14 Feb 2014 18:26:22 +0800 Subject: ipv4: distinguish EHOSTUNREACH from the ENETUNREACH since commit 251da413("ipv4: Cache ip_error() routes even when not forwarding."), the counter IPSTATS_MIB_INADDRERRORS can't work correctly, because the value of err was always set to ENETUNREACH. Signed-off-by: Duan Jiong Signed-off-by: David S. Miller --- net/ipv4/route.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 25071b48921c..6f6dd85bdffc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1695,8 +1695,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res); - if (err != 0) + if (err != 0) { + if (!IN_DEV_FORWARD(in_dev)) + err = -EHOSTUNREACH; goto no_route; + } RT_CACHE_STAT_INC(in_slow_tot); @@ -1712,8 +1715,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto local_input; } - if (!IN_DEV_FORWARD(in_dev)) + if (!IN_DEV_FORWARD(in_dev)) { + err = -EHOSTUNREACH; goto no_route; + } if (res.type != RTN_UNICAST) goto martian_destination; -- cgit v1.2.3 From ef2820a735f74ea60335f8ba3801b844f0cb184d Mon Sep 17 00:00:00 2001 From: Matija Glavinic Pecotic Date: Fri, 14 Feb 2014 14:51:18 +0100 Subject: net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer Implementation of (a)rwnd calculation might lead to severe performance issues and associations completely stalling. These problems are described and solution is proposed which improves lksctp's robustness in congestion state. 1) Sudden drop of a_rwnd and incomplete window recovery afterwards Data accounted in sctp_assoc_rwnd_decrease takes only payload size (sctp data), but size of sk_buff, which is blamed against receiver buffer, is not accounted in rwnd. Theoretically, this should not be the problem as actual size of buffer is double the amount requested on the socket (SO_RECVBUF). Problem here is that this will have bad scaling for data which is less then sizeof sk_buff. E.g. in 4G (LTE) networks, link interfacing radio side will have a large portion of traffic of this size (less then 100B). An example of sudden drop and incomplete window recovery is given below. Node B exhibits problematic behavior. Node A initiates association and B is configured to advertise rwnd of 10000. A sends messages of size 43B (size of typical sctp message in 4G (LTE) network). On B data is left in buffer by not reading socket in userspace. Lets examine when we will hit pressure state and declare rwnd to be 0 for scenario with above stated parameters (rwnd == 10000, chunk size == 43, each chunk is sent in separate sctp packet) Logic is implemented in sctp_assoc_rwnd_decrease: socket_buffer (see below) is maximum size which can be held in socket buffer (sk_rcvbuf). current_alloced is amount of data currently allocated (rx_count) A simple expression is given for which it will be examined after how many packets for above stated parameters we enter pressure state: We start by condition which has to be met in order to enter pressure state: socket_buffer < currently_alloced; currently_alloced is represented as size of sctp packets received so far and not yet delivered to userspace. x is the number of chunks/packets (since there is no bundling, and each chunk is delivered in separate packet, we can observe each chunk also as sctp packet, and what is important here, having its own sk_buff): socket_buffer < x*each_sctp_packet; each_sctp_packet is sctp chunk size + sizeof(struct sk_buff). socket_buffer is twice the amount of initially requested size of socket buffer, which is in case of sctp, twice the a_rwnd requested: 2*rwnd < x*(payload+sizeof(struc sk_buff)); sizeof(struct sk_buff) is 190 (3.13.0-rc4+). Above is stated that rwnd is 10000 and each payload size is 43 20000 < x(43+190); x > 20000/233; x ~> 84; After ~84 messages, pressure state is entered and 0 rwnd is advertised while received 84*43B ~= 3612B sctp data. This is why external observer notices sudden drop from 6474 to 0, as it will be now shown in example: IP A.34340 > B.12345: sctp (1) [INIT] [init tag: 1875509148] [rwnd: 81920] [OS: 10] [MIS: 65535] [init TSN: 1096057017] IP B.12345 > A.34340: sctp (1) [INIT ACK] [init tag: 3198966556] [rwnd: 10000] [OS: 10] [MIS: 10] [init TSN: 902132839] IP A.34340 > B.12345: sctp (1) [COOKIE ECHO] IP B.12345 > A.34340: sctp (1) [COOKIE ACK] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057017] [SID: 0] [SSEQ 0] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057017] [a_rwnd 9957] [#gap acks 0] [#dup tsns 0] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057018] [SID: 0] [SSEQ 1] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057018] [a_rwnd 9957] [#gap acks 0] [#dup tsns 0] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057019] [SID: 0] [SSEQ 2] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057019] [a_rwnd 9914] [#gap acks 0] [#dup tsns 0] <...> IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057098] [SID: 0] [SSEQ 81] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057098] [a_rwnd 6517] [#gap acks 0] [#dup tsns 0] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057099] [SID: 0] [SSEQ 82] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057099] [a_rwnd 6474] [#gap acks 0] [#dup tsns 0] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057100] [SID: 0] [SSEQ 83] [PPID 0x18] --> Sudden drop IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057100] [a_rwnd 0] [#gap acks 0] [#dup tsns 0] At this point, rwnd_press stores current rwnd value so it can be later restored in sctp_assoc_rwnd_increase. This however doesn't happen as condition to start slowly increasing rwnd until rwnd_press is returned to rwnd is never met. This condition is not met since rwnd, after it hit 0, must first reach rwnd_press by adding amount which is read from userspace. Let us observe values in above example. Initial a_rwnd is 10000, pressure was hit when rwnd was ~6500 and the amount of actual sctp data currently waiting to be delivered to userspace is ~3500. When userspace starts to read, sctp_assoc_rwnd_increase will be blamed only for sctp data, which is ~3500. Condition is never met, and when userspace reads all data, rwnd stays on 3569. IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057100] [a_rwnd 1505] [#gap acks 0] [#dup tsns 0] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057100] [a_rwnd 3010] [#gap acks 0] [#dup tsns 0] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057101] [SID: 0] [SSEQ 84] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057101] [a_rwnd 3569] [#gap acks 0] [#dup tsns 0] --> At this point userspace read everything, rwnd recovered only to 3569 IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057102] [SID: 0] [SSEQ 85] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057102] [a_rwnd 3569] [#gap acks 0] [#dup tsns 0] Reproduction is straight forward, it is enough for sender to send packets of size less then sizeof(struct sk_buff) and receiver keeping them in its buffers. 2) Minute size window for associations sharing the same socket buffer In case multiple associations share the same socket, and same socket buffer (sctp.rcvbuf_policy == 0), different scenarios exist in which congestion on one of the associations can permanently drop rwnd of other association(s). Situation will be typically observed as one association suddenly having rwnd dropped to size of last packet received and never recovering beyond that point. Different scenarios will lead to it, but all have in common that one of the associations (let it be association from 1)) nearly depleted socket buffer, and the other association blames socket buffer just for the amount enough to start the pressure. This association will enter pressure state, set rwnd_press and announce 0 rwnd. When data is read by userspace, similar situation as in 1) will occur, rwnd will increase just for the size read by userspace but rwnd_press will be high enough so that association doesn't have enough credit to reach rwnd_press and restore to previous state. This case is special case of 1), being worse as there is, in the worst case, only one packet in buffer for which size rwnd will be increased. Consequence is association which has very low maximum rwnd ('minute size', in our case down to 43B - size of packet which caused pressure) and as such unusable. Scenario happened in the field and labs frequently after congestion state (link breaks, different probabilities of packet drop, packet reordering) and with scenario 1) preceding. Here is given a deterministic scenario for reproduction: >From node A establish two associations on the same socket, with rcvbuf_policy being set to share one common buffer (sctp.rcvbuf_policy == 0). On association 1 repeat scenario from 1), that is, bring it down to 0 and restore up. Observe scenario 1). Use small payload size (here we use 43). Once rwnd is 'recovered', bring it down close to 0, as in just one more packet would close it. This has as a consequence that association number 2 is able to receive (at least) one more packet which will bring it in pressure state. E.g. if association 2 had rwnd of 10000, packet received was 43, and we enter at this point into pressure, rwnd_press will have 9957. Once payload is delivered to userspace, rwnd will increase for 43, but conditions to restore rwnd to original state, just as in 1), will never be satisfied. --> Association 1, between A.y and B.12345 IP A.55915 > B.12345: sctp (1) [INIT] [init tag: 836880897] [rwnd: 10000] [OS: 10] [MIS: 65535] [init TSN: 4032536569] IP B.12345 > A.55915: sctp (1) [INIT ACK] [init tag: 2873310749] [rwnd: 81920] [OS: 10] [MIS: 10] [init TSN: 3799315613] IP A.55915 > B.12345: sctp (1) [COOKIE ECHO] IP B.12345 > A.55915: sctp (1) [COOKIE ACK] --> Association 2, between A.z and B.12346 IP A.55915 > B.12346: sctp (1) [INIT] [init tag: 534798321] [rwnd: 10000] [OS: 10] [MIS: 65535] [init TSN: 2099285173] IP B.12346 > A.55915: sctp (1) [INIT ACK] [init tag: 516668823] [rwnd: 81920] [OS: 10] [MIS: 10] [init TSN: 3676403240] IP A.55915 > B.12346: sctp (1) [COOKIE ECHO] IP B.12346 > A.55915: sctp (1) [COOKIE ACK] --> Deplete socket buffer by sending messages of size 43B over association 1 IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315613] [SID: 0] [SSEQ 0] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315613] [a_rwnd 9957] [#gap acks 0] [#dup tsns 0] <...> IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315696] [a_rwnd 6388] [#gap acks 0] [#dup tsns 0] IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315697] [SID: 0] [SSEQ 84] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315697] [a_rwnd 6345] [#gap acks 0] [#dup tsns 0] --> Sudden drop on 1 IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315698] [SID: 0] [SSEQ 85] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315698] [a_rwnd 0] [#gap acks 0] [#dup tsns 0] --> Here userspace read, rwnd 'recovered' to 3698, now deplete again using association 1 so there is place in buffer for only one more packet IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315799] [SID: 0] [SSEQ 186] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315799] [a_rwnd 86] [#gap acks 0] [#dup tsns 0] IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315800] [SID: 0] [SSEQ 187] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315800] [a_rwnd 43] [#gap acks 0] [#dup tsns 0] --> Socket buffer is almost depleted, but there is space for one more packet, send them over association 2, size 43B IP B.12346 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3676403240] [SID: 0] [SSEQ 0] [PPID 0x18] IP A.55915 > B.12346: sctp (1) [SACK] [cum ack 3676403240] [a_rwnd 0] [#gap acks 0] [#dup tsns 0] --> Immediate drop IP A.60995 > B.12346: sctp (1) [SACK] [cum ack 387491510] [a_rwnd 0] [#gap acks 0] [#dup tsns 0] --> Read everything from the socket, both association recover up to maximum rwnd they are capable of reaching, note that association 1 recovered up to 3698, and association 2 recovered only to 43 IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315800] [a_rwnd 1548] [#gap acks 0] [#dup tsns 0] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315800] [a_rwnd 3053] [#gap acks 0] [#dup tsns 0] IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315801] [SID: 0] [SSEQ 188] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315801] [a_rwnd 3698] [#gap acks 0] [#dup tsns 0] IP B.12346 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3676403241] [SID: 0] [SSEQ 1] [PPID 0x18] IP A.55915 > B.12346: sctp (1) [SACK] [cum ack 3676403241] [a_rwnd 43] [#gap acks 0] [#dup tsns 0] A careful reader might wonder why it is necessary to reproduce 1) prior reproduction of 2). It is simply easier to observe when to send packet over association 2 which will push association into the pressure state. Proposed solution: Both problems share the same root cause, and that is improper scaling of socket buffer with rwnd. Solution in which sizeof(sk_buff) is taken into concern while calculating rwnd is not possible due to fact that there is no linear relationship between amount of data blamed in increase/decrease with IP packet in which payload arrived. Even in case such solution would be followed, complexity of the code would increase. Due to nature of current rwnd handling, slow increase (in sctp_assoc_rwnd_increase) of rwnd after pressure state is entered is rationale, but it gives false representation to the sender of current buffer space. Furthermore, it implements additional congestion control mechanism which is defined on implementation, and not on standard basis. Proposed solution simplifies whole algorithm having on mind definition from rfc: o Receiver Window (rwnd): This gives the sender an indication of the space available in the receiver's inbound buffer. Core of the proposed solution is given with these lines: sctp_assoc_rwnd_update: if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0) asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1; else asoc->rwnd = 0; We advertise to sender (half of) actual space we have. Half is in the braces depending whether you would like to observe size of socket buffer as SO_RECVBUF or twice the amount, i.e. size is the one visible from userspace, that is, from kernelspace. In this way sender is given with good approximation of our buffer space, regardless of the buffer policy - we always advertise what we have. Proposed solution fixes described problems and removes necessity for rwnd restoration algorithm. Finally, as proposed solution is simplification, some lines of code, along with some bytes in struct sctp_association are saved. Version 2 of the patch addressed comments from Vlad. Name of the function is set to be more descriptive, and two parts of code are changed, in one removing the superfluous call to sctp_assoc_rwnd_update since call would not result in update of rwnd, and the other being reordering of the code in a way that call to sctp_assoc_rwnd_update updates rwnd. Version 3 corrected change introduced in v2 in a way that existing function is not reordered/copied in line, but it is correctly called. Thanks Vlad for suggesting. Signed-off-by: Matija Glavinic Pecotic Reviewed-by: Alexander Sverdlin Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 14 +------- net/sctp/associola.c | 82 ++++++++++------------------------------------ net/sctp/sm_statefuns.c | 2 +- net/sctp/socket.c | 6 ---- net/sctp/ulpevent.c | 8 +++-- 5 files changed, 25 insertions(+), 87 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index d992ca3145fe..6ee76c804893 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1653,17 +1653,6 @@ struct sctp_association { /* This is the last advertised value of rwnd over a SACK chunk. */ __u32 a_rwnd; - /* Number of bytes by which the rwnd has slopped. The rwnd is allowed - * to slop over a maximum of the association's frag_point. - */ - __u32 rwnd_over; - - /* Keeps treack of rwnd pressure. This happens when we have - * a window, but not recevie buffer (i.e small packets). This one - * is releases slowly (1 PMTU at a time ). - */ - __u32 rwnd_press; - /* This is the sndbuf size in use for the association. * This corresponds to the sndbuf size for the association, * as specified in the sk->sndbuf. @@ -1892,8 +1881,7 @@ void sctp_assoc_update(struct sctp_association *old, __u32 sctp_association_get_next_tsn(struct sctp_association *); void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *); -void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int); -void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int); +void sctp_assoc_rwnd_update(struct sctp_association *, bool); void sctp_assoc_set_primary(struct sctp_association *, struct sctp_transport *); void sctp_assoc_del_nonprimary_peers(struct sctp_association *, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5ae609200674..f558433537b8 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1367,44 +1367,35 @@ static inline bool sctp_peer_needs_update(struct sctp_association *asoc) return false; } -/* Increase asoc's rwnd by len and send any window update SACK if needed. */ -void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) +/* Update asoc's rwnd for the approximated state in the buffer, + * and check whether SACK needs to be sent. + */ +void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer) { + int rx_count; struct sctp_chunk *sack; struct timer_list *timer; - if (asoc->rwnd_over) { - if (asoc->rwnd_over >= len) { - asoc->rwnd_over -= len; - } else { - asoc->rwnd += (len - asoc->rwnd_over); - asoc->rwnd_over = 0; - } - } else { - asoc->rwnd += len; - } + if (asoc->ep->rcvbuf_policy) + rx_count = atomic_read(&asoc->rmem_alloc); + else + rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); - /* If we had window pressure, start recovering it - * once our rwnd had reached the accumulated pressure - * threshold. The idea is to recover slowly, but up - * to the initial advertised window. - */ - if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) { - int change = min(asoc->pathmtu, asoc->rwnd_press); - asoc->rwnd += change; - asoc->rwnd_press -= change; - } + if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0) + asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1; + else + asoc->rwnd = 0; - pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n", - __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, - asoc->a_rwnd); + pr_debug("%s: asoc:%p rwnd=%u, rx_count=%d, sk_rcvbuf=%d\n", + __func__, asoc, asoc->rwnd, rx_count, + asoc->base.sk->sk_rcvbuf); /* Send a window update SACK if the rwnd has increased by at least the * minimum of the association's PMTU and half of the receive buffer. * The algorithm used is similar to the one described in * Section 4.2.3.3 of RFC 1122. */ - if (sctp_peer_needs_update(asoc)) { + if (update_peer && sctp_peer_needs_update(asoc)) { asoc->a_rwnd = asoc->rwnd; pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u " @@ -1426,45 +1417,6 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) } } -/* Decrease asoc's rwnd by len. */ -void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) -{ - int rx_count; - int over = 0; - - if (unlikely(!asoc->rwnd || asoc->rwnd_over)) - pr_debug("%s: association:%p has asoc->rwnd:%u, " - "asoc->rwnd_over:%u!\n", __func__, asoc, - asoc->rwnd, asoc->rwnd_over); - - if (asoc->ep->rcvbuf_policy) - rx_count = atomic_read(&asoc->rmem_alloc); - else - rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); - - /* If we've reached or overflowed our receive buffer, announce - * a 0 rwnd if rwnd would still be positive. Store the - * the potential pressure overflow so that the window can be restored - * back to original value. - */ - if (rx_count >= asoc->base.sk->sk_rcvbuf) - over = 1; - - if (asoc->rwnd >= len) { - asoc->rwnd -= len; - if (over) { - asoc->rwnd_press += asoc->rwnd; - asoc->rwnd = 0; - } - } else { - asoc->rwnd_over = len - asoc->rwnd; - asoc->rwnd = 0; - } - - pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n", - __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, - asoc->rwnd_press); -} /* Build the bind address list for the association based on info from the * local endpoint and the remote peer. diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 483dcd71b3c5..591b44d3b7de 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6176,7 +6176,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * PMTU. In cases, such as loopback, this might be a rather * large spill over. */ - if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over || + if ((!chunk->data_accepted) && (!asoc->rwnd || (datalen > asoc->rwnd + asoc->frag_point))) { /* If this is the next TSN, consider reneging to make diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9e91d6e5df63..7075ac847fde 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2092,12 +2092,6 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, sctp_skb_pull(skb, copied); skb_queue_head(&sk->sk_receive_queue, skb); - /* When only partial message is copied to the user, increase - * rwnd by that amount. If all the data in the skb is read, - * rwnd is updated when the event is freed. - */ - if (!sctp_ulpevent_is_notification(event)) - sctp_assoc_rwnd_increase(event->asoc, copied); goto out; } else if ((event->msg_flags & MSG_NOTIFICATION) || (event->msg_flags & MSG_EOR)) diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 85c64658bd0b..8d198ae03606 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -989,7 +989,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, skb = sctp_event2skb(event); /* Set the owner and charge rwnd for bytes received. */ sctp_ulpevent_set_owner(event, asoc); - sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb)); + sctp_assoc_rwnd_update(asoc, false); if (!skb->data_len) return; @@ -1011,6 +1011,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) { struct sk_buff *skb, *frag; unsigned int len; + struct sctp_association *asoc; /* Current stack structures assume that the rcv buffer is * per socket. For UDP style sockets this is not true as @@ -1035,8 +1036,11 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) } done: - sctp_assoc_rwnd_increase(event->asoc, len); + asoc = event->asoc; + sctp_association_hold(asoc); sctp_ulpevent_release_owner(event); + sctp_assoc_rwnd_update(asoc, true); + sctp_association_put(asoc); } static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) -- cgit v1.2.3 From 99932d4fc03a13bb3e94938fe25458fabc8f2fc3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 16 Feb 2014 15:55:20 +0100 Subject: netdevice: add queue selection fallback handler for ndo_select_queue Add a new argument for ndo_select_queue() callback that passes a fallback handler. This gets invoked through netdev_pick_tx(); fallback handler is currently __netdev_pick_tx() as most drivers invoke this function within their customized implementation in case for skbs that don't need any special handling. This fallback handler can then be replaced on other call-sites with different queue selection methods (e.g. in packet sockets, pktgen etc). This also has the nice side-effect that __netdev_pick_tx() is then only invoked from netdev_pick_tx() and export of that function to modules can be undone. Suggested-by: David S. Miller Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 4 ++-- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 +++--- drivers/net/ethernet/lantiq_etop.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 4 ++-- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 2 +- drivers/net/ethernet/tile/tilegx.c | 2 +- drivers/net/team/team.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/wireless/mwifiex/main.c | 2 +- drivers/staging/bcm/Bcmnet.c | 2 +- drivers/staging/netlogic/xlr_net.c | 2 +- drivers/staging/rtl8188eu/os_dep/os_intfs.c | 2 +- include/linux/netdevice.h | 9 ++++++--- net/core/flow_dissector.c | 7 +++---- net/mac80211/iface.c | 6 ++++-- 17 files changed, 31 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 867664918715..1c6104d3501d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3707,7 +3707,7 @@ static inline int bond_slave_override(struct bonding *bond, static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { /* * This helper function exists to help dev_pick_tx get the correct diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 9d7419e0390b..66c0df78c3ff 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1873,7 +1873,7 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw) } u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct bnx2x *bp = netdev_priv(dev); @@ -1895,7 +1895,7 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, } /* select a non-FCoE queue */ - return __netdev_pick_tx(dev, skb) % BNX2X_NUM_ETH_QUEUES(bp); + return fallback(dev, skb) % BNX2X_NUM_ETH_QUEUES(bp); } void bnx2x_set_num_queues(struct bnx2x *bp) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index bfc58d488bb5..a89a40f88c25 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -496,7 +496,7 @@ int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos); /* select_queue callback */ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + void *accel_priv, select_queue_fallback_t fallback); static inline void bnx2x_update_rx_prod(struct bnx2x *bp, struct bnx2x_fastpath *fp, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 6d4ada72dfd0..18076c4178b4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6881,7 +6881,7 @@ static inline int ixgbe_maybe_stop_tx(struct ixgbe_ring *tx_ring, u16 size) } static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct ixgbe_fwd_adapter *fwd_adapter = accel_priv; #ifdef IXGBE_FCOE @@ -6907,7 +6907,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) break; default: - return __netdev_pick_tx(dev, skb); + return fallback(dev, skb); } f = &adapter->ring_feature[RING_F_FCOE]; @@ -6920,7 +6920,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, return txq + f->offset; #else - return __netdev_pick_tx(dev, skb); + return fallback(dev, skb); #endif } diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 8f9266c64c75..fd4b6aecf6ee 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -619,7 +619,7 @@ ltq_etop_set_multicast_list(struct net_device *dev) static u16 ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { /* we are currently only using the first queue */ return 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 8e8a7eb43a2c..13457032d15f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -629,7 +629,7 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *sk } u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct mlx4_en_priv *priv = netdev_priv(dev); u16 rings_p_up = priv->num_tx_rings_p_up; @@ -641,7 +641,7 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, if (vlan_tx_tag_present(skb)) up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT; - return __netdev_pick_tx(dev, skb) % rings_p_up + up * rings_p_up; + return fallback(dev, skb) % rings_p_up + up * rings_p_up; } static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt) diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 3af04c3f42ea..9ca223bc90fc 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -723,7 +723,7 @@ int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); void mlx4_en_tx_irq(struct mlx4_cq *mcq); u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + void *accel_priv, select_queue_fallback_t fallback); netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c index 023237a65720..17503da9f7a5 100644 --- a/drivers/net/ethernet/tile/tilegx.c +++ b/drivers/net/ethernet/tile/tilegx.c @@ -2071,7 +2071,7 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev) /* Return subqueue id on this core (one per core). */ static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { return smp_processor_id(); } diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 28407426fd6f..c8624a8235ab 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1648,7 +1648,7 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) } static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { /* * This helper function exists to help dev_pick_tx get the correct diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 44c4db8450f0..8fe9cb7d0f72 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -366,7 +366,7 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) * hope the rxq no. may help here. */ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct tun_struct *tun = netdev_priv(dev); struct tun_flow_entry *e; diff --git a/drivers/net/wireless/mwifiex/main.c b/drivers/net/wireless/mwifiex/main.c index 4d79761b9c87..9d3d2758ec35 100644 --- a/drivers/net/wireless/mwifiex/main.c +++ b/drivers/net/wireless/mwifiex/main.c @@ -748,7 +748,7 @@ static struct net_device_stats *mwifiex_get_stats(struct net_device *dev) static u16 mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { skb->priority = cfg80211_classify8021d(skb, NULL); return mwifiex_1d_to_wmm_queue[skb->priority]; diff --git a/drivers/staging/bcm/Bcmnet.c b/drivers/staging/bcm/Bcmnet.c index 8dfdd2732bdc..95a2358267ba 100644 --- a/drivers/staging/bcm/Bcmnet.c +++ b/drivers/staging/bcm/Bcmnet.c @@ -40,7 +40,7 @@ static INT bcm_close(struct net_device *dev) } static u16 bcm_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { return ClassifyPacket(netdev_priv(dev), skb); } diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c index eedffed17e39..6f9ac27730af 100644 --- a/drivers/staging/netlogic/xlr_net.c +++ b/drivers/staging/netlogic/xlr_net.c @@ -307,7 +307,7 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb, } static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { return (u16)smp_processor_id(); } diff --git a/drivers/staging/rtl8188eu/os_dep/os_intfs.c b/drivers/staging/rtl8188eu/os_dep/os_intfs.c index 68f98fa114d2..7c9ee58f47bb 100644 --- a/drivers/staging/rtl8188eu/os_dep/os_intfs.c +++ b/drivers/staging/rtl8188eu/os_dep/os_intfs.c @@ -653,7 +653,7 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb) } static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct adapter *padapter = rtw_netdev_priv(dev); struct mlme_priv *pmlmepriv = &padapter->mlmepriv; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 21d4e6be8949..1de9c136b066 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -752,6 +752,9 @@ struct netdev_phys_port_id { unsigned char id_len; }; +typedef u16 (*select_queue_fallback_t)(struct net_device *dev, + struct sk_buff *skb); + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -783,7 +786,7 @@ struct netdev_phys_port_id { * Required can not be NULL. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - * void *accel_priv); + * void *accel_priv, select_queue_fallback_t fallback); * Called to decide which queue to when device supports multiple * transmit queues. * @@ -1005,7 +1008,8 @@ struct net_device_ops { struct net_device *dev); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + void *accel_priv, + select_queue_fallback_t fallback); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); @@ -1551,7 +1555,6 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, void *accel_priv); -u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb); /* * Net namespace inlines diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 87577d447554..75fe83f590ea 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -372,7 +372,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) #endif } -u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); @@ -392,7 +392,6 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) return queue_index; } -EXPORT_SYMBOL(__netdev_pick_tx); struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, @@ -403,8 +402,8 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, - accel_priv); + queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + __netdev_pick_tx); else queue_index = __netdev_pick_tx(dev, skb); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index d6d1f1df9119..ce1c44370610 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1057,7 +1057,8 @@ static void ieee80211_uninit(struct net_device *dev) static u16 ieee80211_netdev_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, + select_queue_fallback_t fallback) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } @@ -1075,7 +1076,8 @@ static const struct net_device_ops ieee80211_dataif_ops = { static u16 ieee80211_monitor_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, + select_queue_fallback_t fallback) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; -- cgit v1.2.3 From b9507bdaf40e91fea2b1c0c1ee7dc627c8ee6fd6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 16 Feb 2014 15:55:21 +0100 Subject: netdevice: move netdev_cap_txqueue for shared usage to header In order to allow users to invoke netdev_cap_txqueue, it needs to be moved into netdevice.h header file. While at it, also add kernel doc header to document the API. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 ++++++++++++++++++++ net/core/flow_dissector.c | 13 +------------ 2 files changed, 21 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1de9c136b066..e8eeebd49a98 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2278,6 +2278,26 @@ static inline void netdev_reset_queue(struct net_device *dev_queue) netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0)); } +/** + * netdev_cap_txqueue - check if selected tx queue exceeds device queues + * @dev: network device + * @queue_index: given tx queue index + * + * Returns 0 if given tx queue index >= number of device tx queues, + * otherwise returns the originally passed tx queue index. + */ +static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ + if (unlikely(queue_index >= dev->real_num_tx_queues)) { + net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", + dev->name, queue_index, + dev->real_num_tx_queues); + return 0; + } + + return queue_index; +} + /** * netif_running - test if up * @dev: network device diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 75fe83f590ea..e29e810663d7 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -323,17 +323,6 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) -{ - if (unlikely(queue_index >= dev->real_num_tx_queues)) { - net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", - dev->name, queue_index, - dev->real_num_tx_queues); - return 0; - } - return queue_index; -} - static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -408,7 +397,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, queue_index = __netdev_pick_tx(dev, skb); if (!accel_priv) - queue_index = dev_cap_txqueue(dev, queue_index); + queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); -- cgit v1.2.3 From 0fd5d57ba3456c4d0b77d1ae64be4818b47d7545 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 16 Feb 2014 15:55:22 +0100 Subject: packet: check for ndo_select_queue during queue selection Mathias reported that on an AMD Geode LX embedded board (ALiX) with ath9k driver PACKET_QDISC_BYPASS, introduced in commit d346a3fae3ff ("packet: introduce PACKET_QDISC_BYPASS socket option"), triggers a WARN_ON() coming from the driver itself via 066dae93bdf ("ath9k: rework tx queue selection and fix queue stopping/waking"). The reason why this happened is that ndo_select_queue() call is not invoked from direct xmit path i.e. for ieee80211 subsystem that sets queue and TID (similar to 802.1d tag) which is being put into the frame through 802.11e (WMM, QoS). If that is not set, pending frame counter for e.g. ath9k can get messed up. So the WARN_ON() in ath9k is absolutely legitimate. Generally, the hw queue selection in ieee80211 depends on the type of traffic, and priorities are set according to ieee80211_ac_numbers mapping; working in a similar way as DiffServ only on a lower layer, so that the AP can favour frames that have "real-time" requirements like voice or video data frames. Therefore, check for presence of ndo_select_queue() in netdev ops and, if available, invoke it with a fallback handler to __packet_pick_tx_queue(), so that driver such as bnx2x, ixgbe, or mlx4 can still select a hw queue for transmission in relation to the current CPU while e.g. ieee80211 subsystem can make their own choices. Reported-by: Mathias Kretschmer Signed-off-by: Daniel Borkmann Cc: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/packet/af_packet.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 6a2bb37506c5..b5dc1168f98a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -308,11 +308,27 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) return po->xmit == packet_direct_xmit; } -static u16 packet_pick_tx_queue(struct net_device *dev) +static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) { return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; } +static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +{ + const struct net_device_ops *ops = dev->netdev_ops; + u16 queue_index; + + if (ops->ndo_select_queue) { + queue_index = ops->ndo_select_queue(dev, skb, NULL, + __packet_pick_tx_queue); + queue_index = netdev_cap_txqueue(dev, queue_index); + } else { + queue_index = __packet_pick_tx_queue(dev, skb); + } + + skb_set_queue_mapping(skb, queue_index); +} + /* register_prot_hook must be invoked with the po->bind_lock held, * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). @@ -2285,7 +2301,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } } - skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); + packet_pick_tx_queue(dev, skb); + skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); @@ -2499,7 +2516,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); + + packet_pick_tx_queue(dev, skb); if (po->has_vnet_hdr) { if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { -- cgit v1.2.3 From 930cd6e46eadce8b8ed2a232ee536e5fd286c152 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 21 Jan 2014 11:22:05 +0100 Subject: batman-adv: fix soft-interface MTU computation The current MTU computation always returns a value smaller than 1500bytes even if the real interfaces have an MTU large enough to compensate the batman-adv overhead. Fix the computation by properly returning the highest admitted value. Introduced by a19d3d85e1b854e4a483a55d740a42458085560d ("batman-adv: limit local translation table max size") Reported-by: Russell Senior Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/hard-interface.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 3d417d3641c6..b851cc580853 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -241,7 +241,7 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); const struct batadv_hard_iface *hard_iface; - int min_mtu = ETH_DATA_LEN; + int min_mtu = INT_MAX; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { @@ -256,8 +256,6 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) } rcu_read_unlock(); - atomic_set(&bat_priv->packet_size_max, min_mtu); - if (atomic_read(&bat_priv->fragmentation) == 0) goto out; @@ -268,13 +266,21 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) min_mtu = min_t(int, min_mtu, BATADV_FRAG_MAX_FRAG_SIZE); min_mtu -= sizeof(struct batadv_frag_packet); min_mtu *= BATADV_FRAG_MAX_FRAGMENTS; - atomic_set(&bat_priv->packet_size_max, min_mtu); - - /* with fragmentation enabled we can fragment external packets easily */ - min_mtu = min_t(int, min_mtu, ETH_DATA_LEN); out: - return min_mtu - batadv_max_header_len(); + /* report to the other components the maximum amount of bytes that + * batman-adv can send over the wire (without considering the payload + * overhead). For example, this value is used by TT to compute the + * maximum local table table size + */ + atomic_set(&bat_priv->packet_size_max, min_mtu); + + /* the real soft-interface MTU is computed by removing the payload + * overhead from the maximum amount of bytes that was just computed. + * + * However batman-adv does not support MTUs bigger than ETH_DATA_LEN + */ + return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN); } /* adjusts the MTU if a new interface with a smaller MTU appeared. */ -- cgit v1.2.3 From e889241f45f9cecbc84a6ffed577083ab52e62ee Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Mon, 27 Jan 2014 12:23:28 +0100 Subject: batman-adv: fix TT-TVLV parsing on OGM reception When accessing a TT-TVLV container in the OGM RX path the variable pointing to the list of changes to apply is altered by mistake. This makes the TT component read data at the wrong position in the OGM packet buffer. Fix it by removing the bogus pointer alteration. Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/translation-table.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index b6071f675a3e..beba13fbd10a 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3218,7 +3218,6 @@ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, spin_lock_bh(&orig_node->tt_lock); - tt_change = (struct batadv_tvlv_tt_change *)tt_buff; batadv_tt_update_changes(bat_priv, orig_node, tt_num_changes, ttvn, tt_change); -- cgit v1.2.3 From 91c2b1a9f680ff105369d49abc7e19ca7efb33e1 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 28 Jan 2014 02:06:47 +0100 Subject: batman-adv: release vlan object after checking the CRC There is a refcounter unbalance in the CRC checking routine invoked on OGM reception. A vlan object is retrieved (thus its refcounter is increased by one) but it is never properly released. This leads to a memleak because the vlan object will never be free'd. Fix this by releasing the vlan object after having read the CRC. Reported-by: Russell Senior Reported-by: Daniel Reported-by: cmsv Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/translation-table.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index beba13fbd10a..c21c5572c860 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -2262,6 +2262,7 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, { struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp; struct batadv_orig_node_vlan *vlan; + uint32_t crc; int i; /* check if each received CRC matches the locally stored one */ @@ -2281,7 +2282,10 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, if (!vlan) return false; - if (vlan->tt.crc != ntohl(tt_vlan_tmp->crc)) + crc = vlan->tt.crc; + batadv_orig_node_vlan_free_ref(vlan); + + if (crc != ntohl(tt_vlan_tmp->crc)) return false; } -- cgit v1.2.3 From f1791425cf0bcda43ab9a9a37df1ad3ccb1f6654 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Thu, 30 Jan 2014 00:12:24 +0100 Subject: batman-adv: properly check pskb_may_pull return value pskb_may_pull() returns 1 on success and 0 in case of failure, therefore checking for the return value being negative does not make sense at all. This way if the function fails we will probably read beyond the current skb data buffer. Fix this by doing the proper check. Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/routing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 1ed9f7c9ecea..c26f07368849 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -688,7 +688,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, int is_old_ttvn; /* check if there is enough data before accessing it */ - if (pskb_may_pull(skb, hdr_len + ETH_HLEN) < 0) + if (!pskb_may_pull(skb, hdr_len + ETH_HLEN)) return 0; /* create a copy of the skb (in case of for re-routing) to modify it. */ -- cgit v1.2.3 From 08bf0ed29c7ded45c477d08618220dd200c3524a Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Wed, 29 Jan 2014 11:25:12 +0100 Subject: batman-adv: avoid potential race condition when adding a new neighbour When adding a new neighbour it is important to atomically perform the following: - check if the neighbour already exists - append the neighbour to the proper list If the two operations are not performed in an atomic context it is possible that two concurrent insertions add the same neighbour twice. Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 22 ++++++++++++++++------ net/batman-adv/originator.c | 36 ++++++++++++++++++++++++++++++++++++ net/batman-adv/originator.h | 4 ++++ 3 files changed, 56 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 512159bf607f..094ae7ca50a0 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -266,7 +266,7 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, struct batadv_orig_node *orig_neigh) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_neigh_node *neigh_node; + struct batadv_neigh_node *neigh_node, *tmp_neigh_node; neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node); if (!neigh_node) @@ -281,14 +281,24 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, neigh_node->orig_node = orig_neigh; neigh_node->if_incoming = hard_iface; - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Creating new neighbor %pM for orig_node %pM on interface %s\n", - neigh_addr, orig_node->orig, hard_iface->net_dev->name); - spin_lock_bh(&orig_node->neigh_list_lock); - hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); + tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface, + neigh_addr); + if (!tmp_neigh_node) { + hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); + } else { + kfree(neigh_node); + batadv_hardif_free_ref(hard_iface); + neigh_node = tmp_neigh_node; + } spin_unlock_bh(&orig_node->neigh_list_lock); + if (!tmp_neigh_node) + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Creating new neighbor %pM for orig_node %pM on interface %s\n", + neigh_addr, orig_node->orig, + hard_iface->net_dev->name); + out: return neigh_node; } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 6df12a2e3605..853941629dc1 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -457,6 +457,42 @@ out: return neigh_node; } +/** + * batadv_neigh_node_get - retrieve a neighbour from the list + * @orig_node: originator which the neighbour belongs to + * @hard_iface: the interface where this neighbour is connected to + * @addr: the address of the neighbour + * + * Looks for and possibly returns a neighbour belonging to this originator list + * which is connected through the provided hard interface. + * Returns NULL if the neighbour is not found. + */ +struct batadv_neigh_node * +batadv_neigh_node_get(const struct batadv_orig_node *orig_node, + const struct batadv_hard_iface *hard_iface, + const uint8_t *addr) +{ + struct batadv_neigh_node *tmp_neigh_node, *res = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { + if (!batadv_compare_eth(tmp_neigh_node->addr, addr)) + continue; + + if (tmp_neigh_node->if_incoming != hard_iface) + continue; + + if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + continue; + + res = tmp_neigh_node; + break; + } + rcu_read_unlock(); + + return res; +} + /** * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object * @rcu: rcu pointer of the orig_ifinfo object diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 37be290f63f6..db3a9ed734cb 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -29,6 +29,10 @@ void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const uint8_t *addr); struct batadv_neigh_node * +batadv_neigh_node_get(const struct batadv_orig_node *orig_node, + const struct batadv_hard_iface *hard_iface, + const uint8_t *addr); +struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, const uint8_t *neigh_addr, struct batadv_orig_node *orig_node); -- cgit v1.2.3 From b2262df7fcf2c395eca564df83238e931d88d7bf Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Sat, 8 Feb 2014 16:45:06 +0100 Subject: batman-adv: fix potential orig_node reference leak Since batadv_orig_node_new() sets the refcount to two, assuming that the calling function will use a reference for putting the orig_node into a hash or similar, both references must be freed if initialization of the orig_node fails. Otherwise that object may be leaked in that error case. Reported-by: Antonio Quartulli Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/bat_iv_ogm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 094ae7ca50a0..42cbc0a68941 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -254,6 +254,8 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr) free_bcast_own: kfree(orig_node->bat_iv.bcast_own); free_orig_node: + /* free twice, as batadv_orig_node_new sets refcount to 2 */ + batadv_orig_node_free_ref(orig_node); batadv_orig_node_free_ref(orig_node); return NULL; -- cgit v1.2.3 From a30e22ca8464c2dc573e0144a972221c2f06c2cd Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 11 Feb 2014 17:05:06 +0100 Subject: batman-adv: fix TT CRC computation by ensuring byte order When computing the CRC on a 2byte variable the order of the bytes obviously alters the final result. This means that computing the CRC over the same value on two archs having different endianess leads to different numbers. The global and local translation table CRC computation routine makes this mistake while processing the clients VIDs. The result is a continuous CRC mismatching between nodes having different endianess. Fix this by converting the VID to Network Order before processing it. This guarantees that every node uses the same byte order. Introduced by 7ea7b4a142758deaf46c1af0ca9ceca6dd55138b ("batman-adv: make the TT CRC logic VLAN specific") Reported-by: Russel Senior Signed-off-by: Antonio Quartulli Tested-by: Russell Senior Signed-off-by: Marek Lindner --- net/batman-adv/translation-table.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index c21c5572c860..959dde721c46 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1975,6 +1975,7 @@ static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv, struct hlist_head *head; uint32_t i, crc_tmp, crc = 0; uint8_t flags; + __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -2011,8 +2012,11 @@ static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv, orig_node)) continue; - crc_tmp = crc32c(0, &tt_common->vid, - sizeof(tt_common->vid)); + /* use network order to read the VID: this ensures that + * every node reads the bytes in the same order. + */ + tmp_vid = htons(tt_common->vid); + crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes @@ -2046,6 +2050,7 @@ static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv, struct hlist_head *head; uint32_t i, crc_tmp, crc = 0; uint8_t flags; + __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -2064,8 +2069,11 @@ static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv, if (tt_common->flags & BATADV_TT_CLIENT_NEW) continue; - crc_tmp = crc32c(0, &tt_common->vid, - sizeof(tt_common->vid)); + /* use network order to read the VID: this ensures that + * every node reads the bytes in the same order. + */ + tmp_vid = htons(tt_common->vid); + crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes -- cgit v1.2.3 From 05c3c8a636aa9ee35ce13f65afc5b665615cc786 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 11 Feb 2014 17:05:07 +0100 Subject: batman-adv: free skb on TVLV parsing success When the TVLV parsing routine succeed the skb is left untouched thus leading to a memory leak. Fix this by consuming the skb in case of success. Introduced by ef26157747d42254453f6b3ac2bd8bd3c53339c3 ("batman-adv: tvlv - basic infrastructure") Reported-by: Russel Senior Signed-off-by: Antonio Quartulli Tested-by: Russell Senior Signed-off-by: Marek Lindner --- net/batman-adv/routing.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index c26f07368849..a953d5b196a3 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -918,6 +918,8 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb, if (ret != NET_RX_SUCCESS) ret = batadv_route_unicast_packet(skb, recv_if); + else + consume_skb(skb); return ret; } -- cgit v1.2.3 From a5a5cb8cab526af2f6cbe9715f8ca843192f0d81 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Sat, 15 Feb 2014 02:17:20 +0100 Subject: batman-adv: avoid double free when orig_node initialization fails In the failure path of the orig_node initialization routine the orig_node->bat_iv.bcast_own field is free'd twice: first in batadv_iv_ogm_orig_get() and then later in batadv_orig_node_free_rcu(). Fix it by removing the kfree in batadv_iv_ogm_orig_get(). Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/bat_iv_ogm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 42cbc0a68941..8323bced8e5b 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -241,18 +241,16 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr) size = bat_priv->num_ifaces * sizeof(uint8_t); orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC); if (!orig_node->bat_iv.bcast_own_sum) - goto free_bcast_own; + goto free_orig_node; hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) - goto free_bcast_own; + goto free_orig_node; return orig_node; -free_bcast_own: - kfree(orig_node->bat_iv.bcast_own); free_orig_node: /* free twice, as batadv_orig_node_new sets refcount to 2 */ batadv_orig_node_free_ref(orig_node); -- cgit v1.2.3 From 70b271a78beba787155d6696aacd7c4d4a251c50 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Sat, 15 Feb 2014 21:50:37 +0100 Subject: batman-adv: fix potential kernel paging error for unicast transmissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit batadv_send_skb_prepare_unicast(_4addr) might reallocate the skb's data. If it does then our ethhdr pointer is not valid anymore in batadv_send_skb_unicast(), resulting in a kernel paging error. Fixing this by refetching the ethhdr pointer after the potential reallocation. Signed-off-by: Linus Lüssing Signed-off-by: Antonio Quartulli --- net/batman-adv/send.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 579f5f00a385..843febd1e519 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -254,9 +254,9 @@ static int batadv_send_skb_unicast(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, unsigned short vid) { - struct ethhdr *ethhdr = (struct ethhdr *)skb->data; + struct ethhdr *ethhdr; struct batadv_unicast_packet *unicast_packet; - int ret = NET_XMIT_DROP; + int ret = NET_XMIT_DROP, hdr_size; if (!orig_node) goto out; @@ -265,12 +265,16 @@ static int batadv_send_skb_unicast(struct batadv_priv *bat_priv, case BATADV_UNICAST: if (!batadv_send_skb_prepare_unicast(skb, orig_node)) goto out; + + hdr_size = sizeof(*unicast_packet); break; case BATADV_UNICAST_4ADDR: if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, skb, orig_node, packet_subtype)) goto out; + + hdr_size = sizeof(struct batadv_unicast_4addr_packet); break; default: /* this function supports UNICAST and UNICAST_4ADDR only. It @@ -279,6 +283,7 @@ static int batadv_send_skb_unicast(struct batadv_priv *bat_priv, goto out; } + ethhdr = (struct ethhdr *)(skb->data + hdr_size); unicast_packet = (struct batadv_unicast_packet *)skb->data; /* inform the destination node that we are still missing a correct route -- cgit v1.2.3 From 08b44656c08c8c2f73cdac2a058be2880e3361f2 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 17 Feb 2014 14:22:21 +0100 Subject: gre: add link local route when local addr is any This bug was reported by Steinar H. Gunderson and was introduced by commit f7cb8886335d ("sit/gre6: don't try to add the same route two times"). root@morgental:~# ip tunnel add foo mode gre remote 1.2.3.4 ttl 64 root@morgental:~# ip link set foo up mtu 1468 root@morgental:~# ip -6 route show dev foo fe80::/64 proto kernel metric 256 but after the above commit, no such route shows up. There is no link local route because dev->dev_addr is 0 (because local ipv4 address is 0), hence no link local address is configured. In this scenario, the link local address is added manually: 'ip -6 addr add fe80::1 dev foo' and because prefix is /128, no link local route is added by the kernel. Even if the right things to do is to add the link local address with a /64 prefix, we need to restore the previous behavior to avoid breaking userpace. Reported-by: Steinar H. Gunderson Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ad235690684c..fdbfeca36d63 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2783,6 +2783,8 @@ static void addrconf_gre_config(struct net_device *dev) ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) addrconf_add_linklocal(idev, &addr); + else + addrconf_prefix_route(&addr, 64, dev, 0, 0); } #endif -- cgit v1.2.3 From a4b1b5877b514b276f0f31efe02388a9c2836728 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Thu, 19 Dec 2013 12:09:32 +0100 Subject: HID: Bluetooth: hidp: make sure input buffers are big enough HID core expects the input buffers to be at least of size 4096 (HID_MAX_BUFFER_SIZE). Other sizes will result in buffer-overflows if an input-report is smaller than advertised. We could, like i2c, compute the biggest report-size instead of using HID_MAX_BUFFER_SIZE, but this will blow up if report-descriptors are changed after ->start() has been called. So lets be safe and just use the biggest buffer we have. Note that this adds an additional copy to the HIDP input path. If there is a way to make sure the skb-buf is big enough, we should use that instead. The best way would be to make hid-core honor the @size argument, though, that sounds easier than it is. So lets just fix the buffer-overflows for now and afterwards look for a faster way for all transport drivers. Signed-off-by: David Herrmann Signed-off-by: Jiri Kosina --- net/bluetooth/hidp/core.c | 16 ++++++++++++++-- net/bluetooth/hidp/hidp.h | 4 ++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 292e619db896..d9fb93451442 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -430,6 +430,16 @@ static void hidp_del_timer(struct hidp_session *session) del_timer(&session->timer); } +static void hidp_process_report(struct hidp_session *session, + int type, const u8 *data, int len, int intr) +{ + if (len > HID_MAX_BUFFER_SIZE) + len = HID_MAX_BUFFER_SIZE; + + memcpy(session->input_buf, data, len); + hid_input_report(session->hid, type, session->input_buf, len, intr); +} + static void hidp_process_handshake(struct hidp_session *session, unsigned char param) { @@ -502,7 +512,8 @@ static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb, hidp_input_report(session, skb); if (session->hid) - hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 0); + hidp_process_report(session, HID_INPUT_REPORT, + skb->data, skb->len, 0); break; case HIDP_DATA_RTYPE_OTHER: @@ -584,7 +595,8 @@ static void hidp_recv_intr_frame(struct hidp_session *session, hidp_input_report(session, skb); if (session->hid) { - hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 1); + hidp_process_report(session, HID_INPUT_REPORT, + skb->data, skb->len, 1); BT_DBG("report len %d", skb->len); } } else { diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h index ab5241400cf7..8798492a6e99 100644 --- a/net/bluetooth/hidp/hidp.h +++ b/net/bluetooth/hidp/hidp.h @@ -24,6 +24,7 @@ #define __HIDP_H #include +#include #include #include #include @@ -179,6 +180,9 @@ struct hidp_session { /* Used in hidp_output_raw_report() */ int output_report_success; /* boolean */ + + /* temporary input buffer */ + u8 input_buf[HID_MAX_BUFFER_SIZE]; }; /* HIDP init defines */ -- cgit v1.2.3 From a6254864c08109c66a194612585afc0439005286 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Mon, 17 Feb 2014 15:23:43 +0800 Subject: ipv4: fix counter in_slow_tot since commit 89aef8921bf("ipv4: Delete routing cache."), the counter in_slow_tot can't work correctly. The counter in_slow_tot increase by one when fib_lookup() return successfully in ip_route_input_slow(), but actually the dst struct maybe not be created and cached, so we can increase in_slow_tot after the dst struct is created. Signed-off-by: Duan Jiong Signed-off-by: David S. Miller --- net/ipv4/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6f6dd85bdffc..4c011ec69ed4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1597,6 +1597,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rth->dst.output = ip_output; @@ -1701,8 +1702,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto no_route; } - RT_CACHE_STAT_INC(in_slow_tot); - if (res.type == RTN_BROADCAST) goto brd_input; @@ -1773,6 +1772,7 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; -- cgit v1.2.3 From ffd5939381c609056b33b7585fb05a77b4c695f3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 17 Feb 2014 12:11:11 +0100 Subject: net: sctp: fix sctp_connectx abi for ia32 emulation/compat mode SCTP's sctp_connectx() abi breaks for 64bit kernels compiled with 32bit emulation (e.g. ia32 emulation or x86_x32). Due to internal usage of 'struct sctp_getaddrs_old' which includes a struct sockaddr pointer, sizeof(param) check will always fail in kernel as the structure in 64bit kernel space is 4bytes larger than for user binaries compiled in 32bit mode. Thus, applications making use of sctp_connectx() won't be able to run under such circumstances. Introduce a compat interface in the kernel to deal with such situations by using a 'struct compat_sctp_getaddrs_old' structure where user data is copied into it, and then sucessively transformed into a 'struct sctp_getaddrs_old' structure with the help of compat_ptr(). That fixes sctp_connectx() abi without any changes needed in user space, and lets the SCTP test suite pass when compiled in 32bit and run on 64bit kernels. Fixes: f9c67811ebc0 ("sctp: Fix regression introduced by new sctp_connectx api") Signed-off-by: Daniel Borkmann Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/socket.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 7075ac847fde..981aaf8b6ace 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -1368,11 +1369,19 @@ static int sctp_setsockopt_connectx(struct sock *sk, /* * New (hopefully final) interface for the API. * We use the sctp_getaddrs_old structure so that use-space library - * can avoid any unnecessary allocations. The only defferent part + * can avoid any unnecessary allocations. The only different part * is that we store the actual length of the address buffer into the - * addrs_num structure member. That way we can re-use the existing + * addrs_num structure member. That way we can re-use the existing * code. */ +#ifdef CONFIG_COMPAT +struct compat_sctp_getaddrs_old { + sctp_assoc_t assoc_id; + s32 addr_num; + compat_uptr_t addrs; /* struct sockaddr * */ +}; +#endif + static int sctp_getsockopt_connectx3(struct sock *sk, int len, char __user *optval, int __user *optlen) @@ -1381,16 +1390,30 @@ static int sctp_getsockopt_connectx3(struct sock *sk, int len, sctp_assoc_t assoc_id = 0; int err = 0; - if (len < sizeof(param)) - return -EINVAL; +#ifdef CONFIG_COMPAT + if (is_compat_task()) { + struct compat_sctp_getaddrs_old param32; - if (copy_from_user(¶m, optval, sizeof(param))) - return -EFAULT; + if (len < sizeof(param32)) + return -EINVAL; + if (copy_from_user(¶m32, optval, sizeof(param32))) + return -EFAULT; - err = __sctp_setsockopt_connectx(sk, - (struct sockaddr __user *)param.addrs, - param.addr_num, &assoc_id); + param.assoc_id = param32.assoc_id; + param.addr_num = param32.addr_num; + param.addrs = compat_ptr(param32.addrs); + } else +#endif + { + if (len < sizeof(param)) + return -EINVAL; + if (copy_from_user(¶m, optval, sizeof(param))) + return -EFAULT; + } + err = __sctp_setsockopt_connectx(sk, (struct sockaddr __user *) + param.addrs, param.addr_num, + &assoc_id); if (err == 0 || err == -EINPROGRESS) { if (copy_to_user(optval, &assoc_id, sizeof(assoc_id))) return -EFAULT; -- cgit v1.2.3 From d7cf0c34af067555737193b6c1aa7abaa677f29c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 18 Feb 2014 15:20:51 +0300 Subject: af_packet: remove a stray tab in packet_set_ring() At first glance it looks like there is a missing curly brace but actually the code works the same either way. I have adjusted the indenting but left the code the same. Signed-off-by: Dan Carpenter Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b5dc1168f98a..48a6a93db296 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3804,7 +3804,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, */ if (!tx_ring) init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); - break; + break; default: break; } -- cgit v1.2.3