diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 09:41:05 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 09:41:05 -0800 |
commit | b0f85fa11aefc4f3e03306b4cd47f113bd57dcba (patch) | |
tree | 1333d36d99fde3f97210795941fc246f0ad08a75 /net/ipv4/ip_output.c | |
parent | ccc9d4a6d640cbde05d519edeb727881646cf71b (diff) | |
parent | f32bfb9a8ca083f8d148ea90ae5ba66f4831836e (diff) | |
download | linux-b0f85fa11aefc4f3e03306b4cd47f113bd57dcba.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
Changes of note:
1) Allow to schedule ICMP packets in IPVS, from Alex Gartrell.
2) Provide FIB table ID in ipv4 route dumps just as ipv6 does, from
David Ahern.
3) Allow the user to ask for the statistics to be filtered out of
ipv4/ipv6 address netlink dumps. From Sowmini Varadhan.
4) More work to pass the network namespace context around deep into
various packet path APIs, starting with the netfilter hooks. From
Eric W Biederman.
5) Add layer 2 TX/RX checksum offloading to qeth driver, from Thomas
Richter.
6) Use usec resolution for SYN/ACK RTTs in TCP, from Yuchung Cheng.
7) Support Very High Throughput in wireless MESH code, from Bob
Copeland.
8) Allow setting the ageing_time in switchdev/rocker. From Scott
Feldman.
9) Properly autoload L2TP type modules, from Stephen Hemminger.
10) Fix and enable offload features by default in 8139cp driver, from
David Woodhouse.
11) Support both ipv4 and ipv6 sockets in a single vxlan device, from
Jiri Benc.
12) Fix CWND limiting of thin streams in TCP, from Bendik Rønning
Opstad.
13) Fix IPSEC flowcache overflows on large systems, from Steffen
Klassert.
14) Convert bridging to track VLANs using rhashtable entries rather than
a bitmap. From Nikolay Aleksandrov.
15) Make TCP listener handling completely lockless, this is a major
accomplishment. Incoming request sockets now live in the
established hash table just like any other socket too.
From Eric Dumazet.
15) Provide more bridging attributes to netlink, from Nikolay
Aleksandrov.
16) Use hash based algorithm for ipv4 multipath routing, this was very
long overdue. From Peter Nørlund.
17) Several y2038 cures, mostly avoiding timespec. From Arnd Bergmann.
18) Allow non-root execution of EBPF programs, from Alexei Starovoitov.
19) Support SO_INCOMING_CPU as setsockopt, from Eric Dumazet. This
influences the port binding selection logic used by SO_REUSEPORT.
20) Add ipv6 support to VRF, from David Ahern.
21) Add support for Mellanox Spectrum switch ASIC, from Jiri Pirko.
22) Add rtl8xxxu Realtek wireless driver, from Jes Sorensen.
23) Implement RACK loss recovery in TCP, from Yuchung Cheng.
24) Support multipath routes in MPLS, from Roopa Prabhu.
25) Fix POLLOUT notification for listening sockets in AF_UNIX, from Eric
Dumazet.
26) Add new QED Qlogic river, from Yuval Mintz, Manish Chopra, and
Sudarsana Kalluru.
27) Don't fetch timestamps on AF_UNIX sockets, from Hannes Frederic
Sowa.
28) Support ipv6 geneve tunnels, from John W Linville.
29) Add flood control support to switchdev layer, from Ido Schimmel.
30) Fix CHECKSUM_PARTIAL handling of potentially fragmented frames, from
Hannes Frederic Sowa.
31) Support persistent maps and progs in bpf, from Daniel Borkmann.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1790 commits)
sh_eth: use DMA barriers
switchdev: respect SKIP_EOPNOTSUPP flag in case there is no recursion
net: sched: kill dead code in sch_choke.c
irda: Delete an unnecessary check before the function call "irlmp_unregister_service"
net: dsa: mv88e6xxx: include DSA ports in VLANs
net: dsa: mv88e6xxx: disable SA learning for DSA and CPU ports
net/core: fix for_each_netdev_feature
vlan: Invoke driver vlan hooks only if device is present
arcnet/com20020: add LEDS_CLASS dependency
bpf, verifier: annotate verbose printer with __printf
dp83640: Only wait for timestamps for packets with timestamping enabled.
ptp: Change ptp_class to a proper bitmask
dp83640: Prune rx timestamp list before reading from it
dp83640: Delay scheduled work.
dp83640: Include hash in timestamp/packet matching
ipv6: fix tunnel error handling
net/mlx5e: Fix LSO vlan insertion
net/mlx5e: Re-eanble client vlan TX acceleration
net/mlx5e: Return error in case mlx5e_set_features() fails
net/mlx5e: Don't allow more than max supported channels
...
Diffstat (limited to 'net/ipv4/ip_output.c')
-rw-r--r-- | net/ipv4/ip_output.c | 148 |
1 files changed, 75 insertions, 73 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0138fada0951..4233cbe47052 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,9 +83,10 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); -static int ip_fragment(struct sock *sk, struct sk_buff *skb, - unsigned int mtu, - int (*output)(struct sock *, struct sk_buff *)); +static int +ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct net *, struct sock *, struct sk_buff *)); /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) @@ -95,32 +96,28 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->tot_len = htons(skb->len); ip_send_check(iph); - return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL, - skb_dst(skb)->dev, dst_output_sk); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, skb_dst(skb)->dev, + dst_output); } -int __ip_local_out(struct sk_buff *skb) -{ - return __ip_local_out_sk(skb->sk, skb); -} - -int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; - err = __ip_local_out(skb); + err = __ip_local_out(net, sk, skb); if (likely(err == 1)) - err = dst_output_sk(sk, skb); + err = dst_output(net, sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out_sk); +EXPORT_SYMBOL_GPL(ip_local_out); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -135,11 +132,12 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * Add an ip header to a skbuff and send it out. * */ -int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, +int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); + struct net *net = sock_net(sk); struct iphdr *iph; /* Build the IP header. */ @@ -149,15 +147,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; - if (ip_dont_fragment(sk, &rt->dst)) - iph->frag_off = htons(IP_DF); - else - iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(sock_net(sk), skb, sk); + if (ip_dont_fragment(sk, &rt->dst)) { + iph->frag_off = htons(IP_DF); + iph->id = 0; + } else { + iph->frag_off = 0; + __ip_select_ident(net, iph, 1); + } if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -168,11 +168,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, skb->mark = sk->sk_mark; /* Send it out. */ - return ip_local_out(skb); + return ip_local_out(net, skb->sk, skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); -static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; @@ -182,9 +182,9 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -220,8 +220,8 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, - unsigned int mtu) +static int ip_finish_output_gso(struct net *net, struct sock *sk, + struct sk_buff *skb, unsigned int mtu) { netdev_features_t features; struct sk_buff *segs; @@ -230,7 +230,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || skb_gso_network_seglen(skb) <= mtu) - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. * @@ -253,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, int err; segs->next = NULL; - err = ip_fragment(sk, segs, mtu, ip_finish_output2); + err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; @@ -263,7 +263,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, return ret; } -static int ip_finish_output(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; @@ -271,20 +271,20 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif mtu = ip_skb_dst_mtu(skb); if (skb_is_gso(skb)) - return ip_finish_output_gso(sk, skb, mtu); + return ip_finish_output_gso(net, sk, skb, mtu); if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) - return ip_fragment(sk, skb, mtu, ip_finish_output2); + return ip_fragment(net, sk, skb, mtu, ip_finish_output2); - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); } -int ip_mc_output(struct sock *sk, struct sk_buff *skb) +int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -292,7 +292,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) /* * If the indicated interface is up and running, send the packet. */ - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -320,7 +320,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, - sk, newskb, NULL, newskb->dev, + net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); } @@ -335,26 +335,28 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb, - NULL, newskb->dev, dev_loopback_xmit); + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, newskb, NULL, newskb->dev, + dev_loopback_xmit); } - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL, - skb->dev, ip_finish_output, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb->dev, + ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sock *sk, struct sk_buff *skb) +int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, - NULL, dev, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -377,6 +379,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; @@ -407,7 +410,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(sock_net(sk), fl4, sk, + rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, @@ -444,20 +447,20 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_segs(sock_net(sk), skb, sk, + ip_select_ident_segs(net, skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - res = ip_local_out(skb); + res = ip_local_out(net, sk, skb); rcu_read_unlock(); return res; no_route: rcu_read_unlock(); - IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -486,29 +489,26 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -static int ip_fragment(struct sock *sk, struct sk_buff *skb, +static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, - int (*output)(struct sock *, struct sk_buff *)) + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph = ip_hdr(skb); if ((iph->frag_off & htons(IP_DF)) == 0) - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); if (unlikely(!skb->ignore_df || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->dst.dev; - - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); } /* @@ -518,8 +518,8 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb, * single device frame, and queue such a frame for sending. */ -int ip_do_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; @@ -533,6 +533,11 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, dev = rt->dst.dev; + /* for offloaded checksums cleanup checksum before fragmentation */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto fail; + /* * Point into the IP datagram header. */ @@ -621,10 +626,10 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, ip_send_check(iph); } - err = output(sk, skb); + err = output(net, sk, skb); if (!err) - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -634,7 +639,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, } if (err == 0) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return 0; } @@ -643,7 +648,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, kfree_skb(frag); frag = skb; } - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; slow_path_clean: @@ -657,9 +662,6 @@ slow_path_clean: } slow_path: - /* for offloaded checksums cleanup checksum before fragmentation */ - if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) - goto fail; iph = ip_hdr(skb); left = skb->len - hlen; /* Space per frame */ @@ -761,19 +763,19 @@ slow_path: ip_send_check(iph); - err = output(sk, skb2); + err = output(net, sk, skb2); if (err) goto fail; - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); } consume_skb(skb); - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; } EXPORT_SYMBOL(ip_do_fragment); @@ -911,6 +913,7 @@ static int __ip_append_data(struct sock *sk, if (transhdrlen && length + fragheaderlen <= mtu && rt->dst.dev->features & NETIF_F_V4_CSUM && + !(flags & MSG_MORE) && !exthdrlen) csummode = CHECKSUM_PARTIAL; @@ -1434,7 +1437,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb) { int err; - err = ip_local_out(skb); + err = ip_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); @@ -1561,7 +1564,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, } oif = arg->bound_dev_if; - if (!oif && netif_index_is_vrf(net, skb->skb_iif)) + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; flowi4_init_output(&fl4, oif, @@ -1596,7 +1599,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } out: |