From f02db315b8d888570cb0d4496cfbb7e4acb047cb Mon Sep 17 00:00:00 2001 From: Francesco Fusco Date: Tue, 24 Sep 2013 15:43:08 +0200 Subject: ipv4: IP_TOS and IP_TTL can be specified as ancillary data This patch enables the IP_TTL and IP_TOS values passed from userspace to be stored in the ipcm_cookie struct. Three fields are added to the struct: - the TTL, expressed as __u8. The allowed values are in the [1-255]. A value of 0 means that the TTL is not specified. - the TOS, expressed as __s16. The allowed values are in the range [0,255]. A value of -1 means that the TOS is not specified. - the priority, expressed as a char and computed when handling the ancillary data. Signed-off-by: Francesco Fusco Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'net/ipv4/ip_sockglue.c') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d9c4f113d709..56e34457ac07 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -189,7 +189,7 @@ EXPORT_SYMBOL(ip_cmsg_recv); int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) { - int err; + int err, val; struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { @@ -215,6 +215,24 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) ipc->addr = info->ipi_spec_dst.s_addr; break; } + case IP_TTL: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + return -EINVAL; + val = *(int *)CMSG_DATA(cmsg); + if (val < 1 || val > 255) + return -EINVAL; + ipc->ttl = val; + break; + case IP_TOS: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + return -EINVAL; + val = *(int *)CMSG_DATA(cmsg); + if (val < 0 || val > 255) + return -EINVAL; + ipc->tos = val; + ipc->priority = rt_tos2priority(ipc->tos); + break; + default: return -EINVAL; } -- cgit v1.2.3 From fbf8866d65d5de84f75563eb0edd7fc27dbe9a90 Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Mon, 7 Oct 2013 11:01:40 -0500 Subject: net: ipv4 only populate IP_PKTINFO when needed The since the removal of the routing cache computing fib_compute_spec_dst() does a fib_table lookup for each UDP multicast packet received. This has introduced a performance regression for some UDP workloads. This change skips populating the packet info for sockets that do not have IP_PKTINFO set. Benchmark results from a netperf UDP_RR test: Before 89789.68 transactions/s After 90587.62 transactions/s Benchmark results from a fio 1 byte UDP multicast pingpong test (Multicast one way unicast response): Before 12.63us RTT After 12.48us RTT Signed-off-by: Shawn Bohrer Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- net/ipv4/ip_sockglue.c | 5 +++-- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net/ipv4/ip_sockglue.c') diff --git a/include/net/ip.h b/include/net/ip.h index 16078f422397..b39ebe5339ac 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -459,7 +459,7 @@ int ip_options_rcv_srr(struct sk_buff *skb); * Functions provided by ip_sockglue.c */ -void ipv4_pktinfo_prepare(struct sk_buff *skb); +void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb); void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb); int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc); int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 56e34457ac07..0626f2cb192e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1052,11 +1052,12 @@ e_inval: * destination in skb->cb[] before dst drop. * This way, receiver doesnt make cache line misses to read rtable. */ -void ipv4_pktinfo_prepare(struct sk_buff *skb) +void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); - if (skb_rtable(skb)) { + if ((inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) && + skb_rtable(skb)) { pktinfo->ipi_ifindex = inet_iif(skb); pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index b2fa14c1a6f1..41e1d2845c8f 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -299,7 +299,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { /* Charge it to the socket. */ - ipv4_pktinfo_prepare(skb); + ipv4_pktinfo_prepare(sk, skb); if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 262ea3929da6..4226c53daaed 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1544,7 +1544,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) rc = 0; - ipv4_pktinfo_prepare(skb); + ipv4_pktinfo_prepare(sk, skb); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) rc = __udp_queue_rcv_skb(sk, skb); -- cgit v1.2.3 From 482fc6094afad572a4ea1fd722e7b11ca72022a0 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 5 Nov 2013 02:24:17 +0100 Subject: ipv4: introduce new IP_MTU_DISCOVER mode IP_PMTUDISC_INTERFACE Sockets marked with IP_PMTUDISC_INTERFACE won't do path mtu discovery, their sockets won't accept and install new path mtu information and they will always use the interface mtu for outgoing packets. It is guaranteed that the packet is not fragmented locally. But we won't set the DF-Flag on the outgoing frames. Florian Weimer had the idea to use this flag to ensure DNS servers are never generating outgoing fragments. They may well be fragmented on the path, but the server never stores or usees path mtu values, which could well be forged in an attack. (The root of the problem with path MTU discovery is that there is no reliable way to authenticate ICMP Fragmentation Needed But DF Set messages because they are sent from intermediate routers with their source addresses, and the IMCP payload will not always contain sufficient information to identify a flow.) Recent research in the DNS community showed that it is possible to implement an attack where DNS cache poisoning is feasible by spoofing fragments. This work was done by Amir Herzberg and Haya Shulman: This issue was previously discussed among the DNS community, e.g. , without leading to fixes. This patch depends on the patch "ipv4: fix DO and PROBE pmtu mode regarding local fragmentation with UFO/CORK" for the enforcement of the non-fragmentable checks. If other users than ip_append_page/data should use this semantic too, we have to add a new flag to IPCB(skb)->flags to suppress local fragmentation and check for this in ip_finish_output. Many thanks to Florian Weimer for the idea and feedback while implementing this patch. Cc: David S. Miller Suggested-by: Florian Weimer Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/route.h | 16 ++++++++++++---- include/uapi/linux/in.h | 5 +++++ net/dccp/ipv4.c | 1 + net/ipv4/ip_output.c | 8 ++++---- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/route.c | 4 ++++ net/ipv4/tcp_ipv4.c | 1 + 7 files changed, 28 insertions(+), 9 deletions(-) (limited to 'net/ipv4/ip_sockglue.c') diff --git a/include/net/route.h b/include/net/route.h index dd4ae0029fd8..f68c167280a7 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -313,12 +313,20 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) return hoplimit; } -static inline int ip_skb_dst_mtu(struct sk_buff *skb) +static inline bool ip_sk_accept_pmtu(const struct sock *sk) { - struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; + return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE; +} - return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? - skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); +static inline bool ip_sk_use_pmtu(const struct sock *sk) +{ + return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE; +} + +static inline int ip_skb_dst_mtu(const struct sk_buff *skb) +{ + return (!skb->sk || ip_sk_use_pmtu(skb->sk)) ? + dst_mtu(skb_dst(skb)) : skb_dst(skb)->dev->mtu; } #endif /* _ROUTE_H */ diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index f9e8e496ae5d..393c5de09d42 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -115,6 +115,11 @@ struct in_addr { #define IP_PMTUDISC_WANT 1 /* Use per route hints */ #define IP_PMTUDISC_DO 2 /* Always DF */ #define IP_PMTUDISC_PROBE 3 /* Ignore dst pmtu */ +/* Always use interface mtu (ignores dst pmtu) but don't set DF flag. + * Also incoming ICMP frag_needed notifications will be ignored on + * this socket to prevent accepting spoofed ones. + */ +#define IP_PMTUDISC_INTERFACE 4 #define IP_MULTICAST_IF 32 #define IP_MULTICAST_TTL 33 diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 720c36225ed9..d9f65fc66db5 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -174,6 +174,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && + ip_sk_accept_pmtu(sk) && inet_csk(sk)->icsk_pmtu_cookie > mtu) { dccp_sync_mss(sk, mtu); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 51be64e18e32..912402752f2f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1037,7 +1037,6 @@ error: static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, struct ipcm_cookie *ipc, struct rtable **rtp) { - struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *opt; struct rtable *rt; @@ -1063,8 +1062,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, * We steal reference to this route, caller should not release it */ *rtp = NULL; - cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(&rt->dst); + cork->fragsize = ip_sk_use_pmtu(sk) ? + dst_mtu(&rt->dst) : rt->dst.dev->mtu; cork->dst = &rt->dst; cork->length = 0; cork->ttl = ipc->ttl; @@ -1315,7 +1314,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk, /* DF bit is set when we want to see DF on outgoing frames. * If local_df is set too, we still allow to fragment this frame * locally. */ - if (inet->pmtudisc >= IP_PMTUDISC_DO || + if (inet->pmtudisc == IP_PMTUDISC_DO || + inet->pmtudisc == IP_PMTUDISC_PROBE || (skb->len <= dst_mtu(&rt->dst) && ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 0626f2cb192e..3f858266fa7e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -627,7 +627,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->nodefrag = val ? 1 : 0; break; case IP_MTU_DISCOVER: - if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE) goto e_inval; inet->pmtudisc = val; break; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d2d325382b13..f428935c50db 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1036,6 +1036,10 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) bool new = false; bh_lock_sock(sk); + + if (!ip_sk_accept_pmtu(sk)) + goto out; + rt = (struct rtable *) __sk_dst_get(sk); if (sock_owned_by_user(sk) || !rt) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 300ab2c93f29..14bba8a1c5a7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -288,6 +288,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk) mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && + ip_sk_accept_pmtu(sk) && inet_csk(sk)->icsk_pmtu_cookie > mtu) { tcp_sync_mss(sk, mtu); -- cgit v1.2.3