From f333ee0cdb27ba201e6cc0c99c76b1364aa29b86 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 11 Jul 2018 17:33:32 -0700 Subject: bpf: Add BPF_SOCK_OPS_TCP_LISTEN_CB Add new TCP-BPF callback that is called on listen(2) right after socket transition to TCP_LISTEN state. It fills the gap for listening sockets in TCP-BPF. For example BPF program can set BPF_SOCK_OPS_STATE_CB_FLAG when socket becomes listening and track later transition from TCP_LISTEN to TCP_CLOSE with BPF_SOCK_OPS_STATE_CB callback. Before there was no way to do it with TCP-BPF and other options were much harder to work with. E.g. socket state tracking can be done with tracepoints (either raw or regular) but they can't be attached to cgroup and their lifetime has to be managed separately. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/ipv4/af_inet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c716be13d58c..f2a0a3bab6b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog) err = inet_csk_listen_start(sk, backlog); if (err) goto out; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } sk->sk_max_ack_backlog = backlog; err = 0; -- cgit v1.2.3 From 83ba4645152d1177c161750e1064e3a8e7cee19b Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Tue, 31 Jul 2018 21:18:11 +0200 Subject: net: add helpers checking if socket can be bound to nonlocal address The construction "net->ipv4.sysctl_ip_nonlocal_bind || inet->freebind || inet->transparent" is present three times and its IPv6 counterpart is also present three times. We introduce two small helpers to characterize these tests uniformly. Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- include/net/inet_sock.h | 8 ++++++++ include/net/ipv6.h | 7 +++++++ net/ipv4/af_inet.c | 3 +-- net/ipv4/ping.c | 6 ++---- net/ipv6/af_inet6.c | 6 ++---- net/ipv6/datagram.c | 3 +-- 6 files changed, 21 insertions(+), 12 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 314be484c696..e03b93360f33 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -359,4 +359,12 @@ static inline bool inet_get_convert_csum(struct sock *sk) return !!inet_sk(sk)->convert_csum; } + +static inline bool inet_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv4.sysctl_ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + #endif /* _INET_SOCK_H */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a44509f4e985..82deb684ba73 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -766,6 +766,13 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, return hlimit; } +static inline bool ipv6_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv6.sysctl.ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f2a0a3bab6b5..ee707b91d1a7 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -486,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * is temporarily down) */ err = -EADDRNOTAVAIL; - if (!net->ipv4.sysctl_ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!inet_can_nonlocal_bind(net, inet) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b54c964ad925..8d7aaf118a30 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; - if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && - isk->freebind == 0 && isk->transparent == 0 && + if ((!inet_can_nonlocal_bind(net, isk) && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) @@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, scoped); rcu_read_unlock(); - if (!(net->ipv6.sysctl.ip_nonlocal_bind || - isk->freebind || isk->transparent || has_addr || + if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c9535354149f..020f6e14a7af 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -322,8 +322,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type(net, v4addr); - if (!net->ipv4.sysctl_ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!inet_can_nonlocal_bind(net, inet) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && @@ -362,8 +361,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!net->ipv6.sysctl.ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!ipv6_can_nonlocal_bind(net, inet) && !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index f0264dfd38de..1ede7a16a0be 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -803,8 +803,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; - if (!(net->ipv6.sysctl.ip_nonlocal_bind || - inet_sk(sk)->freebind || inet_sk(sk)->transparent) && + if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) && !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, dev, !strict, 0, IFA_F_TENTATIVE) && -- cgit v1.2.3 From 432e05d328921c68c35bfdeff7d7b7400b8e3d1a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 1 Aug 2018 00:36:03 +0200 Subject: net: ipv4: Control SKB reprioritization after forwarding After IPv4 packets are forwarded, the priority of the corresponding SKB is updated according to the TOS field of IPv4 header. This overrides any prioritization done earlier by e.g. an skbedit action or ingress-qos-map defined at a vlan device. Such overriding may not always be desirable. Even if the packet ends up being routed, which implies this is an L3 network node, an administrator may wish to preserve whatever prioritization was done earlier on in the pipeline. Therefore introduce a sysctl that controls this behavior. Keep the default value at 1 to maintain backward-compatible behavior. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/ip_forward.c | 3 ++- net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ 5 files changed, 22 insertions(+), 1 deletion(-) (limited to 'net/ipv4/af_inet.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 77c37fb0b6a6..e74515ecaa9c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -81,6 +81,15 @@ fib_multipath_hash_policy - INTEGER 0 - Layer 3 1 - Layer 4 +ip_forward_update_priority - INTEGER + Whether to update SKB priority from "TOS" field in IPv4 header after it + is forwarded. The new SKB priority is mapped from TOS field value + according to an rt_tos2priority table (see e.g. man tc-prio). + Default: 1 (Update priority.) + Possible values: + 0 - Do not update priority. + 1 - Update priority. + route/max_size - INTEGER Maximum number of routes allowed in the kernel. Increase this when using large numbers of interfaces and/or routes. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 661348f23ea5..e47503b4e4d1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -98,6 +98,7 @@ struct netns_ipv4 { int sysctl_ip_default_ttl; int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; + int sysctl_ip_fwd_update_priority; int sysctl_ip_nonlocal_bind; /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ee707b91d1a7..20fda8fb8ffd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1801,6 +1801,7 @@ static __net_init int inet_init_net(struct net *net) * We set them here, in case sysctl is not compiled. */ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; + net->ipv4.sysctl_ip_fwd_update_priority = 1; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index b54b948b0596..32662e9e5d21 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb) !skb_sec_path(skb)) ip_rt_send_redirect(skb); - skb->priority = rt_tos2priority(iph->tos); + if (net->ipv4.sysctl_ip_fwd_update_priority) + skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, net, NULL, skb, skb->dev, rt->dst.dev, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5fa335fd3852..e21dda015513 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -663,6 +663,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ip_forward_update_priority", + .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, -- cgit v1.2.3