diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-12 07:54:15 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-12 07:54:15 -0800 |
commit | ce38aa9cbed3d109355b0169b520362c409c0541 (patch) | |
tree | 621511c34edd22ac30ca12f78f0d478245b4ccd7 /net/ipv4 | |
parent | 69973b830859bc6529a7a0468ba0d80ee5117826 (diff) | |
parent | d84701ecbcd6ad63faa7a9c18ad670d1c4d561c0 (diff) | |
download | linux-ce38aa9cbed3d109355b0169b520362c409c0541.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) Platform regulatory domain support for ath10k, from Bartosz
Markowski.
2) Centralize min/max MTU checking, thus removing tons of duplicated
code all of the the various drivers. From Jarod Wilson.
3) Support ingress actions in act_mirred, from Shmulik Ladkani.
4) Improve device adjacency tracking, from David Ahern.
5) Add support for LED triggers on PHY link state changes, from Zach
Brown.
6) Improve UDP socket memory accounting, from Paolo Abeni.
7) Set SK_MEM_QUANTUM to a fixed size of 4096, instead of PAGE_SIZE.
From Eric Dumazet.
8) Collapse TCP SKBs at retransmit time even if the right side SKB has
frags. Also from Eric Dumazet.
9) Add IP_RECVFRAGSIZE and IPV6_RECVFRAGSIZE cmsgs, from Willem de
Bruijn.
10) Support routing by UID, from Lorenzo Colitti.
11) Handle L3 domain binding (ie. VRF) for RAW sockets, from David
Ahern.
12) tcp_get_info() can run lockless, from Eric Dumazet.
13) 4-tuple UDP hashing in SFC driver, from Edward Cree.
14) Avoid reorders in GRO code, from Eric Dumazet.
15) IPV6 Segment Routing support, from David Lebrun.
16) Support MPLS push and pop for L3 packets in openvswitch, from Jiri
Benc.
17) Add LRU datastructure support for BPF, Martin KaFai Lau.
18) VF support in liquidio driver, from Raghu Vatsavayi.
19) Multiqueue support in alx driver, from Tobias Regnery.
20) Networking cgroup BPF support, from Daniel Mack.
21) TCP chronograph measurements, from Francis Yan.
22) XDP support for qed driver, from Yuval Mintz.
23) BPF based lwtunnels, from Thomas Graf.
24) Consistent FIB dumping to offloading drivers, from Ido Schimmel.
25) Many optimizations for UDP under high load, from Eric Dumazet.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1522 commits)
netfilter: nft_counter: rework atomic dump and reset
e1000: use disable_hardirq() for e1000_netpoll()
i40e: don't truncate match_method assignment
net: ethernet: ti: netcp: add support of cpts
net: phy: phy drivers should not set SUPPORTED_[Asym_]Pause
net: l2tp: ppp: change PPPOL2TP_MSG_* => L2TP_MSG_*
net: l2tp: deprecate PPPOL2TP_MSG_* in favour of L2TP_MSG_*
net: l2tp: export debug flags to UAPI
net: ethernet: stmmac: remove private tx queue lock
net: ethernet: sxgbe: remove private tx queue lock
net: bridge: shorten ageing time on topology change
net: bridge: add helper to set topology change
net: bridge: add helper to offload ageing time
net: nicvf: use new api ethtool_{get|set}_link_ksettings
net: ethernet: ti: cpsw: sync rates for channels in dual emac mode
net: ethernet: ti: cpsw: re-split res only when speed is changed
net: ethernet: ti: cpsw: combine budget and weight split and check
net: ethernet: ti: cpsw: don't start queue twice
net: ethernet: ti: cpsw: use same macros to get active slave
net: mvneta: select GENERIC_ALLOCATOR
...
Diffstat (limited to 'net/ipv4')
60 files changed, 1897 insertions, 426 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index b54b3ca939db..6e7baaf814c6 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -430,6 +430,14 @@ config INET_UDP_DIAG Support for UDP socket monitoring interface used by the ss tool. If unsure, say Y. +config INET_RAW_DIAG + tristate "RAW: socket monitoring interface" + depends on INET_DIAG && (IPV6 || IPV6=n) + default n + ---help--- + Support for RAW socket monitoring interface used by the ss tool. + If unsure, say Y. + config INET_DIAG_DESTROY bool "INET: allow privileged process to administratively close sockets" depends on INET_DIAG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bc6a6c8b9bcd..48af58a5686e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o +obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 215143246e4b..1830e6f0e9cc 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -374,8 +374,18 @@ lookup_protocol: if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) + if (err) { + sk_common_release(sk); + goto out; + } + } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { sk_common_release(sk); + goto out; + } } out: return err; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 161fc0f0d752..dbad5a1c161a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -620,6 +620,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, + [RTA_UID] = { .type = NLA_U32 }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, @@ -1218,6 +1219,8 @@ static int __net_init ip_fib_net_init(struct net *net) int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; + net->ipv4.fib_seq = 0; + /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 388d3e21629b..c1bc1e92de0e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi) #endif call_rcu(&fi->rcu, free_fib_info_rcu); } +EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e3665bf7a7f3..1b0e7d1f5217 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -84,25 +84,114 @@ #include <trace/events/fib.h> #include "fib_lookup.h" -static BLOCKING_NOTIFIER_HEAD(fib_chain); +static unsigned int fib_seq_sum(void) +{ + unsigned int fib_seq = 0; + struct net *net; + + rtnl_lock(); + for_each_net(net) + fib_seq += net->ipv4.fib_seq; + rtnl_unlock(); + + return fib_seq; +} + +static ATOMIC_NOTIFIER_HEAD(fib_chain); -int register_fib_notifier(struct notifier_block *nb) +static int call_fib_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info) { - return blocking_notifier_chain_register(&fib_chain, nb); + info->net = net; + return nb->notifier_call(nb, event_type, info); +} + +static void fib_rules_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type) +{ +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct fib_notifier_info info; + + if (net->ipv4.fib_has_custom_rules) + call_fib_notifier(nb, net, event_type, &info); +#endif +} + +static void fib_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type); + +static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, u32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id, u32 nlflags) +{ + struct fib_entry_notifier_info info = { + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .tb_id = tb_id, + .nlflags = nlflags, + }; + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static bool fib_dump_is_consistent(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + unsigned int fib_seq) +{ + atomic_notifier_chain_register(&fib_chain, nb); + if (fib_seq == fib_seq_sum()) + return true; + atomic_notifier_chain_unregister(&fib_chain, nb); + if (cb) + cb(nb); + return false; +} + +#define FIB_DUMP_MAX_RETRIES 5 +int register_fib_notifier(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb)) +{ + int retries = 0; + + do { + unsigned int fib_seq = fib_seq_sum(); + struct net *net; + + /* Mutex semantics guarantee that every change done to + * FIB tries before we read the change sequence counter + * is now visible to us. + */ + rcu_read_lock(); + for_each_net_rcu(net) { + fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD); + fib_notify(net, nb, FIB_EVENT_ENTRY_ADD); + } + rcu_read_unlock(); + + if (fib_dump_is_consistent(nb, cb, fib_seq)) + return 0; + } while (++retries < FIB_DUMP_MAX_RETRIES); + + return -EBUSY; } EXPORT_SYMBOL(register_fib_notifier); int unregister_fib_notifier(struct notifier_block *nb) { - return blocking_notifier_chain_unregister(&fib_chain, nb); + return atomic_notifier_chain_unregister(&fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + net->ipv4.fib_seq++; info->net = net; - return blocking_notifier_call_chain(&fib_chain, event_type, info); + return atomic_notifier_call_chain(&fib_chain, event_type, info); } static int call_fib_entry_notifiers(struct net *net, @@ -1903,6 +1992,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb) return found; } +static void fib_leaf_notify(struct net *net, struct key_vector *l, + struct fib_table *tb, struct notifier_block *nb, + enum fib_event_type event_type) +{ + struct fib_alias *fa; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi) + continue; + + /* local and main table can share the same trie, + * so don't notify twice for the same entry. + */ + if (tb->tb_id != fa->tb_id) + continue; + + call_fib_entry_notifier(nb, net, event_type, l->key, + KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, + fa->fa_type, fa->tb_id, 0); + } +} + +static void fib_table_notify(struct net *net, struct fib_table *tb, + struct notifier_block *nb, + enum fib_event_type event_type) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *l, *tp = t->kv; + t_key key = 0; + + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { + fib_leaf_notify(net, l, tb, nb, event_type); + + key = l->key + 1; + /* stop in case of wrap around */ + if (key < l->key) + break; + } +} + +static void fib_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type) +{ + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb_hlist) + fib_table_notify(net, tb, nb, event_type); + } +} + static void __trie_free_rcu(struct rcu_head *head) { struct fib_table *tb = container_of(head, struct fib_table, rcu); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 030d1531e897..805f6607f8d9 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -622,14 +622,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) return err; } -static struct genl_family fou_nl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = FOU_GENL_NAME, - .version = FOU_GENL_VERSION, - .maxattr = FOU_ATTR_MAX, - .netnsok = true, -}; +static struct genl_family fou_nl_family; static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_PORT] = { .type = NLA_U16, }, @@ -831,6 +824,17 @@ static const struct genl_ops fou_nl_ops[] = { }, }; +static struct genl_family fou_nl_family __ro_after_init = { + .hdrsize = 0, + .name = FOU_GENL_NAME, + .version = FOU_GENL_VERSION, + .maxattr = FOU_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = fou_nl_ops, + .n_ops = ARRAY_SIZE(fou_nl_ops), +}; + size_t fou_encap_hlen(struct ip_tunnel_encap *e) { return sizeof(struct udphdr); @@ -1086,8 +1090,7 @@ static int __init fou_init(void) if (ret) goto exit; - ret = genl_register_family_with_ops(&fou_nl_family, - fou_nl_ops); + ret = genl_register_family(&fou_nl_family); if (ret < 0) goto unregister; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 48734ee6293f..f79d7a8ab1c6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -425,6 +425,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_mark = mark; + fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); @@ -473,6 +474,7 @@ static struct rtable *icmp_route_lookup(struct net *net, param->replyopts.opt.opt.faddr : iph->saddr); fl4->saddr = saddr; fl4->flowi4_mark = mark; + fl4->flowi4_uid = sock_net_uid(net, NULL); fl4->flowi4_tos = RT_TOS(tos); fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; @@ -1045,12 +1047,12 @@ int icmp_rcv(struct sk_buff *skb) if (success) { consume_skb(skb); - return 0; + return NET_RX_SUCCESS; } drop: kfree_skb(skb); - return 0; + return NET_RX_DROP; csum_error: __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS); error: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 61a9deec2993..d5d3ead0a6c3 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -415,7 +415,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num)); + htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -452,7 +452,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num)); + htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e4d16fc5bbb3..4dea33e5f295 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -200,6 +200,15 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) goto errout; + /* + * RAW sockets might have user-defined protocols assigned, + * so report the one supplied on socket creation. + */ + if (sk->sk_type == SOCK_RAW) { + if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol)) + goto errout; + } + if (!icsk) { handler->idiag_get_info(sk, r, NULL); goto out; @@ -852,10 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); - int i, num, s_i, s_num; u32 idiag_states = r->idiag_states; - bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + int i, num, s_i, s_num; + struct sock *sk; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; @@ -863,16 +873,15 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(idiag_states & TCPF_LISTEN)) + if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; - struct sock *sk; num = 0; ilb = &hashinfo->listening_hash[i]; - spin_lock_bh(&ilb->lock); + spin_lock(&ilb->lock); sk_for_each(sk, &ilb->head) { struct inet_sock *inet = inet_sk(sk); @@ -892,26 +901,18 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (r->id.idiag_dport || - cb->args[3] > 0) - goto next_listen; - if (inet_csk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { - spin_unlock_bh(&ilb->lock); + spin_unlock(&ilb->lock); goto done; } next_listen: - cb->args[3] = 0; - cb->args[4] = 0; ++num; } - spin_unlock_bh(&ilb->lock); + spin_unlock(&ilb->lock); s_num = 0; - cb->args[3] = 0; - cb->args[4] = 0; } skip_listen_ht: cb->args[0] = 1; @@ -921,13 +922,14 @@ skip_listen_ht: if (!(idiag_states & ~TCPF_LISTEN)) goto out; +#define SKARR_SZ 16 for (i = s_i; i <= hashinfo->ehash_mask; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; spinlock_t *lock = inet_ehash_lockp(hashinfo, i); struct hlist_nulls_node *node; - struct sock *sk; - - num = 0; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; if (hlist_nulls_empty(&head->chain)) continue; @@ -935,9 +937,12 @@ skip_listen_ht: if (i > s_i) s_num = 0; +next_chunk: + num = 0; + accum = 0; spin_lock_bh(lock); sk_nulls_for_each(sk, node, &head->chain) { - int state, res; + int state; if (!net_eq(sock_net(sk), net)) continue; @@ -961,21 +966,35 @@ skip_listen_ht: if (!inet_diag_bc_sk(bc, sk)) goto next_normal; - res = sk_diag_fill(sk, skb, r, + sock_hold(sk); + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + break; +next_normal: + ++num; + } + spin_unlock_bh(lock); + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = sk_diag_fill(sk_arr[idx], skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin); - if (res < 0) { - spin_unlock_bh(lock); - goto done; + if (res < 0) + num = num_arr[idx]; } -next_normal: - ++num; + sock_gen_put(sk_arr[idx]); } - - spin_unlock_bh(lock); + if (res < 0) + break; cond_resched(); + if (accum == SKARR_SZ) { + s_num = num + 1; + goto next_chunk; + } } done: diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 576f705d8180..78fd62048335 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -113,8 +113,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); -static int ipgre_net_id __read_mostly; -static int gre_tap_net_id __read_mostly; +static unsigned int ipgre_net_id __read_mostly; +static unsigned int gre_tap_net_id __read_mostly; static void ipgre_err(struct sk_buff *skb, u32 info, const struct tnl_ptk_info *tpi) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 877bdb02e887..9ffc2625cddd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -74,6 +74,7 @@ #include <net/checksum.h> #include <net/inetpeer.h> #include <net/lwtunnel.h> +#include <linux/bpf-cgroup.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> @@ -287,6 +288,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -305,6 +313,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_mc_finish_output(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + + return dev_loopback_xmit(net, sk, skb); +} + int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); @@ -342,7 +364,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -358,7 +380,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, @@ -583,7 +605,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, */ if (skb_has_frag_list(skb)) { struct sk_buff *frag, *frag2; - int first_len = skb_pagelen(skb); + unsigned int first_len = skb_pagelen(skb); if (first_len - hlen > mtu || ((first_len - hlen) & 7) || @@ -1594,7 +1616,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, - tcp_hdr(skb)->source, tcp_hdr(skb)->dest); + tcp_hdr(skb)->source, tcp_hdr(skb)->dest, + arg->uid); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b8a2d63d1fb8..8b13881ed064 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -97,6 +97,17 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); } +static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb) +{ + int val; + + if (IPCB(skb)->frag_max_size == 0) + return; + + val = IPCB(skb)->frag_max_size; + put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val); +} + static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset) { @@ -153,10 +164,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); } -void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, - int tlen, int offset) +void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, int tlen, int offset) { - struct inet_sock *inet = inet_sk(skb->sk); + struct inet_sock *inet = inet_sk(sk); unsigned int flags = inet->cmsg_flags; /* Ordered by supposed usage frequency */ @@ -218,6 +229,9 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, if (flags & IP_CMSG_CHECKSUM) ip_cmsg_recv_checksum(msg, skb, tlen, offset); + + if (flags & IP_CMSG_RECVFRAGSIZE) + ip_cmsg_recv_fragsize(msg, skb); } EXPORT_SYMBOL(ip_cmsg_recv_offset); @@ -614,6 +628,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_MULTICAST_LOOP: case IP_RECVORIGDSTADDR: case IP_CHECKSUM: + case IP_RECVFRAGSIZE: if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -726,6 +741,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, } } break; + case IP_RECVFRAGSIZE: + if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) + goto e_inval; + if (val) + inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE; + else + inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE; + break; case IP_TOS: /* This sets both TOS and Precedence */ if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; @@ -1357,6 +1380,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_CHECKSUM: val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0; break; + case IP_RECVFRAGSIZE: + val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0; + break; case IP_TOS: val = inet->tos; break; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 5719d6ba0824..823abaef006b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -358,6 +358,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, { struct ip_tunnel *nt; struct net_device *dev; + int t_hlen; BUG_ON(!itn->fb_tunnel_dev); dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); @@ -367,6 +368,9 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, dev->mtu = ip_tunnel_bind_dev(dev); nt = netdev_priv(dev); + t_hlen = nt->hlen + sizeof(struct iphdr); + dev->min_mtu = ETH_MIN_MTU; + dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; ip_tunnel_add(itn, nt); return nt; } @@ -929,7 +933,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) int t_hlen = tunnel->hlen + sizeof(struct iphdr); int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; - if (new_mtu < 68) + if (new_mtu < ETH_MIN_MTU) return -EINVAL; if (new_mtu > max_mtu) { @@ -990,7 +994,7 @@ int ip_tunnel_get_iflink(const struct net_device *dev) } EXPORT_SYMBOL(ip_tunnel_get_iflink); -int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, +int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); @@ -1192,7 +1196,7 @@ void ip_tunnel_uninit(struct net_device *dev) EXPORT_SYMBOL_GPL(ip_tunnel_uninit); /* Do least required initialization, rest of init is done in tunnel_init call */ -void ip_tunnel_setup(struct net_device *dev, int net_id) +void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) { struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->ip_tnl_net_id = net_id; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5d7944f394d9..8b14f1404c8f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -46,7 +46,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly; -static int vti_net_id __read_mostly; +static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index c9392589c415..79489f017854 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -121,7 +121,7 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -static int ipip_net_id __read_mostly; +static unsigned int ipip_net_id __read_mostly; static int ipip_tunnel_init(struct net_device *dev); static struct rtnl_link_ops ipip_link_ops __read_mostly; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 27089f5ebbb1..665505d86b12 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -137,6 +137,9 @@ static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, .flags = FIB_LOOKUP_NOREF, }; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi4_to_flowi(flp4)); + err = fib_rules_lookup(net->ipv4.mr_rules_ops, flowi4_to_flowi(flp4), 0, &arg); if (err < 0) @@ -163,7 +166,9 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp, return -EINVAL; } - mrt = ipmr_get_table(rule->fr_net, rule->table); + arg->table = fib_rule_get_table(rule, arg); + + mrt = ipmr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -1809,6 +1814,12 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, /* Wrong interface: drop packet and (maybe) send PIM assert. */ if (mrt->vif_table[vif].dev != skb->dev) { + struct net_device *mdev; + + mdev = l3mdev_master_dev_rcu(mrt->vif_table[vif].dev); + if (mdev == skb->dev) + goto forward; + if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. * Very complicated situation... @@ -2053,7 +2064,7 @@ static int pim_rcv(struct sk_buff *skb) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); - if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) || + if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) || (pim->flags & PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d613309e3e5d..c11eb1744ab1 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -25,6 +25,12 @@ config NF_CONNTRACK_IPV4 To compile it as a module, choose M here. If unsure, say N. +config NF_SOCKET_IPV4 + tristate "IPv4 socket lookup support" + help + This option enables the IPv4 socket lookup infrastructure. This is + is required by the iptables socket match. + if NF_TABLES config NF_TABLES_IPV4 @@ -54,6 +60,14 @@ config NFT_DUP_IPV4 help This module enables IPv4 packet duplication support for nf_tables. +config NFT_FIB_IPV4 + select NFT_FIB + tristate "nf_tables fib / ip route lookup support" + help + This module enables IPv4 FIB lookups, e.g. for reverse path filtering. + It also allows query of the FIB for the route type, e.g. local, unicast, + multicast or blackhole. + endif # NF_TABLES_IPV4 config NF_TABLES_ARP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 853328f8fd05..f462fee66ac8 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -14,6 +14,8 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o # defrag obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o +obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o + # logging obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o @@ -34,6 +36,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 697538464e6e..1258a9ab62ef 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -217,11 +217,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, */ e = get_entry(table_base, private->hook_entry[hook]); - acpar.net = state->net; - acpar.in = state->in; - acpar.out = state->out; - acpar.hooknum = hook; - acpar.family = NFPROTO_ARP; + acpar.state = state; acpar.hotdrop = false; arp = arp_hdr(skb); @@ -415,17 +411,15 @@ static inline int check_target(struct arpt_entry *e, const char *name) } static inline int -find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) +find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, + struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; - unsigned long pcnt; int ret; - pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(pcnt)) + if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; - e->counters.pcnt = pcnt; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, @@ -443,7 +437,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) err: module_put(t->u.kernel.target->me); out: - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); return ret; } @@ -523,7 +517,7 @@ static inline void cleanup_entry(struct arpt_entry *e) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in @@ -532,6 +526,7 @@ static inline void cleanup_entry(struct arpt_entry *e) static int translate_table(struct xt_table_info *newinfo, void *entry0, const struct arpt_replace *repl) { + struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct arpt_entry *iter; unsigned int *offsets; unsigned int i; @@ -594,7 +589,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { - ret = find_check_entry(iter, repl->name, repl->size); + ret = find_check_entry(iter, repl->name, repl->size, + &alloc_state); if (ret != 0) break; ++i; @@ -809,7 +805,7 @@ static int get_info(struct net *net, void __user *user, #endif t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -838,7 +834,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(NFPROTO_ARP); @@ -863,7 +859,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { const struct xt_table_info *private = t->private; if (get.size == private->size) @@ -875,7 +871,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; return ret; } @@ -902,8 +898,8 @@ static int __do_replace(struct net *net, const char *name, t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); - if (IS_ERR_OR_NULL(t)) { - ret = t ? PTR_ERR(t) : -ENOENT; + if (!t) { + ret = -ENOENT; goto free_newinfo_counters_untrans; } @@ -1018,8 +1014,8 @@ static int do_add_counters(struct net *net, const void __user *user, return PTR_ERR(paddc); t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name); - if (IS_ERR_OR_NULL(t)) { - ret = t ? PTR_ERR(t) : -ENOENT; + if (!t) { + ret = -ENOENT; goto free; } @@ -1408,7 +1404,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { const struct xt_table_info *private = t->private; struct xt_table_info info; @@ -1423,7 +1419,7 @@ static int compat_get_entries(struct net *net, module_put(t->me); xt_table_unlock(t); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; xt_compat_unlock(NFPROTO_ARP); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 7c00ce90adb8..308b456723f0 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -261,11 +261,7 @@ ipt_do_table(struct sk_buff *skb, acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; acpar.thoff = ip_hdrlen(skb); acpar.hotdrop = false; - acpar.net = state->net; - acpar.in = state->in; - acpar.out = state->out; - acpar.family = NFPROTO_IPV4; - acpar.hooknum = hook; + acpar.state = state; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); local_bh_disable(); @@ -535,7 +531,8 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name) static int find_check_entry(struct ipt_entry *e, struct net *net, const char *name, - unsigned int size) + unsigned int size, + struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; @@ -543,12 +540,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - unsigned long pcnt; - pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(pcnt)) + if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; - e->counters.pcnt = pcnt; j = 0; mtpar.net = net; @@ -586,7 +580,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, cleanup_match(ematch, net); } - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); return ret; } @@ -674,7 +668,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in @@ -683,6 +677,7 @@ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ipt_replace *repl) { + struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct ipt_entry *iter; unsigned int *offsets; unsigned int i; @@ -742,7 +737,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { - ret = find_check_entry(iter, net, repl->name, repl->size); + ret = find_check_entry(iter, net, repl->name, repl->size, + &alloc_state); if (ret != 0) break; ++i; @@ -977,7 +973,7 @@ static int get_info(struct net *net, void __user *user, #endif t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), "iptable_%s", name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1007,7 +1003,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(AF_INET); @@ -1032,7 +1028,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { const struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1043,7 +1039,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; return ret; } @@ -1068,8 +1064,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), "iptable_%s", name); - if (IS_ERR_OR_NULL(t)) { - ret = t ? PTR_ERR(t) : -ENOENT; + if (!t) { + ret = -ENOENT; goto free_newinfo_counters_untrans; } @@ -1184,8 +1180,8 @@ do_add_counters(struct net *net, const void __user *user, return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET, tmp.name); - if (IS_ERR_OR_NULL(t)) { - ret = t ? PTR_ERR(t) : -ENOENT; + if (!t) { + ret = -ENOENT; goto free; } @@ -1630,7 +1626,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); @@ -1644,7 +1640,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; xt_compat_unlock(AF_INET); return ret; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 4a9e6db9df8d..21db00d0362b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -62,7 +62,7 @@ struct clusterip_config { static const struct file_operations clusterip_proc_fops; #endif -static int clusterip_net_id __read_mostly; +static unsigned int clusterip_net_id __read_mostly; struct clusterip_net { struct list_head configs; @@ -419,7 +419,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) } cipinfo->config = config; - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -444,7 +444,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) clusterip_config_put(cipinfo->config); - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_get(par->net, par->family); } #ifdef CONFIG_COMPAT diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index da7f02a0b868..a03e4e7ef5f9 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -41,7 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) pr_debug("bad rangesize %u\n", mr->rangesize); return -EINVAL; } - return 0; + return nf_ct_netns_get(par->net, par->family); } static unsigned int @@ -55,7 +55,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) range.min_proto = mr->range[0].min; range.max_proto = mr->range[0].max; - return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out); + return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range, + xt_out(par)); +} + +static void masquerade_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); } static struct xt_target masquerade_tg_reg __read_mostly = { @@ -66,6 +72,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = { .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg_check, + .destroy = masquerade_tg_destroy, .me = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 1d16c0f28df0..8bd0d7b26632 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -34,7 +34,7 @@ static unsigned int reject_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_reject_info *reject = par->targinfo; - int hook = par->hooknum; + int hook = xt_hooknum(par); switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: @@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(par->net, skb, hook); + nf_send_reset(xt_net(par), skb, hook); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index db5b87509446..30c0de53e254 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -263,12 +263,12 @@ static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct net *net = par->net; + struct net *net = xt_net(par); struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; - if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) + if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP)) return NF_DROP; th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); @@ -418,12 +418,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par) e->ip.invflags & XT_INV_PROTO) return -EINVAL; - return nf_ct_l3proto_try_module_get(par->family); + return nf_ct_netns_get(par->net, par->family); } static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg4_reg __read_mostly = { diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 78cc64eddfc1..f273098e48fd 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -83,10 +83,12 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) return true ^ invert; iph = ip_hdr(skb); - if (ipv4_is_multicast(iph->daddr)) { - if (ipv4_is_zeronet(iph->saddr)) - return ipv4_is_local_multicast(iph->daddr) ^ invert; + if (ipv4_is_zeronet(iph->saddr)) { + if (ipv4_is_lbcast(iph->daddr) || + ipv4_is_local_multicast(iph->daddr)) + return true ^ invert; } + flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); @@ -95,7 +97,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_tos = RT_TOS(iph->tos); flow.flowi4_scope = RT_SCOPE_UNIVERSE; - return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert; + return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 713c09a74b90..fcfd071f4705 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -31,6 +31,13 @@ #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #include <net/netfilter/nf_log.h> +static int conntrack4_net_id __read_mostly; +static DEFINE_MUTEX(register_ipv4_hooks); + +struct conntrack4_net { + unsigned int users; +}; + static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_tuple *tuple) { @@ -307,9 +314,42 @@ static struct nf_sockopt_ops so_getorigdst = { .owner = THIS_MODULE, }; -static int ipv4_init_net(struct net *net) +static int ipv4_hooks_register(struct net *net) { - return 0; + struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); + int err = 0; + + mutex_lock(®ister_ipv4_hooks); + + cnet->users++; + if (cnet->users > 1) + goto out_unlock; + + err = nf_defrag_ipv4_enable(net); + if (err) { + cnet->users = 0; + goto out_unlock; + } + + err = nf_register_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + + if (err) + cnet->users = 0; + out_unlock: + mutex_unlock(®ister_ipv4_hooks); + return err; +} + +static void ipv4_hooks_unregister(struct net *net) +{ + struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); + + mutex_lock(®ister_ipv4_hooks); + if (cnet->users && (--cnet->users == 0)) + nf_unregister_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + mutex_unlock(®ister_ipv4_hooks); } struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { @@ -325,7 +365,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .nlattr_to_tuple = ipv4_nlattr_to_tuple, .nla_policy = ipv4_nla_policy, #endif - .init_net = ipv4_init_net, + .net_ns_get = ipv4_hooks_register, + .net_ns_put = ipv4_hooks_unregister, .me = THIS_MODULE, }; @@ -336,52 +377,50 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("ip_conntrack"); MODULE_LICENSE("GPL"); +static struct nf_conntrack_l4proto *builtin_l4proto4[] = { + &nf_conntrack_l4proto_tcp4, + &nf_conntrack_l4proto_udp4, + &nf_conntrack_l4proto_icmp, +#ifdef CONFIG_NF_CT_PROTO_DCCP + &nf_conntrack_l4proto_dccp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + &nf_conntrack_l4proto_sctp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + &nf_conntrack_l4proto_udplite4, +#endif +}; + static int ipv4_net_init(struct net *net) { int ret = 0; - ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4); - if (ret < 0) { - pr_err("nf_conntrack_tcp4: pernet registration failed\n"); - goto out_tcp; - } - ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4); - if (ret < 0) { - pr_err("nf_conntrack_udp4: pernet registration failed\n"); - goto out_udp; - } - ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp); - if (ret < 0) { - pr_err("nf_conntrack_icmp4: pernet registration failed\n"); - goto out_icmp; - } + ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); + if (ret < 0) + return ret; ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4); if (ret < 0) { pr_err("nf_conntrack_ipv4: pernet registration failed\n"); - goto out_ipv4; + nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); } - return 0; -out_ipv4: - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); -out_icmp: - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); -out_udp: - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4); -out_tcp: return ret; } static void ipv4_net_exit(struct net *net) { nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4); - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4); + nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); } static struct pernet_operations ipv4_net_ops = { .init = ipv4_net_init, .exit = ipv4_net_exit, + .id = &conntrack4_net_id, + .size = sizeof(struct conntrack4_net), }; static int __init nf_conntrack_l3proto_ipv4_init(void) @@ -389,7 +428,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) int ret = 0; need_conntrack(); - nf_defrag_ipv4_enable(); ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) { @@ -403,46 +441,21 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) goto cleanup_sockopt; } - ret = nf_register_hooks(ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register hooks.\n"); + ret = nf_ct_l4proto_register(builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); + if (ret < 0) goto cleanup_pernet; - } - - ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n"); - goto cleanup_hooks; - } - - ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n"); - goto cleanup_tcp4; - } - - ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n"); - goto cleanup_udp4; - } ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); if (ret < 0) { pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); - goto cleanup_icmpv4; + goto cleanup_l4proto; } return ret; - cleanup_icmpv4: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); - cleanup_udp4: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); - cleanup_tcp4: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4); - cleanup_hooks: - nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); +cleanup_l4proto: + nf_ct_l4proto_unregister(builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); cleanup_pernet: unregister_pernet_subsys(&ipv4_net_ops); cleanup_sockopt: @@ -454,10 +467,8 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) { synchronize_net(); nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4); - nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); + nf_ct_l4proto_unregister(builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); unregister_pernet_subsys(&ipv4_net_ops); nf_unregister_sockopt(&so_getorigdst); } diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index d88da36b383c..49bd6a54404f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -11,6 +11,7 @@ #include <linux/netfilter.h> #include <linux/module.h> #include <linux/skbuff.h> +#include <net/netns/generic.h> #include <net/route.h> #include <net/ip.h> @@ -22,6 +23,8 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> +static DEFINE_MUTEX(defrag4_mutex); + static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { @@ -102,18 +105,50 @@ static struct nf_hook_ops ipv4_defrag_ops[] = { }, }; +static void __net_exit defrag4_net_exit(struct net *net) +{ + if (net->nf.defrag_ipv4) { + nf_unregister_net_hooks(net, ipv4_defrag_ops, + ARRAY_SIZE(ipv4_defrag_ops)); + net->nf.defrag_ipv4 = false; + } +} + +static struct pernet_operations defrag4_net_ops = { + .exit = defrag4_net_exit, +}; + static int __init nf_defrag_init(void) { - return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); + return register_pernet_subsys(&defrag4_net_ops); } static void __exit nf_defrag_fini(void) { - nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); + unregister_pernet_subsys(&defrag4_net_ops); } -void nf_defrag_ipv4_enable(void) +int nf_defrag_ipv4_enable(struct net *net) { + int err = 0; + + might_sleep(); + + if (net->nf.defrag_ipv4) + return 0; + + mutex_lock(&defrag4_mutex); + if (net->nf.defrag_ipv4) + goto out_unlock; + + err = nf_register_net_hooks(net, ipv4_defrag_ops, + ARRAY_SIZE(ipv4_defrag_ops)); + if (err == 0) + net->nf.defrag_ipv4 = true; + + out_unlock: + mutex_unlock(&defrag4_mutex); + return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c new file mode 100644 index 000000000000..a83d558e1aae --- /dev/null +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2007-2008 BalaBit IT Ltd. + * Author: Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/icmp.h> +#include <net/sock.h> +#include <net/inet_sock.h> +#include <net/netfilter/nf_socket.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#endif + +static int +extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, + __be32 *raddr, __be32 *laddr, + __be16 *rport, __be16 *lport) +{ + unsigned int outside_hdrlen = ip_hdrlen(skb); + struct iphdr *inside_iph, _inside_iph; + struct icmphdr *icmph, _icmph; + __be16 *ports, _ports[2]; + + icmph = skb_header_pointer(skb, outside_hdrlen, + sizeof(_icmph), &_icmph); + if (icmph == NULL) + return 1; + + switch (icmph->type) { + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_REDIRECT: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return 1; + } + + inside_iph = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr), + sizeof(_inside_iph), &_inside_iph); + if (inside_iph == NULL) + return 1; + + if (inside_iph->protocol != IPPROTO_TCP && + inside_iph->protocol != IPPROTO_UDP) + return 1; + + ports = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr) + + (inside_iph->ihl << 2), + sizeof(_ports), &_ports); + if (ports == NULL) + return 1; + + /* the inside IP packet is the one quoted from our side, thus + * its saddr is the local address */ + *protocol = inside_iph->protocol; + *laddr = inside_iph->saddr; + *lport = ports[0]; + *raddr = inside_iph->daddr; + *rport = ports[1]; + + return 0; +} + +static struct sock * +nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, + const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in) +{ + switch (protocol) { + case IPPROTO_TCP: + return inet_lookup(net, &tcp_hashinfo, skb, doff, + saddr, sport, daddr, dport, + in->ifindex); + case IPPROTO_UDP: + return udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + } + return NULL; +} + +struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, + const struct net_device *indev) +{ + __be32 uninitialized_var(daddr), uninitialized_var(saddr); + __be16 uninitialized_var(dport), uninitialized_var(sport); + const struct iphdr *iph = ip_hdr(skb); + struct sk_buff *data_skb = NULL; + u8 uninitialized_var(protocol); +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + enum ip_conntrack_info ctinfo; + struct nf_conn const *ct; +#endif + int doff = 0; + + if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { + struct udphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(_hdr), &_hdr); + if (hp == NULL) + return NULL; + + protocol = iph->protocol; + saddr = iph->saddr; + sport = hp->source; + daddr = iph->daddr; + dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = iph->protocol == IPPROTO_TCP ? + ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) : + ip_hdrlen(skb) + sizeof(*hp); + + } else if (iph->protocol == IPPROTO_ICMP) { + if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, + &sport, &dport)) + return NULL; + } else { + return NULL; + } + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Do the lookup with the original socket address in + * case this is a reply packet of an established + * SNAT-ted connection. + */ + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct) && + ((iph->protocol != IPPROTO_ICMP && + ctinfo == IP_CT_ESTABLISHED_REPLY) || + (iph->protocol == IPPROTO_ICMP && + ctinfo == IP_CT_RELATED_REPLY)) && + (ct->status & IPS_SRC_NAT_DONE)) { + + daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; + dport = (iph->protocol == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + + return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr, + daddr, sport, dport, indev); +} +EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); +MODULE_DESCRIPTION("Netfilter IPv4 socket lookup infrastructure"); diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index 0c01a270bf9f..0af3d8df70dd 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -30,7 +30,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr, }; int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; - nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif); + nf_dup_ipv4(nft_net(pkt), pkt->skb, nft_hook(pkt), &gw, oif); } static int nft_dup_ipv4_init(const struct nft_ctx *ctx, diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c new file mode 100644 index 000000000000..965b1a161369 --- /dev/null +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -0,0 +1,241 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_fib.h> + +#include <net/ip_fib.h> +#include <net/route.h> + +/* don't try to find route from mcast/bcast/zeronet */ +static __be32 get_saddr(__be32 addr) +{ + if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) || + ipv4_is_zeronet(addr)) + return 0; + return addr; +} + +static bool fib4_is_local(const struct sk_buff *skb) +{ + const struct rtable *rt = skb_rtable(skb); + + return rt && (rt->rt_flags & RTCF_LOCAL); +} + +#define DSCP_BITS 0xfc + +void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + u32 *dst = ®s->data[priv->dreg]; + const struct net_device *dev = NULL; + const struct iphdr *iph; + __be32 addr; + + if (priv->flags & NFTA_FIB_F_IIF) + dev = nft_in(pkt); + else if (priv->flags & NFTA_FIB_F_OIF) + dev = nft_out(pkt); + + iph = ip_hdr(pkt->skb); + if (priv->flags & NFTA_FIB_F_DADDR) + addr = iph->daddr; + else + addr = iph->saddr; + + *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); +} +EXPORT_SYMBOL_GPL(nft_fib4_eval_type); + +static int get_ifindex(const struct net_device *dev) +{ + return dev ? dev->ifindex : 0; +} + +void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + const struct iphdr *iph; + struct fib_result res; + struct flowi4 fl4 = { + .flowi4_scope = RT_SCOPE_UNIVERSE, + .flowi4_iif = LOOPBACK_IFINDEX, + }; + const struct net_device *oif; + struct net_device *found; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int i; +#endif + + /* + * Do not set flowi4_oif, it restricts results (for example, asking + * for oif 3 will get RTN_UNICAST result even if the daddr exits + * on another interface. + * + * Search results for the desired outinterface instead. + */ + if (priv->flags & NFTA_FIB_F_OIF) + oif = nft_out(pkt); + else if (priv->flags & NFTA_FIB_F_IIF) + oif = nft_in(pkt); + else + oif = NULL; + + if (nft_hook(pkt) == NF_INET_PRE_ROUTING && fib4_is_local(pkt->skb)) { + nft_fib_store_result(dest, priv->result, pkt, LOOPBACK_IFINDEX); + return; + } + + iph = ip_hdr(pkt->skb); + if (ipv4_is_zeronet(iph->saddr)) { + if (ipv4_is_lbcast(iph->daddr) || + ipv4_is_local_multicast(iph->daddr)) { + nft_fib_store_result(dest, priv->result, pkt, + get_ifindex(pkt->skb->dev)); + return; + } + } + + if (priv->flags & NFTA_FIB_F_MARK) + fl4.flowi4_mark = pkt->skb->mark; + + fl4.flowi4_tos = iph->tos & DSCP_BITS; + + if (priv->flags & NFTA_FIB_F_DADDR) { + fl4.daddr = iph->daddr; + fl4.saddr = get_saddr(iph->saddr); + } else { + fl4.daddr = iph->saddr; + fl4.saddr = get_saddr(iph->daddr); + } + + *dest = 0; + + if (fib_lookup(nft_net(pkt), &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) + return; + + switch (res.type) { + case RTN_UNICAST: + break; + case RTN_LOCAL: /* should not appear here, see fib4_is_local() above */ + return; + default: + break; + } + + if (!oif) { + found = FIB_RES_DEV(res); + goto ok; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + for (i = 0; i < res.fi->fib_nhs; i++) { + struct fib_nh *nh = &res.fi->fib_nh[i]; + + if (nh->nh_dev == oif) { + found = nh->nh_dev; + goto ok; + } + } + return; +#else + found = FIB_RES_DEV(res); + if (found != oif) + return; +#endif +ok: + switch (priv->result) { + case NFT_FIB_RESULT_OIF: + *dest = found->ifindex; + break; + case NFT_FIB_RESULT_OIFNAME: + strncpy((char *)dest, found->name, IFNAMSIZ); + break; + default: + WARN_ON_ONCE(1); + break; + } +} +EXPORT_SYMBOL_GPL(nft_fib4_eval); + +static struct nft_expr_type nft_fib4_type; + +static const struct nft_expr_ops nft_fib4_type_ops = { + .type = &nft_fib4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), + .eval = nft_fib4_eval_type, + .init = nft_fib_init, + .dump = nft_fib_dump, + .validate = nft_fib_validate, +}; + +static const struct nft_expr_ops nft_fib4_ops = { + .type = &nft_fib4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), + .eval = nft_fib4_eval, + .init = nft_fib_init, + .dump = nft_fib_dump, + .validate = nft_fib_validate, +}; + +static const struct nft_expr_ops * +nft_fib4_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + enum nft_fib_result result; + + if (!tb[NFTA_FIB_RESULT]) + return ERR_PTR(-EINVAL); + + result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT])); + + switch (result) { + case NFT_FIB_RESULT_OIF: + return &nft_fib4_ops; + case NFT_FIB_RESULT_OIFNAME: + return &nft_fib4_ops; + case NFT_FIB_RESULT_ADDRTYPE: + return &nft_fib4_type_ops; + default: + return ERR_PTR(-EOPNOTSUPP); + } +} + +static struct nft_expr_type nft_fib4_type __read_mostly = { + .name = "fib", + .select_ops = &nft_fib4_select_ops, + .policy = nft_fib_policy, + .maxattr = NFTA_FIB_MAX, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, +}; + +static int __init nft_fib4_module_init(void) +{ + return nft_register_expr(&nft_fib4_type); +} + +static void __exit nft_fib4_module_exit(void) +{ + nft_unregister_expr(&nft_fib4_type); +} + +module_init(nft_fib4_module_init); +module_exit(nft_fib4_module_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_ALIAS_NFT_AF_EXPR(2, "fib"); diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 51ced81b616c..a0ea8aad1bf1 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,8 +31,14 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, range.max_proto.all = *(__be16 *)®s->data[priv->sreg_proto_max]; } - regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, - &range, pkt->out); + regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt), + &range, nft_out(pkt)); +} + +static void +nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_IPV4); } static struct nft_expr_type nft_masq_ipv4_type; @@ -41,6 +47,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), .eval = nft_masq_ipv4_eval, .init = nft_masq_init, + .destroy = nft_masq_ipv4_destroy, .dump = nft_masq_dump, .validate = nft_masq_validate, }; @@ -77,5 +84,5 @@ module_init(nft_masq_ipv4_module_init); module_exit(nft_masq_ipv4_module_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index c09d4381427e..1650ed23c15d 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -35,8 +35,13 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, mr.range[0].flags |= priv->flags; - regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, - pkt->hook); + regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt)); +} + +static void +nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_IPV4); } static struct nft_expr_type nft_redir_ipv4_type; @@ -45,6 +50,7 @@ static const struct nft_expr_ops nft_redir_ipv4_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), .eval = nft_redir_ipv4_eval, .init = nft_redir_init, + .destroy = nft_redir_ipv4_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, }; @@ -72,5 +78,5 @@ module_init(nft_redir_ipv4_module_init); module_exit(nft_redir_ipv4_module_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index 2c2553b9026c..517ce93699de 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,10 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook); + nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(pkt->net, pkt->skb, pkt->hook); + nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt)); break; default: break; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 96b8e2b95731..5b2635e69a92 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -793,7 +793,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); + inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, + sk->sk_uid); security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index ecbe5a7c2d6d..2300fae11b22 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -89,9 +89,10 @@ struct raw_frag_vec { int hlen; }; -static struct raw_hashinfo raw_v4_hashinfo = { +struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), }; +EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { @@ -120,7 +121,7 @@ void raw_unhash_sk(struct sock *sk) } EXPORT_SYMBOL_GPL(raw_unhash_sk); -static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, +struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif) { sk_for_each_from(sk) { @@ -136,6 +137,7 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, found: return sk; } +EXPORT_SYMBOL_GPL(__raw_v4_lookup); /* * 0 - deliver @@ -604,7 +606,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), - daddr, saddr, 0, 0); + daddr, saddr, 0, 0, sk->sk_uid); if (!inet->hdrincl) { rfv.msg = msg; @@ -693,12 +695,20 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; + u32 tb_id = RT_TABLE_LOCAL; int ret = -EINVAL; int chk_addr_ret; if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); + + if (sk->sk_bound_dev_if) + tb_id = l3mdev_fib_table_by_index(sock_net(sk), + sk->sk_bound_dev_if) ? : tb_id; + + chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr, + tb_id); + ret = -EADDRNOTAVAIL; if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) @@ -912,6 +922,20 @@ static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg } #endif +int raw_abort(struct sock *sk, int err) +{ + lock_sock(sk); + + sk->sk_err = err; + sk->sk_error_report(sk); + __udp_disconnect(sk, 0); + + release_sock(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(raw_abort); + struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, @@ -937,6 +961,7 @@ struct proto raw_prot = { .compat_getsockopt = compat_raw_getsockopt, .compat_ioctl = compat_raw_ioctl, #endif + .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c new file mode 100644 index 000000000000..e1a51ca68d23 --- /dev/null +++ b/net/ipv4/raw_diag.c @@ -0,0 +1,266 @@ +#include <linux/module.h> + +#include <linux/inet_diag.h> +#include <linux/sock_diag.h> + +#include <net/inet_sock.h> +#include <net/raw.h> +#include <net/rawv6.h> + +#ifdef pr_fmt +# undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +static struct raw_hashinfo * +raw_get_hashinfo(const struct inet_diag_req_v2 *r) +{ + if (r->sdiag_family == AF_INET) { + return &raw_v4_hashinfo; +#if IS_ENABLED(CONFIG_IPV6) + } else if (r->sdiag_family == AF_INET6) { + return &raw_v6_hashinfo; +#endif + } else { + pr_warn_once("Unexpected inet family %d\n", + r->sdiag_family); + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); + } +} + +/* + * Due to requirement of not breaking user API we can't simply + * rename @pad field in inet_diag_req_v2 structure, instead + * use helper to figure it out. + */ + +static struct sock *raw_lookup(struct net *net, struct sock *from, + const struct inet_diag_req_v2 *req) +{ + struct inet_diag_req_raw *r = (void *)req; + struct sock *sk = NULL; + + if (r->sdiag_family == AF_INET) + sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol, + r->id.idiag_dst[0], + r->id.idiag_src[0], + r->id.idiag_if); +#if IS_ENABLED(CONFIG_IPV6) + else + sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol, + (const struct in6_addr *)r->id.idiag_src, + (const struct in6_addr *)r->id.idiag_dst, + r->id.idiag_if); +#endif + return sk; +} + +static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) +{ + struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); + struct sock *sk = NULL, *s; + int slot; + + if (IS_ERR(hashinfo)) + return ERR_CAST(hashinfo); + + read_lock(&hashinfo->lock); + for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { + sk_for_each(s, &hashinfo->ht[slot]) { + sk = raw_lookup(net, s, r); + if (sk) { + /* + * Grab it and keep until we fill + * diag meaage to be reported, so + * caller should call sock_put then. + * We can do that because we're keeping + * hashinfo->lock here. + */ + sock_hold(sk); + goto out_unlock; + } + } + } +out_unlock: + read_unlock(&hashinfo->lock); + + return sk ? sk : ERR_PTR(-ENOENT); +} + +static int raw_diag_dump_one(struct sk_buff *in_skb, + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *r) +{ + struct net *net = sock_net(in_skb->sk); + struct sk_buff *rep; + struct sock *sk; + int err; + + sk = raw_sock_get(net, r); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + rep = nlmsg_new(sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + 64, + GFP_KERNEL); + if (!rep) { + sock_put(sk); + return -ENOMEM; + } + + err = inet_sk_diag_fill(sk, NULL, rep, r, + sk_user_ns(NETLINK_CB(in_skb).sk), + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); + sock_put(sk); + + if (err < 0) { + kfree_skb(rep); + return err; + } + + err = netlink_unicast(net->diag_nlsk, rep, + NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; + return err; +} + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, + struct nlattr *bc, bool net_admin) +{ + if (!inet_diag_bc_sk(bc, sk)) + return 0; + + return inet_sk_diag_fill(sk, NULL, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh, net_admin); +} + +static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, struct nlattr *bc) +{ + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); + struct net *net = sock_net(skb->sk); + int num, s_num, slot, s_slot; + struct sock *sk = NULL; + + if (IS_ERR(hashinfo)) + return; + + s_slot = cb->args[0]; + num = s_num = cb->args[1]; + + read_lock(&hashinfo->lock); + for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { + num = 0; + + sk_for_each(sk, &hashinfo->ht[slot]) { + struct inet_sock *inet = inet_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) + goto next; + if (sk->sk_family != r->sdiag_family) + goto next; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) + goto out_unlock; +next: + num++; + } + } + +out_unlock: + read_unlock(&hashinfo->lock); + + cb->args[0] = slot; + cb->args[1] = num; +} + +static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *info) +{ + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); +} + +#ifdef CONFIG_INET_DIAG_DESTROY +static int raw_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *r) +{ + struct net *net = sock_net(in_skb->sk); + struct sock *sk; + int err; + + sk = raw_sock_get(net, r); + if (IS_ERR(sk)) + return PTR_ERR(sk); + err = sock_diag_destroy(sk, ECONNABORTED); + sock_put(sk); + return err; +} +#endif + +static const struct inet_diag_handler raw_diag_handler = { + .dump = raw_diag_dump, + .dump_one = raw_diag_dump_one, + .idiag_get_info = raw_diag_get_info, + .idiag_type = IPPROTO_RAW, + .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = raw_diag_destroy, +#endif +}; + +static void __always_unused __check_inet_diag_req_raw(void) +{ + /* + * Make sure the two structures are identical, + * except the @pad field. + */ +#define __offset_mismatch(m1, m2) \ + (offsetof(struct inet_diag_req_v2, m1) != \ + offsetof(struct inet_diag_req_raw, m2)) + + BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) != + sizeof(struct inet_diag_req_raw)); + BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family)); + BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol)); + BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext)); + BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol)); + BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states)); + BUILD_BUG_ON(__offset_mismatch(id, id)); +#undef __offset_mismatch +} + +static int __init raw_diag_init(void) +{ + return inet_diag_register(&raw_diag_handler); +} + +static void __exit raw_diag_exit(void) +{ + inet_diag_unregister(&raw_diag_handler); +} + +module_init(raw_diag_init); +module_exit(raw_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2a57566e6e91..fa5c037227cb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -507,7 +507,8 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) } EXPORT_SYMBOL(__ip_select_ident); -static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, +static void __build_flow_key(const struct net *net, struct flowi4 *fl4, + const struct sock *sk, const struct iphdr *iph, int oif, u8 tos, u8 prot, u32 mark, int flow_flags) @@ -523,19 +524,21 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, flowi4_init_output(fl4, oif, mark, tos, RT_SCOPE_UNIVERSE, prot, flow_flags, - iph->daddr, iph->saddr, 0, 0); + iph->daddr, iph->saddr, 0, 0, + sock_net_uid(net, sk)); } static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, const struct sock *sk) { + const struct net *net = dev_net(skb->dev); const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; u8 tos = RT_TOS(iph->tos); u8 prot = iph->protocol; u32 mark = skb->mark; - __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); + __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) @@ -552,7 +555,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), - daddr, inet->inet_saddr, 0, 0); + daddr, inet->inet_saddr, 0, 0, sk->sk_uid); rcu_read_unlock(); } @@ -802,7 +805,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf rt = (struct rtable *) dst; - __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0); + __build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } @@ -1020,7 +1023,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, if (!mark) mark = IP4_REPLY_MARK(net, skb->mark); - __build_flow_key(&fl4, NULL, iph, oif, + __build_flow_key(net, &fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { @@ -1036,7 +1039,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct flowi4 fl4; struct rtable *rt; - __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); if (!fl4.flowi4_mark) fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); @@ -1055,6 +1058,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct rtable *rt; struct dst_entry *odst = NULL; bool new = false; + struct net *net = sock_net(sk); bh_lock_sock(sk); @@ -1068,7 +1072,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) goto out; } - __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = (struct rtable *)odst; if (odst->obsolete && !odst->ops->check(odst, 0)) { @@ -1108,7 +1112,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net, struct flowi4 fl4; struct rtable *rt; - __build_flow_key(&fl4, NULL, iph, oif, + __build_flow_key(net, &fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { @@ -1123,9 +1127,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; + struct net *net = sock_net(sk); - __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - rt = __ip_route_output_key(sock_net(sk), &fl4); + __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); @@ -1598,6 +1603,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) spin_unlock_bh(&fnhe_lock); } +static void set_lwt_redirect(struct rtable *rth) +{ + if (lwtunnel_output_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_output = rth->dst.output; + rth->dst.output = lwtunnel_output; + } + + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_input = rth->dst.input; + rth->dst.input = lwtunnel_input; + } +} + /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1687,14 +1705,7 @@ rt_cache: rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); - if (lwtunnel_output_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_output = rth->dst.output; - rth->dst.output = lwtunnel_output; - } - if (lwtunnel_input_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_input = rth->dst.input; - rth->dst.input = lwtunnel_input; - } + set_lwt_redirect(rth); skb_dst_set(skb, &rth->dst); out: err = 0; @@ -1921,8 +1932,18 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } + if (do_cache) { - if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { + struct fib_nh *nh = &FIB_RES_NH(res); + + rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + WARN_ON(rth->dst.input == lwtunnel_input); + rth->dst.lwtstate->orig_input = rth->dst.input; + rth->dst.input = lwtunnel_input; + } + + if (unlikely(!rt_cache_route(nh, rth))) { rth->dst.flags |= DST_NOCACHE; rt_add_uncached_list(rth); } @@ -1982,25 +2003,35 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, */ if (ipv4_is_multicast(daddr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); + int our = 0; + + if (in_dev) + our = ip_check_mc_rcu(in_dev, daddr, saddr, + ip_hdr(skb)->protocol); + + /* check l3 master if no match yet */ + if ((!in_dev || !our) && netif_is_l3_slave(dev)) { + struct in_device *l3_in_dev; - if (in_dev) { - int our = ip_check_mc_rcu(in_dev, daddr, saddr, - ip_hdr(skb)->protocol); - if (our + l3_in_dev = __in_dev_get_rcu(skb->dev); + if (l3_in_dev) + our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, + ip_hdr(skb)->protocol); + } + + res = -EINVAL; + if (our #ifdef CONFIG_IP_MROUTE - || - (!ipv4_is_local_multicast(daddr) && - IN_DEV_MFORWARD(in_dev)) + || + (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif - ) { - int res = ip_route_input_mc(skb, daddr, saddr, - tos, dev, our); - rcu_read_unlock(); - return res; - } + ) { + res = ip_route_input_mc(skb, daddr, saddr, + tos, dev, our); } rcu_read_unlock(); - return -EINVAL; + return res; } res = ip_route_input_slow(skb, daddr, saddr, tos, dev); rcu_read_unlock(); @@ -2140,8 +2171,7 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - if (lwtunnel_output_redirect(rth->dst.lwtstate)) - rth->dst.output = lwtunnel_output; + set_lwt_redirect(rth); return rth; } @@ -2268,7 +2298,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif && - !netif_index_is_l3_master(net, fl4->flowi4_oif)) { + (ipv4_is_multicast(fl4->daddr) || + !netif_index_is_l3_master(net, fl4->flowi4_oif))) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2495,6 +2526,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; + if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && + nla_put_u32(skb, RTA_UID, + from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) + goto nla_put_failure; + error = rt->dst.error; if (rt_is_input_route(rt)) { @@ -2547,6 +2583,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) int mark; struct sk_buff *skb; u32 table_id = RT_TABLE_MAIN; + kuid_t uid; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) @@ -2574,6 +2611,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + if (tb[RTA_UID]) + uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); + else + uid = (iif ? INVALID_UID : current_uid()); memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; @@ -2581,6 +2622,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_tos = rtm->rtm_tos; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; + fl4.flowi4_uid = uid; if (iif) { struct net_device *dev; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index e3c4043c27de..3e88467d70ee 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -334,6 +334,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq = tcp_rsk(req); treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; + treq->ts_off = 0; req->mss = mss; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; @@ -372,7 +373,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, th->source, th->dest); + ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 814af89c1bd3..1ef3165114ba 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,7 +279,6 @@ #include <asm/uaccess.h> #include <asm/ioctls.h> -#include <asm/unaligned.h> #include <net/busy_poll.h> int sysctl_tcp_min_tso_segs __read_mostly = 2; @@ -405,7 +404,6 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; - u64_stats_init(&tp->syncp); tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; tcp_enable_early_retrans(tp); @@ -665,9 +663,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, if (tcp_should_autocork(sk, skb, size_goal)) { /* avoid atomic op if TSQ_THROTTLED bit is already set */ - if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { + if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); - set_bit(TSQ_THROTTLED, &tp->tsq_flags); + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. @@ -998,8 +996,11 @@ do_error: goto out; out_err: /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && + err == -EAGAIN)) { sk->sk_write_space(sk); + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } return sk_stream_error(sk, flags, err); } @@ -1333,8 +1334,11 @@ do_error: out_err: err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && + err == -EAGAIN)) { sk->sk_write_space(sk); + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } release_sock(sk); return err; } @@ -2302,7 +2306,7 @@ EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && - ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); + (sk->sk_state != TCP_LISTEN); } static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) @@ -2704,15 +2708,33 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_setsockopt); #endif +static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, + struct tcp_info *info) +{ + u64 stats[__TCP_CHRONO_MAX], total = 0; + enum tcp_chrono i; + + for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { + stats[i] = tp->chrono_stat[i - 1]; + if (i == tp->chrono_type) + stats[i] += tcp_time_stamp - tp->chrono_start; + stats[i] *= USEC_PER_SEC / HZ; + total += stats[i]; + } + + info->tcpi_busy_time = total; + info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; + info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; +} + /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp, intv; - unsigned int start; - int notsent_bytes; u64 rate64; + bool slow; u32 rate; memset(info, 0, sizeof(*info)); @@ -2721,6 +2743,27 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_state = sk_state_load(sk); + /* Report meaningful fields for all TCP states, including listeners */ + rate = READ_ONCE(sk->sk_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + info->tcpi_pacing_rate = rate64; + + rate = READ_ONCE(sk->sk_max_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + info->tcpi_max_pacing_rate = rate64; + + info->tcpi_reordering = tp->reordering; + info->tcpi_snd_cwnd = tp->snd_cwnd; + + if (info->tcpi_state == TCP_LISTEN) { + /* listeners aliased fields : + * tcpi_unacked -> Number of children ready for accept() + * tcpi_sacked -> max backlog + */ + info->tcpi_unacked = sk->sk_ack_backlog; + info->tcpi_sacked = sk->sk_max_ack_backlog; + return; + } info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2748,13 +2791,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - if (info->tcpi_state == TCP_LISTEN) { - info->tcpi_unacked = sk->sk_ack_backlog; - info->tcpi_sacked = sk->sk_max_ack_backlog; - } else { - info->tcpi_unacked = tp->packets_out; - info->tcpi_sacked = tp->sacked_out; - } + info->tcpi_unacked = tp->packets_out; + info->tcpi_sacked = tp->sacked_out; + info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; info->tcpi_fackets = tp->fackets_out; @@ -2768,34 +2807,25 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rtt = tp->srtt_us >> 3; info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; - info->tcpi_snd_cwnd = tp->snd_cwnd; info->tcpi_advmss = tp->advmss; - info->tcpi_reordering = tp->reordering; info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; - rate = READ_ONCE(sk->sk_pacing_rate); - rate64 = rate != ~0U ? rate : ~0ULL; - put_unaligned(rate64, &info->tcpi_pacing_rate); + slow = lock_sock_fast(sk); - rate = READ_ONCE(sk->sk_max_pacing_rate); - rate64 = rate != ~0U ? rate : ~0ULL; - put_unaligned(rate64, &info->tcpi_max_pacing_rate); + info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; + info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); + tcp_get_info_chrono_stats(tp, info); + + unlock_sock_fast(sk, slow); - do { - start = u64_stats_fetch_begin_irq(&tp->syncp); - put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked); - put_unaligned(tp->bytes_received, &info->tcpi_bytes_received); - } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; - notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); - info->tcpi_notsent_bytes = max(0, notsent_bytes); - info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_in = tp->data_segs_in; info->tcpi_data_segs_out = tp->data_segs_out; @@ -2806,11 +2836,31 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (rate && intv) { rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; do_div(rate64, intv); - put_unaligned(rate64, &info->tcpi_delivery_rate); + info->tcpi_delivery_rate = rate64; } } EXPORT_SYMBOL_GPL(tcp_get_info); +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *stats; + struct tcp_info info; + + stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); + if (!stats) + return NULL; + + tcp_get_info_chrono_stats(tp, &info); + nla_put_u64_64bit(stats, TCP_NLA_BUSY, + info.tcpi_busy_time, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, + info.tcpi_rwnd_limited, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, + info.tcpi_sndbuf_limited, TCP_NLA_PAD); + return stats; +} + static int do_tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 0ea66c2c9344..b89bce4c721e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -14,6 +14,36 @@ * observed, or adjust the sending rate if it estimates there is a * traffic policer, in order to keep the drop rate reasonable. * + * Here is a state transition diagram for BBR: + * + * | + * V + * +---> STARTUP ----+ + * | | | + * | V | + * | DRAIN ----+ + * | | | + * | V | + * +---> PROBE_BW ----+ + * | ^ | | + * | | | | + * | +----+ | + * | | + * +---- PROBE_RTT <--+ + * + * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. + * When it estimates the pipe is full, it enters DRAIN to drain the queue. + * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. + * A long-lived BBR flow spends the vast majority of its time remaining + * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth + * in a fair manner, with a small, bounded queue. *If* a flow has been + * continuously sending for the entire min_rtt window, and hasn't seen an RTT + * sample that matches or decreases its min_rtt estimate for 10 seconds, then + * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe + * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if + * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; + * otherwise we enter STARTUP to try to fill the pipe. + * * BBR is described in detail in: * "BBR: Congestion-Based Congestion Control", * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, @@ -51,7 +81,7 @@ enum bbr_mode { BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ BBR_DRAIN, /* drain any queue created during startup */ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ - BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */ + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ }; /* BBR congestion control block */ diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index f9038d6b109e..79c4817abc94 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -68,8 +68,9 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret = 0; - /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) { + /* all algorithms must implement these */ + if (!ca->ssthresh || !ca->undo_cwnd || + !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } @@ -443,10 +444,19 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); +u32 tcp_reno_undo_cwnd(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + return max(tp->snd_cwnd, tp->snd_ssthresh << 1); +} +EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); + struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, }; diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index ab37c6775630..5f5e5936760e 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -335,6 +335,7 @@ static struct tcp_congestion_ops dctcp __read_mostly = { static struct tcp_congestion_ops dctcp_reno __read_mostly = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, .get_info = dctcp_get_info, .owner = THIS_MODULE, .name = "dctcp-reno", diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index db7842495a64..6d9879e93648 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -94,6 +94,7 @@ static const struct hstcp_aimd_val { struct hstcp { u32 ai; + u32 loss_cwnd; }; static void hstcp_init(struct sock *sk) @@ -150,16 +151,24 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 hstcp_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - const struct hstcp *ca = inet_csk_ca(sk); + struct hstcp *ca = inet_csk_ca(sk); + ca->loss_cwnd = tp->snd_cwnd; /* Do multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); } +static u32 hstcp_cwnd_undo(struct sock *sk) +{ + const struct hstcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, + .undo_cwnd = hstcp_cwnd_undo, .cong_avoid = hstcp_cong_avoid, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 083831e359df..0f7175c3338e 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -166,6 +166,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = hybla_cong_avoid, .set_state = hybla_state, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index c8e6d86be114..60352ff4f5a8 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -48,6 +48,7 @@ struct illinois { u32 end_seq; /* right edge of current RTT */ u32 alpha; /* Additive increase */ u32 beta; /* Muliplicative decrease */ + u32 loss_cwnd; /* cwnd on loss */ u16 acked; /* # packets acked by current ACK */ u8 rtt_above; /* average rtt has gone above threshold */ u8 rtt_low; /* # of rtts measurements below threshold */ @@ -296,10 +297,18 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); + ca->loss_cwnd = tp->snd_cwnd; /* Multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); } +static u32 tcp_illinois_cwnd_undo(struct sock *sk) +{ + const struct illinois *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + /* Extract info for Tcp socket info provided via netlink. */ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) @@ -327,6 +336,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, static struct tcp_congestion_ops tcp_illinois __read_mostly = { .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, + .undo_cwnd = tcp_illinois_cwnd_undo, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, .get_info = tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c71d49ce0c93..6c790754ae3e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,6 +85,7 @@ int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +EXPORT_SYMBOL(sysctl_tcp_timestamps); /* rfc5961 challenge ack rate limiting */ int sysctl_tcp_challenge_ack_limit = 1000; @@ -2414,10 +2415,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->undo_cwnd) - tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); - else - tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); + tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; @@ -3201,6 +3199,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->lost_skb_hint = NULL; } + if (!skb) + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; @@ -3371,9 +3372,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) u32 delta = ack - tp->snd_una; sock_owned_by_me((struct sock *)tp); - u64_stats_update_begin_raw(&tp->syncp); tp->bytes_acked += delta; - u64_stats_update_end_raw(&tp->syncp); tp->snd_una = ack; } @@ -3383,9 +3382,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) u32 delta = seq - tp->rcv_nxt; sock_owned_by_me((struct sock *)tp); - u64_stats_update_begin_raw(&tp->syncp); tp->bytes_received += delta; - u64_stats_update_end_raw(&tp->syncp); tp->rcv_nxt = seq; } @@ -5083,8 +5080,11 @@ static void tcp_check_space(struct sock *sk) /* pairs with tcp_poll() */ smp_mb__after_atomic(); if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } } } @@ -6318,13 +6318,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - - /* Accept backlog is full. If we have already queued enough - * of warm entries in syn queue, drop request. It is better than - * clogging syn queue with openreqs with exponentially increasing - * timeout. - */ - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { + if (sk_acceptq_is_full(sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; } @@ -6334,6 +6328,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; tcp_rsk(req)->af_specific = af_ops; + tcp_rsk(req)->ts_off = 0; tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; @@ -6355,6 +6350,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; + if (isn && tmp_opt.tstamp_ok) + af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); + if (!want_cookie && !isn) { /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering @@ -6395,7 +6393,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_release; } - isn = af_ops->init_seq(skb); + isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); } if (!dst) { dst = af_ops->route_req(sk, &fl, req, NULL); @@ -6407,6 +6405,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + tcp_rsk(req)->ts_off = 0; req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2259114c7242..30d81f533ada 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -95,12 +95,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static __u32 tcp_v4_init_sequence(const struct sk_buff *skb) +static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff) { return secure_tcp_sequence_number(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, tcp_hdr(skb)->dest, - tcp_hdr(skb)->source); + tcp_hdr(skb)->source, tsoff); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -237,7 +237,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, - usin->sin_port); + usin->sin_port, + &tp->tsoffset); inet->inet_id = tp->write_seq ^ jiffies; @@ -442,7 +443,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); } else { - if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } goto out; @@ -691,6 +692,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) offsetof(struct inet_timewait_sock, tw_bound_dev_if)); arg.tos = ip_hdr(skb)->tos; + arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, @@ -711,7 +713,7 @@ out: outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct net *net, +static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, @@ -726,6 +728,7 @@ static void tcp_v4_send_ack(struct net *net, #endif ]; } rep; + struct net *net = sock_net(sk); struct ip_reply_arg arg; memset(&rep.th, 0, sizeof(struct tcphdr)); @@ -775,6 +778,7 @@ static void tcp_v4_send_ack(struct net *net, if (oif) arg.bound_dev_if = oif; arg.tos = tos; + arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, @@ -790,7 +794,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(sock_net(sk), skb, + tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, @@ -818,10 +822,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * exception of <SYN> segments, MUST be right-shifted by * Rcv.Wind.Shift bits: */ - tcp_v4_send_ack(sock_net(sk), skb, seq, + tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp, + tcp_time_stamp + tcp_rsk(req)->ts_off, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, @@ -1908,7 +1912,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) if (!sk) { get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_lock_bh(&ilb->lock); + spin_lock(&ilb->lock); sk = sk_head(&ilb->head); st->offset = 0; goto get_sk; @@ -1925,7 +1929,7 @@ get_sk: if (sk->sk_family == st->family) return sk; } - spin_unlock_bh(&ilb->lock); + spin_unlock(&ilb->lock); st->offset = 0; if (++st->bucket < INET_LHTABLE_SIZE) goto get_head; @@ -2133,7 +2137,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); + spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); break; case TCP_SEQ_STATE_ESTABLISHED: if (v) diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index c67ece1390c2..046fd3910873 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -316,6 +316,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) static struct tcp_congestion_ops tcp_lp __read_mostly = { .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_lp_cong_avoid, .pkts_acked = tcp_lp_pkts_acked, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index bf1f3b2b29d1..d46f4d5b1c62 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -742,14 +742,7 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, rcu_read_unlock(); } -static struct genl_family tcp_metrics_nl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = TCP_METRICS_GENL_NAME, - .version = TCP_METRICS_GENL_VERSION, - .maxattr = TCP_METRICS_ATTR_MAX, - .netnsok = true, -}; +static struct genl_family tcp_metrics_nl_family; static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, @@ -1116,6 +1109,17 @@ static const struct genl_ops tcp_metrics_nl_ops[] = { }, }; +static struct genl_family tcp_metrics_nl_family __ro_after_init = { + .hdrsize = 0, + .name = TCP_METRICS_GENL_NAME, + .version = TCP_METRICS_GENL_VERSION, + .maxattr = TCP_METRICS_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = tcp_metrics_nl_ops, + .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops), +}; + static unsigned int tcpmhash_entries; static int __init set_tcpmhash_entries(char *str) { @@ -1179,8 +1183,7 @@ void __init tcp_metrics_init(void) if (ret < 0) panic("Could not allocate the tcp_metrics hash table\n"); - ret = genl_register_family_with_ops(&tcp_metrics_nl_family, - tcp_metrics_nl_ops); + ret = genl_register_family(&tcp_metrics_nl_family); if (ret < 0) panic("Could not register tcp_metrics generic netlink\n"); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6234ebaa7db1..28ce5ee831f5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -532,7 +532,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } - newtp->tsoffset = 0; + newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ if (newtp->af_specific->md5_lookup(sk, newsk)) @@ -581,6 +581,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; + if (tmp_opt.rcv_tsecr) + tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 896e9dfbdb5c..b45101f3d2bd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -640,7 +640,7 @@ static unsigned int tcp_synack_options(struct request_sock *req, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp(skb); + opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off; opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -769,25 +769,26 @@ static void tcp_tasklet_func(unsigned long data) list_del(&tp->tsq_node); sk = (struct sock *)tp; - bh_lock_sock(sk); - - if (!sock_owned_by_user(sk)) { - tcp_tsq_handler(sk); - } else { - /* defer the work to tcp_release_cb() */ - set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); + clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); + + if (!sk->sk_lock.owned && + test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) { + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); + tcp_tsq_handler(sk); + } + bh_unlock_sock(sk); } - bh_unlock_sock(sk); - clear_bit(TSQ_QUEUED, &tp->tsq_flags); sk_free(sk); } } -#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ - (1UL << TCP_WRITE_TIMER_DEFERRED) | \ - (1UL << TCP_DELACK_TIMER_DEFERRED) | \ - (1UL << TCP_MTU_REDUCED_DEFERRED)) +#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ + TCPF_WRITE_TIMER_DEFERRED | \ + TCPF_DELACK_TIMER_DEFERRED | \ + TCPF_MTU_REDUCED_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket @@ -797,18 +798,17 @@ static void tcp_tasklet_func(unsigned long data) */ void tcp_release_cb(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nflags; /* perform an atomic operation only if at least one flag is set */ do { - flags = tp->tsq_flags; + flags = sk->sk_tsq_flags; if (!(flags & TCP_DEFERRED_ALL)) return; nflags = flags & ~TCP_DEFERRED_ALL; - } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); + } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); - if (flags & (1UL << TCP_TSQ_DEFERRED)) + if (flags & TCPF_TSQ_DEFERRED) tcp_tsq_handler(sk); /* Here begins the tricky part : @@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk) */ sock_release_ownership(sk); - if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { + if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk); __sock_put(sk); } - if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { + if (flags & TCPF_DELACK_TIMER_DEFERRED) { tcp_delack_timer_handler(sk); __sock_put(sk); } - if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { + if (flags & TCPF_MTU_REDUCED_DEFERRED) { inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } @@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); + unsigned long flags, nval, oval; int wmem; /* Keep one reference on sk_wmem_alloc. @@ -877,16 +878,25 @@ void tcp_wfree(struct sk_buff *skb) if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; - if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && - !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { - unsigned long flags; + for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { struct tsq_tasklet *tsq; + bool empty; + + if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) + goto out; + + nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; + nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); + if (nval != oval) + continue; /* queue this socket to tasklet queue */ local_irq_save(flags); tsq = this_cpu_ptr(&tsq_tasklet); + empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); - tasklet_schedule(&tsq->tasklet); + if (empty) + tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); return; } @@ -1514,6 +1524,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (sysctl_tcp_slow_start_after_idle && (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); + + /* The following conditions together indicate the starvation + * is caused by insufficient sender buffer: + * 1) just sent some data (see tcp_write_xmit) + * 2) not cwnd limited (this else condition) + * 3) no more data to send (null tcp_send_head ) + * 4) application is hitting buffer limit (SOCK_NOSPACE) + */ + if (!tcp_send_head(sk) && sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && + (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); } } @@ -1910,26 +1932,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) */ static int tcp_mtu_probe(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *nskb, *next; struct net *net = sock_net(sk); - int len; int probe_size; int size_needed; - int copy; + int copy, len; int mss_now; int interval; /* Not currently probing/verifying, * not in recovery, * have enough cwnd, and - * not SACKing (the variable headers throw things off) */ - if (!icsk->icsk_mtup.enabled || - icsk->icsk_mtup.probe_size || - inet_csk(sk)->icsk_ca_state != TCP_CA_Open || - tp->snd_cwnd < 11 || - tp->rx_opt.num_sacks || tp->rx_opt.dsack) + * not SACKing (the variable headers throw things off) + */ + if (likely(!icsk->icsk_mtup.enabled || + icsk->icsk_mtup.probe_size || + inet_csk(sk)->icsk_ca_state != TCP_CA_Open || + tp->snd_cwnd < 11 || + tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; /* Use binary search for probe_size between tcp_mss_base, @@ -2069,7 +2091,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit <<= factor; if (atomic_read(&sk->sk_wmem_alloc) > limit) { - set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags); + /* Always send the 1st or 2nd skb in write queue. + * No need to wait for TX completion to call us back, + * after softirq/tasklet schedule. + * This helps when TX completions are delayed too much. + */ + if (skb == sk->sk_write_queue.next || + skb->prev == sk->sk_write_queue.next) + return false; + + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. @@ -2081,6 +2112,47 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, return false; } +static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) +{ + const u32 now = tcp_time_stamp; + + if (tp->chrono_type > TCP_CHRONO_UNSPEC) + tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; + tp->chrono_start = now; + tp->chrono_type = new; +} + +void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* If there are multiple conditions worthy of tracking in a + * chronograph then the highest priority enum takes precedence + * over the other conditions. So that if something "more interesting" + * starts happening, stop the previous chrono and start a new one. + */ + if (type > tp->chrono_type) + tcp_chrono_set(tp, type); +} + +void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) +{ + struct tcp_sock *tp = tcp_sk(sk); + + + /* There are multiple conditions worthy of tracking in a + * chronograph, so that the highest priority enum takes + * precedence over the other conditions (see tcp_chrono_start). + * If a condition stops, we only stop chrono tracking if + * it's the "most interesting" or current chrono we are + * tracking and starts busy chrono if we have pending data. + */ + if (tcp_write_queue_empty(sk)) + tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); + else if (type == tp->chrono_type) + tcp_chrono_set(tp, TCP_CHRONO_BUSY); +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -2103,7 +2175,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; - bool is_cwnd_limited = false; + bool is_cwnd_limited = false, is_rwnd_limited = false; u32 max_segs; sent_pkts = 0; @@ -2140,8 +2212,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } - if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { + is_rwnd_limited = true; break; + } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, @@ -2167,6 +2241,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; + if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) + clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -2186,6 +2262,11 @@ repair: break; } + if (is_rwnd_limited) + tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); + else + tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -2514,7 +2595,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, } /* Collapses two adjacent SKB's during retransmission. */ -static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) +static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); @@ -2525,13 +2606,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); + if (next_skb_size) { + if (next_skb_size <= skb_availroom(skb)) + skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), + next_skb_size); + else if (!skb_shift(skb, next_skb, next_skb_size)) + return false; + } tcp_highest_sack_combine(sk, next_skb, skb); tcp_unlink_write_queue(next_skb, sk); - skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), - next_skb_size); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -2560,6 +2645,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_skb_collapse_tstamp(skb, next_skb); sk_wmem_free_skb(sk, next_skb); + return true; } /* Check if coalescing SKBs is legal. */ @@ -2567,14 +2653,11 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) return false; - /* TODO: SACK collapsing could be used to remove this condition */ - if (skb_shinfo(skb)->nr_frags != 0) - return false; if (skb_cloned(skb)) return false; if (skb == tcp_send_head(sk)) return false; - /* Some heurestics for collapsing over SACK'd could be invented */ + /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; @@ -2612,16 +2695,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (space < 0) break; - /* Punt if not enough space exists in the first SKB for - * the data in the second - */ - if (skb->len > skb_availroom(to)) - break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) break; - tcp_collapse_retrans(sk, to); + if (!tcp_collapse_retrans(sk, to)) + break; } } @@ -3300,6 +3379,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) fo->copied = space; tcp_connect_queue_skb(sk, syn_data); + if (syn_data->len) + tcp_chrono_start(sk, TCP_CHRONO_BUSY); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); @@ -3464,8 +3545,6 @@ void tcp_send_ack(struct sock *sk) /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 - * We also avoid tcp_wfree() overhead (cache line miss accessing - * tp->tsq_flags) by using regular sock_wfree() */ skb_set_tcp_pure_ack(buff); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index bf5ea9e9bbc1..f2123075ce6e 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,6 +15,10 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 +struct scalable { + u32 loss_cwnd; +}; + static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -32,12 +36,23 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + struct scalable *ca = inet_csk_ca(sk); + + ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); } +static u32 tcp_scalable_cwnd_undo(struct sock *sk) +{ + const struct scalable *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, + .undo_cwnd = tcp_scalable_cwnd_undo, .cong_avoid = tcp_scalable_cong_avoid, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3ea1cf804748..3705075f42c3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -310,7 +310,7 @@ static void tcp_delack_timer(unsigned long data) inet_csk(sk)->icsk_ack.blocked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); @@ -592,7 +592,7 @@ static void tcp_write_timer(unsigned long data) tcp_write_timer_handler(sk); } else { /* delegate our work to tcp_release_cb() */ - if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 4c4bac1b5eab..218cfcc77650 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas __read_mostly = { .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_vegas_cong_avoid, .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 40171e163cff..76005d4b8dfc 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -30,6 +30,7 @@ struct veno { u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ u32 inc; /* decide whether to increase cwnd */ u32 diff; /* calculate the diff rate */ + u32 loss_cwnd; /* cwnd when loss occured */ }; /* There are several situations when we must "re-start" Veno: @@ -193,6 +194,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); + veno->loss_cwnd = tp->snd_cwnd; if (veno->diff < beta) /* in "non-congestive state", cut cwnd by 1/5 */ return max(tp->snd_cwnd * 4 / 5, 2U); @@ -201,9 +203,17 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } +static u32 tcp_veno_cwnd_undo(struct sock *sk) +{ + const struct veno *veno = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd); +} + static struct tcp_congestion_ops tcp_veno __read_mostly = { .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, + .undo_cwnd = tcp_veno_cwnd_undo, .cong_avoid = tcp_veno_cong_avoid, .pkts_acked = tcp_veno_pkts_acked, .set_state = tcp_veno_state, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 4b03a2e2a050..fed66dc0e0f5 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -278,6 +278,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = tcp_westwood_event, .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 9c5fc973267f..e6ff99c4bd3b 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -37,6 +37,7 @@ struct yeah { u32 fast_count; u32 pkts_acked; + u32 loss_cwnd; }; static void tcp_yeah_init(struct sock *sk) @@ -219,13 +220,22 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) yeah->fast_count = 0; yeah->reno_count = max(yeah->reno_count>>1, 2U); + yeah->loss_cwnd = tp->snd_cwnd; return max_t(int, tp->snd_cwnd - reduction, 2); } +static u32 tcp_yeah_cwnd_undo(struct sock *sk) +{ + const struct yeah *yeah = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd); +} + static struct tcp_congestion_ops tcp_yeah __read_mostly = { .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, + .undo_cwnd = tcp_yeah_cwnd_undo, .cong_avoid = tcp_yeah_cong_avoid, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5bab6c3f7a2f..9ca279b130d5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -580,7 +580,8 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ - IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \ + IS_ENABLED(CONFIG_NF_SOCKET_IPV4) struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { @@ -1019,7 +1020,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, - faddr, saddr, dport, inet->inet_sport); + faddr, saddr, dport, inet->inet_sport, + sk->sk_uid); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); @@ -1172,6 +1174,181 @@ out: return ret; } +/* fully reclaim rmem/fwd memory allocated for skb */ +static void udp_rmem_release(struct sock *sk, int size, int partial) +{ + struct udp_sock *up = udp_sk(sk); + int amt; + + if (likely(partial)) { + up->forward_deficit += size; + size = up->forward_deficit; + if (size < (sk->sk_rcvbuf >> 2) && + !skb_queue_empty(&sk->sk_receive_queue)) + return; + } else { + size += up->forward_deficit; + } + up->forward_deficit = 0; + + sk->sk_forward_alloc += size; + amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1); + sk->sk_forward_alloc -= amt; + + if (amt) + __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT); + + atomic_sub(size, &sk->sk_rmem_alloc); +} + +/* Note: called with sk_receive_queue.lock held. + * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch + * This avoids a cache line miss while receive_queue lock is held. + * Look at __udp_enqueue_schedule_skb() to find where this copy is done. + */ +void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) +{ + udp_rmem_release(sk, skb->dev_scratch, 1); +} +EXPORT_SYMBOL(udp_skb_destructor); + +/* Idea of busylocks is to let producers grab an extra spinlock + * to relieve pressure on the receive_queue spinlock shared by consumer. + * Under flood, this means that only one producer can be in line + * trying to acquire the receive_queue spinlock. + * These busylock can be allocated on a per cpu manner, instead of a + * per socket one (that would consume a cache line per socket) + */ +static int udp_busylocks_log __read_mostly; +static spinlock_t *udp_busylocks __read_mostly; + +static spinlock_t *busylock_acquire(void *ptr) +{ + spinlock_t *busy; + + busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); + spin_lock(busy); + return busy; +} + +static void busylock_release(spinlock_t *busy) +{ + if (busy) + spin_unlock(busy); +} + +int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff_head *list = &sk->sk_receive_queue; + int rmem, delta, amt, err = -ENOMEM; + spinlock_t *busy = NULL; + int size; + + /* try to avoid the costly atomic add/sub pair when the receive + * queue is full; always allow at least a packet + */ + rmem = atomic_read(&sk->sk_rmem_alloc); + if (rmem > sk->sk_rcvbuf) + goto drop; + + /* Under mem pressure, it might be helpful to help udp_recvmsg() + * having linear skbs : + * - Reduce memory overhead and thus increase receive queue capacity + * - Less cache line misses at copyout() time + * - Less work at consume_skb() (less alien page frag freeing) + */ + if (rmem > (sk->sk_rcvbuf >> 1)) { + skb_condense(skb); + + busy = busylock_acquire(sk); + } + size = skb->truesize; + /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss + * in udp_skb_destructor() + */ + skb->dev_scratch = size; + + /* we drop only if the receive buf is full and the receive + * queue contains some other skb + */ + rmem = atomic_add_return(size, &sk->sk_rmem_alloc); + if (rmem > (size + sk->sk_rcvbuf)) + goto uncharge_drop; + + spin_lock(&list->lock); + if (size >= sk->sk_forward_alloc) { + amt = sk_mem_pages(size); + delta = amt << SK_MEM_QUANTUM_SHIFT; + if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) { + err = -ENOBUFS; + spin_unlock(&list->lock); + goto uncharge_drop; + } + + sk->sk_forward_alloc += delta; + } + + sk->sk_forward_alloc -= size; + + /* no need to setup a destructor, we will explicitly release the + * forward allocated memory on dequeue + */ + sock_skb_set_dropcount(sk, skb); + + __skb_queue_tail(list, skb); + spin_unlock(&list->lock); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + + busylock_release(busy); + return 0; + +uncharge_drop: + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + +drop: + atomic_inc(&sk->sk_drops); + busylock_release(busy); + return err; +} +EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); + +void udp_destruct_sock(struct sock *sk) +{ + /* reclaim completely the forward allocated memory */ + unsigned int total = 0; + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + total += skb->truesize; + kfree_skb(skb); + } + udp_rmem_release(sk, total, 0); + + inet_sock_destruct(sk); +} +EXPORT_SYMBOL_GPL(udp_destruct_sock); + +int udp_init_sock(struct sock *sk) +{ + sk->sk_destruct = udp_destruct_sock; + return 0; +} +EXPORT_SYMBOL_GPL(udp_init_sock); + +void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) +{ + if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) { + bool slow = lock_sock_fast(sk); + + sk_peek_offset_bwd(sk, len); + unlock_sock_fast(sk, slow); + } + consume_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_consume_udp); + /** * first_packet_length - return length of first packet in receive queue * @sk: socket @@ -1181,12 +1358,11 @@ out: */ static int first_packet_length(struct sock *sk) { - struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue; + struct sk_buff_head *rcvq = &sk->sk_receive_queue; struct sk_buff *skb; + int total = 0; int res; - __skb_queue_head_init(&list_kill); - spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL && udp_lib_checksum_complete(skb)) { @@ -1196,18 +1372,13 @@ static int first_packet_length(struct sock *sk) IS_UDPLITE(sk)); atomic_inc(&sk->sk_drops); __skb_unlink(skb, rcvq); - __skb_queue_tail(&list_kill, skb); + total += skb->truesize; + kfree_skb(skb); } res = skb ? skb->len : -1; + if (total) + udp_rmem_release(sk, total, 1); spin_unlock_bh(&rcvq->lock); - - if (!skb_queue_empty(&list_kill)) { - bool slow = lock_sock_fast(sk); - - __skb_queue_purge(&list_kill); - sk_mem_reclaim_partial(sk); - unlock_sock_fast(sk, slow); - } return res; } @@ -1256,15 +1427,13 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int err; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; - bool slow; if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len, addr_len); try_again: peeking = off = sk_peek_offset(sk, flags); - skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), - &peeked, &off, &err); + skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; @@ -1281,7 +1450,8 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) { + if (copied < ulen || peeking || + (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { checksum_valid = !udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; @@ -1297,13 +1467,12 @@ try_again: } if (unlikely(err)) { - trace_kfree_skb(skb, udp_recvmsg); if (!peeked) { atomic_inc(&sk->sk_drops); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } - skb_free_datagram_locked(sk, skb); + kfree_skb(skb); return err; } @@ -1322,22 +1491,21 @@ try_again: *addr_len = sizeof(*sin); } if (inet->cmsg_flags) - ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off); + ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); err = copied; if (flags & MSG_TRUNC) err = ulen; - __skb_free_datagram_locked(sk, skb, peeking ? -err : err); + skb_consume_udp(sk, skb, peeking ? -err : err); return err; csum_copy_err: - slow = lock_sock_fast(sk); - if (!skb_kill_datagram(sk, skb, flags)) { + if (!__sk_queue_drop_skb(sk, skb, flags)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } - unlock_sock_fast(sk, slow); + kfree_skb(skb); /* starting over for a new packet, but check if we need to yield */ cond_resched(); @@ -1463,9 +1631,11 @@ int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); + } else { + sk_mark_napi_id_once(sk, skb); } - rc = __sock_queue_rcv_skb(sk, skb); + rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); @@ -1480,7 +1650,6 @@ int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } return 0; - } static struct static_key udp_encap_needed __read_mostly; @@ -1502,7 +1671,6 @@ EXPORT_SYMBOL(udp_encap_enable); int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); - int rc; int is_udplite = IS_UDPLITE(sk); /* @@ -1589,25 +1757,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto drop; udp_csum_pull_header(skb); - if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { - __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, - is_udplite); - goto drop; - } - - rc = 0; ipv4_pktinfo_prepare(sk, skb); - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) - rc = __udp_queue_rcv_skb(sk, skb); - else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { - bh_unlock_sock(sk); - goto drop; - } - bh_unlock_sock(sk); - - return rc; + return __udp_queue_rcv_skb(sk, skb); csum_error: __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); @@ -2217,13 +2369,13 @@ struct proto udp_prot = { .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, + .init = udp_init_sock, .destroy = udp_destroy_sock, .setsockopt = udp_setsockopt, .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .sendpage = udp_sendpage, - .backlog_rcv = __udp_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, @@ -2512,6 +2664,7 @@ EXPORT_SYMBOL(udp_flow_hashrnd); void __init udp_init(void) { unsigned long limit; + unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; @@ -2522,4 +2675,13 @@ void __init udp_init(void) sysctl_udp_rmem_min = SK_MEM_QUANTUM; sysctl_udp_wmem_min = SK_MEM_QUANTUM; + + /* 16 spinlocks per cpu */ + udp_busylocks_log = ilog2(nr_cpu_ids) + 4; + udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, + GFP_KERNEL); + if (!udp_busylocks) + panic("UDP: failed to alloc udp_busylocks\n"); + for (i = 0; i < (1U << udp_busylocks_log); i++) + spin_lock_init(udp_busylocks + i); } diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index ff450c2aad9b..59f10fe9782e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -50,10 +50,11 @@ struct proto udplite_prot = { .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .sendpage = udp_sendpage, - .backlog_rcv = __udp_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT |