diff options
Diffstat (limited to 'net/ipv4')
46 files changed, 1225 insertions, 1472 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index f48fe6fc7e8c..80dad301361d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -212,9 +212,14 @@ config NET_IPGRE_BROADCAST Network), but can be distributed all over the Internet. If you want to do that, say Y here and to "IP multicast routing" below. +config IP_MROUTE_COMMON + bool + depends on IP_MROUTE || IPV6_MROUTE + config IP_MROUTE bool "IP: multicast routing" depends on IP_MULTICAST + select IP_MROUTE_COMMON help This is used if you want your machine to act as a router for IP packets that have several destination addresses. It is needed on the diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 47a0a6649a9d..a07b7dd06def 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o +obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o obj-$(CONFIG_NET_FOU) += fou.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e4329e161943..eaed0367e669 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -432,23 +432,37 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; - struct inet_sock *inet = inet_sk(sk); - struct net *net = sock_net(sk); - unsigned short snum; - int chk_addr_ret; - u32 tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { - err = sk->sk_prot->bind(sk, uaddr, addr_len); - goto out; + return sk->sk_prot->bind(sk, uaddr, addr_len); } - err = -EINVAL; if (addr_len < sizeof(struct sockaddr_in)) - goto out; + return -EINVAL; + + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + if (err) + return err; + + return __inet_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet_bind); + +int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + unsigned short snum; + int chk_addr_ret; + u32 tb_id = RT_TABLE_LOCAL; + int err; if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) @@ -492,7 +506,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ - lock_sock(sk); + if (with_lock) + lock_sock(sk); /* Check these errors (active socket, double bind). */ err = -EINVAL; @@ -504,11 +519,18 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if ((snum || !inet->bind_address_no_port) && - sk->sk_prot->get_port(sk, snum)) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - err = -EADDRINUSE; - goto out_release_sock; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; + } + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } if (inet->inet_rcv_saddr) @@ -521,22 +543,29 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); err = 0; out_release_sock: - release_sock(sk); + if (with_lock) + release_sock(sk); out: return err; } -EXPORT_SYMBOL(inet_bind); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + return err; + } + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->connect(sk, uaddr, addr_len); @@ -617,6 +646,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != TCP_CLOSE) goto out; + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + goto out; + } + err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; @@ -723,7 +758,7 @@ EXPORT_SYMBOL(inet_accept); * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -745,8 +780,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_addr.s_addr = addr; } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - *uaddr_len = sizeof(*sin); - return 0; + return sizeof(*sin); } EXPORT_SYMBOL(inet_getname); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f28f06c91ead..be4c595edccb 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1434,7 +1434,7 @@ static const struct file_operations arp_seq_fops = { static int __net_init arp_net_init(struct net *net) { - if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops)) + if (!proc_create("arp", 0444, net->proc_net, &arp_seq_fops)) return -ENOMEM; return 0; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 296d0b956bfe..97689012b357 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -654,7 +654,7 @@ static void esp_input_restore_header(struct sk_buff *skb) static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) { struct xfrm_state *x = xfrm_input_state(skb); - struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data; + struct ip_esp_hdr *esph; /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index da5635fc52c2..7cf755ef9efb 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -138,6 +138,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); + else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) + esp_features = features & ~NETIF_F_CSUM_MASK; xo->flags |= XFRM_GSO_SEGMENT; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 35d646a62ad4..737d11bc8838 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -182,6 +182,17 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) if (r->tos && (r->tos != fl4->flowi4_tos)) return 0; + if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) + return 0; + + if (fib_rule_port_range_set(&rule->sport_range) && + !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport)) + return 0; + + if (fib_rule_port_range_set(&rule->dport_range) && + !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport)) + return 0; + return 1; } @@ -244,6 +255,9 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, } #endif + if (fib_rule_requires_fldissect(rule)) + net->ipv4.fib_rules_require_fldissect++; + rule4->src_len = frh->src_len; rule4->srcmask = inet_make_mask(rule4->src_len); rule4->dst_len = frh->dst_len; @@ -272,6 +286,10 @@ static int fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; + + if (net->ipv4.fib_rules_require_fldissect && + fib_rule_requires_fldissect(rule)) + net->ipv4.fib_rules_require_fldissect--; errout: return err; } @@ -389,6 +407,7 @@ int __net_init fib4_rules_init(struct net *net) goto fail; net->ipv4.rules_ops = ops; net->ipv4.fib_has_custom_rules = false; + net->ipv4.fib_rules_require_fldissect = 0; return 0; fail: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 7d36a950d961..c27122f01b87 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -171,7 +171,7 @@ static void free_nh_exceptions(struct fib_nh *nh) fnhe = rcu_dereference_protected(hash[i].chain, 1); while (fnhe) { struct fib_nh_exception *next; - + next = rcu_dereference_protected(fnhe->fnhe_next, 1); rt_fibinfo_free(&fnhe->fnhe_rth_input); @@ -1746,18 +1746,20 @@ void fib_select_multipath(struct fib_result *res, int hash) bool first = false; for_nexthops(fi) { + if (net->ipv4.sysctl_fib_multipath_use_neigh) { + if (!fib_good_nh(nh)) + continue; + if (!first) { + res->nh_sel = nhsel; + first = true; + } + } + if (hash > atomic_read(&nh->nh_upper_bound)) continue; - if (!net->ipv4.sysctl_fib_multipath_use_neigh || - fib_good_nh(nh)) { - res->nh_sel = nhsel; - return; - } - if (!first) { - res->nh_sel = nhsel; - first = true; - } + res->nh_sel = nhsel; + return; } endfor_nexthops(fi); } #endif @@ -1765,14 +1767,12 @@ void fib_select_multipath(struct fib_result *res, int hash) void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb) { - bool oif_check; - - oif_check = (fl4->flowi4_oif == 0 || - fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF); + if (fl4->flowi4_oif && !(fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) + goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1 && oif_check) { - int h = fib_multipath_hash(res->fi, fl4, skb); + if (res->fi->fib_nhs > 1) { + int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h); } @@ -1780,10 +1780,10 @@ void fib_select_path(struct net *net, struct fib_result *res, #endif if (!res->prefixlen && res->table->tb_num_default > 1 && - res->type == RTN_UNICAST && oif_check) + res->type == RTN_UNICAST) fib_select_default(fl4, res); +check_saddr: if (!fl4->saddr) fl4->saddr = FIB_RES_PREFSRC(net, *res); } -EXPORT_SYMBOL_GPL(fib_select_path); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5530cd6fdbc7..3dcffd3ce98c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -50,6 +50,7 @@ #define VERSION "0.409" +#include <linux/cache.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> @@ -191,8 +192,8 @@ static size_t tnode_free_size; */ static const int sync_pages = 128; -static struct kmem_cache *fn_alias_kmem __read_mostly; -static struct kmem_cache *trie_leaf_kmem __read_mostly; +static struct kmem_cache *fn_alias_kmem __ro_after_init; +static struct kmem_cache *trie_leaf_kmem __ro_after_init; static inline struct tnode *tn_info(struct key_vector *kv) { @@ -1064,6 +1065,9 @@ noleaf: return -ENOMEM; } +/* fib notifier for ADD is sent before calling fib_insert_alias with + * the expectation that the only possible failure ENOMEM + */ static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) @@ -1215,8 +1219,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa, extack); + err = call_fib_entry_notifiers(net, + FIB_EVENT_ENTRY_REPLACE, + key, plen, new_fa, + extack); + if (err) + goto out_free_new_fa; + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1262,21 +1271,32 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); + if (err) + goto out_free_new_fa; + /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_free_new_fa; + goto out_fib_notif; if (!plen) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: return 0; +out_fib_notif: + /* notifier was sent that entry would be added to trie, but + * the add failed and need to recover. Only failure for + * fib_insert_alias is ENOMEM. + */ + NL_SET_ERR_MSG(extack, "Failed to insert route into trie"); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, + plen, new_fa, NULL); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -2721,14 +2741,14 @@ static const struct file_operations fib_route_fops = { int __net_init fib_proc_init(struct net *net) { - if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops)) + if (!proc_create("fib_trie", 0444, net->proc_net, &fib_trie_fops)) goto out1; - if (!proc_create("fib_triestat", S_IRUGO, net->proc_net, + if (!proc_create("fib_triestat", 0444, net->proc_net, &fib_triestat_fops)) goto out2; - if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops)) + if (!proc_create("route", 0444, net->proc_net, &fib_route_fops)) goto out3; return 0; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f2402581fef1..b26a81a7de42 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2993,10 +2993,10 @@ static int __net_init igmp_net_init(struct net *net) struct proc_dir_entry *pde; int err; - pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops); + pde = proc_create("igmp", 0444, net->proc_net, &igmp_mc_seq_fops); if (!pde) goto out_igmp; - pde = proc_create("mcfilter", S_IRUGO, net->proc_net, + pde = proc_create("mcfilter", 0444, net->proc_net, &igmp_mcf_seq_fops); if (!pde) goto out_mcfilter; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e8ec28999f5c..c9e35b81d093 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,12 +25,6 @@ #include <net/inet_frag.h> #include <net/inet_ecn.h> -#define INETFRAGS_EVICT_BUCKETS 128 -#define INETFRAGS_EVICT_MAX 512 - -/* don't rebuild inetfrag table with new secret more often than this */ -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) - /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static unsigned int -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) -{ - return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); -} - -static bool inet_frag_may_rebuild(struct inet_frags *f) -{ - return time_after(jiffies, - f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); -} - -static void inet_frag_secret_rebuild(struct inet_frags *f) -{ - int i; - - write_seqlock_bh(&f->rnd_seqlock); - - if (!inet_frag_may_rebuild(f)) - goto out; - - get_random_bytes(&f->rnd, sizeof(u32)); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - struct hlist_node *n; - - hb = &f->hash[i]; - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = inet_frag_hashfn(f, q); - - if (hval != i) { - struct inet_frag_bucket *hb_dest; - - hlist_del(&q->list); - - /* Relink to new hash chain. */ - hb_dest = &f->hash[hval]; - - /* This is the only place where we take - * another chain_lock while already holding - * one. As this will not run concurrently, - * we cannot deadlock on hb_dest lock below, if its - * already locked it will be released soon since - * other caller cannot be waiting for hb lock - * that we've taken above. - */ - spin_lock_nested(&hb_dest->chain_lock, - SINGLE_DEPTH_NESTING); - hlist_add_head(&q->list, &hb_dest->chain); - spin_unlock(&hb_dest->chain_lock); - } - } - spin_unlock(&hb->chain_lock); - } - - f->rebuild = false; - f->last_rebuild_jiffies = jiffies; -out: - write_sequnlock_bh(&f->rnd_seqlock); -} - -static bool inet_fragq_should_evict(const struct inet_frag_queue *q) -{ - if (!hlist_unhashed(&q->list_evictor)) - return false; - - return q->net->low_thresh == 0 || - frag_mem_limit(q->net) >= q->net->low_thresh; -} - -static unsigned int -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) -{ - struct inet_frag_queue *fq; - struct hlist_node *n; - unsigned int evicted = 0; - HLIST_HEAD(expired); - - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &hb->chain, list) { - if (!inet_fragq_should_evict(fq)) - continue; - - if (!del_timer(&fq->timer)) - continue; - - hlist_add_head(&fq->list_evictor, &expired); - ++evicted; - } - - spin_unlock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire(&fq->timer); - - return evicted; -} - -static void inet_frag_worker(struct work_struct *work) -{ - unsigned int budget = INETFRAGS_EVICT_BUCKETS; - unsigned int i, evicted = 0; - struct inet_frags *f; - - f = container_of(work, struct inet_frags, frags_work); - - BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); - - local_bh_disable(); - - for (i = READ_ONCE(f->next_bucket); budget; --budget) { - evicted += inet_evict_bucket(f, &f->hash[i]); - i = (i + 1) & (INETFRAGS_HASHSZ - 1); - if (evicted > INETFRAGS_EVICT_MAX) - break; - } - - f->next_bucket = i; - - local_bh_enable(); - - if (f->rebuild && inet_frag_may_rebuild(f)) - inet_frag_secret_rebuild(f); -} - -static void inet_frag_schedule_worker(struct inet_frags *f) -{ - if (unlikely(!work_pending(&f->frags_work))) - schedule_work(&f->frags_work); -} - int inet_frags_init(struct inet_frags *f) { - int i; - - INIT_WORK(&f->frags_work, inet_frag_worker); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb = &f->hash[i]; - - spin_lock_init(&hb->chain_lock); - INIT_HLIST_HEAD(&hb->chain); - } - - seqlock_init(&f->rnd_seqlock); - f->last_rebuild_jiffies = 0; f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) @@ -214,83 +59,75 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { - cancel_work_sync(&f->frags_work); + /* We must wait that all inet_frag_destroy_rcu() have completed. */ + rcu_barrier(); + kmem_cache_destroy(f->frags_cachep); + f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); -void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) +static void inet_frags_free_cb(void *ptr, void *arg) { - unsigned int seq; - int i; - - nf->low_thresh = 0; + struct inet_frag_queue *fq = ptr; -evict_again: - local_bh_disable(); - seq = read_seqbegin(&f->rnd_seqlock); - - for (i = 0; i < INETFRAGS_HASHSZ ; i++) - inet_evict_bucket(f, &f->hash[i]); - - local_bh_enable(); - cond_resched(); - - if (read_seqretry(&f->rnd_seqlock, seq) || - sum_frag_mem_limit(nf)) - goto evict_again; -} -EXPORT_SYMBOL(inet_frags_exit_net); - -static struct inet_frag_bucket * -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) -__acquires(hb->chain_lock) -{ - struct inet_frag_bucket *hb; - unsigned int seq, hash; - - restart: - seq = read_seqbegin(&f->rnd_seqlock); - - hash = inet_frag_hashfn(f, fq); - hb = &f->hash[hash]; + /* If we can not cancel the timer, it means this frag_queue + * is already disappearing, we have nothing to do. + * Otherwise, we own a refcount until the end of this function. + */ + if (!del_timer(&fq->timer)) + return; - spin_lock(&hb->chain_lock); - if (read_seqretry(&f->rnd_seqlock, seq)) { - spin_unlock(&hb->chain_lock); - goto restart; + spin_lock_bh(&fq->lock); + if (!(fq->flags & INET_FRAG_COMPLETE)) { + fq->flags |= INET_FRAG_COMPLETE; + refcount_dec(&fq->refcnt); } + spin_unlock_bh(&fq->lock); - return hb; + inet_frag_put(fq); } -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +void inet_frags_exit_net(struct netns_frags *nf) { - struct inet_frag_bucket *hb; + nf->low_thresh = 0; /* prevent creation of new frags */ - hb = get_frag_bucket_locked(fq, f); - hlist_del(&fq->list); - fq->flags |= INET_FRAG_COMPLETE; - spin_unlock(&hb->chain_lock); + rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } +EXPORT_SYMBOL(inet_frags_exit_net); -void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) +void inet_frag_kill(struct inet_frag_queue *fq) { if (del_timer(&fq->timer)) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - fq_unlink(fq, f); + struct netns_frags *nf = fq->net; + + fq->flags |= INET_FRAG_COMPLETE; + rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); refcount_dec(&fq->refcnt); } } EXPORT_SYMBOL(inet_frag_kill); -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) +static void inet_frag_destroy_rcu(struct rcu_head *head) +{ + struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, + rcu); + struct inet_frags *f = q->net->f; + + if (f->destructor) + f->destructor(q); + kmem_cache_free(f->frags_cachep, q); +} + +void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; struct netns_frags *nf; unsigned int sum, sum_truesize = 0; + struct inet_frags *f; WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); @@ -298,6 +135,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) /* Release all fragment data. */ fp = q->fragments; nf = q->net; + f = nf->f; while (fp) { struct sk_buff *xp = fp->next; @@ -307,59 +145,20 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) } sum = sum_truesize + f->qsize; - if (f->destructor) - f->destructor(q); - kmem_cache_free(f->frags_cachep, q); + call_rcu(&q->rcu, inet_frag_destroy_rcu); sub_frag_mem_limit(nf, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, - struct inet_frag_queue *qp_in, - struct inet_frags *f, - void *arg) -{ - struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); - struct inet_frag_queue *qp; - -#ifdef CONFIG_SMP - /* With SMP race we have to recheck hash table, because - * such entry could have been created on other cpu before - * we acquired hash bucket lock. - */ - hlist_for_each_entry(qp, &hb->chain, list) { - if (qp->net == nf && f->match(qp, arg)) { - refcount_inc(&qp->refcnt); - spin_unlock(&hb->chain_lock); - qp_in->flags |= INET_FRAG_COMPLETE; - inet_frag_put(qp_in, f); - return qp; - } - } -#endif - qp = qp_in; - if (!mod_timer(&qp->timer, jiffies + nf->timeout)) - refcount_inc(&qp->refcnt); - - refcount_inc(&qp->refcnt); - hlist_add_head(&qp->list, &hb->chain); - - spin_unlock(&hb->chain_lock); - - return qp; -} - static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { - inet_frag_schedule_worker(f); + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) return NULL; - } q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) @@ -371,70 +170,51 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - refcount_set(&q->refcnt, 1); + refcount_set(&q->refcnt, 3); return q; } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, void *arg) { + struct inet_frags *f = nf->f; struct inet_frag_queue *q; + int err; q = inet_frag_alloc(nf, f, arg); if (!q) return NULL; - return inet_frag_intern(nf, q, f, arg); + mod_timer(&q->timer, jiffies + nf->timeout); + + err = rhashtable_insert_fast(&nf->rhashtable, &q->node, + f->rhash_params); + if (err < 0) { + q->flags |= INET_FRAG_COMPLETE; + inet_frag_kill(q); + inet_frag_destroy(q); + return NULL; + } + return q; } -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, - unsigned int hash) +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - int depth = 0; - - if (frag_mem_limit(nf) > nf->low_thresh) - inet_frag_schedule_worker(f); - - hash &= (INETFRAGS_HASHSZ - 1); - hb = &f->hash[hash]; - - spin_lock(&hb->chain_lock); - hlist_for_each_entry(q, &hb->chain, list) { - if (q->net == nf && f->match(q, key)) { - refcount_inc(&q->refcnt); - spin_unlock(&hb->chain_lock); - return q; - } - depth++; - } - spin_unlock(&hb->chain_lock); + struct inet_frag_queue *fq; - if (depth <= INETFRAGS_MAXDEPTH) - return inet_frag_create(nf, f, key); + rcu_read_lock(); - if (inet_frag_may_rebuild(f)) { - if (!f->rebuild) - f->rebuild = true; - inet_frag_schedule_worker(f); + fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + if (fq) { + if (!refcount_inc_not_zero(&fq->refcnt)) + fq = NULL; + rcu_read_unlock(); + return fq; } + rcu_read_unlock(); - return ERR_PTR(-ENOBUFS); + return inet_frag_create(nf, key); } EXPORT_SYMBOL(inet_frag_find); - -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix) -{ - static const char msg[] = "inet_frag_find: Fragment hash bucket" - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) - ". Dropping fragment.\n"; - - if (PTR_ERR(q) == -ENOBUFS) - net_dbg_ratelimited("%s%s", prefix, msg); -} -EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 914d56928578..1f04bd91fc2e 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -6,6 +6,7 @@ * Authors: Andrey V. Savochkin <saw@msu.ru> */ +#include <linux/cache.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> @@ -51,7 +52,7 @@ * daddr: unchangeable */ -static struct kmem_cache *peer_cachep __read_mostly; +static struct kmem_cache *peer_cachep __ro_after_init; void inet_peer_base_init(struct inet_peer_base *bp) { diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index bbf1b94942c0..994fa70a910f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,27 +57,13 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; -struct ipfrag_skb_cb -{ - struct inet_skb_parm h; - int offset; -}; - -#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) - /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; - u32 user; - __be32 saddr; - __be32 daddr; - __be16 id; - u8 protocol; u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; - int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; @@ -89,49 +75,9 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -int ip_frag_mem(struct net *net) -{ - return sum_frag_mem_limit(&net->ipv4.frags); -} - static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev); -struct ip4_create_arg { - struct iphdr *iph; - u32 user; - int vif; -}; - -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) -{ - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); - return jhash_3words((__force u32)id << 16 | prot, - (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd); -} - -static unsigned int ip4_hashfn(const struct inet_frag_queue *q) -{ - const struct ipq *ipq; - - ipq = container_of(q, struct ipq, q); - return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); -} - -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct ipq *qp; - const struct ip4_create_arg *arg = a; - - qp = container_of(q, struct ipq, q); - return qp->id == arg->iph->id && - qp->saddr == arg->iph->saddr && - qp->daddr == arg->iph->daddr && - qp->protocol == arg->iph->protocol && - qp->user == arg->user && - qp->vif == arg->vif; -} static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { @@ -140,17 +86,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) frags); struct net *net = container_of(ipv4, struct net, ipv4); - const struct ip4_create_arg *arg = a; + const struct frag_v4_compare_key *key = a; - qp->protocol = arg->iph->protocol; - qp->id = arg->iph->id; - qp->ecn = ip4_frag_ecn(arg->iph->tos); - qp->saddr = arg->iph->saddr; - qp->daddr = arg->iph->daddr; - qp->vif = arg->vif; - qp->user = arg->user; + q->key.v4 = *key; + qp->ecn = 0; qp->peer = q->net->max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } @@ -168,7 +109,7 @@ static void ip4_frag_free(struct inet_frag_queue *q) static void ipq_put(struct ipq *ipq) { - inet_frag_put(&ipq->q, &ip4_frags); + inet_frag_put(&ipq->q); } /* Kill ipq entry. It is not destroyed immediately, @@ -176,7 +117,7 @@ static void ipq_put(struct ipq *ipq) */ static void ipq_kill(struct ipq *ipq) { - inet_frag_kill(&ipq->q, &ip4_frags); + inet_frag_kill(&ipq->q); } static bool frag_expire_skip_icmp(u32 user) @@ -194,8 +135,11 @@ static bool frag_expire_skip_icmp(u32 user) static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); - struct ipq *qp; + const struct iphdr *iph; + struct sk_buff *head; struct net *net; + struct ipq *qp; + int err; qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); @@ -209,46 +153,38 @@ static void ip_expire(struct timer_list *t) ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - if (!inet_frag_evicting(&qp->q)) { - struct sk_buff *clone, *head = qp->q.fragments; - const struct iphdr *iph; - int err; + head = qp->q.fragments; - __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); + __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) - goto out; + if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; - head->dev = dev_get_by_index_rcu(net, qp->iif); - if (!head->dev) - goto out; + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out; - /* skb has no dst, perform route lookup again */ - iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, + /* skb has no dst, perform route lookup again */ + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); - if (err) - goto out; + if (err) + goto out; + + /* Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (frag_expire_skip_icmp(qp->q.key.v4.user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) + goto out; + + skb_get(head); + spin_unlock(&qp->q.lock); + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; - /* Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (frag_expire_skip_icmp(qp->user) && - (skb_rtable(head)->rt_type != RTN_LOCAL)) - goto out; - - clone = skb_clone(head, GFP_ATOMIC); - - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if (clone) { - spin_unlock(&qp->q.lock); - icmp_send(clone, ICMP_TIME_EXCEEDED, - ICMP_EXC_FRAGTIME, 0); - consume_skb(clone); - goto out_rcu_unlock; - } - } out: spin_unlock(&qp->q.lock); out_rcu_unlock: @@ -262,21 +198,20 @@ out_rcu_unlock: static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { + struct frag_v4_compare_key key = { + .saddr = iph->saddr, + .daddr = iph->daddr, + .user = user, + .vif = vif, + .id = iph->id, + .protocol = iph->protocol, + }; struct inet_frag_queue *q; - struct ip4_create_arg arg; - unsigned int hash; - - arg.iph = iph; - arg.user = user; - arg.vif = vif; - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - - q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + q = inet_frag_find(&net->ipv4.frags, &key); + if (!q) return NULL; - } + return container_of(q, struct ipq, q); } @@ -410,13 +345,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * this fragment, right? */ prev = qp->q.fragments_tail; - if (!prev || FRAG_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = qp->q.fragments; next != NULL; next = next->next) { - if (FRAG_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -427,7 +362,7 @@ found: * any overlaps are eliminated. */ if (prev) { - int i = (FRAG_CB(prev)->offset + prev->len) - offset; + int i = (prev->ip_defrag_offset + prev->len) - offset; if (i > 0) { offset += i; @@ -444,8 +379,8 @@ found: err = -ENOMEM; - while (next && FRAG_CB(next)->offset < end) { - int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + while (next && next->ip_defrag_offset < end) { + int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */ if (i < next->len) { /* Eat head of the next overlapped fragment @@ -453,7 +388,7 @@ found: */ if (!pskb_pull(next, i)) goto err; - FRAG_CB(next)->offset += i; + next->ip_defrag_offset += i; qp->q.meat -= i; if (next->ip_summed != CHECKSUM_UNNECESSARY) next->ip_summed = CHECKSUM_NONE; @@ -477,7 +412,13 @@ found: } } - FRAG_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + dev = skb->dev; + if (dev) + qp->iif = dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -488,11 +429,6 @@ found: else qp->q.fragments = skb; - dev = skb->dev; - if (dev) { - qp->iif = dev->ifindex; - skb->dev = NULL; - } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; @@ -568,7 +504,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } WARN_ON(!head); - WARN_ON(FRAG_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); /* Allocate a new buffer for the datagram. */ ihlen = ip_hdrlen(head); @@ -656,7 +592,7 @@ out_nomem: err = -ENOMEM; goto out_fail; out_oversize: - net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; @@ -731,23 +667,23 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", .data = &init_net.ipv4.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.ipv4.frags.high_thresh }, @@ -846,6 +782,8 @@ static void __init ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { + int res; + /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -870,16 +808,21 @@ static int __net_init ipv4_frags_init_net(struct net *net) net->ipv4.frags.timeout = IP_FRAG_TIME; net->ipv4.frags.max_dist = 64; - - inet_frags_init_net(&net->ipv4.frags); - - return ip4_frags_ns_ctl_register(net); + net->ipv4.frags.f = &ip4_frags; + + res = inet_frags_init_net(&net->ipv4.frags); + if (res < 0) + return res; + res = ip4_frags_ns_ctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->ipv4.frags); + return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); - inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); + inet_frags_exit_net(&net->ipv4.frags); } static struct pernet_operations ip4_frags_ops = { @@ -887,17 +830,49 @@ static struct pernet_operations ip4_frags_ops = { .exit = ipv4_frags_exit_net, }; + +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v4, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v4_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params ip4_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .key_offset = offsetof(struct inet_frag_queue, key), + .key_len = sizeof(struct frag_v4_compare_key), + .hashfn = ip4_key_hashfn, + .obj_hashfn = ip4_obj_hashfn, + .obj_cmpfn = ip4_obj_cmpfn, + .automatic_shrinking = true, +}; + void __init ipfrag_init(void) { - ip4_frags_ctl_register(); - register_pernet_subsys(&ip4_frags_ops); - ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.qsize = sizeof(struct ipq); - ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; + ip4_frags.rhash_params = ip4_rhash_params; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); + ip4_frags_ctl_register(); + register_pernet_subsys(&ip4_frags_ops); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0901de42ed85..a8772a978224 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -522,6 +522,7 @@ err_free_skb: static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { + struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct rtable *rt = NULL; @@ -545,9 +546,11 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) goto err_free_rt; - flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + flags = tun_info->key.tun_flags & + (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); gre_build_header(skb, tunnel_hlen, flags, proto, - tunnel_id_to_key32(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), + (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; @@ -1317,6 +1320,12 @@ static void ipgre_tap_setup(struct net_device *dev) ip_tunnel_setup(dev, gre_tap_net_id); } +bool is_gretap_dev(const struct net_device *dev) +{ + return dev->netdev_ops == &gre_tap_netdev_ops; +} +EXPORT_SYMBOL_GPL(is_gretap_dev); + static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 57fc13c6ab2b..7582713dd18f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { + for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report @@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || - sk->sk_bound_dev_if == dev->ifindex) && - net_eq(sock_net(sk), net)) { + sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 66340ab750e6..94cacae76aca 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -876,6 +876,7 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; + unsigned int wmem_alloc_delta = 0; u32 tskey = 0; skb = skb_peek_tail(queue); @@ -971,11 +972,10 @@ alloc_new_skb: (flags & MSG_DONTWAIT), &err); } else { skb = NULL; - if (refcount_read(&sk->sk_wmem_alloc) <= + if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) - skb = sock_wmalloc(sk, - alloclen + hh_len + 15, 1, - sk->sk_allocation); + skb = alloc_skb(alloclen + hh_len + 15, + sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } @@ -1033,6 +1033,11 @@ alloc_new_skb: /* * Put the packet on the pending queue. */ + if (!skb->destructor) { + skb->destructor = sock_wfree; + skb->sk = sk; + wmem_alloc_delta += skb->truesize; + } __skb_queue_tail(queue, skb); continue; } @@ -1079,12 +1084,13 @@ alloc_new_skb: skb->len += copy; skb->data_len += copy; skb->truesize += copy; - refcount_add(copy, &sk->sk_wmem_alloc); + wmem_alloc_delta += copy; } offset += copy; length -= copy; } + refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; error_efault: @@ -1092,6 +1098,7 @@ error_efault: error: cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); + refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return err; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 74c962b9b09c..5ad2d8ed3a3f 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -322,20 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, return 0; } - -/* Special input handler for packets caught by router alert option. - They are selected only by protocol field, and then processed likely - local ones; but only if someone wants them! Otherwise, router - not running rsvpd will kill RSVP. - - It is user level problem, what it will make with them. - I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), - but receiver should be enough clever f.e. to forward mtrace requests, - sent to multicast group to reach destination designated router. - */ -struct ip_ra_chain __rcu *ip_ra_chain; - - static void ip_ra_destroy_rcu(struct rcu_head *head) { struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); @@ -349,23 +335,28 @@ int ip_ra_control(struct sock *sk, unsigned char on, { struct ip_ra_chain *ra, *new_ra; struct ip_ra_chain __rcu **rap; + struct net *net = sock_net(sk); if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; - for (rap = &ip_ra_chain; - (ra = rtnl_dereference(*rap)) != NULL; + mutex_lock(&net->ipv4.ra_mutex); + for (rap = &net->ipv4.ra_chain; + (ra = rcu_dereference_protected(*rap, + lockdep_is_held(&net->ipv4.ra_mutex))) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { + mutex_unlock(&net->ipv4.ra_mutex); kfree(new_ra); return -EADDRINUSE; } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; RCU_INIT_POINTER(*rap, ra->next); + mutex_unlock(&net->ipv4.ra_mutex); if (ra->destructor) ra->destructor(sk); @@ -379,14 +370,17 @@ int ip_ra_control(struct sock *sk, unsigned char on, return 0; } } - if (!new_ra) + if (!new_ra) { + mutex_unlock(&net->ipv4.ra_mutex); return -ENOBUFS; + } new_ra->sk = sk; new_ra->destructor = destructor; RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); + mutex_unlock(&net->ipv4.ra_mutex); return 0; } @@ -586,7 +580,6 @@ static bool setsockopt_needs_rtnl(int optname) case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_UNBLOCK_SOURCE: - case IP_ROUTER_ALERT: return true; } return false; @@ -639,6 +632,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, /* If optlen==0, it is equivalent to val == 0 */ + if (optname == IP_ROUTER_ALERT) + return ip_ra_control(sk, val ? 1 : 0, NULL); if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk, optname, optval, optlen); @@ -1149,9 +1144,6 @@ mc_msf_out: goto e_inval; inet->mc_all = val; break; - case IP_ROUTER_ALERT: - err = ip_ra_control(sk, val ? 1 : 0, NULL); - break; case IP_FREEBIND: if (optlen < 1) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index a7fd1c5a2a14..de6d94482fe7 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -290,22 +290,6 @@ failed: return ERR_PTR(err); } -static inline void init_tunnel_flow(struct flowi4 *fl4, - int proto, - __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif, - __u32 mark) -{ - memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; - fl4->daddr = daddr; - fl4->saddr = saddr; - fl4->flowi4_tos = tos; - fl4->flowi4_proto = proto; - fl4->fl4_gre_key = key; - fl4->flowi4_mark = mark; -} - static int ip_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; @@ -322,10 +306,10 @@ static int ip_tunnel_bind_dev(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - init_tunnel_flow(&fl4, iph->protocol, iph->daddr, - iph->saddr, tunnel->parms.o_key, - RT_TOS(iph->tos), tunnel->parms.link, - tunnel->fwmark); + ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, + iph->saddr, tunnel->parms.o_key, + RT_TOS(iph->tos), tunnel->parms.link, + tunnel->fwmark); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -365,8 +349,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, int mtu; int err; - BUG_ON(!itn->fb_tunnel_dev); - dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); + dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); if (IS_ERR(dev)) return ERR_CAST(dev); @@ -590,8 +573,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) else if (skb->protocol == htons(ETH_P_IPV6)) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } - init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, - RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); + ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, + RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; rt = ip_route_output_key(tunnel->net, &fl4); @@ -719,9 +702,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } } - init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, - tunnel->fwmark); + ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, + tunnel->fwmark); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; @@ -847,7 +830,6 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) struct net *net = t->net; struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); - BUG_ON(!itn->fb_tunnel_dev); switch (cmd) { case SIOCGETTUNNEL: if (dev == itn->fb_tunnel_dev) { @@ -872,7 +854,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) p->o_key = 0; } - t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + t = ip_tunnel_find(itn, p, itn->type); if (cmd == SIOCADDTUNNEL) { if (!t) { @@ -1016,10 +998,15 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct ip_tunnel_parm parms; unsigned int i; + itn->rtnl_link_ops = ops; for (i = 0; i < IP_TNL_HASH_SIZE; i++) INIT_HLIST_HEAD(&itn->tunnels[i]); - if (!ops) { + if (!ops || !net_has_fallback_tunnels(net)) { + struct ip_tunnel_net *it_init_net; + + it_init_net = net_generic(&init_net, ip_tnl_net_id); + itn->type = it_init_net->type; itn->fb_tunnel_dev = NULL; return 0; } @@ -1037,6 +1024,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); + itn->type = itn->fb_tunnel_dev->type; } rtnl_unlock(); @@ -1044,10 +1032,10 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, +static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, + struct list_head *head, struct rtnl_link_ops *ops) { - struct net *net = dev_net(itn->fb_tunnel_dev); struct net_device *dev, *aux; int h; @@ -1079,7 +1067,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { itn = net_generic(net, id); - ip_tunnel_destroy(itn, &list, ops); + ip_tunnel_destroy(net, itn, &list, ops); } unregister_netdevice_many(&list); rtnl_unlock(); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f75802ad960f..43f620feb1c4 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1369,7 +1369,7 @@ static int __init ip_auto_config(void) unsigned int i; #ifdef CONFIG_PROC_FS - proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops); + proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops); #endif /* CONFIG_PROC_FS */ if (!ic_enable) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b05689bbba31..2fb4de3f7f66 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -28,9 +28,9 @@ #include <linux/uaccess.h> #include <linux/types.h> +#include <linux/cache.h> #include <linux/capability.h> #include <linux/errno.h> -#include <linux/timer.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> @@ -52,7 +52,6 @@ #include <net/protocol.h> #include <linux/skbuff.h> #include <net/route.h> -#include <net/sock.h> #include <net/icmp.h> #include <net/udp.h> #include <net/raw.h> @@ -96,7 +95,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock); * In this case data path is free of exclusive locks at all. */ -static struct kmem_cache *mrt_cachep __read_mostly; +static struct kmem_cache *mrt_cachep __ro_after_init; static struct mr_table *ipmr_new_table(struct net *net, u32 id); static void ipmr_free_table(struct mr_table *mrt); @@ -106,8 +105,6 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct mfc_cache *cache, int local); static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); -static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mfc_cache *c, struct rtmsg *rtm); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); @@ -118,6 +115,23 @@ static void ipmr_expire_process(struct timer_list *t); #define ipmr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list) +static struct mr_table *ipmr_mr_table_iter(struct net *net, + struct mr_table *mrt) +{ + struct mr_table *ret; + + if (!mrt) + ret = list_entry_rcu(net->ipv4.mr_tables.next, + struct mr_table, list); + else + ret = list_entry_rcu(mrt->list.next, + struct mr_table, list); + + if (&ret->list == &net->ipv4.mr_tables) + return NULL; + return ret; +} + static struct mr_table *ipmr_get_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -285,6 +299,14 @@ EXPORT_SYMBOL(ipmr_rule_default); #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) +static struct mr_table *ipmr_mr_table_iter(struct net *net, + struct mr_table *mrt) +{ + if (!mrt) + return net->ipv4.mrt; + return NULL; +} + static struct mr_table *ipmr_get_table(struct net *net, u32 id) { return net->ipv4.mrt; @@ -344,7 +366,7 @@ static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params ipmr_rht_params = { - .head_offset = offsetof(struct mfc_cache, mnode), + .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc_cache, cmparg), .key_len = sizeof(struct mfc_cache_cmp_arg), .nelem_hint = 3, @@ -353,6 +375,24 @@ static const struct rhashtable_params ipmr_rht_params = { .automatic_shrinking = true, }; +static void ipmr_new_table_set(struct mr_table *mrt, + struct net *net) +{ +#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES + list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); +#endif +} + +static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = { + .mfc_mcastgrp = htonl(INADDR_ANY), + .mfc_origin = htonl(INADDR_ANY), +}; + +static struct mr_table_ops ipmr_mr_table_ops = { + .rht_params = &ipmr_rht_params, + .cmparg_any = &ipmr_mr_table_ops_cmparg_any, +}; + static struct mr_table *ipmr_new_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -365,23 +405,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) if (mrt) return mrt; - mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); - if (!mrt) - return ERR_PTR(-ENOMEM); - write_pnet(&mrt->net, net); - mrt->id = id; - - rhltable_init(&mrt->mfc_hash, &ipmr_rht_params); - INIT_LIST_HEAD(&mrt->mfc_cache_list); - INIT_LIST_HEAD(&mrt->mfc_unres_queue); - - timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0); - - mrt->mroute_reg_vif_num = -1; -#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES - list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); -#endif - return mrt; + return mr_table_alloc(net, id, &ipmr_mr_table_ops, + ipmr_expire_process, ipmr_new_table_set); } static void ipmr_free_table(struct mr_table *mrt) @@ -619,80 +644,22 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } #endif -static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, - struct net *net, - enum fib_event_type event_type, - struct vif_device *vif, - vifi_t vif_index, u32 tb_id) -{ - struct vif_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .dev = vif->dev, - .vif_index = vif_index, - .vif_flags = vif->flags, - .tb_id = tb_id, - }; - - return call_fib_notifier(nb, net, event_type, &info.info); -} - static int call_ipmr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, vifi_t vif_index, u32 tb_id) { - struct vif_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .dev = vif->dev, - .vif_index = vif_index, - .vif_flags = vif->flags, - .tb_id = tb_id, - }; - - ASSERT_RTNL(); - net->ipv4.ipmr_seq++; - return call_fib_notifiers(net, event_type, &info.info); -} - -static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, - struct net *net, - enum fib_event_type event_type, - struct mfc_cache *mfc, u32 tb_id) -{ - struct mfc_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .mfc = mfc, - .tb_id = tb_id - }; - - return call_fib_notifier(nb, net, event_type, &info.info); + return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type, + vif, vif_index, tb_id, + &net->ipv4.ipmr_seq); } static int call_ipmr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc_cache *mfc, u32 tb_id) { - struct mfc_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .mfc = mfc, - .tb_id = tb_id - }; - - ASSERT_RTNL(); - net->ipv4.ipmr_seq++; - return call_fib_notifiers(net, event_type, &info.info); + return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type, + &mfc->_c, tb_id, &net->ipv4.ipmr_seq); } /** @@ -760,16 +727,15 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, static void ipmr_cache_free_rcu(struct rcu_head *head) { - struct mfc_cache *c = container_of(head, struct mfc_cache, rcu); + struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); - kmem_cache_free(mrt_cachep, c); + kmem_cache_free(mrt_cachep, (struct mfc_cache *)c); } -void ipmr_cache_free(struct mfc_cache *c) +static void ipmr_cache_free(struct mfc_cache *c) { - call_rcu(&c->rcu, ipmr_cache_free_rcu); + call_rcu(&c->_c.rcu, ipmr_cache_free_rcu); } -EXPORT_SYMBOL(ipmr_cache_free); /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. @@ -782,7 +748,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) atomic_dec(&mrt->cache_resolve_queue_len); - while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { + while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); @@ -806,9 +772,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); - unsigned long now; + struct mr_mfc *c, *next; unsigned long expires; - struct mfc_cache *c, *next; + unsigned long now; if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10); @@ -830,8 +796,8 @@ static void ipmr_expire_process(struct timer_list *t) } list_del(&c->list); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_destroy_unres(mrt, c); + mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE); + ipmr_destroy_unres(mrt, (struct mfc_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) @@ -842,7 +808,7 @@ out: } /* Fill oifs list. It is called under write locked mrt_lock. */ -static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache, +static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; @@ -944,6 +910,10 @@ static int vif_add(struct net *net, struct mr_table *mrt, ip_rt_multicast_event(in_dev); /* Fill in the VIF structures */ + vif_device_init(v, dev, vifc->vifc_rate_limit, + vifc->vifc_threshold, + vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0), + (VIFF_TUNNEL | VIFF_REGISTER)); attr.orig_dev = dev; if (!switchdev_port_attr_get(dev, &attr)) { @@ -952,20 +922,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, } else { v->dev_parent_id.id_len = 0; } - v->rate_limit = vifc->vifc_rate_limit; + v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; - v->flags = vifc->vifc_flags; - if (!mrtsock) - v->flags |= VIFF_STATIC; - v->threshold = vifc->vifc_threshold; - v->bytes_in = 0; - v->bytes_out = 0; - v->pkt_in = 0; - v->pkt_out = 0; - v->link = dev->ifindex; - if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) - v->link = dev_get_iflink(dev); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); @@ -988,33 +947,8 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, .mfc_mcastgrp = mcastgrp, .mfc_origin = origin }; - struct rhlist_head *tmp, *list; - struct mfc_cache *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - return c; - - return NULL; -} - -/* Look for a (*,*,oif) entry */ -static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, - int vifi) -{ - struct mfc_cache_cmp_arg arg = { - .mfc_mcastgrp = htonl(INADDR_ANY), - .mfc_origin = htonl(INADDR_ANY) - }; - struct rhlist_head *tmp, *list; - struct mfc_cache *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - if (c->mfc_un.res.ttls[vifi] < 255) - return c; - return NULL; + return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ @@ -1025,25 +959,10 @@ static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, .mfc_mcastgrp = mcastgrp, .mfc_origin = htonl(INADDR_ANY) }; - struct rhlist_head *tmp, *list; - struct mfc_cache *c, *proxy; if (mcastgrp == htonl(INADDR_ANY)) - goto skip; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) { - if (c->mfc_un.res.ttls[vifi] < 255) - return c; - - /* It's ok if the vifi is part of the static tree */ - proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent); - if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) - return c; - } - -skip: - return ipmr_cache_find_any_parent(mrt, vifi); + return mr_mfc_find_any_parent(mrt, vifi); + return mr_mfc_find_any(mrt, vifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ @@ -1055,15 +974,8 @@ static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt, .mfc_mcastgrp = mcastgrp, .mfc_origin = origin, }; - struct rhlist_head *tmp, *list; - struct mfc_cache *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - if (parent == -1 || parent == c->mfc_parent) - return c; - return NULL; + return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ @@ -1072,9 +984,10 @@ static struct mfc_cache *ipmr_cache_alloc(void) struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (c) { - c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; - c->mfc_un.res.minvif = MAXVIFS; - refcount_set(&c->mfc_un.res.refcount, 1); + c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; + c->_c.mfc_un.res.minvif = MAXVIFS; + c->_c.free = ipmr_cache_free_rcu; + refcount_set(&c->_c.mfc_un.res.refcount, 1); } return c; } @@ -1084,8 +997,8 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void) struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (c) { - skb_queue_head_init(&c->mfc_un.unres.unresolved); - c->mfc_un.unres.expires = jiffies + 10*HZ; + skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); + c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; } return c; } @@ -1098,12 +1011,13 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct nlmsgerr *e; /* Play the pending entries through our router */ - while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); - if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { + if (mr_fill_mroute(mrt, skb, &c->_c, + nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { @@ -1211,7 +1125,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, int err; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(c, &mrt->mfc_unres_queue, list) { + list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) { found = true; @@ -1230,12 +1144,13 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, } /* Fill in the new cache entry */ - c->mfc_parent = -1; + c->_c.mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; /* Reflect first query at mrouted. */ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); + if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker @@ -1248,15 +1163,16 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, } atomic_inc(&mrt->cache_resolve_queue_len); - list_add(&c->list, &mrt->mfc_unres_queue); + list_add(&c->_c.list, &mrt->mfc_unres_queue); mroute_netlink_event(mrt, c, RTM_NEWROUTE); if (atomic_read(&mrt->cache_resolve_queue_len) == 1) - mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); + mod_timer(&mrt->ipmr_expire_timer, + c->_c.mfc_un.unres.expires); } /* See if we can append the packet */ - if (c->mfc_un.unres.unresolved.qlen > 3) { + if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { @@ -1264,7 +1180,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, skb->dev = dev; skb->skb_iif = dev->ifindex; } - skb_queue_tail(&c->mfc_un.unres.unresolved, skb); + skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } @@ -1286,11 +1202,11 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) rcu_read_unlock(); if (!c) return -ENOENT; - rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); - list_del_rcu(&c->list); + rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params); + list_del_rcu(&c->_c.list); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_put(c); + mr_cache_put(&c->_c); return 0; } @@ -1299,6 +1215,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, struct mfcctl *mfc, int mrtsock, int parent) { struct mfc_cache *uc, *c; + struct mr_mfc *_uc; bool found; int ret; @@ -1312,10 +1229,10 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, rcu_read_unlock(); if (c) { write_lock_bh(&mrt_lock); - c->mfc_parent = mfc->mfcc_parent; - ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); + c->_c.mfc_parent = mfc->mfcc_parent; + ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) - c->mfc_flags |= MFC_STATIC; + c->_c.mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); @@ -1333,28 +1250,29 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, c->mfc_origin = mfc->mfcc_origin.s_addr; c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; - c->mfc_parent = mfc->mfcc_parent; - ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); + c->_c.mfc_parent = mfc->mfcc_parent; + ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) - c->mfc_flags |= MFC_STATIC; + c->_c.mfc_flags |= MFC_STATIC; - ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode, + ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ipmr_rht_params); if (ret) { pr_err("ipmr: rhtable insert error %d\n", ret); ipmr_cache_free(c); return ret; } - list_add_tail_rcu(&c->list, &mrt->mfc_cache_list); + list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(uc, &mrt->mfc_unres_queue, list) { + list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { + uc = (struct mfc_cache *)_uc; if (uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { - list_del(&uc->list); + list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; @@ -1377,7 +1295,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, static void mroute_clean_tables(struct mr_table *mrt, bool all) { struct net *net = read_pnet(&mrt->net); - struct mfc_cache *c, *tmp; + struct mr_mfc *c, *tmp; + struct mfc_cache *cache; LIST_HEAD(list); int i; @@ -1395,18 +1314,20 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); - call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, + cache = (struct mfc_cache *)c; + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, mrt->id); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_put(c); + mroute_netlink_event(mrt, cache, RTM_DELROUTE); + mr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_destroy_unres(mrt, c); + cache = (struct mfc_cache *)c; + mroute_netlink_event(mrt, cache, RTM_DELROUTE); + ipmr_destroy_unres(mrt, cache); } spin_unlock_bh(&mfc_unres_lock); } @@ -1420,7 +1341,7 @@ static void mrtsock_destruct(struct sock *sk) struct net *net = sock_net(sk); struct mr_table *mrt; - ASSERT_RTNL(); + rtnl_lock(); ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; @@ -1432,6 +1353,7 @@ static void mrtsock_destruct(struct sock *sk) mroute_clean_tables(mrt, false); } } + rtnl_unlock(); } /* Socket options and virtual interface manipulation. The whole @@ -1496,8 +1418,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, if (sk != rcu_access_pointer(mrt->mroute_sk)) { ret = -EACCES; } else { + /* We need to unlock here because mrtsock_destruct takes + * care of rtnl itself and we can't change that due to + * the IP_ROUTER_ALERT setsockopt which runs without it. + */ + rtnl_unlock(); ret = ip_ra_control(sk, 0, NULL); - goto out_unlock; + goto out; } break; case MRT_ADD_VIF: @@ -1609,6 +1536,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, } out_unlock: rtnl_unlock(); +out: return ret; } @@ -1698,9 +1626,9 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; + sr.pktcnt = c->_c.mfc_un.res.pkt; + sr.bytecnt = c->_c.mfc_un.res.bytes; + sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -1772,9 +1700,9 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; + sr.pktcnt = c->_c.mfc_un.res.pkt; + sr.bytecnt = c->_c.mfc_un.res.bytes; + sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -1998,26 +1926,26 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) /* "local" means that we should preserve one skb (for local delivery) */ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, - struct mfc_cache *cache, int local) + struct mfc_cache *c, int local) { int true_vifi = ipmr_find_vif(mrt, dev); int psend = -1; int vif, ct; - vif = cache->mfc_parent; - cache->mfc_un.res.pkt++; - cache->mfc_un.res.bytes += skb->len; - cache->mfc_un.res.lastuse = jiffies; + vif = c->_c.mfc_parent; + c->_c.mfc_un.res.pkt++; + c->_c.mfc_un.res.bytes += skb->len; + c->_c.mfc_un.res.lastuse = jiffies; - if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { + if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; /* For an (*,G) entry, we only check that the incomming * interface is part of the static tree. */ - cache_proxy = ipmr_cache_find_any_parent(mrt, vif); + cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && - cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } @@ -2038,7 +1966,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, goto dont_forward; } - cache->mfc_un.res.wrong_if++; + c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -2047,10 +1975,11 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, * large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || - cache->mfc_un.res.ttls[true_vifi] < 255) && + c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, - cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { - cache->mfc_un.res.last_assert = jiffies; + c->_c.mfc_un.res.last_assert + + MFC_ASSERT_THRESH)) { + c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); } goto dont_forward; @@ -2061,33 +1990,33 @@ forward: mrt->vif_table[vif].bytes_in += skb->len; /* Forward the frame */ - if (cache->mfc_origin == htonl(INADDR_ANY) && - cache->mfc_mcastgrp == htonl(INADDR_ANY)) { + if (c->mfc_origin == htonl(INADDR_ANY) && + c->mfc_mcastgrp == htonl(INADDR_ANY)) { if (true_vifi >= 0 && - true_vifi != cache->mfc_parent && + true_vifi != c->_c.mfc_parent && ip_hdr(skb)->ttl > - cache->mfc_un.res.ttls[cache->mfc_parent]) { + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ - psend = cache->mfc_parent; + psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } - for (ct = cache->mfc_un.res.maxvif - 1; - ct >= cache->mfc_un.res.minvif; ct--) { + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ - if ((cache->mfc_origin != htonl(INADDR_ANY) || + if ((c->mfc_origin != htonl(INADDR_ANY) || ct != true_vifi) && - ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { + ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, - skb2, cache, psend); + skb2, c, psend); } psend = ct; } @@ -2099,9 +2028,9 @@ last_forward: if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, - cache, psend); + c, psend); } else { - ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend); return; } } @@ -2299,62 +2228,6 @@ drop: } #endif -static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mfc_cache *c, struct rtmsg *rtm) -{ - struct rta_mfc_stats mfcs; - struct nlattr *mp_attr; - struct rtnexthop *nhp; - unsigned long lastuse; - int ct; - - /* If cache is unresolved, don't try to parse IIF and OIF */ - if (c->mfc_parent >= MAXVIFS) { - rtm->rtm_flags |= RTNH_F_UNRESOLVED; - return -ENOENT; - } - - if (VIF_EXISTS(mrt, c->mfc_parent) && - nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) - return -EMSGSIZE; - - if (c->mfc_flags & MFC_OFFLOAD) - rtm->rtm_flags |= RTNH_F_OFFLOAD; - - if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) - return -EMSGSIZE; - - for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { - if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { - nla_nest_cancel(skb, mp_attr); - return -EMSGSIZE; - } - - nhp->rtnh_flags = 0; - nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; - nhp->rtnh_len = sizeof(*nhp); - } - } - - nla_nest_end(skb, mp_attr); - - lastuse = READ_ONCE(c->mfc_un.res.lastuse); - lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; - - mfcs.mfcs_packets = c->mfc_un.res.pkt; - mfcs.mfcs_bytes = c->mfc_un.res.bytes; - mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; - if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), - RTA_PAD)) - return -EMSGSIZE; - - rtm->rtm_type = RTN_MULTICAST; - return 1; -} - int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid) @@ -2412,7 +2285,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, } read_lock(&mrt_lock); - err = __ipmr_fill_mroute(mrt, skb, cache, rtm); + err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); read_unlock(&mrt_lock); rcu_read_unlock(); return err; @@ -2440,7 +2313,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; - if (c->mfc_flags & MFC_STATIC) + if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; @@ -2449,7 +2322,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) || nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp)) goto nla_put_failure; - err = __ipmr_fill_mroute(mrt, skb, c, rtm); + err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; @@ -2462,6 +2335,14 @@ nla_put_failure: return -EMSGSIZE; } +static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, int cmd, + int flags) +{ + return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c, + cmd, flags); +} + static size_t mroute_msgsize(bool unresolved, int maxvif) { size_t len = @@ -2490,7 +2371,8 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif), + skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS, + mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; @@ -2634,62 +2516,8 @@ errout_free: static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); - struct mr_table *mrt; - struct mfc_cache *mfc; - unsigned int t = 0, s_t; - unsigned int e = 0, s_e; - - s_t = cb->args[0]; - s_e = cb->args[1]; - - rcu_read_lock(); - ipmr_for_each_table(mrt, net) { - if (t < s_t) - goto next_table; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { - if (e < s_e) - goto next_entry; - if (ipmr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) - goto done; -next_entry: - e++; - } - e = 0; - s_e = 0; - - spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { - if (e < s_e) - goto next_entry2; - if (ipmr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) { - spin_unlock_bh(&mfc_unres_lock); - goto done; - } -next_entry2: - e++; - } - spin_unlock_bh(&mfc_unres_lock); - e = 0; - s_e = 0; -next_table: - t++; - } -done: - rcu_read_unlock(); - - cb->args[1] = e; - cb->args[0] = t; - - return skb->len; + return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, + _ipmr_fill_mroute, &mfc_unres_lock); } static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { @@ -2946,31 +2774,11 @@ out: /* The /proc interfaces to multicast routing : * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ -struct ipmr_vif_iter { - struct seq_net_private p; - struct mr_table *mrt; - int ct; -}; - -static struct vif_device *ipmr_vif_seq_idx(struct net *net, - struct ipmr_vif_iter *iter, - loff_t pos) -{ - struct mr_table *mrt = iter->mrt; - - for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { - if (!VIF_EXISTS(mrt, iter->ct)) - continue; - if (pos-- == 0) - return &mrt->vif_table[iter->ct]; - } - return NULL; -} static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(mrt_lock) { - struct ipmr_vif_iter *iter = seq->private; + struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; @@ -2981,26 +2789,7 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) iter->mrt = mrt; read_lock(&mrt_lock); - return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1) - : SEQ_START_TOKEN; -} - -static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ipmr_vif_iter *iter = seq->private; - struct net *net = seq_file_net(seq); - struct mr_table *mrt = iter->mrt; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ipmr_vif_seq_idx(net, iter, 0); - - while (++iter->ct < mrt->maxvif) { - if (!VIF_EXISTS(mrt, iter->ct)) - continue; - return &mrt->vif_table[iter->ct]; - } - return NULL; + return mr_vif_seq_start(seq, pos); } static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) @@ -3011,7 +2800,7 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) static int ipmr_vif_seq_show(struct seq_file *seq, void *v) { - struct ipmr_vif_iter *iter = seq->private; + struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { @@ -3019,7 +2808,8 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); } else { const struct vif_device *vif = v; - const char *name = vif->dev ? vif->dev->name : "none"; + const char *name = vif->dev ? + vif->dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", @@ -3033,7 +2823,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) static const struct seq_operations ipmr_vif_seq_ops = { .start = ipmr_vif_seq_start, - .next = ipmr_vif_seq_next, + .next = mr_vif_seq_next, .stop = ipmr_vif_seq_stop, .show = ipmr_vif_seq_show, }; @@ -3041,7 +2831,7 @@ static const struct seq_operations ipmr_vif_seq_ops = { static int ipmr_vif_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ipmr_vif_seq_ops, - sizeof(struct ipmr_vif_iter)); + sizeof(struct mr_vif_iter)); } static const struct file_operations ipmr_vif_fops = { @@ -3051,40 +2841,8 @@ static const struct file_operations ipmr_vif_fops = { .release = seq_release_net, }; -struct ipmr_mfc_iter { - struct seq_net_private p; - struct mr_table *mrt; - struct list_head *cache; -}; - -static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, - struct ipmr_mfc_iter *it, loff_t pos) -{ - struct mr_table *mrt = it->mrt; - struct mfc_cache *mfc; - - rcu_read_lock(); - it->cache = &mrt->mfc_cache_list; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - if (pos-- == 0) - return mfc; - rcu_read_unlock(); - - spin_lock_bh(&mfc_unres_lock); - it->cache = &mrt->mfc_unres_queue; - list_for_each_entry(mfc, it->cache, list) - if (pos-- == 0) - return mfc; - spin_unlock_bh(&mfc_unres_lock); - - it->cache = NULL; - return NULL; -} - - static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { - struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; @@ -3092,54 +2850,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) if (!mrt) return ERR_PTR(-ENOENT); - it->mrt = mrt; - it->cache = NULL; - return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) - : SEQ_START_TOKEN; -} - -static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ipmr_mfc_iter *it = seq->private; - struct net *net = seq_file_net(seq); - struct mr_table *mrt = it->mrt; - struct mfc_cache *mfc = v; - - ++*pos; - - if (v == SEQ_START_TOKEN) - return ipmr_mfc_seq_idx(net, seq->private, 0); - - if (mfc->list.next != it->cache) - return list_entry(mfc->list.next, struct mfc_cache, list); - - if (it->cache == &mrt->mfc_unres_queue) - goto end_of_list; - - /* exhausted cache_array, show unresolved */ - rcu_read_unlock(); - it->cache = &mrt->mfc_unres_queue; - - spin_lock_bh(&mfc_unres_lock); - if (!list_empty(it->cache)) - return list_first_entry(it->cache, struct mfc_cache, list); - -end_of_list: - spin_unlock_bh(&mfc_unres_lock); - it->cache = NULL; - - return NULL; -} - -static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) -{ - struct ipmr_mfc_iter *it = seq->private; - struct mr_table *mrt = it->mrt; - - if (it->cache == &mrt->mfc_unres_queue) - spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == &mrt->mfc_cache_list) - rcu_read_unlock(); + return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) @@ -3151,26 +2862,26 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) "Group Origin Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc_cache *mfc = v; - const struct ipmr_mfc_iter *it = seq->private; + const struct mr_mfc_iter *it = seq->private; const struct mr_table *mrt = it->mrt; seq_printf(seq, "%08X %08X %-3hd", (__force u32) mfc->mfc_mcastgrp, (__force u32) mfc->mfc_origin, - mfc->mfc_parent); + mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->mfc_un.res.pkt, - mfc->mfc_un.res.bytes, - mfc->mfc_un.res.wrong_if); - for (n = mfc->mfc_un.res.minvif; - n < mfc->mfc_un.res.maxvif; n++) { + mfc->_c.mfc_un.res.pkt, + mfc->_c.mfc_un.res.bytes, + mfc->_c.mfc_un.res.wrong_if); + for (n = mfc->_c.mfc_un.res.minvif; + n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && - mfc->mfc_un.res.ttls[n] < 255) + mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", - n, mfc->mfc_un.res.ttls[n]); + n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain @@ -3185,15 +2896,15 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, - .next = ipmr_mfc_seq_next, - .stop = ipmr_mfc_seq_stop, + .next = mr_mfc_seq_next, + .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; static int ipmr_mfc_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ipmr_mfc_seq_ops, - sizeof(struct ipmr_mfc_iter)); + sizeof(struct mr_mfc_iter)); } static const struct file_operations ipmr_mfc_fops = { @@ -3220,37 +2931,8 @@ static unsigned int ipmr_seq_read(struct net *net) static int ipmr_dump(struct net *net, struct notifier_block *nb) { - struct mr_table *mrt; - int err; - - err = ipmr_rules_dump(net, nb); - if (err) - return err; - - ipmr_for_each_table(mrt, net) { - struct vif_device *v = &mrt->vif_table[0]; - struct mfc_cache *mfc; - int vifi; - - /* Notifiy on table VIF entries */ - read_lock(&mrt_lock); - for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { - if (!v->dev) - continue; - - call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); - } - read_unlock(&mrt_lock); - - /* Notify on table MFC entries */ - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - call_ipmr_mfc_entry_notifier(nb, net, - FIB_EVENT_ENTRY_ADD, mfc, - mrt->id); - } - - return 0; + return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, + ipmr_mr_table_iter, &mrt_lock); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c new file mode 100644 index 000000000000..4fe97723b53f --- /dev/null +++ b/net/ipv4/ipmr_base.c @@ -0,0 +1,365 @@ +/* Linux multicast routing support + * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation + */ + +#include <linux/mroute_base.h> + +/* Sets everything common except 'dev', since that is done under locking */ +void vif_device_init(struct vif_device *v, + struct net_device *dev, + unsigned long rate_limit, + unsigned char threshold, + unsigned short flags, + unsigned short get_iflink_mask) +{ + v->dev = NULL; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + v->rate_limit = rate_limit; + v->flags = flags; + v->threshold = threshold; + if (v->flags & get_iflink_mask) + v->link = dev_get_iflink(dev); + else + v->link = dev->ifindex; +} +EXPORT_SYMBOL(vif_device_init); + +struct mr_table * +mr_table_alloc(struct net *net, u32 id, + struct mr_table_ops *ops, + void (*expire_func)(struct timer_list *t), + void (*table_set)(struct mr_table *mrt, + struct net *net)) +{ + struct mr_table *mrt; + + mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); + if (!mrt) + return NULL; + mrt->id = id; + write_pnet(&mrt->net, net); + + mrt->ops = *ops; + rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params); + INIT_LIST_HEAD(&mrt->mfc_cache_list); + INIT_LIST_HEAD(&mrt->mfc_unres_queue); + + timer_setup(&mrt->ipmr_expire_timer, expire_func, 0); + + mrt->mroute_reg_vif_num = -1; + table_set(mrt, net); + return mrt; +} +EXPORT_SYMBOL(mr_table_alloc); + +void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) +{ + struct rhlist_head *tmp, *list; + struct mr_mfc *c; + + list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (parent == -1 || parent == c->mfc_parent) + return c; + + return NULL; +} +EXPORT_SYMBOL(mr_mfc_find_parent); + +void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) +{ + struct rhlist_head *tmp, *list; + struct mr_mfc *c; + + list = rhltable_lookup(&mrt->mfc_hash, mrt->ops.cmparg_any, + *mrt->ops.rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (c->mfc_un.res.ttls[vifi] < 255) + return c; + + return NULL; +} +EXPORT_SYMBOL(mr_mfc_find_any_parent); + +void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) +{ + struct rhlist_head *tmp, *list; + struct mr_mfc *c, *proxy; + + list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) { + if (c->mfc_un.res.ttls[vifi] < 255) + return c; + + /* It's ok if the vifi is part of the static tree */ + proxy = mr_mfc_find_any_parent(mrt, c->mfc_parent); + if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) + return c; + } + + return mr_mfc_find_any_parent(mrt, vifi); +} +EXPORT_SYMBOL(mr_mfc_find_any); + +#ifdef CONFIG_PROC_FS +void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) +{ + struct mr_table *mrt = iter->mrt; + + for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { + if (!VIF_EXISTS(mrt, iter->ct)) + continue; + if (pos-- == 0) + return &mrt->vif_table[iter->ct]; + } + return NULL; +} +EXPORT_SYMBOL(mr_vif_seq_idx); + +void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct mr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr_table *mrt = iter->mrt; + + ++*pos; + if (v == SEQ_START_TOKEN) + return mr_vif_seq_idx(net, iter, 0); + + while (++iter->ct < mrt->maxvif) { + if (!VIF_EXISTS(mrt, iter->ct)) + continue; + return &mrt->vif_table[iter->ct]; + } + return NULL; +} +EXPORT_SYMBOL(mr_vif_seq_next); + +void *mr_mfc_seq_idx(struct net *net, + struct mr_mfc_iter *it, loff_t pos) +{ + struct mr_table *mrt = it->mrt; + struct mr_mfc *mfc; + + rcu_read_lock(); + it->cache = &mrt->mfc_cache_list; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + if (pos-- == 0) + return mfc; + rcu_read_unlock(); + + spin_lock_bh(it->lock); + it->cache = &mrt->mfc_unres_queue; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + spin_unlock_bh(it->lock); + + it->cache = NULL; + return NULL; +} +EXPORT_SYMBOL(mr_mfc_seq_idx); + +void *mr_mfc_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct mr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr_table *mrt = it->mrt; + struct mr_mfc *c = v; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return mr_mfc_seq_idx(net, seq->private, 0); + + if (c->list.next != it->cache) + return list_entry(c->list.next, struct mr_mfc, list); + + if (it->cache == &mrt->mfc_unres_queue) + goto end_of_list; + + /* exhausted cache_array, show unresolved */ + rcu_read_unlock(); + it->cache = &mrt->mfc_unres_queue; + + spin_lock_bh(it->lock); + if (!list_empty(it->cache)) + return list_first_entry(it->cache, struct mr_mfc, list); + +end_of_list: + spin_unlock_bh(it->lock); + it->cache = NULL; + + return NULL; +} +EXPORT_SYMBOL(mr_mfc_seq_next); +#endif + +int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mr_mfc *c, struct rtmsg *rtm) +{ + struct rta_mfc_stats mfcs; + struct nlattr *mp_attr; + struct rtnexthop *nhp; + unsigned long lastuse; + int ct; + + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mfc_parent >= MAXVIFS) { + rtm->rtm_flags |= RTNH_F_UNRESOLVED; + return -ENOENT; + } + + if (VIF_EXISTS(mrt, c->mfc_parent) && + nla_put_u32(skb, RTA_IIF, + mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) + return -EMSGSIZE; + + if (c->mfc_flags & MFC_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; + + mp_attr = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp_attr) + return -EMSGSIZE; + + for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { + if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + struct vif_device *vif; + + nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); + if (!nhp) { + nla_nest_cancel(skb, mp_attr); + return -EMSGSIZE; + } + + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; + vif = &mrt->vif_table[ct]; + nhp->rtnh_ifindex = vif->dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + } + } + + nla_nest_end(skb, mp_attr); + + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + + mfcs.mfcs_packets = c->mfc_un.res.pkt; + mfcs.mfcs_bytes = c->mfc_un.res.bytes; + mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; + if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), + RTA_PAD)) + return -EMSGSIZE; + + rtm->rtm_type = RTN_MULTICAST; + return 1; +} +EXPORT_SYMBOL(mr_fill_mroute); + +int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, + struct mr_table *(*iter)(struct net *net, + struct mr_table *mrt), + int (*fill)(struct mr_table *mrt, + struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, + int cmd, int flags), + spinlock_t *lock) +{ + unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1]; + struct net *net = sock_net(skb->sk); + struct mr_table *mrt; + struct mr_mfc *mfc; + + rcu_read_lock(); + for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) { + if (t < s_t) + goto next_table; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + if (e < s_e) + goto next_entry; + if (fill(mrt, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, mfc, + RTM_NEWROUTE, NLM_F_MULTI) < 0) + goto done; +next_entry: + e++; + } + e = 0; + s_e = 0; + + spin_lock_bh(lock); + list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { + if (e < s_e) + goto next_entry2; + if (fill(mrt, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, mfc, + RTM_NEWROUTE, NLM_F_MULTI) < 0) { + spin_unlock_bh(lock); + goto done; + } +next_entry2: + e++; + } + spin_unlock_bh(lock); + e = 0; + s_e = 0; +next_table: + t++; + } +done: + rcu_read_unlock(); + + cb->args[1] = e; + cb->args[0] = t; + + return skb->len; +} +EXPORT_SYMBOL(mr_rtm_dumproute); + +int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, + int (*rules_dump)(struct net *net, + struct notifier_block *nb), + struct mr_table *(*mr_iter)(struct net *net, + struct mr_table *mrt), + rwlock_t *mrt_lock) +{ + struct mr_table *mrt; + int err; + + err = rules_dump(net, nb); + if (err) + return err; + + for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) { + struct vif_device *v = &mrt->vif_table[0]; + struct mr_mfc *mfc; + int vifi; + + /* Notifiy on table VIF entries */ + read_lock(mrt_lock); + for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { + if (!v->dev) + continue; + + mr_call_vif_notifier(nb, net, family, + FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + } + read_unlock(mrt_lock); + + /* Notify on table MFC entries */ + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + mr_call_mfc_notifier(nb, net, family, + FIB_EVENT_ENTRY_ADD, + mfc, mrt->id); + } + + return 0; +} +EXPORT_SYMBOL(mr_dump); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index dfe6fa4ea554..280048e1e395 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -34,7 +34,7 @@ config NF_SOCKET_IPV4 if NF_TABLES config NF_TABLES_IPV4 - tristate "IPv4 nf_tables support" + bool "IPv4 nf_tables support" help This option enables the IPv4 support for nf_tables. @@ -71,7 +71,7 @@ config NFT_FIB_IPV4 endif # NF_TABLES_IPV4 config NF_TABLES_ARP - tristate "ARP nf_tables support" + bool "ARP nf_tables support" select NETFILTER_FAMILY_ARP help This option enables the ARP support for nf_tables. diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 9bd19cd18849..7523ddb2566b 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -39,7 +39,6 @@ obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o -obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o @@ -47,7 +46,6 @@ obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o -obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # flow table support obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e3e420f3ba7b..2dc83de53f94 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -334,11 +334,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, t->verdict < 0) || visited) { unsigned int oldpos, size; - if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) - return 0; - /* Return: backtrack through the last * big jump. */ @@ -560,16 +555,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (i != repl->num_entries) goto out_free; - /* Check hooks all assigned */ - for (i = 0; i < NF_ARP_NUMHOOKS; i++) { - /* Only hooks which are valid */ - if (!(repl->valid_hooks & (1 << i))) - continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) - goto out_free; - if (newinfo->underflow[i] == 0xFFFFFFFF) - goto out_free; - } + ret = xt_check_table_hooks(newinfo, repl->valid_hooks); + if (ret) + goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; @@ -781,7 +769,9 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; - xt_compat_init_offsets(NFPROTO_ARP, info->number); + ret = xt_compat_init_offsets(NFPROTO_ARP, info->number); + if (ret) + return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -895,7 +885,7 @@ static int __do_replace(struct net *net, const char *name, struct arpt_entry *iter; ret = 0; - counters = vzalloc(num_counters * sizeof(struct xt_counters)); + counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; @@ -925,6 +915,8 @@ static int __do_replace(struct net *net, const char *name, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); + xt_table_unlock(t); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ @@ -939,7 +931,6 @@ static int __do_replace(struct net *net, const char *name, net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); } vfree(counters); - xt_table_unlock(t); return ret; put_module: @@ -1167,7 +1158,7 @@ static int translate_compat_table(struct xt_table_info **pinfo, struct compat_arpt_entry *iter0; struct arpt_replace repl; unsigned int size; - int ret = 0; + int ret; info = *pinfo; entry0 = *pentry0; @@ -1176,7 +1167,9 @@ static int translate_compat_table(struct xt_table_info **pinfo, j = 0; xt_compat_lock(NFPROTO_ARP); - xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); + ret = xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); + if (ret) + goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e38395a8dcf2..44b308d93ec2 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -402,11 +402,6 @@ mark_source_chains(const struct xt_table_info *newinfo, t->verdict < 0) || visited) { unsigned int oldpos, size; - if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) - return 0; - /* Return: backtrack through the last big jump. */ do { @@ -707,16 +702,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (i != repl->num_entries) goto out_free; - /* Check hooks all assigned */ - for (i = 0; i < NF_INET_NUMHOOKS; i++) { - /* Only hooks which are valid */ - if (!(repl->valid_hooks & (1 << i))) - continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) - goto out_free; - if (newinfo->underflow[i] == 0xFFFFFFFF) - goto out_free; - } + ret = xt_check_table_hooks(newinfo, repl->valid_hooks); + if (ret) + goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; @@ -945,7 +933,9 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; - xt_compat_init_offsets(AF_INET, info->number); + ret = xt_compat_init_offsets(AF_INET, info->number); + if (ret) + return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1057,7 +1047,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct ipt_entry *iter; ret = 0; - counters = vzalloc(num_counters * sizeof(struct xt_counters)); + counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; @@ -1087,6 +1077,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); + xt_table_unlock(t); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ @@ -1100,7 +1092,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); } vfree(counters); - xt_table_unlock(t); return ret; put_module: @@ -1418,7 +1409,9 @@ translate_compat_table(struct net *net, j = 0; xt_compat_lock(AF_INET); - xt_compat_init_offsets(AF_INET, compatr->num_entries); + ret = xt_compat_init_offsets(AF_INET, compatr->num_entries); + if (ret) + goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 8a8ae61cea71..2c8d313ae216 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -250,7 +250,7 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, /* create proc dir entry */ sprintf(buffer, "%pI4", &ip); - c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR, + c->pde = proc_create_data(buffer, 0600, cn->procdir, &clusterip_proc_fops, c); if (!c->pde) { diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index f75fc6b53115..690b17ef6a44 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -16,6 +16,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> +#include <net/netfilter/nf_conntrack_ecache.h> static struct iphdr * synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, @@ -384,6 +385,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, synproxy->isn = ntohl(th->ack_seq); if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) @@ -392,8 +395,10 @@ static unsigned int ipv4_synproxy_hook(void *priv, if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } opts.options &= ~(XT_SYNPROXY_OPT_MSS | XT_SYNPROXY_OPT_WSCALE | @@ -403,6 +408,7 @@ static unsigned int ipv4_synproxy_hook(void *priv, synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack(net, skb, th, &opts); diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index a787d07f6cb7..7c6c20eaf4db 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -47,7 +47,7 @@ static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par) */ pr_debug("Dropping evil AH tinygram.\n"); par->hotdrop = true; - return 0; + return false; } return spi_match(ahinfo->spis[0], ahinfo->spis[1], diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c deleted file mode 100644 index 036c074736b0..000000000000 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/netfilter_arp.h> -#include <net/netfilter/nf_tables.h> - -static unsigned int -nft_do_chain_arp(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_unspec(&pkt, skb); - - return nft_do_chain(&pkt, priv); -} - -static const struct nf_chain_type filter_arp = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_ARP, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_ARP_IN) | - (1 << NF_ARP_OUT), - .hooks = { - [NF_ARP_IN] = nft_do_chain_arp, - [NF_ARP_OUT] = nft_do_chain_arp, - }, -}; - -static int __init nf_tables_arp_init(void) -{ - return nft_register_chain_type(&filter_arp); -} - -static void __exit nf_tables_arp_exit(void) -{ - nft_unregister_chain_type(&filter_arp); -} - -module_init(nf_tables_arp_init); -module_exit(nf_tables_arp_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(3, "filter"); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c deleted file mode 100644 index 96f955496d5f..000000000000 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/ip.h> -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_tables.h> -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/netfilter/nf_tables_ipv4.h> - -static unsigned int nft_do_chain_ipv4(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); - - return nft_do_chain(&pkt, priv); -} - -static const struct nf_chain_type filter_ipv4 = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_IPV4, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING), - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, - [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4, - [NF_INET_FORWARD] = nft_do_chain_ipv4, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, - }, -}; - -static int __init nf_tables_ipv4_init(void) -{ - return nft_register_chain_type(&filter_ipv4); -} - -static void __exit nf_tables_ipv4_exit(void) -{ - nft_unregister_chain_type(&filter_ipv4); -} - -module_init(nf_tables_ipv4_init); -module_exit(nf_tables_ipv4_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_INET, "filter"); diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index f2a490981594..b5464a3f253b 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -67,7 +67,17 @@ static unsigned int nft_nat_ipv4_local_fn(void *priv, return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain); } -static const struct nf_chain_type nft_chain_nat_ipv4 = { +static int nft_nat_ipv4_init(struct nft_ctx *ctx) +{ + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_nat_ipv4_free(struct nft_ctx *ctx) +{ + nf_ct_netns_put(ctx->net, ctx->family); +} + +static const struct nft_chain_type nft_chain_nat_ipv4 = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_IPV4, @@ -82,15 +92,13 @@ static const struct nf_chain_type nft_chain_nat_ipv4 = { [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn, [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn, }, + .init = nft_nat_ipv4_init, + .free = nft_nat_ipv4_free, }; static int __init nft_chain_nat_init(void) { - int err; - - err = nft_register_chain_type(&nft_chain_nat_ipv4); - if (err < 0) - return err; + nft_register_chain_type(&nft_chain_nat_ipv4); return 0; } diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index d965c225b9f6..7d82934c46f4 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -58,7 +58,7 @@ static unsigned int nf_route_table_hook(void *priv, return ret; } -static const struct nf_chain_type nft_chain_route_ipv4 = { +static const struct nft_chain_type nft_chain_route_ipv4 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, .family = NFPROTO_IPV4, @@ -71,7 +71,9 @@ static const struct nf_chain_type nft_chain_route_ipv4 = { static int __init nft_chain_route_init(void) { - return nft_register_chain_type(&nft_chain_route_ipv4); + nft_register_chain_type(&nft_chain_route_ipv4); + + return 0; } static void __exit nft_chain_route_exit(void) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b8f0db54b197..05e47d777009 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1177,7 +1177,7 @@ static struct ping_seq_afinfo ping_v4_seq_afinfo = { int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo) { struct proc_dir_entry *p; - p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(afinfo->name, 0444, net->proc_net, afinfo->seq_fops, afinfo); if (!p) return -ENOMEM; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index dc5edc8f7564..a058de677e94 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -54,7 +54,6 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; - unsigned int frag_mem; int orphans, sockets; orphans = percpu_counter_sum_positive(&tcp_orphan_count); @@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - frag_mem = ip_frag_mem(net); - seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); + seq_printf(seq, "FRAG: inuse %u memory %lu\n", + atomic_read(&net->ipv4.frags.rhashtable.nelems), + frag_mem_limit(&net->ipv4.frags)); return 0; } @@ -521,12 +521,12 @@ static const struct file_operations netstat_seq_fops = { static __net_init int ip_proc_init_net(struct net *net) { - if (!proc_create("sockstat", S_IRUGO, net->proc_net, + if (!proc_create("sockstat", 0444, net->proc_net, &sockstat_seq_fops)) goto out_sockstat; - if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops)) + if (!proc_create("netstat", 0444, net->proc_net, &netstat_seq_fops)) goto out_netstat; - if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops)) + if (!proc_create("snmp", 0444, net->proc_net, &snmp_seq_fops)) goto out_snmp; return 0; @@ -555,4 +555,3 @@ int __init ip_misc_proc_init(void) { return register_pernet_subsys(&ip_proc_ops); } - diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9b367fc48d7d..1b4d3355624a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout) /* * Raw sockets may have direct kernel references. Kill them. */ - rtnl_lock(); ip_ra_control(sk, 0, NULL); - rtnl_unlock(); sk_common_release(sk); } @@ -1142,7 +1140,7 @@ static const struct file_operations raw_seq_fops = { static __net_init int raw_init_net(struct net *net) { - if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops)) + if (!proc_create("raw", 0444, net->proc_net, &raw_seq_fops)) return -ENOMEM; return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 299e247b2032..8322e479f299 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -379,12 +379,12 @@ static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_create("rt_cache", S_IRUGO, net->proc_net, + pde = proc_create("rt_cache", 0444, net->proc_net, &rt_cache_seq_fops); if (!pde) goto err1; - pde = proc_create("rt_cache", S_IRUGO, + pde = proc_create("rt_cache", 0444, net->proc_net_stat, &rt_cpu_seq_fops); if (!pde) goto err2; @@ -1532,7 +1532,6 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_mtu_locked = 0; rt->rt_gateway = 0; rt->rt_uses_gateway = 0; - rt->rt_table_id = 0; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; @@ -1668,19 +1667,6 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) spin_unlock_bh(&fnhe_lock); } -static void set_lwt_redirect(struct rtable *rth) -{ - if (lwtunnel_output_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_output = rth->dst.output; - rth->dst.output = lwtunnel_output; - } - - if (lwtunnel_input_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_input = rth->dst.input; - rth->dst.input = lwtunnel_input; - } -} - /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1763,15 +1749,13 @@ rt_cache: } rth->rt_is_input = 1; - if (res->table) - rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); - set_lwt_redirect(rth); + lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: err = 0; @@ -1787,44 +1771,45 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); + const struct iphdr *key_iph = outer_iph; const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; struct icmphdr _icmph; - hash_keys->addrs.v4addrs.src = outer_iph->saddr; - hash_keys->addrs.v4addrs.dst = outer_iph->daddr; if (likely(outer_iph->protocol != IPPROTO_ICMP)) - return; + goto out; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) - return; + goto out; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) - return; + goto out; if (icmph->type != ICMP_DEST_UNREACH && icmph->type != ICMP_REDIRECT && icmph->type != ICMP_TIME_EXCEEDED && icmph->type != ICMP_PARAMETERPROB) - return; + goto out; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) - return; - hash_keys->addrs.v4addrs.src = inner_iph->saddr; - hash_keys->addrs.v4addrs.dst = inner_iph->daddr; + goto out; + + key_iph = inner_iph; +out: + hash_keys->addrs.v4addrs.src = key_iph->saddr; + hash_keys->addrs.v4addrs.dst = key_iph->daddr; } /* if skb is set it will be used and fl4 can be NULL */ -int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, - const struct sk_buff *skb) +int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, + const struct sk_buff *skb, struct flow_keys *flkeys) { - struct net *net = fi->fib_net; struct flow_keys hash_keys; u32 mhash; @@ -1848,15 +1833,20 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; + memset(&hash_keys, 0, sizeof(hash_keys)); - skb_flow_dissect_flow_keys(skb, &keys, flag); + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, flag); + flkeys = &keys; + } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; - hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; - hash_keys.ports.src = keys.ports.src; - hash_keys.ports.dst = keys.ports.dst; - hash_keys.basic.ip_proto = keys.basic.ip_proto; + hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; + hash_keys.ports.src = flkeys->ports.src; + hash_keys.ports.dst = flkeys->ports.dst; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -1872,17 +1862,17 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, return mhash >> 1; } -EXPORT_SYMBOL_GPL(fib_multipath_hash); #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) + __be32 daddr, __be32 saddr, u32 tos, + struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) { - int h = fib_multipath_hash(res->fi, NULL, skb); + int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); } @@ -1908,13 +1898,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct fib_result *res) { struct in_device *in_dev = __in_dev_get_rcu(dev); + struct flow_keys *flkeys = NULL, _flkeys; + struct net *net = dev_net(dev); struct ip_tunnel_info *tun_info; - struct flowi4 fl4; + int err = -EINVAL; unsigned int flags = 0; u32 itag = 0; struct rtable *rth; - int err = -EINVAL; - struct net *net = dev_net(dev); + struct flowi4 fl4; bool do_cache; /* IP on this device is disabled. */ @@ -1973,6 +1964,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); + + if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) + flkeys = &_flkeys; + err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) @@ -1998,7 +1993,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res->type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); out: return err; brd_input: @@ -2040,8 +2035,6 @@ local_input: rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; - if (res->table) - rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); if (res->type == RTN_UNREACHABLE) { @@ -2270,8 +2263,6 @@ add: return ERR_PTR(-ENOBUFS); rth->rt_iif = orig_oif; - if (res->table) - rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(out_slow_tot); @@ -2293,7 +2284,7 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); - set_lwt_redirect(rth); + lwtunnel_set_redirect(&rth->dst); return rth; } @@ -2804,7 +2795,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) - table_id = rt->rt_table_id; + table_id = res.table ? res.table->tb_id : 0; if (rtm->rtm_flags & RTM_F_FIB_MATCH) { if (!res.fi) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 93e172118a94..4b195bac8ac0 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -400,7 +400,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net); + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } @@ -520,22 +520,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { - .procname = "udp_rmem_min", - .data = &sysctl_udp_rmem_min, - .maxlen = sizeof(sysctl_udp_rmem_min), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one - }, - { - .procname = "udp_wmem_min", - .data = &sysctl_udp_wmem_min, - .maxlen = sizeof(sysctl_udp_wmem_min), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one - }, { } }; @@ -1167,6 +1151,22 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, + { + .procname = "udp_rmem_min", + .data = &init_net.ipv4.sysctl_udp_rmem_min, + .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, + { + .procname = "udp_wmem_min", + .data = &init_net.ipv4.sysctl_udp_wmem_min, + .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8b8059b7af4d..bccc4c270087 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -453,6 +453,7 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk_sockets_allocated_inc(sk); + sk->sk_route_forced_caps = NETIF_F_GSO; } EXPORT_SYMBOL(tcp_init_sock); @@ -484,6 +485,14 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) } } +static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, + int target, struct sock *sk) +{ + return (tp->rcv_nxt - tp->copied_seq >= target) || + (sk->sk_prot->stream_memory_read ? + sk->sk_prot->stream_memory_read(sk) : false); +} + /* * Wait for a TCP event. * @@ -553,7 +562,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - if (tp->rcv_nxt - tp->copied_seq >= target) + if (tcp_stream_is_readable(tp, target, sk)) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { @@ -897,7 +906,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, struct tcp_sock *tp = tcp_sk(sk); u32 new_size_goal, size_goal; - if (!large_allowed || !sk_can_gso(sk)) + if (!large_allowed) return mss_now; /* Note : tcp_tso_autosize() will eventually split this later */ @@ -993,7 +1002,9 @@ new_segment: get_page(page); skb_fill_page_desc(skb, i, page, offset, copy); } - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; skb->len += copy; skb->data_len += copy; @@ -1062,8 +1073,7 @@ EXPORT_SYMBOL_GPL(do_tcp_sendpages); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags) { - if (!(sk->sk_route_caps & NETIF_F_SG) || - !sk_check_csum_caps(sk)) + if (!(sk->sk_route_caps & NETIF_F_SG)) return sock_no_sendpage_locked(sk, page, offset, size, flags); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ @@ -1102,27 +1112,11 @@ static int linear_payload_sz(bool first_skb) return 0; } -static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc) +static int select_size(bool first_skb, bool zc) { - const struct tcp_sock *tp = tcp_sk(sk); - int tmp = tp->mss_cache; - - if (sg) { - if (zc) - return 0; - - if (sk_can_gso(sk)) { - tmp = linear_payload_sz(first_skb); - } else { - int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); - - if (tmp >= pgbreak && - tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) - tmp = pgbreak; - } - } - - return tmp; + if (zc) + return 0; + return linear_payload_sz(first_skb); } void tcp_free_fastopen_req(struct tcp_sock *tp) @@ -1187,7 +1181,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; bool process_backlog = false; - bool sg, zc = false; + bool zc = false; long timeo; flags = msg->msg_flags; @@ -1205,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG; + zc = sk->sk_route_caps & NETIF_F_SG; if (!zc) uarg->zerocopy = 0; } @@ -1268,18 +1262,12 @@ restart: if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; - sg = !!(sk->sk_route_caps & NETIF_F_SG); - while (msg_data_left(msg)) { int copy = 0; - int max = size_goal; skb = tcp_write_queue_tail(sk); - if (skb) { - if (skb->ip_summed == CHECKSUM_NONE) - max = mss_now; - copy = max - skb->len; - } + if (skb) + copy = size_goal - skb->len; if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; @@ -1297,22 +1285,17 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - linear = select_size(sk, sg, first_skb, zc); + linear = select_size(first_skb, zc); skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, first_skb); if (!skb) goto wait_for_memory; process_backlog = true; - /* - * Check whether we can use HW checksum. - */ - if (sk_check_csum_caps(sk)) - skb->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; skb_entail(sk, skb); copy = size_goal; - max = size_goal; /* All packets are restored as if they have * already been sent. skb_mstamp isn't set to @@ -1343,7 +1326,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i >= sysctl_max_skb_frags || !sg) { + if (i >= sysctl_max_skb_frags) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1396,7 +1379,7 @@ new_segment: goto out; } - if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) + if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { @@ -3058,8 +3041,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 3 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); + 5 * nla_total_size(sizeof(u32)) + + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3088,6 +3071,10 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); + nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); + + nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); + nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); return stats; } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index a471f696e13c..158d105e76da 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -97,10 +97,9 @@ struct bbr { packet_conservation:1, /* use packet conservation? */ restore_cwnd:1, /* decided to revert cwnd to old value */ round_start:1, /* start of packet-timed tx->ack round? */ - tso_segs_goal:7, /* segments we want in each skb we send */ idle_restart:1, /* restarting after idle? */ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ - unused:5, + unused:12, lt_is_sampling:1, /* taking long-term ("LT") samples now? */ lt_rtt_cnt:7, /* round trips in long-term interval */ lt_use_bw:1; /* use lt_bw as our bw estimate? */ @@ -261,23 +260,25 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) sk->sk_pacing_rate = rate; } -/* Return count of segments we want in the skbs we send, or 0 for default. */ -static u32 bbr_tso_segs_goal(struct sock *sk) +/* override sysctl_tcp_min_tso_segs */ +static u32 bbr_min_tso_segs(struct sock *sk) { - struct bbr *bbr = inet_csk_ca(sk); - - return bbr->tso_segs_goal; + return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; } -static void bbr_set_tso_segs_goal(struct sock *sk) +static u32 bbr_tso_segs_goal(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); - u32 min_segs; + u32 segs, bytes; + + /* Sort of tcp_tso_autosize() but ignoring + * driver provided sk_gso_max_size. + */ + bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, + GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); + segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; - bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), - 0x7FU); + return min(segs, 0x7FU); } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ @@ -348,7 +349,7 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; /* Allow enough full-sized skbs in flight to utilize end systems. */ - cwnd += 3 * bbr->tso_segs_goal; + cwnd += 3 * bbr_tso_segs_goal(sk); /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ cwnd = (cwnd + 1) & ~1U; @@ -730,6 +731,8 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) bbr->mode = BBR_DRAIN; /* drain queue we created */ bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ + tcp_sk(sk)->snd_ssthresh = + bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && tcp_packets_in_flight(tcp_sk(sk)) <= @@ -824,7 +827,6 @@ static void bbr_main(struct sock *sk, const struct rate_sample *rs) bw = bbr_bw(sk); bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); - bbr_set_tso_segs_goal(sk); bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); } @@ -834,7 +836,7 @@ static void bbr_init(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->prior_cwnd = 0; - bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; bbr->rtt_cnt = 0; bbr->next_rtt_delivered = 0; bbr->prev_ca_state = TCP_CA_Open; @@ -887,7 +889,7 @@ static u32 bbr_undo_cwnd(struct sock *sk) static u32 bbr_ssthresh(struct sock *sk) { bbr_save_cwnd(sk); - return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ + return tcp_sk(sk)->snd_ssthresh; } static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, @@ -936,7 +938,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .undo_cwnd = bbr_undo_cwnd, .cwnd_event = bbr_cwnd_event, .ssthresh = bbr_ssthresh, - .tso_segs_goal = bbr_tso_segs_goal, + .min_tso_segs = bbr_min_tso_segs, .get_info = bbr_get_info, .set_state = bbr_set_state, }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ff6cd98ce8d5..367def6ddeda 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1358,9 +1358,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, int len; int in_sack; - if (!sk_can_gso(sk)) - goto fallback; - /* Normally R but no L won't result in plain S */ if (!dup_sack && (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) @@ -5862,10 +5859,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->rx_opt.saw_tstamp = 0; req = tp->fastopen_rsk; if (req) { + bool req_stolen; + WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (!tcp_check_req(sk, skb, req, true)) + if (!tcp_check_req(sk, skb, req, true, &req_stolen)) goto discard; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f8ad397e285e..f70586b50838 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -140,6 +140,21 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_SYMBOL_GPL(tcp_twsk_unique); +static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v4_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); +} + /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -561,16 +576,9 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct tcphdr *th = tcp_hdr(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - } else { - th->check = tcp_v4_check(skb->len, saddr, daddr, - csum_partial(th, - th->doff << 2, - skb->csum)); - } + th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); } /* This routine computes an IPv4 TCP checksum. */ @@ -1672,6 +1680,7 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); + bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; @@ -1694,10 +1703,20 @@ process: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); - nsk = tcp_check_req(sk, skb, req, false); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen); } if (!nsk) { reqsk_put(req); + if (req_stolen) { + /* Another cpu got exclusive access to req + * and created a full blown socket. + * Try to feed this packet to this socket + * instead of discarding it. + */ + tcp_v4_restore_cb(skb); + sock_put(sk); + goto lookup; + } goto discard_and_relse; } if (nsk == sk) { @@ -2211,7 +2230,7 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) afinfo->seq_ops.next = tcp_seq_next; afinfo->seq_ops.stop = tcp_seq_stop; - p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(afinfo->name, 0444, net->proc_net, afinfo->seq_fops, afinfo); if (!p) rc = -ENOMEM; @@ -2404,6 +2423,7 @@ struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v4_pre_connect, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a8384b0c11f8..57b5468b5139 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -332,6 +332,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcp_update_metrics(sk); tcp_done(sk); } +EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { @@ -578,7 +579,7 @@ EXPORT_SYMBOL(tcp_create_openreq_child); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - bool fastopen) + bool fastopen, bool *req_stolen) { struct tcp_options_received tmp_opt; struct sock *child; @@ -785,6 +786,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); + *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6818042cd8a9..383cac0ff0ec 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1206,7 +1206,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { + if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ @@ -1335,21 +1335,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; tcp_skb_fragment_eor(skb, buff); - if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { - /* Copy and checksum data tail into the new buffer. */ - buff->csum = csum_partial_copy_nocheck(skb->data + len, - skb_put(buff, nsize), - nsize, 0); - - skb_trim(skb, len); - - skb->csum = csum_block_sub(skb->csum, buff->csum, len); - } else { - skb->ip_summed = CHECKSUM_PARTIAL; - skb_split(skb, buff, len); - } + skb_split(skb, buff, len); - buff->ip_summed = skb->ip_summed; + buff->ip_summed = CHECKSUM_PARTIAL; buff->tstamp = skb->tstamp; tcp_fragment_tstamp(skb, buff); @@ -1715,8 +1703,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, /* Return how many segs we'd like on a TSO packet, * to send one TSO packet per ms */ -u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, - int min_tso_segs) +static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + int min_tso_segs) { u32 bytes, segs; @@ -1732,7 +1720,6 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, return segs; } -EXPORT_SYMBOL(tcp_tso_autosize); /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. @@ -1740,11 +1727,13 @@ EXPORT_SYMBOL(tcp_tso_autosize); static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; + u32 min_tso, tso_segs; - if (!tso_segs) - tso_segs = tcp_tso_autosize(sk, mss_now, - sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + min_tso = ca_ops->min_tso_segs ? + ca_ops->min_tso_segs(sk) : + sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs; + + tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } @@ -1902,7 +1891,7 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, tcp_skb_fragment_eor(skb, buff); - buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; + buff->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); @@ -2135,7 +2124,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; TCP_SKB_CB(nskb)->sacked = 0; nskb->csum = 0; - nskb->ip_summed = skb->ip_summed; + nskb->ip_summed = CHECKSUM_PARTIAL; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); @@ -2143,14 +2132,7 @@ static int tcp_mtu_probe(struct sock *sk) len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); - if (nskb->ip_summed) { - skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); - } else { - __wsum csum = skb_copy_and_csum_bits(skb, 0, - skb_put(nskb, copy), - copy, 0); - nskb->csum = csum_block_add(nskb->csum, csum, len); - } + skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); if (skb->len <= copy) { /* We've eaten all the data from this skb. @@ -2167,9 +2149,6 @@ static int tcp_mtu_probe(struct sock *sk) ~(TCPHDR_FIN|TCPHDR_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); - if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_partial(skb->data, - skb->len, 0); } else { __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(skb, mss_now); @@ -2747,12 +2726,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } tcp_highest_sack_replace(sk, next_skb, skb); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) - skb->ip_summed = CHECKSUM_PARTIAL; - - if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); - /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index ec35eaa5c029..c0630013c1ae 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -90,7 +90,7 @@ EXPORT_SYMBOL(xfrm4_tunnel_deregister); for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ - + static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e5ef7c38c934..24b5c59b1c53 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -122,12 +122,6 @@ EXPORT_SYMBOL(udp_table); long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); -int sysctl_udp_rmem_min __read_mostly; -EXPORT_SYMBOL(sysctl_udp_rmem_min); - -int sysctl_udp_wmem_min __read_mostly; -EXPORT_SYMBOL(sysctl_udp_wmem_min); - atomic_long_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); @@ -1664,6 +1658,19 @@ csum_copy_err: goto try_again; } +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + /* This check is replicated from __ip4_datagram_connect() and + * intended to prevent BPF program called below from accessing bytes + * that are out of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); +} +EXPORT_SYMBOL(udp_pre_connect); + int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2533,35 +2540,36 @@ int udp_abort(struct sock *sk, int err) EXPORT_SYMBOL_GPL(udp_abort); struct proto udp_prot = { - .name = "UDP", - .owner = THIS_MODULE, - .close = udp_lib_close, - .connect = ip4_datagram_connect, - .disconnect = udp_disconnect, - .ioctl = udp_ioctl, - .init = udp_init_sock, - .destroy = udp_destroy_sock, - .setsockopt = udp_setsockopt, - .getsockopt = udp_getsockopt, - .sendmsg = udp_sendmsg, - .recvmsg = udp_recvmsg, - .sendpage = udp_sendpage, - .release_cb = ip4_datagram_release_cb, - .hash = udp_lib_hash, - .unhash = udp_lib_unhash, - .rehash = udp_v4_rehash, - .get_port = udp_v4_get_port, - .memory_allocated = &udp_memory_allocated, - .sysctl_mem = sysctl_udp_mem, - .sysctl_wmem = &sysctl_udp_wmem_min, - .sysctl_rmem = &sysctl_udp_rmem_min, - .obj_size = sizeof(struct udp_sock), - .h.udp_table = &udp_table, + .name = "UDP", + .owner = THIS_MODULE, + .close = udp_lib_close, + .pre_connect = udp_pre_connect, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .init = udp_init_sock, + .destroy = udp_destroy_sock, + .setsockopt = udp_setsockopt, + .getsockopt = udp_getsockopt, + .sendmsg = udp_sendmsg, + .recvmsg = udp_recvmsg, + .sendpage = udp_sendpage, + .release_cb = ip4_datagram_release_cb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .rehash = udp_v4_rehash, + .get_port = udp_v4_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), + .obj_size = sizeof(struct udp_sock), + .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udp_setsockopt, - .compat_getsockopt = compat_udp_getsockopt, + .compat_setsockopt = compat_udp_setsockopt, + .compat_getsockopt = compat_udp_getsockopt, #endif - .diag_destroy = udp_abort, + .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); @@ -2679,7 +2687,7 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) afinfo->seq_ops.next = udp_seq_next; afinfo->seq_ops.stop = udp_seq_stop; - p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(afinfo->name, 0444, net->proc_net, afinfo->seq_fops, afinfo); if (!p) rc = -ENOMEM; @@ -2830,6 +2838,26 @@ u32 udp_flow_hashrnd(void) } EXPORT_SYMBOL(udp_flow_hashrnd); +static void __udp_sysctl_init(struct net *net) +{ + net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM; + net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM; + +#ifdef CONFIG_NET_L3_MASTER_DEV + net->ipv4.sysctl_udp_l3mdev_accept = 0; +#endif +} + +static int __net_init udp_sysctl_init(struct net *net) +{ + __udp_sysctl_init(net); + return 0; +} + +static struct pernet_operations __net_initdata udp_sysctl_ops = { + .init = udp_sysctl_init, +}; + void __init udp_init(void) { unsigned long limit; @@ -2842,8 +2870,7 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - sysctl_udp_rmem_min = SK_MEM_QUANTUM; - sysctl_udp_wmem_min = SK_MEM_QUANTUM; + __udp_sysctl_init(&init_net); /* 16 spinlocks per cpu */ udp_busylocks_log = ilog2(nr_cpu_ids) + 4; @@ -2853,4 +2880,7 @@ void __init udp_init(void) panic("UDP: failed to alloc udp_busylocks\n"); for (i = 0; i < (1U << udp_busylocks_log); i++) spin_lock_init(udp_busylocks + i); + + if (register_pernet_subsys(&udp_sysctl_ops)) + panic("UDP: failed to init sysctl parameters.\n"); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index fbebda67ac1b..d73a6d6652f6 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -101,7 +101,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; - xdst->u.rt.rt_table_id = rt->rt_table_id; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); rt_add_uncached_list(&xdst->u.rt); @@ -382,4 +381,3 @@ void __init xfrm4_init(void) xfrm4_protocol_init(); register_pernet_subsys(&xfrm4_net_ops); } - |