summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2020-03-31 10:44:53 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2020-03-31 10:44:53 -0400
commitcf39d37539068d53e015d8b4f1dcf42c65306b0d (patch)
treec8dcbbc3a5c92beee82e83e2a2b597f328a919a6 /net
parent830948eb68265ac7f3f364aa9801550feafec0d6 (diff)
parent463050599742a89e0508355e626e032e8d0dab8d (diff)
downloadlinux-cf39d37539068d53e015d8b4f1dcf42c65306b0d.tar.bz2
Merge tag 'kvmarm-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm updates for Linux 5.7 - GICv4.1 support - 32bit host removal
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig1
-rw-r--r--net/bridge/br_device.c6
-rw-r--r--net/bridge/br_stp.c3
-rw-r--r--net/core/dev.c36
-rw-r--r--net/core/devlink.c38
-rw-r--r--net/core/fib_rules.c2
-rw-r--r--net/core/rtnetlink.c26
-rw-r--r--net/core/skbuff.c6
-rw-r--r--net/ethtool/bitset.c6
-rw-r--r--net/ethtool/bitset.h2
-rw-r--r--net/hsr/hsr_framereg.c3
-rw-r--r--net/ipv4/cipso_ipv4.c7
-rw-r--r--net/ipv4/tcp_input.c6
-rw-r--r--net/ipv4/udp.c6
-rw-r--r--net/ipv6/ip6_fib.c7
-rw-r--r--net/ipv6/ip6_gre.c8
-rw-r--r--net/ipv6/ip6_tunnel.c13
-rw-r--r--net/ipv6/ipv6_sockglue.c10
-rw-r--r--net/ipv6/route.c1
-rw-r--r--net/mac80211/mlme.c6
-rw-r--r--net/mac80211/rx.c2
-rw-r--r--net/mptcp/Kconfig1
-rw-r--r--net/mptcp/protocol.c54
-rw-r--r--net/mptcp/protocol.h4
-rw-r--r--net/netfilter/ipset/ip_set_core.c34
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h635
-rw-r--r--net/netfilter/nf_conntrack_core.c192
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c20
-rw-r--r--net/netfilter/nf_flow_table_offload.c6
-rw-r--r--net/netfilter/nft_set_pipapo.c12
-rw-r--r--net/netfilter/xt_hashlimit.c38
-rw-r--r--net/netlabel/netlabel_domainhash.c3
-rw-r--r--net/netlabel/netlabel_unlabeled.c3
-rw-r--r--net/netlink/af_netlink.c5
-rw-r--r--net/netlink/genetlink.c5
-rw-r--r--net/openvswitch/datapath.c9
-rw-r--r--net/openvswitch/flow_netlink.c18
-rw-r--r--net/openvswitch/flow_table.c6
-rw-r--r--net/openvswitch/meter.c3
-rw-r--r--net/openvswitch/vport.c3
-rw-r--r--net/rds/rdma.c24
-rw-r--r--net/sched/act_api.c1
-rw-r--r--net/sched/cls_flower.c1
-rw-r--r--net/sctp/sm_statefuns.c29
-rw-r--r--net/smc/af_smc.c25
-rw-r--r--net/smc/smc_core.c12
-rw-r--r--net/smc/smc_core.h2
-rw-r--r--net/smc/smc_ib.c2
-rw-r--r--net/tls/tls_device.c20
-rw-r--r--net/unix/af_unix.c4
-rw-r--r--net/vmw_vsock/af_vsock.c20
-rw-r--r--net/vmw_vsock/hyperv_transport.c3
-rw-r--r--net/vmw_vsock/virtio_transport_common.c2
-rw-r--r--net/wireless/nl80211.c5
-rw-r--r--net/wireless/reg.c2
-rw-r--r--net/xdp/xsk.c2
-rw-r--r--net/xdp/xsk_queue.h3
57 files changed, 954 insertions, 449 deletions
diff --git a/net/Kconfig b/net/Kconfig
index b0937a700f01..2eeb0e55f7c9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -189,7 +189,6 @@ config BRIDGE_NETFILTER
depends on NETFILTER_ADVANCED
select NETFILTER_FAMILY_BRIDGE
select SKB_EXTENSIONS
- default m
---help---
Enabling this option will let arptables resp. iptables see bridged
ARP resp. IP traffic. If you want a bridging firewall, you probably
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index dc3d2c1dd9d5..0e3dbc5f3c34 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -34,7 +34,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
const struct nf_br_ops *nf_ops;
u8 state = BR_STATE_FORWARDING;
const unsigned char *dest;
- struct ethhdr *eth;
u16 vid = 0;
rcu_read_lock();
@@ -54,15 +53,14 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
BR_INPUT_SKB_CB(skb)->frag_max_size = 0;
skb_reset_mac_header(skb);
- eth = eth_hdr(skb);
skb_pull(skb, ETH_HLEN);
if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state))
goto out;
if (IS_ENABLED(CONFIG_INET) &&
- (eth->h_proto == htons(ETH_P_ARP) ||
- eth->h_proto == htons(ETH_P_RARP)) &&
+ (eth_hdr(skb)->h_proto == htons(ETH_P_ARP) ||
+ eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) &&
br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
br_do_proxy_suppress_arp(skb, br, vid, NULL);
} else if (IS_ENABLED(CONFIG_IPV6) &&
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 6856a6d9282b..1f14b8455345 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -63,7 +63,8 @@ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no)
{
struct net_bridge_port *p;
- list_for_each_entry_rcu(p, &br->port_list, list) {
+ list_for_each_entry_rcu(p, &br->port_list, list,
+ lockdep_is_held(&br->lock)) {
if (p->port_no == port_no)
return p;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index a6316b336128..c6c985fe7b1b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -146,7 +146,6 @@
#include "net-sysfs.h"
#define MAX_GRO_SKBS 8
-#define MAX_NEST_DEV 8
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
@@ -331,6 +330,12 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
name_node = netdev_name_node_lookup(net, name);
if (!name_node)
return -ENOENT;
+ /* lookup might have found our primary name or a name belonging
+ * to another device.
+ */
+ if (name_node == dev->name_node || name_node->dev != dev)
+ return -EINVAL;
+
__netdev_name_node_alt_destroy(name_node);
return 0;
@@ -3071,6 +3076,8 @@ static u16 skb_tx_hash(const struct net_device *dev,
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
+ if (hash >= qoffset)
+ hash -= qoffset;
while (unlikely(hash >= qcount))
hash -= qcount;
return hash + qoffset;
@@ -3657,26 +3664,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
if (q->flags & TCQ_F_NOLOCK) {
- if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) &&
- qdisc_run_begin(q)) {
- if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
- &q->state))) {
- __qdisc_drop(skb, &to_free);
- rc = NET_XMIT_DROP;
- goto end_run;
- }
- qdisc_bstats_cpu_update(q, skb);
-
- rc = NET_XMIT_SUCCESS;
- if (sch_direct_xmit(skb, q, dev, txq, NULL, true))
- __qdisc_run(q);
-
-end_run:
- qdisc_run_end(q);
- } else {
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
- qdisc_run(q);
- }
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ qdisc_run(q);
if (unlikely(to_free))
kfree_skb_list(to_free);
@@ -7201,8 +7190,8 @@ static int __netdev_walk_all_lower_dev(struct net_device *dev,
return 0;
}
-static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
- struct list_head **iter)
+struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
@@ -7214,6 +7203,7 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
return lower->dev;
}
+EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
static u8 __netdev_upper_depth(struct net_device *dev)
{
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 549ee56b7a21..5e220809844c 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2103,11 +2103,11 @@ err_action_values_put:
static struct devlink_dpipe_table *
devlink_dpipe_table_find(struct list_head *dpipe_tables,
- const char *table_name)
+ const char *table_name, struct devlink *devlink)
{
struct devlink_dpipe_table *table;
-
- list_for_each_entry_rcu(table, dpipe_tables, list) {
+ list_for_each_entry_rcu(table, dpipe_tables, list,
+ lockdep_is_held(&devlink->lock)) {
if (!strcmp(table->name, table_name))
return table;
}
@@ -2226,7 +2226,7 @@ static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb,
table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
+ table_name, devlink);
if (!table)
return -EINVAL;
@@ -2382,7 +2382,7 @@ static int devlink_dpipe_table_counters_set(struct devlink *devlink,
struct devlink_dpipe_table *table;
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
+ table_name, devlink);
if (!table)
return -EINVAL;
@@ -6854,7 +6854,7 @@ bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
rcu_read_lock();
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
+ table_name, devlink);
enabled = false;
if (table)
enabled = table->counters_enabled;
@@ -6878,26 +6878,34 @@ int devlink_dpipe_table_register(struct devlink *devlink,
void *priv, bool counter_control_extern)
{
struct devlink_dpipe_table *table;
-
- if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name))
- return -EEXIST;
+ int err = 0;
if (WARN_ON(!table_ops->size_get))
return -EINVAL;
+ mutex_lock(&devlink->lock);
+
+ if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name,
+ devlink)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
table = kzalloc(sizeof(*table), GFP_KERNEL);
- if (!table)
- return -ENOMEM;
+ if (!table) {
+ err = -ENOMEM;
+ goto unlock;
+ }
table->name = table_name;
table->table_ops = table_ops;
table->priv = priv;
table->counter_control_extern = counter_control_extern;
- mutex_lock(&devlink->lock);
list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
+unlock:
mutex_unlock(&devlink->lock);
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
@@ -6914,7 +6922,7 @@ void devlink_dpipe_table_unregister(struct devlink *devlink,
mutex_lock(&devlink->lock);
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
+ table_name, devlink);
if (!table)
goto unlock;
list_del_rcu(&table->list);
@@ -7071,7 +7079,7 @@ int devlink_dpipe_table_resource_set(struct devlink *devlink,
mutex_lock(&devlink->lock);
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
+ table_name, devlink);
if (!table) {
err = -EINVAL;
goto out;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 3e7e15278c46..bd7eba9066f8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -974,7 +974,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
frh = nlmsg_data(nlh);
frh->family = ops->family;
- frh->table = rule->table;
+ frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT;
if (nla_put_u32(skb, FRA_TABLE, rule->table))
goto nla_put_failure;
if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 09c44bf2e1d2..e1152f4ffe33 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3504,27 +3504,25 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
if (err)
return err;
- alt_ifname = nla_data(attr);
+ alt_ifname = nla_strdup(attr, GFP_KERNEL);
+ if (!alt_ifname)
+ return -ENOMEM;
+
if (cmd == RTM_NEWLINKPROP) {
- alt_ifname = kstrdup(alt_ifname, GFP_KERNEL);
- if (!alt_ifname)
- return -ENOMEM;
err = netdev_name_node_alt_create(dev, alt_ifname);
- if (err) {
- kfree(alt_ifname);
- return err;
- }
+ if (!err)
+ alt_ifname = NULL;
} else if (cmd == RTM_DELLINKPROP) {
err = netdev_name_node_alt_destroy(dev, alt_ifname);
- if (err)
- return err;
} else {
- WARN_ON(1);
- return 0;
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
}
- *changed = true;
- return 0;
+ kfree(alt_ifname);
+ if (!err)
+ *changed = true;
+ return err;
}
static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 864cb9e9622f..e1101a4f90a6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -467,7 +467,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
return NULL;
}
- /* use OR instead of assignment to avoid clearing of bits in mask */
if (pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -527,7 +526,6 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
return NULL;
}
- /* use OR instead of assignment to avoid clearing of bits in mask */
if (nc->page.pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -4805,9 +4803,9 @@ static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
typeof(IPPROTO_IP) proto,
unsigned int off)
{
- switch (proto) {
- int err;
+ int err;
+ switch (proto) {
case IPPROTO_TCP:
err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
off + MAX_TCP_HDR_LEN);
diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c
index fce45dac4205..ef9197541cb3 100644
--- a/net/ethtool/bitset.c
+++ b/net/ethtool/bitset.c
@@ -305,7 +305,8 @@ nla_put_failure:
static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = {
[ETHTOOL_A_BITSET_UNSPEC] = { .type = NLA_REJECT },
[ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG },
- [ETHTOOL_A_BITSET_SIZE] = { .type = NLA_U32 },
+ [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32,
+ ETHNL_MAX_BITSET_SIZE),
[ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED },
[ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY },
[ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY },
@@ -447,7 +448,10 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits,
"mask only allowed in compact bitset");
return -EINVAL;
}
+
no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+ if (no_mask)
+ ethnl_bitmap32_clear(bitmap, 0, nbits, mod);
nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) {
bool old_val, new_val;
diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h
index b8247e34109d..b849f9d19676 100644
--- a/net/ethtool/bitset.h
+++ b/net/ethtool/bitset.h
@@ -3,6 +3,8 @@
#ifndef _NET_ETHTOOL_BITSET_H
#define _NET_ETHTOOL_BITSET_H
+#define ETHNL_MAX_BITSET_SIZE S16_MAX
+
typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN];
int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact);
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 364ea2cc028e..3ba7f61be107 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -155,7 +155,8 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
new_node->seq_out[i] = seq_out;
spin_lock_bh(&hsr->list_lock);
- list_for_each_entry_rcu(node, node_db, mac_list) {
+ list_for_each_entry_rcu(node, node_db, mac_list,
+ lockdep_is_held(&hsr->list_lock)) {
if (ether_addr_equal(node->macaddress_A, addr))
goto out;
if (ether_addr_equal(node->macaddress_B, addr))
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 376882215919..0bd10a1f477f 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1724,6 +1724,7 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
{
unsigned char optbuf[sizeof(struct ip_options) + 40];
struct ip_options *opt = (struct ip_options *)optbuf;
+ int res;
if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
return;
@@ -1735,7 +1736,11 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
memset(opt, 0, sizeof(struct ip_options));
opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
- if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL))
+ rcu_read_lock();
+ res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL);
+ rcu_read_unlock();
+
+ if (res)
return;
if (gateway)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 316ebdf8151d..6b6b57000dad 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6124,7 +6124,11 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
struct request_sock *req;
- tcp_try_undo_loss(sk, false);
+ /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
+ * undo. If peer SACKs triggered fast recovery, we can't undo here.
+ */
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
+ tcp_try_undo_loss(sk, false);
/* Reset rtx states to prevent spurious retransmits_timed_out() */
tcp_sk(sk)->retrans_stamp = 0;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index db76b9609299..08a41f1e1cd2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1857,8 +1857,12 @@ int __udp_disconnect(struct sock *sk, int flags)
inet->inet_dport = 0;
sock_rps_reset_rxhash(sk);
sk->sk_bound_dev_if = 0;
- if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
inet_reset_saddr(sk);
+ if (sk->sk_prot->rehash &&
+ (sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ sk->sk_prot->rehash(sk);
+ }
if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
sk->sk_prot->unhash(sk);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 58fbde244381..72abf892302f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1102,8 +1102,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
found++;
break;
}
- if (rt_can_ecmp)
- fallback_ins = fallback_ins ?: ins;
+ fallback_ins = fallback_ins ?: ins;
goto next_iter;
}
@@ -1146,7 +1145,9 @@ next_iter:
}
if (fallback_ins && !found) {
- /* No ECMP-able route found, replace first non-ECMP one */
+ /* No matching route with same ecmp-able-ness found, replace
+ * first matching route
+ */
ins = fallback_ins;
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 55bfc5149d0c..781ca8c07a0d 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -437,8 +437,6 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return -ENOENT;
switch (type) {
- struct ipv6_tlv_tnl_enc_lim *tel;
- __u32 teli;
case ICMPV6_DEST_UNREACH:
net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
t->parms.name);
@@ -452,7 +450,10 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
break;
}
return 0;
- case ICMPV6_PARAMPROB:
+ case ICMPV6_PARAMPROB: {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ __u32 teli;
+
teli = 0;
if (code == ICMPV6_HDR_FIELD)
teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
@@ -468,6 +469,7 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
t->parms.name);
}
return 0;
+ }
case ICMPV6_PKT_TOOBIG:
ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
return 0;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 5d65436ad5ad..4703b09808d0 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -517,8 +517,6 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
err = 0;
switch (*type) {
- struct ipv6_tlv_tnl_enc_lim *tel;
- __u32 mtu, teli;
case ICMPV6_DEST_UNREACH:
net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
t->parms.name);
@@ -531,7 +529,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
rel_msg = 1;
}
break;
- case ICMPV6_PARAMPROB:
+ case ICMPV6_PARAMPROB: {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ __u32 teli;
+
teli = 0;
if ((*code) == ICMPV6_HDR_FIELD)
teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
@@ -548,7 +549,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
t->parms.name);
}
break;
- case ICMPV6_PKT_TOOBIG:
+ }
+ case ICMPV6_PKT_TOOBIG: {
+ __u32 mtu;
+
ip6_update_pmtu(skb, net, htonl(*info), 0, 0,
sock_net_uid(net, NULL));
mtu = *info - offset;
@@ -562,6 +566,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
rel_msg = 1;
}
break;
+ }
case NDISC_REDIRECT:
ip6_redirect(skb, net, skb->dev->ifindex, 0,
sock_net_uid(net, NULL));
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 79fc012dd2ca..debdaeba5d8c 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -183,9 +183,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
retv = -EBUSY;
break;
}
- } else if (sk->sk_protocol != IPPROTO_TCP)
+ } else if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_prot != &tcpv6_prot) {
+ retv = -EBUSY;
+ break;
+ }
break;
-
+ } else {
+ break;
+ }
if (sk->sk_state != TCP_ESTABLISHED) {
retv = -ENOTCONN;
break;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4fbdc60b4e07..2931224b674e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5198,6 +5198,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
*/
cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
NLM_F_REPLACE);
+ cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
nhn++;
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e041af2f021a..88d7a692a965 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2959,7 +2959,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
(auth_transaction == 2 &&
ifmgd->auth_data->expected_transaction == 2)) {
if (!ieee80211_mark_sta_auth(sdata, bssid))
- goto out_err;
+ return; /* ignore frame -- wait for timeout */
} else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE &&
auth_transaction == 2) {
sdata_info(sdata, "SAE peer confirmed\n");
@@ -2967,10 +2967,6 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
}
cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
- return;
- out_err:
- mutex_unlock(&sdata->local->sta_mtx);
- /* ignore frame -- wait for timeout */
}
#define case_WLAN(type) \
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 0e05ff037672..0ba98ad9bc85 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4114,7 +4114,7 @@ void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
lockdep_assert_held(&local->sta_mtx);
- list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ list_for_each_entry(sta, &local->sta_list, list) {
if (sdata != sta->sdata &&
(!sta->sdata->bss || sta->sdata->bss != sdata->bss))
continue;
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
index 49f6054e7f4e..a9ed3bf1d93f 100644
--- a/net/mptcp/Kconfig
+++ b/net/mptcp/Kconfig
@@ -4,6 +4,7 @@ config MPTCP
depends on INET
select SKB_EXTENSIONS
select CRYPTO_LIB_SHA256
+ select CRYPTO
help
Multipath TCP (MPTCP) connections send and receive data over multiple
subflows in order to utilize multiple network paths. Each subflow
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 030dee668e0a..3c19a8efdcea 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -543,6 +543,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
}
}
+static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ return 0;
+}
+
static int __mptcp_init_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -551,6 +556,7 @@ static int __mptcp_init_sock(struct sock *sk)
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
msk->first = NULL;
+ inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
return 0;
}
@@ -755,60 +761,50 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- int ret = -EOPNOTSUPP;
struct socket *ssock;
- struct sock *ssk;
pr_debug("msk=%p", msk);
/* @@ the meaning of setsockopt() when the socket is connected and
- * there are multiple subflows is not defined.
+ * there are multiple subflows is not yet defined. It is up to the
+ * MPTCP-level socket to configure the subflows until the subflow
+ * is in TCP fallback, when TCP socket options are passed through
+ * to the one remaining subflow.
*/
lock_sock(sk);
- ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
- if (IS_ERR(ssock)) {
- release_sock(sk);
- return ret;
- }
+ ssock = __mptcp_tcp_fallback(msk);
+ if (ssock)
+ return tcp_setsockopt(ssock->sk, level, optname, optval,
+ optlen);
- ssk = ssock->sk;
- sock_hold(ssk);
release_sock(sk);
- ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
- sock_put(ssk);
-
- return ret;
+ return -EOPNOTSUPP;
}
static int mptcp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *option)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- int ret = -EOPNOTSUPP;
struct socket *ssock;
- struct sock *ssk;
pr_debug("msk=%p", msk);
- /* @@ the meaning of getsockopt() when the socket is connected and
- * there are multiple subflows is not defined.
+ /* @@ the meaning of setsockopt() when the socket is connected and
+ * there are multiple subflows is not yet defined. It is up to the
+ * MPTCP-level socket to configure the subflows until the subflow
+ * is in TCP fallback, when socket options are passed through
+ * to the one remaining subflow.
*/
lock_sock(sk);
- ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
- if (IS_ERR(ssock)) {
- release_sock(sk);
- return ret;
- }
+ ssock = __mptcp_tcp_fallback(msk);
+ if (ssock)
+ return tcp_getsockopt(ssock->sk, level, optname, optval,
+ option);
- ssk = ssock->sk;
- sock_hold(ssk);
release_sock(sk);
- ret = tcp_getsockopt(ssk, level, optname, optval, option);
- sock_put(ssk);
-
- return ret;
+ return -EOPNOTSUPP;
}
static int mptcp_get_port(struct sock *sk, unsigned short snum)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 8a99a2930284..9f8663b30456 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -56,8 +56,8 @@
#define MPTCP_DSS_FLAG_MASK (0x1F)
/* MPTCP socket flags */
-#define MPTCP_DATA_READY BIT(0)
-#define MPTCP_SEND_SPACE BIT(1)
+#define MPTCP_DATA_READY 0
+#define MPTCP_SEND_SPACE 1
/* MPTCP connection sock */
struct mptcp_sock {
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 69c107f9ba8d..8dd17589217d 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -723,6 +723,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index)
return set;
}
+static inline void
+ip_set_lock(struct ip_set *set)
+{
+ if (!set->variant->region_lock)
+ spin_lock_bh(&set->lock);
+}
+
+static inline void
+ip_set_unlock(struct ip_set *set)
+{
+ if (!set->variant->region_lock)
+ spin_unlock_bh(&set->lock);
+}
+
int
ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
@@ -744,9 +758,9 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
if (ret == -EAGAIN) {
/* Type requests element to be completed */
pr_debug("element must be completed, ADD is triggered\n");
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
set->variant->kadt(set, skb, par, IPSET_ADD, opt);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
ret = 1;
} else {
/* --return-nomatch: invert matched element */
@@ -775,9 +789,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
return ret;
}
@@ -797,9 +811,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
return ret;
}
@@ -1264,9 +1278,9 @@ ip_set_flush_set(struct ip_set *set)
{
pr_debug("set: %s\n", set->name);
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
set->variant->flush(set);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
}
static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
@@ -1713,9 +1727,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
do {
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
retried = true;
} while (ret == -EAGAIN &&
set->variant->resize &&
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 7480ce55b5c8..e52d7b7597a0 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -7,13 +7,21 @@
#include <linux/rcupdate.h>
#include <linux/jhash.h>
#include <linux/types.h>
+#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/ipset/ip_set.h>
-#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c)
-#define ipset_dereference_protected(p, set) \
- __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock))
-
-#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
+#define __ipset_dereference(p) \
+ rcu_dereference_protected(p, 1)
+#define ipset_dereference_nfnl(p) \
+ rcu_dereference_protected(p, \
+ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
+#define ipset_dereference_set(p, set) \
+ rcu_dereference_protected(p, \
+ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
+ lockdep_is_held(&(set)->lock))
+#define ipset_dereference_bh_nfnl(p) \
+ rcu_dereference_bh_check(p, \
+ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
/* Hashing which uses arrays to resolve clashing. The hash table is resized
* (doubled) when searching becomes too long.
@@ -72,11 +80,35 @@ struct hbucket {
__aligned(__alignof__(u64));
};
+/* Region size for locking == 2^HTABLE_REGION_BITS */
+#define HTABLE_REGION_BITS 10
+#define ahash_numof_locks(htable_bits) \
+ ((htable_bits) < HTABLE_REGION_BITS ? 1 \
+ : jhash_size((htable_bits) - HTABLE_REGION_BITS))
+#define ahash_sizeof_regions(htable_bits) \
+ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region))
+#define ahash_region(n, htable_bits) \
+ ((n) % ahash_numof_locks(htable_bits))
+#define ahash_bucket_start(h, htable_bits) \
+ ((htable_bits) < HTABLE_REGION_BITS ? 0 \
+ : (h) * jhash_size(HTABLE_REGION_BITS))
+#define ahash_bucket_end(h, htable_bits) \
+ ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits) \
+ : ((h) + 1) * jhash_size(HTABLE_REGION_BITS))
+
+struct htable_gc {
+ struct delayed_work dwork;
+ struct ip_set *set; /* Set the gc belongs to */
+ u32 region; /* Last gc run position */
+};
+
/* The hash table: the table size stored here in order to make resizing easy */
struct htable {
atomic_t ref; /* References for resizing */
- atomic_t uref; /* References for dumping */
+ atomic_t uref; /* References for dumping and gc */
u8 htable_bits; /* size of hash table == 2^htable_bits */
+ u32 maxelem; /* Maxelem per region */
+ struct ip_set_region *hregion; /* Region locks and ext sizes */
struct hbucket __rcu *bucket[0]; /* hashtable buckets */
};
@@ -162,6 +194,10 @@ htable_bits(u32 hashsize)
#define NLEN 0
#endif /* IP_SET_HASH_WITH_NETS */
+#define SET_ELEM_EXPIRED(set, d) \
+ (SET_WITH_TIMEOUT(set) && \
+ ip_set_timeout_expired(ext_timeout(d, set)))
+
#endif /* _IP_SET_HASH_GEN_H */
#ifndef MTYPE
@@ -205,10 +241,12 @@ htable_bits(u32 hashsize)
#undef mtype_test_cidrs
#undef mtype_test
#undef mtype_uref
-#undef mtype_expire
#undef mtype_resize
+#undef mtype_ext_size
+#undef mtype_resize_ad
#undef mtype_head
#undef mtype_list
+#undef mtype_gc_do
#undef mtype_gc
#undef mtype_gc_init
#undef mtype_variant
@@ -247,10 +285,12 @@ htable_bits(u32 hashsize)
#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
#define mtype_test IPSET_TOKEN(MTYPE, _test)
#define mtype_uref IPSET_TOKEN(MTYPE, _uref)
-#define mtype_expire IPSET_TOKEN(MTYPE, _expire)
#define mtype_resize IPSET_TOKEN(MTYPE, _resize)
+#define mtype_ext_size IPSET_TOKEN(MTYPE, _ext_size)
+#define mtype_resize_ad IPSET_TOKEN(MTYPE, _resize_ad)
#define mtype_head IPSET_TOKEN(MTYPE, _head)
#define mtype_list IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
@@ -275,8 +315,7 @@ htable_bits(u32 hashsize)
/* The generic hash structure */
struct htype {
struct htable __rcu *table; /* the hash table */
- struct timer_list gc; /* garbage collection when timeout enabled */
- struct ip_set *set; /* attached to this ip_set */
+ struct htable_gc gc; /* gc workqueue */
u32 maxelem; /* max elements in the hash */
u32 initval; /* random jhash init value */
#ifdef IP_SET_HASH_WITH_MARKMASK
@@ -288,21 +327,33 @@ struct htype {
#ifdef IP_SET_HASH_WITH_NETMASK
u8 netmask; /* netmask value for subnets to store */
#endif
+ struct list_head ad; /* Resize add|del backlist */
struct mtype_elem next; /* temporary storage for uadd */
#ifdef IP_SET_HASH_WITH_NETS
struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */
#endif
};
+/* ADD|DEL entries saved during resize */
+struct mtype_resize_ad {
+ struct list_head list;
+ enum ipset_adt ad; /* ADD|DEL element */
+ struct mtype_elem d; /* Element value */
+ struct ip_set_ext ext; /* Extensions for ADD */
+ struct ip_set_ext mext; /* Target extensions for ADD */
+ u32 flags; /* Flags for ADD */
+};
+
#ifdef IP_SET_HASH_WITH_NETS
/* Network cidr size book keeping when the hash stores different
* sized networks. cidr == real cidr + 1 to support /0.
*/
static void
-mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
+mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
{
int i, j;
+ spin_lock_bh(&set->lock);
/* Add in increasing prefix order, so larger cidr first */
for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) {
if (j != -1) {
@@ -311,7 +362,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
j = i;
} else if (h->nets[i].cidr[n] == cidr) {
h->nets[CIDR_POS(cidr)].nets[n]++;
- return;
+ goto unlock;
}
}
if (j != -1) {
@@ -320,24 +371,29 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
}
h->nets[i].cidr[n] = cidr;
h->nets[CIDR_POS(cidr)].nets[n] = 1;
+unlock:
+ spin_unlock_bh(&set->lock);
}
static void
-mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
+mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
{
u8 i, j, net_end = NLEN - 1;
+ spin_lock_bh(&set->lock);
for (i = 0; i < NLEN; i++) {
if (h->nets[i].cidr[n] != cidr)
continue;
h->nets[CIDR_POS(cidr)].nets[n]--;
if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
- return;
+ goto unlock;
for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
h->nets[j].cidr[n] = 0;
- return;
+ goto unlock;
}
+unlock:
+ spin_unlock_bh(&set->lock);
}
#endif
@@ -345,7 +401,7 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
static size_t
mtype_ahash_memsize(const struct htype *h, const struct htable *t)
{
- return sizeof(*h) + sizeof(*t);
+ return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits);
}
/* Get the ith element from the array block n */
@@ -369,24 +425,29 @@ mtype_flush(struct ip_set *set)
struct htype *h = set->data;
struct htable *t;
struct hbucket *n;
- u32 i;
-
- t = ipset_dereference_protected(h->table, set);
- for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(t, i), 1);
- if (!n)
- continue;
- if (set->extensions & IPSET_EXT_DESTROY)
- mtype_ext_cleanup(set, n);
- /* FIXME: use slab cache */
- rcu_assign_pointer(hbucket(t, i), NULL);
- kfree_rcu(n, rcu);
+ u32 r, i;
+
+ t = ipset_dereference_nfnl(h->table);
+ for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
+ spin_lock_bh(&t->hregion[r].lock);
+ for (i = ahash_bucket_start(r, t->htable_bits);
+ i < ahash_bucket_end(r, t->htable_bits); i++) {
+ n = __ipset_dereference(hbucket(t, i));
+ if (!n)
+ continue;
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set, n);
+ /* FIXME: use slab cache */
+ rcu_assign_pointer(hbucket(t, i), NULL);
+ kfree_rcu(n, rcu);
+ }
+ t->hregion[r].ext_size = 0;
+ t->hregion[r].elements = 0;
+ spin_unlock_bh(&t->hregion[r].lock);
}
#ifdef IP_SET_HASH_WITH_NETS
memset(h->nets, 0, sizeof(h->nets));
#endif
- set->elements = 0;
- set->ext_size = 0;
}
/* Destroy the hashtable part of the set */
@@ -397,7 +458,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
u32 i;
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(t, i), 1);
+ n = __ipset_dereference(hbucket(t, i));
if (!n)
continue;
if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
@@ -406,6 +467,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
kfree(n);
}
+ ip_set_free(t->hregion);
ip_set_free(t);
}
@@ -414,28 +476,21 @@ static void
mtype_destroy(struct ip_set *set)
{
struct htype *h = set->data;
+ struct list_head *l, *lt;
if (SET_WITH_TIMEOUT(set))
- del_timer_sync(&h->gc);
+ cancel_delayed_work_sync(&h->gc.dwork);
- mtype_ahash_destroy(set,
- __ipset_dereference_protected(h->table, 1), true);
+ mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
+ list_for_each_safe(l, lt, &h->ad) {
+ list_del(l);
+ kfree(l);
+ }
kfree(h);
set->data = NULL;
}
-static void
-mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t))
-{
- struct htype *h = set->data;
-
- timer_setup(&h->gc, gc, 0);
- mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
- pr_debug("gc initialized, run in every %u\n",
- IPSET_GC_PERIOD(set->timeout));
-}
-
static bool
mtype_same_set(const struct ip_set *a, const struct ip_set *b)
{
@@ -454,11 +509,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
a->extensions == b->extensions;
}
-/* Delete expired elements from the hashtable */
static void
-mtype_expire(struct ip_set *set, struct htype *h)
+mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r)
{
- struct htable *t;
struct hbucket *n, *tmp;
struct mtype_elem *data;
u32 i, j, d;
@@ -466,10 +519,12 @@ mtype_expire(struct ip_set *set, struct htype *h)
#ifdef IP_SET_HASH_WITH_NETS
u8 k;
#endif
+ u8 htable_bits = t->htable_bits;
- t = ipset_dereference_protected(h->table, set);
- for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(t, i), 1);
+ spin_lock_bh(&t->hregion[r].lock);
+ for (i = ahash_bucket_start(r, htable_bits);
+ i < ahash_bucket_end(r, htable_bits); i++) {
+ n = __ipset_dereference(hbucket(t, i));
if (!n)
continue;
for (j = 0, d = 0; j < n->pos; j++) {
@@ -485,58 +540,100 @@ mtype_expire(struct ip_set *set, struct htype *h)
smp_mb__after_atomic();
#ifdef IP_SET_HASH_WITH_NETS
for (k = 0; k < IPSET_NET_COUNT; k++)
- mtype_del_cidr(h,
+ mtype_del_cidr(set, h,
NCIDR_PUT(DCIDR_GET(data->cidr, k)),
k);
#endif
+ t->hregion[r].elements--;
ip_set_ext_destroy(set, data);
- set->elements--;
d++;
}
if (d >= AHASH_INIT_SIZE) {
if (d >= n->size) {
+ t->hregion[r].ext_size -=
+ ext_size(n->size, dsize);
rcu_assign_pointer(hbucket(t, i), NULL);
kfree_rcu(n, rcu);
continue;
}
tmp = kzalloc(sizeof(*tmp) +
- (n->size - AHASH_INIT_SIZE) * dsize,
- GFP_ATOMIC);
+ (n->size - AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
if (!tmp)
- /* Still try to delete expired elements */
+ /* Still try to delete expired elements. */
continue;
tmp->size = n->size - AHASH_INIT_SIZE;
for (j = 0, d = 0; j < n->pos; j++) {
if (!test_bit(j, n->used))
continue;
data = ahash_data(n, j, dsize);
- memcpy(tmp->value + d * dsize, data, dsize);
+ memcpy(tmp->value + d * dsize,
+ data, dsize);
set_bit(d, tmp->used);
d++;
}
tmp->pos = d;
- set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
+ t->hregion[r].ext_size -=
+ ext_size(AHASH_INIT_SIZE, dsize);
rcu_assign_pointer(hbucket(t, i), tmp);
kfree_rcu(n, rcu);
}
}
+ spin_unlock_bh(&t->hregion[r].lock);
}
static void
-mtype_gc(struct timer_list *t)
+mtype_gc(struct work_struct *work)
{
- struct htype *h = from_timer(h, t, gc);
- struct ip_set *set = h->set;
+ struct htable_gc *gc;
+ struct ip_set *set;
+ struct htype *h;
+ struct htable *t;
+ u32 r, numof_locks;
+ unsigned int next_run;
+
+ gc = container_of(work, struct htable_gc, dwork.work);
+ set = gc->set;
+ h = set->data;
- pr_debug("called\n");
spin_lock_bh(&set->lock);
- mtype_expire(set, h);
+ t = ipset_dereference_set(h->table, set);
+ atomic_inc(&t->uref);
+ numof_locks = ahash_numof_locks(t->htable_bits);
+ r = gc->region++;
+ if (r >= numof_locks) {
+ r = gc->region = 0;
+ }
+ next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks;
+ if (next_run < HZ/10)
+ next_run = HZ/10;
spin_unlock_bh(&set->lock);
- h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
- add_timer(&h->gc);
+ mtype_gc_do(set, h, t, r);
+
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ pr_debug("Table destroy after resize by expire: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
+
+ queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run);
+
+}
+
+static void
+mtype_gc_init(struct htable_gc *gc)
+{
+ INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc);
+ queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
}
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags);
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags);
+
/* Resize a hash: create a new hash table with doubling the hashsize
* and inserting the elements to it. Repeat until we succeed or
* fail due to memory pressures.
@@ -547,7 +644,7 @@ mtype_resize(struct ip_set *set, bool retried)
struct htype *h = set->data;
struct htable *t, *orig;
u8 htable_bits;
- size_t extsize, dsize = set->dsize;
+ size_t dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
struct mtype_elem *tmp;
@@ -555,7 +652,9 @@ mtype_resize(struct ip_set *set, bool retried)
struct mtype_elem *data;
struct mtype_elem *d;
struct hbucket *n, *m;
- u32 i, j, key;
+ struct list_head *l, *lt;
+ struct mtype_resize_ad *x;
+ u32 i, j, r, nr, key;
int ret;
#ifdef IP_SET_HASH_WITH_NETS
@@ -563,10 +662,8 @@ mtype_resize(struct ip_set *set, bool retried)
if (!tmp)
return -ENOMEM;
#endif
- rcu_read_lock_bh();
- orig = rcu_dereference_bh_nfnl(h->table);
+ orig = ipset_dereference_bh_nfnl(h->table);
htable_bits = orig->htable_bits;
- rcu_read_unlock_bh();
retry:
ret = 0;
@@ -583,88 +680,124 @@ retry:
ret = -ENOMEM;
goto out;
}
+ t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits));
+ if (!t->hregion) {
+ kfree(t);
+ ret = -ENOMEM;
+ goto out;
+ }
t->htable_bits = htable_bits;
+ t->maxelem = h->maxelem / ahash_numof_locks(htable_bits);
+ for (i = 0; i < ahash_numof_locks(htable_bits); i++)
+ spin_lock_init(&t->hregion[i].lock);
- spin_lock_bh(&set->lock);
- orig = __ipset_dereference_protected(h->table, 1);
- /* There can't be another parallel resizing, but dumping is possible */
+ /* There can't be another parallel resizing,
+ * but dumping, gc, kernel side add/del are possible
+ */
+ orig = ipset_dereference_bh_nfnl(h->table);
atomic_set(&orig->ref, 1);
atomic_inc(&orig->uref);
- extsize = 0;
pr_debug("attempt to resize set %s from %u to %u, t %p\n",
set->name, orig->htable_bits, htable_bits, orig);
- for (i = 0; i < jhash_size(orig->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(orig, i), 1);
- if (!n)
- continue;
- for (j = 0; j < n->pos; j++) {
- if (!test_bit(j, n->used))
+ for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) {
+ /* Expire may replace a hbucket with another one */
+ rcu_read_lock_bh();
+ for (i = ahash_bucket_start(r, orig->htable_bits);
+ i < ahash_bucket_end(r, orig->htable_bits); i++) {
+ n = __ipset_dereference(hbucket(orig, i));
+ if (!n)
continue;
- data = ahash_data(n, j, dsize);
+ for (j = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, dsize);
+ if (SET_ELEM_EXPIRED(set, data))
+ continue;
#ifdef IP_SET_HASH_WITH_NETS
- /* We have readers running parallel with us,
- * so the live data cannot be modified.
- */
- flags = 0;
- memcpy(tmp, data, dsize);
- data = tmp;
- mtype_data_reset_flags(data, &flags);
+ /* We have readers running parallel with us,
+ * so the live data cannot be modified.
+ */
+ flags = 0;
+ memcpy(tmp, data, dsize);
+ data = tmp;
+ mtype_data_reset_flags(data, &flags);
#endif
- key = HKEY(data, h->initval, htable_bits);
- m = __ipset_dereference_protected(hbucket(t, key), 1);
- if (!m) {
- m = kzalloc(sizeof(*m) +
+ key = HKEY(data, h->initval, htable_bits);
+ m = __ipset_dereference(hbucket(t, key));
+ nr = ahash_region(key, htable_bits);
+ if (!m) {
+ m = kzalloc(sizeof(*m) +
AHASH_INIT_SIZE * dsize,
GFP_ATOMIC);
- if (!m) {
- ret = -ENOMEM;
- goto cleanup;
- }
- m->size = AHASH_INIT_SIZE;
- extsize += ext_size(AHASH_INIT_SIZE, dsize);
- RCU_INIT_POINTER(hbucket(t, key), m);
- } else if (m->pos >= m->size) {
- struct hbucket *ht;
-
- if (m->size >= AHASH_MAX(h)) {
- ret = -EAGAIN;
- } else {
- ht = kzalloc(sizeof(*ht) +
+ if (!m) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+ m->size = AHASH_INIT_SIZE;
+ t->hregion[nr].ext_size +=
+ ext_size(AHASH_INIT_SIZE,
+ dsize);
+ RCU_INIT_POINTER(hbucket(t, key), m);
+ } else if (m->pos >= m->size) {
+ struct hbucket *ht;
+
+ if (m->size >= AHASH_MAX(h)) {
+ ret = -EAGAIN;
+ } else {
+ ht = kzalloc(sizeof(*ht) +
(m->size + AHASH_INIT_SIZE)
* dsize,
GFP_ATOMIC);
- if (!ht)
- ret = -ENOMEM;
+ if (!ht)
+ ret = -ENOMEM;
+ }
+ if (ret < 0)
+ goto cleanup;
+ memcpy(ht, m, sizeof(struct hbucket) +
+ m->size * dsize);
+ ht->size = m->size + AHASH_INIT_SIZE;
+ t->hregion[nr].ext_size +=
+ ext_size(AHASH_INIT_SIZE,
+ dsize);
+ kfree(m);
+ m = ht;
+ RCU_INIT_POINTER(hbucket(t, key), ht);
}
- if (ret < 0)
- goto cleanup;
- memcpy(ht, m, sizeof(struct hbucket) +
- m->size * dsize);
- ht->size = m->size + AHASH_INIT_SIZE;
- extsize += ext_size(AHASH_INIT_SIZE, dsize);
- kfree(m);
- m = ht;
- RCU_INIT_POINTER(hbucket(t, key), ht);
- }
- d = ahash_data(m, m->pos, dsize);
- memcpy(d, data, dsize);
- set_bit(m->pos++, m->used);
+ d = ahash_data(m, m->pos, dsize);
+ memcpy(d, data, dsize);
+ set_bit(m->pos++, m->used);
+ t->hregion[nr].elements++;
#ifdef IP_SET_HASH_WITH_NETS
- mtype_data_reset_flags(d, &flags);
+ mtype_data_reset_flags(d, &flags);
#endif
+ }
}
+ rcu_read_unlock_bh();
}
- rcu_assign_pointer(h->table, t);
- set->ext_size = extsize;
- spin_unlock_bh(&set->lock);
+ /* There can't be any other writer. */
+ rcu_assign_pointer(h->table, t);
/* Give time to other readers of the set */
synchronize_rcu();
pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
orig->htable_bits, orig, t->htable_bits, t);
- /* If there's nobody else dumping the table, destroy it */
+ /* Add/delete elements processed by the SET target during resize.
+ * Kernel-side add cannot trigger a resize and userspace actions
+ * are serialized by the mutex.
+ */
+ list_for_each_safe(l, lt, &h->ad) {
+ x = list_entry(l, struct mtype_resize_ad, list);
+ if (x->ad == IPSET_ADD) {
+ mtype_add(set, &x->d, &x->ext, &x->mext, x->flags);
+ } else {
+ mtype_del(set, &x->d, NULL, NULL, 0);
+ }
+ list_del(l);
+ kfree(l);
+ }
+ /* If there's nobody else using the table, destroy it */
if (atomic_dec_and_test(&orig->uref)) {
pr_debug("Table destroy by resize %p\n", orig);
mtype_ahash_destroy(set, orig, false);
@@ -677,15 +810,44 @@ out:
return ret;
cleanup:
+ rcu_read_unlock_bh();
atomic_set(&orig->ref, 0);
atomic_dec(&orig->uref);
- spin_unlock_bh(&set->lock);
mtype_ahash_destroy(set, t, false);
if (ret == -EAGAIN)
goto retry;
goto out;
}
+/* Get the current number of elements and ext_size in the set */
+static void
+mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size)
+{
+ struct htype *h = set->data;
+ const struct htable *t;
+ u32 i, j, r;
+ struct hbucket *n;
+ struct mtype_elem *data;
+
+ t = rcu_dereference_bh(h->table);
+ for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
+ for (i = ahash_bucket_start(r, t->htable_bits);
+ i < ahash_bucket_end(r, t->htable_bits); i++) {
+ n = rcu_dereference_bh(hbucket(t, i));
+ if (!n)
+ continue;
+ for (j = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, set->dsize);
+ if (!SET_ELEM_EXPIRED(set, data))
+ (*elements)++;
+ }
+ }
+ *ext_size += t->hregion[r].ext_size;
+ }
+}
+
/* Add an element to a hash and update the internal counters when succeeded,
* otherwise report the proper error code.
*/
@@ -698,32 +860,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
const struct mtype_elem *d = value;
struct mtype_elem *data;
struct hbucket *n, *old = ERR_PTR(-ENOENT);
- int i, j = -1;
+ int i, j = -1, ret;
bool flag_exist = flags & IPSET_FLAG_EXIST;
bool deleted = false, forceadd = false, reuse = false;
- u32 key, multi = 0;
+ u32 r, key, multi = 0, elements, maxelem;
- if (set->elements >= h->maxelem) {
- if (SET_WITH_TIMEOUT(set))
- /* FIXME: when set is full, we slow down here */
- mtype_expire(set, h);
- if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set))
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ r = ahash_region(key, t->htable_bits);
+ atomic_inc(&t->uref);
+ elements = t->hregion[r].elements;
+ maxelem = t->maxelem;
+ if (elements >= maxelem) {
+ u32 e;
+ if (SET_WITH_TIMEOUT(set)) {
+ rcu_read_unlock_bh();
+ mtype_gc_do(set, h, t, r);
+ rcu_read_lock_bh();
+ }
+ maxelem = h->maxelem;
+ elements = 0;
+ for (e = 0; e < ahash_numof_locks(t->htable_bits); e++)
+ elements += t->hregion[e].elements;
+ if (elements >= maxelem && SET_WITH_FORCEADD(set))
forceadd = true;
}
+ rcu_read_unlock_bh();
- t = ipset_dereference_protected(h->table, set);
- key = HKEY(value, h->initval, t->htable_bits);
- n = __ipset_dereference_protected(hbucket(t, key), 1);
+ spin_lock_bh(&t->hregion[r].lock);
+ n = rcu_dereference_bh(hbucket(t, key));
if (!n) {
- if (forceadd || set->elements >= h->maxelem)
+ if (forceadd || elements >= maxelem)
goto set_full;
old = NULL;
n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
GFP_ATOMIC);
- if (!n)
- return -ENOMEM;
+ if (!n) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
n->size = AHASH_INIT_SIZE;
- set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
+ t->hregion[r].ext_size +=
+ ext_size(AHASH_INIT_SIZE, set->dsize);
goto copy_elem;
}
for (i = 0; i < n->pos; i++) {
@@ -737,38 +916,37 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
data = ahash_data(n, i, set->dsize);
if (mtype_data_equal(data, d, &multi)) {
- if (flag_exist ||
- (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(data, set)))) {
+ if (flag_exist || SET_ELEM_EXPIRED(set, data)) {
/* Just the extensions could be overwritten */
j = i;
goto overwrite_extensions;
}
- return -IPSET_ERR_EXIST;
+ ret = -IPSET_ERR_EXIST;
+ goto unlock;
}
/* Reuse first timed out entry */
- if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(data, set)) &&
- j == -1) {
+ if (SET_ELEM_EXPIRED(set, data) && j == -1) {
j = i;
reuse = true;
}
}
if (reuse || forceadd) {
+ if (j == -1)
+ j = 0;
data = ahash_data(n, j, set->dsize);
if (!deleted) {
#ifdef IP_SET_HASH_WITH_NETS
for (i = 0; i < IPSET_NET_COUNT; i++)
- mtype_del_cidr(h,
+ mtype_del_cidr(set, h,
NCIDR_PUT(DCIDR_GET(data->cidr, i)),
i);
#endif
ip_set_ext_destroy(set, data);
- set->elements--;
+ t->hregion[r].elements--;
}
goto copy_data;
}
- if (set->elements >= h->maxelem)
+ if (elements >= maxelem)
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
@@ -776,28 +954,32 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (n->size >= AHASH_MAX(h)) {
/* Trigger rehashing */
mtype_data_next(&h->next, d);
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto resize;
}
old = n;
n = kzalloc(sizeof(*n) +
(old->size + AHASH_INIT_SIZE) * set->dsize,
GFP_ATOMIC);
- if (!n)
- return -ENOMEM;
+ if (!n) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
memcpy(n, old, sizeof(struct hbucket) +
old->size * set->dsize);
n->size = old->size + AHASH_INIT_SIZE;
- set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
+ t->hregion[r].ext_size +=
+ ext_size(AHASH_INIT_SIZE, set->dsize);
}
copy_elem:
j = n->pos++;
data = ahash_data(n, j, set->dsize);
copy_data:
- set->elements++;
+ t->hregion[r].elements++;
#ifdef IP_SET_HASH_WITH_NETS
for (i = 0; i < IPSET_NET_COUNT; i++)
- mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
+ mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
#endif
memcpy(data, d, sizeof(struct mtype_elem));
overwrite_extensions:
@@ -820,13 +1002,41 @@ overwrite_extensions:
if (old)
kfree_rcu(old, rcu);
}
+ ret = 0;
+resize:
+ spin_unlock_bh(&t->hregion[r].lock);
+ if (atomic_read(&t->ref) && ext->target) {
+ /* Resize is in process and kernel side add, save values */
+ struct mtype_resize_ad *x;
+
+ x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC);
+ if (!x)
+ /* Don't bother */
+ goto out;
+ x->ad = IPSET_ADD;
+ memcpy(&x->d, value, sizeof(struct mtype_elem));
+ memcpy(&x->ext, ext, sizeof(struct ip_set_ext));
+ memcpy(&x->mext, mext, sizeof(struct ip_set_ext));
+ x->flags = flags;
+ spin_lock_bh(&set->lock);
+ list_add_tail(&x->list, &h->ad);
+ spin_unlock_bh(&set->lock);
+ }
+ goto out;
- return 0;
set_full:
if (net_ratelimit())
pr_warn("Set %s is full, maxelem %u reached\n",
- set->name, h->maxelem);
- return -IPSET_ERR_HASH_FULL;
+ set->name, maxelem);
+ ret = -IPSET_ERR_HASH_FULL;
+unlock:
+ spin_unlock_bh(&t->hregion[r].lock);
+out:
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ pr_debug("Table destroy after resize by add: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
+ return ret;
}
/* Delete an element from the hash and free up space if possible.
@@ -840,13 +1050,23 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
const struct mtype_elem *d = value;
struct mtype_elem *data;
struct hbucket *n;
- int i, j, k, ret = -IPSET_ERR_EXIST;
+ struct mtype_resize_ad *x = NULL;
+ int i, j, k, r, ret = -IPSET_ERR_EXIST;
u32 key, multi = 0;
size_t dsize = set->dsize;
- t = ipset_dereference_protected(h->table, set);
+ /* Userspace add and resize is excluded by the mutex.
+ * Kernespace add does not trigger resize.
+ */
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
key = HKEY(value, h->initval, t->htable_bits);
- n = __ipset_dereference_protected(hbucket(t, key), 1);
+ r = ahash_region(key, t->htable_bits);
+ atomic_inc(&t->uref);
+ rcu_read_unlock_bh();
+
+ spin_lock_bh(&t->hregion[r].lock);
+ n = rcu_dereference_bh(hbucket(t, key));
if (!n)
goto out;
for (i = 0, k = 0; i < n->pos; i++) {
@@ -857,8 +1077,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
data = ahash_data(n, i, dsize);
if (!mtype_data_equal(data, d, &multi))
continue;
- if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(data, set)))
+ if (SET_ELEM_EXPIRED(set, data))
goto out;
ret = 0;
@@ -866,20 +1085,33 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
smp_mb__after_atomic();
if (i + 1 == n->pos)
n->pos--;
- set->elements--;
+ t->hregion[r].elements--;
#ifdef IP_SET_HASH_WITH_NETS
for (j = 0; j < IPSET_NET_COUNT; j++)
- mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
- j);
+ mtype_del_cidr(set, h,
+ NCIDR_PUT(DCIDR_GET(d->cidr, j)), j);
#endif
ip_set_ext_destroy(set, data);
+ if (atomic_read(&t->ref) && ext->target) {
+ /* Resize is in process and kernel side del,
+ * save values
+ */
+ x = kzalloc(sizeof(struct mtype_resize_ad),
+ GFP_ATOMIC);
+ if (x) {
+ x->ad = IPSET_DEL;
+ memcpy(&x->d, value,
+ sizeof(struct mtype_elem));
+ x->flags = flags;
+ }
+ }
for (; i < n->pos; i++) {
if (!test_bit(i, n->used))
k++;
}
if (n->pos == 0 && k == 0) {
- set->ext_size -= ext_size(n->size, dsize);
+ t->hregion[r].ext_size -= ext_size(n->size, dsize);
rcu_assign_pointer(hbucket(t, key), NULL);
kfree_rcu(n, rcu);
} else if (k >= AHASH_INIT_SIZE) {
@@ -898,7 +1130,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
k++;
}
tmp->pos = k;
- set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
+ t->hregion[r].ext_size -=
+ ext_size(AHASH_INIT_SIZE, dsize);
rcu_assign_pointer(hbucket(t, key), tmp);
kfree_rcu(n, rcu);
}
@@ -906,6 +1139,16 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
out:
+ spin_unlock_bh(&t->hregion[r].lock);
+ if (x) {
+ spin_lock_bh(&set->lock);
+ list_add(&x->list, &h->ad);
+ spin_unlock_bh(&set->lock);
+ }
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ pr_debug("Table destroy after resize by del: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
return ret;
}
@@ -991,6 +1234,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
int i, ret = 0;
u32 key, multi = 0;
+ rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
#ifdef IP_SET_HASH_WITH_NETS
/* If we test an IP address and not a network address,
@@ -1022,6 +1266,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
goto out;
}
out:
+ rcu_read_unlock_bh();
return ret;
}
@@ -1033,23 +1278,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
const struct htable *t;
struct nlattr *nested;
size_t memsize;
+ u32 elements = 0;
+ size_t ext_size = 0;
u8 htable_bits;
- /* If any members have expired, set->elements will be wrong
- * mytype_expire function will update it with the right count.
- * we do not hold set->lock here, so grab it first.
- * set->elements can still be incorrect in the case of a huge set,
- * because elements might time out during the listing.
- */
- if (SET_WITH_TIMEOUT(set)) {
- spin_lock_bh(&set->lock);
- mtype_expire(set, h);
- spin_unlock_bh(&set->lock);
- }
-
rcu_read_lock_bh();
- t = rcu_dereference_bh_nfnl(h->table);
- memsize = mtype_ahash_memsize(h, t) + set->ext_size;
+ t = rcu_dereference_bh(h->table);
+ mtype_ext_size(set, &elements, &ext_size);
+ memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size;
htable_bits = t->htable_bits;
rcu_read_unlock_bh();
@@ -1071,7 +1307,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
#endif
if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
- nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
+ nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -1091,15 +1327,15 @@ mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start)
if (start) {
rcu_read_lock_bh();
- t = rcu_dereference_bh_nfnl(h->table);
+ t = ipset_dereference_bh_nfnl(h->table);
atomic_inc(&t->uref);
cb->args[IPSET_CB_PRIVATE] = (unsigned long)t;
rcu_read_unlock_bh();
} else if (cb->args[IPSET_CB_PRIVATE]) {
t = (struct htable *)cb->args[IPSET_CB_PRIVATE];
if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
- /* Resizing didn't destroy the hash table */
- pr_debug("Table destroy by dump: %p\n", t);
+ pr_debug("Table destroy after resize "
+ " by dump: %p\n", t);
mtype_ahash_destroy(set, t, false);
}
cb->args[IPSET_CB_PRIVATE] = 0;
@@ -1141,8 +1377,7 @@ mtype_list(const struct ip_set *set,
if (!test_bit(i, n->used))
continue;
e = ahash_data(n, i, set->dsize);
- if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(e, set)))
+ if (SET_ELEM_EXPIRED(set, e))
continue;
pr_debug("list hash %lu hbucket %p i %u, data %p\n",
cb->args[IPSET_CB_ARG0], n, i, e);
@@ -1208,6 +1443,7 @@ static const struct ip_set_type_variant mtype_variant = {
.uref = mtype_uref,
.resize = mtype_resize,
.same_set = mtype_same_set,
+ .region_lock = true,
};
#ifdef IP_SET_EMIT_CREATE
@@ -1226,6 +1462,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
size_t hsize;
struct htype *h;
struct htable *t;
+ u32 i;
pr_debug("Create set %s with family %s\n",
set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
@@ -1294,6 +1531,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
kfree(h);
return -ENOMEM;
}
+ t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits));
+ if (!t->hregion) {
+ kfree(t);
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->gc.set = set;
+ for (i = 0; i < ahash_numof_locks(hbits); i++)
+ spin_lock_init(&t->hregion[i].lock);
h->maxelem = maxelem;
#ifdef IP_SET_HASH_WITH_NETMASK
h->netmask = netmask;
@@ -1304,9 +1550,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
get_random_bytes(&h->initval, sizeof(h->initval));
t->htable_bits = hbits;
+ t->maxelem = h->maxelem / ahash_numof_locks(hbits);
RCU_INIT_POINTER(h->table, t);
- h->set = set;
+ INIT_LIST_HEAD(&h->ad);
set->data = h;
#ifndef IP_SET_PROTO_UNDEF
if (set->family == NFPROTO_IPV4) {
@@ -1329,12 +1576,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#ifndef IP_SET_PROTO_UNDEF
if (set->family == NFPROTO_IPV4)
#endif
- IPSET_TOKEN(HTYPE, 4_gc_init)(set,
- IPSET_TOKEN(HTYPE, 4_gc));
+ IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc);
#ifndef IP_SET_PROTO_UNDEF
else
- IPSET_TOKEN(HTYPE, 6_gc_init)(set,
- IPSET_TOKEN(HTYPE, 6_gc));
+ IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc);
#endif
}
pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index d1305423640f..1927fc296f95 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -894,32 +894,175 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
}
}
-/* Resolve race on insertion if this protocol allows this. */
+static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
+{
+ struct nf_conn_tstamp *tstamp;
+
+ atomic_inc(&ct->ct_general.use);
+ ct->status |= IPS_CONFIRMED;
+
+ /* set conntrack timestamp, if enabled. */
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp)
+ tstamp->start = ktime_get_real_ns();
+}
+
+static int __nf_ct_resolve_clash(struct sk_buff *skb,
+ struct nf_conntrack_tuple_hash *h)
+{
+ /* This is the conntrack entry already in hashes that won race. */
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *loser_ct;
+
+ loser_ct = nf_ct_get(skb, &ctinfo);
+
+ if (nf_ct_is_dying(ct))
+ return NF_DROP;
+
+ if (!atomic_inc_not_zero(&ct->ct_general.use))
+ return NF_DROP;
+
+ if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
+ nf_ct_match(ct, loser_ct)) {
+ struct net *net = nf_ct_net(ct);
+
+ nf_ct_acct_merge(ct, ctinfo, loser_ct);
+ nf_ct_add_to_dying_list(loser_ct);
+ nf_conntrack_put(&loser_ct->ct_general);
+ nf_ct_set(skb, ct, ctinfo);
+
+ NF_CT_STAT_INC(net, insert_failed);
+ return NF_ACCEPT;
+ }
+
+ nf_ct_put(ct);
+ return NF_DROP;
+}
+
+/**
+ * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
+ *
+ * @skb: skb that causes the collision
+ * @repl_idx: hash slot for reply direction
+ *
+ * Called when origin or reply direction had a clash.
+ * The skb can be handled without packet drop provided the reply direction
+ * is unique or there the existing entry has the identical tuple in both
+ * directions.
+ *
+ * Caller must hold conntrack table locks to prevent concurrent updates.
+ *
+ * Returns NF_DROP if the clash could not be handled.
+ */
+static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
+{
+ struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
+ const struct nf_conntrack_zone *zone;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct net *net;
+
+ zone = nf_ct_zone(loser_ct);
+ net = nf_ct_net(loser_ct);
+
+ /* Reply direction must never result in a clash, unless both origin
+ * and reply tuples are identical.
+ */
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
+ if (nf_ct_key_equal(h,
+ &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
+ return __nf_ct_resolve_clash(skb, h);
+ }
+
+ /* We want the clashing entry to go away real soon: 1 second timeout. */
+ loser_ct->timeout = nfct_time_stamp + HZ;
+
+ /* IPS_NAT_CLASH removes the entry automatically on the first
+ * reply. Also prevents UDP tracker from moving the entry to
+ * ASSURED state, i.e. the entry can always be evicted under
+ * pressure.
+ */
+ loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;
+
+ __nf_conntrack_insert_prepare(loser_ct);
+
+ /* fake add for ORIGINAL dir: we want lookups to only find the entry
+ * already in the table. This also hides the clashing entry from
+ * ctnetlink iteration, i.e. conntrack -L won't show them.
+ */
+ hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+
+ hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
+ &nf_conntrack_hash[repl_idx]);
+ return NF_ACCEPT;
+}
+
+/**
+ * nf_ct_resolve_clash - attempt to handle clash without packet drop
+ *
+ * @skb: skb that causes the clash
+ * @h: tuplehash of the clashing entry already in table
+ * @hash_reply: hash slot for reply direction
+ *
+ * A conntrack entry can be inserted to the connection tracking table
+ * if there is no existing entry with an identical tuple.
+ *
+ * If there is one, @skb (and the assocated, unconfirmed conntrack) has
+ * to be dropped. In case @skb is retransmitted, next conntrack lookup
+ * will find the already-existing entry.
+ *
+ * The major problem with such packet drop is the extra delay added by
+ * the packet loss -- it will take some time for a retransmit to occur
+ * (or the sender to time out when waiting for a reply).
+ *
+ * This function attempts to handle the situation without packet drop.
+ *
+ * If @skb has no NAT transformation or if the colliding entries are
+ * exactly the same, only the to-be-confirmed conntrack entry is discarded
+ * and @skb is associated with the conntrack entry already in the table.
+ *
+ * Failing that, the new, unconfirmed conntrack is still added to the table
+ * provided that the collision only occurs in the ORIGINAL direction.
+ * The new entry will be added after the existing one in the hash list,
+ * so packets in the ORIGINAL direction will continue to match the existing
+ * entry. The new entry will also have a fixed timeout so it expires --
+ * due to the collision, it will not see bidirectional traffic.
+ *
+ * Returns NF_DROP if the clash could not be resolved.
+ */
static __cold noinline int
-nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
- enum ip_conntrack_info ctinfo,
- struct nf_conntrack_tuple_hash *h)
+nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
+ u32 reply_hash)
{
/* This is the conntrack entry already in hashes that won race. */
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
const struct nf_conntrack_l4proto *l4proto;
- enum ip_conntrack_info oldinfo;
- struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *loser_ct;
+ struct net *net;
+ int ret;
+
+ loser_ct = nf_ct_get(skb, &ctinfo);
+ net = nf_ct_net(loser_ct);
l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
- if (l4proto->allow_clash &&
- !nf_ct_is_dying(ct) &&
- atomic_inc_not_zero(&ct->ct_general.use)) {
- if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
- nf_ct_match(ct, loser_ct)) {
- nf_ct_acct_merge(ct, ctinfo, loser_ct);
- nf_conntrack_put(&loser_ct->ct_general);
- nf_ct_set(skb, ct, oldinfo);
- return NF_ACCEPT;
- }
- nf_ct_put(ct);
- }
+ if (!l4proto->allow_clash)
+ goto drop;
+
+ ret = __nf_ct_resolve_clash(skb, h);
+ if (ret == NF_ACCEPT)
+ return ret;
+
+ ret = nf_ct_resolve_clash_harder(skb, reply_hash);
+ if (ret == NF_ACCEPT)
+ return ret;
+
+drop:
+ nf_ct_add_to_dying_list(loser_ct);
NF_CT_STAT_INC(net, drop);
+ NF_CT_STAT_INC(net, insert_failed);
return NF_DROP;
}
@@ -932,7 +1075,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
- struct nf_conn_tstamp *tstamp;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
@@ -989,6 +1131,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
if (unlikely(nf_ct_is_dying(ct))) {
nf_ct_add_to_dying_list(ct);
+ NF_CT_STAT_INC(net, insert_failed);
goto dying;
}
@@ -1009,13 +1152,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout += nfct_time_stamp;
- atomic_inc(&ct->ct_general.use);
- ct->status |= IPS_CONFIRMED;
- /* set conntrack timestamp, if enabled. */
- tstamp = nf_conn_tstamp_find(ct);
- if (tstamp)
- tstamp->start = ktime_get_real_ns();
+ __nf_conntrack_insert_prepare(ct);
/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
@@ -1035,11 +1173,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT;
out:
- nf_ct_add_to_dying_list(ct);
- ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
+ ret = nf_ct_resolve_clash(skb, h, reply_hash);
dying:
nf_conntrack_double_unlock(hash, reply_hash);
- NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 7365b43f8f98..760ca2422816 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -81,6 +81,18 @@ static bool udp_error(struct sk_buff *skb,
return false;
}
+static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct,
+ struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ u32 extra_jiffies)
+{
+ if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY &&
+ ct->status & IPS_NAT_CLASH))
+ nf_ct_kill(ct);
+ else
+ nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies);
+}
+
/* Returns verdict for packet, and may modify conntracktype */
int nf_conntrack_udp_packet(struct nf_conn *ct,
struct sk_buff *skb,
@@ -116,8 +128,8 @@ int nf_conntrack_udp_packet(struct nf_conn *ct,
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
- nf_ct_refresh_acct(ct, ctinfo, skb,
- timeouts[UDP_CT_UNREPLIED]);
+ nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
+ timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
@@ -198,8 +210,8 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct,
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
- nf_ct_refresh_acct(ct, ctinfo, skb,
- timeouts[UDP_CT_UNREPLIED]);
+ nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
+ timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 83e1db37c3b0..06f00cdc3891 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -847,9 +847,6 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
{
int err;
- if (!nf_flowtable_hw_offload(flowtable))
- return 0;
-
if (!dev->netdev_ops->ndo_setup_tc)
return -EOPNOTSUPP;
@@ -876,6 +873,9 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
struct flow_block_offload bo;
int err;
+ if (!nf_flowtable_hw_offload(flowtable))
+ return 0;
+
err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack);
if (err < 0)
return err;
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index f0cb1e13af50..4fc0c924ed5d 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -203,7 +203,7 @@
* ::
*
* rule indices in last field: 0 1
- * map to elements: 0x42 0x66
+ * map to elements: 0x66 0x42
*
*
* Matching
@@ -298,7 +298,7 @@
* ::
*
* rule indices in last field: 0 1
- * map to elements: 0x42 0x66
+ * map to elements: 0x66 0x42
*
* the matching element is at 0x42.
*
@@ -503,7 +503,7 @@ static int pipapo_refill(unsigned long *map, int len, int rules,
return -1;
}
- if (unlikely(match_only)) {
+ if (match_only) {
bitmap_clear(map, i, 1);
return i;
}
@@ -1766,11 +1766,13 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem)
{
- const u8 *data = (const u8 *)elem->key.val.data;
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m = priv->clone;
+ struct nft_pipapo_elem *e = elem->priv;
int rules_f0, first_rule = 0;
- struct nft_pipapo_elem *e;
+ const u8 *data;
+
+ data = (const u8 *)nft_set_ext_key(&e->ext);
e = pipapo_get(net, set, data, 0);
if (IS_ERR(e))
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index bccd47cd7190..8c835ad63729 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -36,6 +36,7 @@
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/mutex.h>
#include <linux/kernel.h>
+#include <linux/refcount.h>
#include <uapi/linux/netfilter/xt_hashlimit.h>
#define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \
@@ -114,7 +115,7 @@ struct dsthash_ent {
struct xt_hashlimit_htable {
struct hlist_node node; /* global list of all htables */
- int use;
+ refcount_t use;
u_int8_t family;
bool rnd_initialized;
@@ -315,7 +316,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
for (i = 0; i < hinfo->cfg.size; i++)
INIT_HLIST_HEAD(&hinfo->hash[i]);
- hinfo->use = 1;
+ refcount_set(&hinfo->use, 1);
hinfo->count = 0;
hinfo->family = family;
hinfo->rnd_initialized = false;
@@ -401,15 +402,6 @@ static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo)
remove_proc_entry(hinfo->name, parent);
}
-static void htable_destroy(struct xt_hashlimit_htable *hinfo)
-{
- cancel_delayed_work_sync(&hinfo->gc_work);
- htable_remove_proc_entry(hinfo);
- htable_selective_cleanup(hinfo, true);
- kfree(hinfo->name);
- vfree(hinfo);
-}
-
static struct xt_hashlimit_htable *htable_find_get(struct net *net,
const char *name,
u_int8_t family)
@@ -420,7 +412,7 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net,
hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) {
if (!strcmp(name, hinfo->name) &&
hinfo->family == family) {
- hinfo->use++;
+ refcount_inc(&hinfo->use);
return hinfo;
}
}
@@ -429,12 +421,16 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net,
static void htable_put(struct xt_hashlimit_htable *hinfo)
{
- mutex_lock(&hashlimit_mutex);
- if (--hinfo->use == 0) {
+ if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) {
hlist_del(&hinfo->node);
- htable_destroy(hinfo);
+ htable_remove_proc_entry(hinfo);
+ mutex_unlock(&hashlimit_mutex);
+
+ cancel_delayed_work_sync(&hinfo->gc_work);
+ htable_selective_cleanup(hinfo, true);
+ kfree(hinfo->name);
+ vfree(hinfo);
}
- mutex_unlock(&hashlimit_mutex);
}
/* The algorithm used is the Simple Token Bucket Filter (TBF)
@@ -837,6 +833,8 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3);
}
+#define HASHLIMIT_MAX_SIZE 1048576
+
static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
struct xt_hashlimit_htable **hinfo,
struct hashlimit_cfg3 *cfg,
@@ -847,6 +845,14 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
if (cfg->gc_interval == 0 || cfg->expire == 0)
return -EINVAL;
+ if (cfg->size > HASHLIMIT_MAX_SIZE) {
+ cfg->size = HASHLIMIT_MAX_SIZE;
+ pr_info_ratelimited("size too large, truncated to %u\n", cfg->size);
+ }
+ if (cfg->max > HASHLIMIT_MAX_SIZE) {
+ cfg->max = HASHLIMIT_MAX_SIZE;
+ pr_info_ratelimited("max too large, truncated to %u\n", cfg->max);
+ }
if (par->family == NFPROTO_IPV4) {
if (cfg->srcmask > 32 || cfg->dstmask > 32)
return -EINVAL;
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index f5d34da0646e..a1f2320ecc16 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -143,7 +143,8 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain,
if (domain != NULL) {
bkt = netlbl_domhsh_hash(domain);
bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt];
- list_for_each_entry_rcu(iter, bkt_list, list)
+ list_for_each_entry_rcu(iter, bkt_list, list,
+ lockdep_is_held(&netlbl_domhsh_lock))
if (iter->valid &&
netlbl_family_match(iter->family, family) &&
strcmp(iter->domain, domain) == 0)
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index d2e4ab8d1cb1..77bb1bb22c3b 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -207,7 +207,8 @@ static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
bkt = netlbl_unlhsh_hash(ifindex);
bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt];
- list_for_each_entry_rcu(iter, bkt_list, list)
+ list_for_each_entry_rcu(iter, bkt_list, list,
+ lockdep_is_held(&netlbl_unlhsh_lock))
if (iter->valid && iter->ifindex == ifindex)
return iter;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 4e31721e7293..edf3e285e242 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1014,7 +1014,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
if (nlk->netlink_bind && groups) {
int group;
- for (group = 0; group < nlk->ngroups; group++) {
+ /* nl_groups is a u32, so cap the maximum groups we can bind */
+ for (group = 0; group < BITS_PER_TYPE(u32); group++) {
if (!test_bit(group, &groups))
continue;
err = nlk->netlink_bind(net, group + 1);
@@ -1033,7 +1034,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
netlink_insert(sk, nladdr->nl_pid) :
netlink_autobind(sock);
if (err) {
- netlink_undo_bind(nlk->ngroups, groups, sk);
+ netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk);
goto unlock;
}
}
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 0522b2b1fd95..9f357aa22b94 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -497,8 +497,9 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family,
err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr,
family->policy, validate, extack);
- if (err && parallel) {
- kfree(attrbuf);
+ if (err) {
+ if (parallel)
+ kfree(attrbuf);
return ERR_PTR(err);
}
return attrbuf;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 659c2a790fe7..c047afd12116 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -179,7 +179,8 @@ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
struct hlist_head *head;
head = vport_hash_bucket(dp, port_no);
- hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
+ hlist_for_each_entry_rcu(vport, head, dp_hash_node,
+ lockdep_ovsl_is_held()) {
if (vport->port_no == port_no)
return vport;
}
@@ -2042,7 +2043,8 @@ static unsigned int ovs_get_max_headroom(struct datapath *dp)
int i;
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
- hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
+ hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node,
+ lockdep_ovsl_is_held()) {
dev = vport->dev;
dev_headroom = netdev_get_fwd_headroom(dev);
if (dev_headroom > max_headroom)
@@ -2061,7 +2063,8 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom)
dp->max_headroom = new_headroom;
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
- hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
+ hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node,
+ lockdep_ovsl_is_held())
netdev_set_rx_headroom(vport->dev, new_headroom);
}
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 7da4230627f5..288122eec7c8 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2708,10 +2708,6 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
switch (key_type) {
- const struct ovs_key_ipv4 *ipv4_key;
- const struct ovs_key_ipv6 *ipv6_key;
- int err;
-
case OVS_KEY_ATTR_PRIORITY:
case OVS_KEY_ATTR_SKB_MARK:
case OVS_KEY_ATTR_CT_MARK:
@@ -2723,7 +2719,9 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
break;
- case OVS_KEY_ATTR_TUNNEL:
+ case OVS_KEY_ATTR_TUNNEL: {
+ int err;
+
if (masked)
return -EINVAL; /* Masked tunnel set not supported. */
@@ -2732,8 +2730,10 @@ static int validate_set(const struct nlattr *a,
if (err)
return err;
break;
+ }
+ case OVS_KEY_ATTR_IPV4: {
+ const struct ovs_key_ipv4 *ipv4_key;
- case OVS_KEY_ATTR_IPV4:
if (eth_type != htons(ETH_P_IP))
return -EINVAL;
@@ -2753,8 +2753,10 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
}
break;
+ }
+ case OVS_KEY_ATTR_IPV6: {
+ const struct ovs_key_ipv6 *ipv6_key;
- case OVS_KEY_ATTR_IPV6:
if (eth_type != htons(ETH_P_IPV6))
return -EINVAL;
@@ -2781,7 +2783,7 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
break;
-
+ }
case OVS_KEY_ATTR_TCP:
if ((eth_type != htons(ETH_P_IP) &&
eth_type != htons(ETH_P_IPV6)) ||
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 5904e93e5765..fd8a01ca7a2d 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -585,7 +585,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
head = find_bucket(ti, hash);
(*n_mask_hit)++;
- hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
+ hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver],
+ lockdep_ovsl_is_held()) {
if (flow->mask == mask && flow->flow_table.hash == hash &&
flow_cmp_masked_key(flow, &masked_key, &mask->range))
return flow;
@@ -769,7 +770,8 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
hash = ufid_hash(ufid);
head = find_bucket(ti, hash);
- hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) {
+ hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver],
+ lockdep_ovsl_is_held()) {
if (flow->ufid_table.hash == hash &&
ovs_flow_cmp_ufid(flow, ufid))
return flow;
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index 3323b79ff548..5010d1ddd4bd 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -61,7 +61,8 @@ static struct dp_meter *lookup_meter(const struct datapath *dp,
struct hlist_head *head;
head = meter_hash_bucket(dp, meter_id);
- hlist_for_each_entry_rcu(meter, head, dp_hash_node) {
+ hlist_for_each_entry_rcu(meter, head, dp_hash_node,
+ lockdep_ovsl_is_held()) {
if (meter->id == meter_id)
return meter;
}
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 5da9392b03d6..47febb4504f0 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -96,7 +96,8 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
struct hlist_head *bucket = hash_bucket(net, name);
struct vport *vport;
- hlist_for_each_entry_rcu(vport, bucket, hash_node)
+ hlist_for_each_entry_rcu(vport, bucket, hash_node,
+ lockdep_ovsl_is_held())
if (!strcmp(name, ovs_vport_name(vport)) &&
net_eq(ovs_dp_get_net(vport->dp), net))
return vport;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 3341eee87bf9..585e6b3b69ce 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -162,10 +162,9 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
if (write)
gup_flags |= FOLL_WRITE;
- ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
+ ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
if (ret >= 0 && ret < nr_pages) {
- while (ret--)
- put_page(pages[ret]);
+ unpin_user_pages(pages, ret);
ret = -EFAULT;
}
@@ -300,8 +299,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* to release anything.
*/
if (!need_odp) {
- for (i = 0 ; i < nents; i++)
- put_page(sg_page(&sg[i]));
+ unpin_user_pages(pages, nr_pages);
kfree(sg);
}
ret = PTR_ERR(trans_private);
@@ -325,7 +323,12 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
if (cookie_ret)
*cookie_ret = cookie;
- if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
+ if (args->cookie_addr &&
+ put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) {
+ if (!need_odp) {
+ unpin_user_pages(pages, nr_pages);
+ kfree(sg);
+ }
ret = -EFAULT;
goto out;
}
@@ -496,9 +499,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro)
* is the case for a RDMA_READ which copies from remote
* to local memory
*/
- if (!ro->op_write)
- set_page_dirty(page);
- put_page(page);
+ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write);
}
}
@@ -515,8 +516,7 @@ void rds_atomic_free_op(struct rm_atomic_op *ao)
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
- set_page_dirty(page);
- put_page(page);
+ unpin_user_pages_dirty_lock(&page, 1, true);
kfree(ao->op_notifier);
ao->op_notifier = NULL;
@@ -944,7 +944,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
return ret;
err:
if (page)
- put_page(page);
+ unpin_user_page(page);
rm->atomic.op_active = 0;
kfree(rm->atomic.op_notifier);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 90a31b15585f..8c466a712cda 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -186,6 +186,7 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
+ nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
+ cookie_len /* TCA_ACT_COOKIE */
+ nla_total_size(0) /* TCA_ACT_STATS nested */
+ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */
/* TCA_STATS_BASIC */
+ nla_total_size_64bit(sizeof(struct gnet_stats_basic))
/* TCA_STATS_PKT64 */
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 7e54d2ab5254..d32d4233d337 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -305,6 +305,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct cls_fl_filter *f;
list_for_each_entry_rcu(mask, &head->masks, list) {
+ flow_dissector_init_keys(&skb_key.control, &skb_key.basic);
fl_clear_masked_range(&skb_key, mask);
skb_flow_dissect_meta(skb, &mask->dissector, &skb_key);
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 748e3b19ec1d..6a16af4b1ef6 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -170,6 +170,16 @@ static inline bool sctp_chunk_length_valid(struct sctp_chunk *chunk,
return true;
}
+/* Check for format error in an ABORT chunk */
+static inline bool sctp_err_chunk_valid(struct sctp_chunk *chunk)
+{
+ struct sctp_errhdr *err;
+
+ sctp_walk_errors(err, chunk->chunk_hdr);
+
+ return (void *)err == (void *)chunk->chunk_end;
+}
+
/**********************************************************
* These are the state functions for handling chunk events.
**********************************************************/
@@ -2255,6 +2265,9 @@ enum sctp_disposition sctp_sf_shutdown_pending_abort(
sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+ if (!sctp_err_chunk_valid(chunk))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
}
@@ -2298,6 +2311,9 @@ enum sctp_disposition sctp_sf_shutdown_sent_abort(
sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+ if (!sctp_err_chunk_valid(chunk))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
/* Stop the T2-shutdown timer. */
sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
@@ -2565,6 +2581,9 @@ enum sctp_disposition sctp_sf_do_9_1_abort(
sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+ if (!sctp_err_chunk_valid(chunk))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
}
@@ -2582,16 +2601,8 @@ static enum sctp_disposition __sctp_sf_do_9_1_abort(
/* See if we have an error cause code in the chunk. */
len = ntohs(chunk->chunk_hdr->length);
- if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) {
- struct sctp_errhdr *err;
-
- sctp_walk_errors(err, chunk->chunk_hdr);
- if ((void *)err != (void *)chunk->chunk_end)
- return sctp_sf_pdiscard(net, ep, asoc, type, arg,
- commands);
-
+ if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
error = ((struct sctp_errhdr *)chunk->skb->data)->cause;
- }
sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET));
/* ASSOC_FAILED will DELETE_TCB. */
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 90988a511cd5..6fd44bdb0fc3 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -512,15 +512,18 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
static int smc_connect_abort(struct smc_sock *smc, int reason_code,
int local_contact)
{
+ bool is_smcd = smc->conn.lgr->is_smcd;
+
if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(smc->conn.lgr);
- if (smc->conn.lgr->is_smcd)
+ smc_lgr_cleanup_early(&smc->conn);
+ else
+ smc_conn_free(&smc->conn);
+ if (is_smcd)
/* there is only one lgr role for SMC-D; use server lock */
mutex_unlock(&smc_server_lgr_pending);
else
mutex_unlock(&smc_client_lgr_pending);
- smc_conn_free(&smc->conn);
smc->connect_nonblock = 0;
return reason_code;
}
@@ -1091,7 +1094,6 @@ static void smc_listen_out_err(struct smc_sock *new_smc)
if (newsmcsk->sk_state == SMC_INIT)
sock_put(&new_smc->sk); /* passive closing */
newsmcsk->sk_state = SMC_CLOSED;
- smc_conn_free(&new_smc->conn);
smc_listen_out(new_smc);
}
@@ -1102,12 +1104,13 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
{
/* RDMA setup failed, switch back to TCP */
if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
+ smc_lgr_cleanup_early(&new_smc->conn);
+ else
+ smc_conn_free(&new_smc->conn);
if (reason_code < 0) { /* error, no fallback possible */
smc_listen_out_err(new_smc);
return;
}
- smc_conn_free(&new_smc->conn);
smc_switch_to_fallback(new_smc);
new_smc->fallback_rsn = reason_code;
if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
@@ -1170,16 +1173,18 @@ static int smc_listen_ism_init(struct smc_sock *new_smc,
new_smc->conn.lgr->vlan_id,
new_smc->conn.lgr->smcd)) {
if (ini->cln_first_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
- smc_conn_free(&new_smc->conn);
+ smc_lgr_cleanup_early(&new_smc->conn);
+ else
+ smc_conn_free(&new_smc->conn);
return SMC_CLC_DECL_SMCDNOTALK;
}
/* Create send and receive buffers */
if (smc_buf_create(new_smc, true)) {
if (ini->cln_first_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
- smc_conn_free(&new_smc->conn);
+ smc_lgr_cleanup_early(&new_smc->conn);
+ else
+ smc_conn_free(&new_smc->conn);
return SMC_CLC_DECL_MEM;
}
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 2249de5379ee..5b085efa3bce 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -162,6 +162,18 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
conn->lgr = NULL;
}
+void smc_lgr_cleanup_early(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+
+ if (!lgr)
+ return;
+
+ smc_conn_free(conn);
+ smc_lgr_forget(lgr);
+ smc_lgr_schedule_free_work_fast(lgr);
+}
+
/* Send delete link, either as client to request the initiation
* of the DELETE LINK sequence from server; or as server to
* initiate the delete processing. See smc_llc_rx_delete_link().
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index c472e12951d1..234ae25f0025 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -296,6 +296,7 @@ struct smc_clc_msg_accept_confirm;
struct smc_clc_msg_local;
void smc_lgr_forget(struct smc_link_group *lgr);
+void smc_lgr_cleanup_early(struct smc_connection *conn);
void smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
@@ -316,7 +317,6 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini);
void smc_conn_free(struct smc_connection *conn);
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
-void smcd_conn_free(struct smc_connection *conn);
void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
int smc_core_init(void);
void smc_core_exit(void);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 548632621f4b..d6ba186f67e2 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -573,6 +573,8 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
struct smc_ib_device *smcibdev;
smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
+ if (!smcibdev || smcibdev->ibdev != ibdev)
+ return;
ib_set_client_data(ibdev, &smc_ib_client, NULL);
spin_lock(&smc_ib_devices.lock);
list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 1ba5a92832bb..1c5574e2e058 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -593,7 +593,7 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
u32 seq, u64 *p_record_sn)
{
u64 record_sn = context->hint_record_sn;
- struct tls_record_info *info;
+ struct tls_record_info *info, *last;
info = context->retransmit_hint;
if (!info ||
@@ -605,6 +605,24 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
struct tls_record_info, list);
if (!info)
return NULL;
+ /* send the start_marker record if seq number is before the
+ * tls offload start marker sequence number. This record is
+ * required to handle TCP packets which are before TLS offload
+ * started.
+ * And if it's not start marker, look if this seq number
+ * belongs to the list.
+ */
+ if (likely(!tls_record_is_start_marker(info))) {
+ /* we have the first record, get the last record to see
+ * if this seq number belongs to the list.
+ */
+ last = list_last_entry(&context->records_list,
+ struct tls_record_info, list);
+
+ if (!between(seq, tls_record_start_seq(info),
+ last->end_seq))
+ return NULL;
+ }
record_sn = context->unacked_record_sn;
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 62c12cb5763e..68debcb28fa4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -682,6 +682,7 @@ static int unix_set_peek_off(struct sock *sk, int val)
return 0;
}
+#ifdef CONFIG_PROC_FS
static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -692,6 +693,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds));
}
}
+#else
+#define unix_show_fdinfo NULL
+#endif
static const struct proto_ops unix_stream_ops = {
.family = PF_UNIX,
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 9c5b2a91baad..a5f28708e0e7 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -451,6 +451,12 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
if (vsk->transport == new_transport)
return 0;
+ /* transport->release() must be called with sock lock acquired.
+ * This path can only be taken during vsock_stream_connect(),
+ * where we have already held the sock lock.
+ * In the other cases, this function is called on a new socket
+ * which is not assigned to any transport.
+ */
vsk->transport->release(vsk);
vsock_deassign_transport(vsk);
}
@@ -753,20 +759,18 @@ static void __vsock_release(struct sock *sk, int level)
vsk = vsock_sk(sk);
pending = NULL; /* Compiler warning. */
- /* The release call is supposed to use lock_sock_nested()
- * rather than lock_sock(), if a sock lock should be acquired.
- */
- if (vsk->transport)
- vsk->transport->release(vsk);
- else if (sk->sk_type == SOCK_STREAM)
- vsock_remove_sock(vsk);
-
/* When "level" is SINGLE_DEPTH_NESTING, use the nested
* version to avoid the warning "possible recursive locking
* detected". When "level" is 0, lock_sock_nested(sk, level)
* is the same as lock_sock(sk).
*/
lock_sock_nested(sk, level);
+
+ if (vsk->transport)
+ vsk->transport->release(vsk);
+ else if (sk->sk_type == SOCK_STREAM)
+ vsock_remove_sock(vsk);
+
sock_orphan(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 3492c021925f..630b851f8150 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -526,12 +526,9 @@ static bool hvs_close_lock_held(struct vsock_sock *vsk)
static void hvs_release(struct vsock_sock *vsk)
{
- struct sock *sk = sk_vsock(vsk);
bool remove_sock;
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
remove_sock = hvs_close_lock_held(vsk);
- release_sock(sk);
if (remove_sock)
vsock_remove_sock(vsk);
}
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index d9f0c9c5425a..f3c4bab2f737 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -829,7 +829,6 @@ void virtio_transport_release(struct vsock_sock *vsk)
struct sock *sk = &vsk->sk;
bool remove_sock = true;
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
if (sk->sk_type == SOCK_STREAM)
remove_sock = virtio_transport_close(vsk);
@@ -837,7 +836,6 @@ void virtio_transport_release(struct vsock_sock *vsk)
list_del(&pkt->list);
virtio_transport_free_pkt(pkt);
}
- release_sock(sk);
if (remove_sock)
vsock_remove_sock(vsk);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index cedf17d4933f..5b19e9fac4aa 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -20,6 +20,7 @@
#include <linux/netlink.h>
#include <linux/nospec.h>
#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
#include <net/net_namespace.h>
#include <net/genetlink.h>
#include <net/cfg80211.h>
@@ -4800,8 +4801,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
err = nl80211_parse_he_obss_pd(
info->attrs[NL80211_ATTR_HE_OBSS_PD],
&params.he_obss_pd);
- if (err)
- return err;
+ goto out;
}
nl80211_calculate_ap_params(&params);
@@ -4823,6 +4823,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
}
wdev_unlock(wdev);
+out:
kfree(params.acl);
return err;
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index fff9a74891fc..1a8218f1bbe0 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2276,7 +2276,7 @@ static void handle_channel_custom(struct wiphy *wiphy,
break;
}
- if (IS_ERR(reg_rule)) {
+ if (IS_ERR_OR_NULL(reg_rule)) {
pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n",
chan->center_freq);
if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index df600487a68d..356f90e4522b 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -217,6 +217,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
static void xsk_flush(struct xdp_sock *xs)
{
xskq_prod_submit(xs->rx);
+ __xskq_cons_release(xs->umem->fq);
sock_def_readable(&xs->sk);
}
@@ -304,6 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+ __xskq_cons_release(xs->tx);
xs->sk.sk_write_space(&xs->sk);
}
rcu_read_unlock();
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index bec2af11853a..89a01ac4e079 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -271,7 +271,8 @@ static inline void xskq_cons_release(struct xsk_queue *q)
{
/* To improve performance, only update local state here.
* Reflect this to global state when we get new entries
- * from the ring in xskq_cons_get_entries().
+ * from the ring in xskq_cons_get_entries() and whenever
+ * Rx or Tx processing are completed in the NAPI loop.
*/
q->cached_cons++;
}