diff options
Diffstat (limited to 'kernel')
61 files changed, 3283 insertions, 3047 deletions
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 22ad967d1e5f..830d7f095748 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -129,7 +129,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags & BPF_NOEXIST) + if (flags != BPF_ANY && flags != BPF_EXIST) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -195,6 +195,9 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); + if (attr->value_size == 0) + return ERR_PTR(-EINVAL); + if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 488ef9663c01..0a0f2ec75370 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -132,6 +132,7 @@ struct smap_psock { struct work_struct gc_work; struct proto *sk_proto; + void (*save_unhash)(struct sock *sk); void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); @@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_unhash(struct sock *sk); static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -184,6 +186,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], struct proto *base) { prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; prot[SOCKMAP_BASE].close = bpf_tcp_close; prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; @@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk) return -EBUSY; } + psock->save_unhash = sk->sk_prot->unhash; psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; @@ -305,30 +309,12 @@ static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, return e; } -static void bpf_tcp_close(struct sock *sk, long timeout) +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) { - void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; struct sock *osk; - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - /* The psock may be destroyed anytime after exiting the RCU critial - * section so by the time we use close_fun the psock may no longer - * be valid. However, bpf_tcp_close is called with the sock lock - * held so the close hook and sk are still valid. - */ - close_fun = psock->save_close; - if (psock->cork) { free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); @@ -379,6 +365,42 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(e); e = psock_map_pop(sk, psock); } +} + +static void bpf_tcp_unhash(struct sock *sk) +{ + void (*unhash_fun)(struct sock *sk); + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + unhash_fun = psock->save_unhash; + bpf_tcp_remove(sk, psock); + rcu_read_unlock(); + unhash_fun(sk); +} + +static void bpf_tcp_close(struct sock *sk, long timeout) +{ + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + close_fun = psock->save_close; + bpf_tcp_remove(sk, psock); rcu_read_unlock(); release_sock(sk); close_fun(sk, timeout); @@ -2097,8 +2119,12 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP) { + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { fput(socket->file); return -EOPNOTSUPP; } @@ -2453,6 +2479,16 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ + if (skops.sk->sk_type != SOCK_STREAM || + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { + fput(socket->file); + return -EOPNOTSUPP; + } + lock_sock(skops.sk); preempt_disable(); rcu_read_lock(); @@ -2543,10 +2579,22 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, }; +static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. This checks that the sock ops triggering the update is + * one indicating we are (or will be soon) in an ESTABLISHED state. + */ + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_map_ctx_update_elem(bpf_sock, map, key, flags); } @@ -2565,6 +2613,9 @@ BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb07e74b34a2..465952a8e465 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2896,6 +2896,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + if (insn_bitness == 32) { + /* Relevant for 32-bit RSH: Information can propagate towards + * LSB, so it isn't sufficient to only truncate the output to + * 32 bits. + */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -3131,7 +3140,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops are (32,32)->32 */ coerce_reg_to_size(dst_reg, 4); - coerce_reg_to_size(&src_reg, 4); } __reg_deduce_bounds(dst_reg); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9f8463afda9c..47147c9e184d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -192,11 +192,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, sock_hold(sock->sk); old_xs = xchg(&m->xsk_map[i], xs); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } sockfd_put(sock); return 0; @@ -212,11 +209,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; old_xs = xchg(&m->xsk_map[k], NULL); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index aae10baf1902..4c1cf0969a80 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -492,7 +492,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, } /** - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * @@ -501,8 +501,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ -static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) +static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, + struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); @@ -523,6 +523,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, } /** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get the effective css of @cgrp for @ss. The effective css is + * defined as the matching css of the nearest ancestor including self which + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, + * the root css is returned, so this function always returns a valid css. + * + * The returned css is not guaranteed to be online, and therefore it is the + * callers responsiblity to tryget a reference for it. + */ +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + do { + css = cgroup_css(cgrp, ss); + + if (css) + return css; + cgrp = cgroup_parent(cgrp); + } while (cgrp); + + return init_css_set.subsys[ss->id]; +} + +/** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest @@ -604,10 +633,11 @@ EXPORT_SYMBOL_GPL(of_css); * * Should be called under cgroup_[tree_]mutex. */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ - ; \ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css_by_mask(cgrp, \ + cgroup_subsys[(ssid)]))) \ + ; \ else /** @@ -1006,7 +1036,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ - template[i] = cgroup_e_css(cgrp, ss); + template[i] = cgroup_e_css_by_mask(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want @@ -2836,11 +2866,12 @@ restart: } /** - * cgroup_save_control - save control masks of a subtree + * cgroup_save_control - save control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * - * Save ->subtree_control and ->subtree_ss_mask to the respective old_ - * prefixed fields for @cgrp's subtree including @cgrp itself. + * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the + * respective old_ prefixed fields for @cgrp's subtree including @cgrp + * itself. */ static void cgroup_save_control(struct cgroup *cgrp) { @@ -2850,6 +2881,7 @@ static void cgroup_save_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; + dsct->old_dom_cgrp = dsct->dom_cgrp; } } @@ -2875,11 +2907,12 @@ static void cgroup_propagate_control(struct cgroup *cgrp) } /** - * cgroup_restore_control - restore control masks of a subtree + * cgroup_restore_control - restore control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * - * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ - * prefixed fields for @cgrp's subtree including @cgrp itself. + * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the + * respective old_ prefixed fields for @cgrp's subtree including @cgrp + * itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { @@ -2889,6 +2922,7 @@ static void cgroup_restore_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; + dsct->dom_cgrp = dsct->old_dom_cgrp; } } @@ -3019,7 +3053,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) return ret; /* - * At this point, cgroup_e_css() results reflect the new csses + * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ @@ -3196,6 +3230,8 @@ static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *dom_cgrp = parent->dom_cgrp; + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; int ret; lockdep_assert_held(&cgroup_mutex); @@ -3225,12 +3261,13 @@ static int cgroup_enable_threaded(struct cgroup *cgrp) */ cgroup_save_control(cgrp); - cgrp->dom_cgrp = dom_cgrp; + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) + if (dsct == cgrp || cgroup_is_threaded(dsct)) + dsct->dom_cgrp = dom_cgrp; + ret = cgroup_apply_control(cgrp); if (!ret) parent->nr_threaded_children++; - else - cgrp->dom_cgrp = cgrp; cgroup_finalize_control(cgrp, ret); return ret; diff --git a/kernel/cpu.c b/kernel/cpu.c index 0097acec1c71..e82920b8bee1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -315,6 +315,16 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +static void lockdep_acquire_cpus_lock(void) +{ + rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); +} + +static void lockdep_release_cpus_lock(void) +{ + rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_); +} + /* * Wait for currently running CPU hotplug operations to complete (if any) and * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects @@ -344,6 +354,17 @@ void cpu_hotplug_enable(void) cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); + +#else + +static void lockdep_acquire_cpus_lock(void) +{ +} + +static void lockdep_release_cpus_lock(void) +{ +} + #endif /* CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_HOTPLUG_SMT @@ -362,6 +383,7 @@ void __init cpu_smt_disable(bool force) pr_info("SMT: Force disabled\n"); cpu_smt_control = CPU_SMT_FORCE_DISABLED; } else { + pr_info("SMT: disabled\n"); cpu_smt_control = CPU_SMT_DISABLED; } } @@ -616,6 +638,12 @@ static void cpuhp_thread_fun(unsigned int cpu) */ smp_mb(); + /* + * The BP holds the hotplug lock, but we're now running on the AP, + * ensure that anybody asserting the lock is held, will actually find + * it so. + */ + lockdep_acquire_cpus_lock(); cpuhp_lock_acquire(bringup); if (st->single) { @@ -661,6 +689,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } cpuhp_lock_release(bringup); + lockdep_release_cpus_lock(); if (!st->should_run) complete_ap_thread(st, bringup); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 9bd54304446f..645c7a2ecde8 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -13,6 +13,9 @@ config NEED_DMA_MAP_STATE config ARCH_DMA_ADDR_T_64BIT def_bool 64BIT || PHYS_ADDR_T_64BIT +config ARCH_HAS_DMA_COHERENCE_H + bool + config HAVE_GENERIC_DMA_COHERENT bool @@ -23,22 +26,22 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU bool select NEED_DMA_MAP_STATE -config DMA_DIRECT_OPS +config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL bool - depends on HAS_DMA -config DMA_NONCOHERENT_OPS +config ARCH_HAS_DMA_COHERENT_TO_PFN bool - depends on HAS_DMA - select DMA_DIRECT_OPS -config DMA_NONCOHERENT_MMAP +config ARCH_HAS_DMA_MMAP_PGPROT bool - depends on DMA_NONCOHERENT_OPS + +config DMA_DIRECT_OPS + bool + depends on HAS_DMA config DMA_NONCOHERENT_CACHE_SYNC bool - depends on DMA_NONCOHERENT_OPS + depends on DMA_DIRECT_OPS config DMA_VIRT_OPS bool diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 6de44e4eb454..7d581e4eea4a 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -4,7 +4,6 @@ obj-$(CONFIG_HAS_DMA) += mapping.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o -obj-$(CONFIG_DMA_NONCOHERENT_OPS) += noncoherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 286d82329eb0..b2a87905846d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -49,7 +49,11 @@ static phys_addr_t limit_cmdline; static int __init early_cma(char *p) { - pr_debug("%s(%s)\n", __func__, p); + if (!p) { + pr_err("Config string not provided\n"); + return -EINVAL; + } + size_cmdline = memparse(p, &p); if (*p != '@') return 0; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index c007d25bee09..231ca4628062 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1312,6 +1312,22 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg) #endif } +void debug_dma_map_single(struct device *dev, const void *addr, + unsigned long len) +{ + if (unlikely(dma_debug_disabled())) + return; + + if (!virt_addr_valid(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from invalid area [addr=%p] [len=%lu]\n", + addr, len); + + if (is_vmalloc_addr(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n", + addr, len); +} +EXPORT_SYMBOL(debug_dma_map_single); + void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, int direction, dma_addr_t dma_addr, bool map_single) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index de87b0282e74..87a6bc2a96c0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -1,13 +1,16 @@ // SPDX-License-Identifier: GPL-2.0 /* - * DMA operations that map physical memory directly without using an IOMMU or - * flushing caches. + * Copyright (C) 2018 Christoph Hellwig. + * + * DMA operations that map physical memory directly without using an IOMMU. */ +#include <linux/bootmem.h> /* for max_pfn */ #include <linux/export.h> #include <linux/mm.h> #include <linux/dma-direct.h> #include <linux/scatterlist.h> #include <linux/dma-contiguous.h> +#include <linux/dma-noncoherent.h> #include <linux/pfn.h> #include <linux/set_memory.h> @@ -41,40 +44,83 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, return false; } - if (*dev->dma_mask >= DMA_BIT_MASK(32)) { + if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { dev_err(dev, - "%s: overflow %pad+%zu of device mask %llx\n", - caller, &dma_addr, size, *dev->dma_mask); + "%s: overflow %pad+%zu of device mask %llx bus mask %llx\n", + caller, &dma_addr, size, + *dev->dma_mask, dev->bus_dma_mask); } return false; } return true; } +static inline dma_addr_t phys_to_dma_direct(struct device *dev, + phys_addr_t phys) +{ + if (force_dma_unencrypted()) + return __phys_to_dma(dev, phys); + return phys_to_dma(dev, phys); +} + +u64 dma_direct_get_required_mask(struct device *dev) +{ + u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); + + if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma) + max_dma = dev->bus_dma_mask; + + return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; +} + +static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_mask) +{ + if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask) + dma_mask = dev->bus_dma_mask; + + if (force_dma_unencrypted()) + *phys_mask = __dma_to_phys(dev, dma_mask); + else + *phys_mask = dma_to_phys(dev, dma_mask); + + /* + * Optimistically try the zone that the physical address mask falls + * into first. If that returns memory that isn't actually addressable + * we will fallback to the next lower zone and try again. + * + * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding + * zones. + */ + if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + return GFP_DMA; + if (*phys_mask <= DMA_BIT_MASK(32)) + return GFP_DMA32; + return 0; +} + static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { - dma_addr_t addr = force_dma_unencrypted() ? - __phys_to_dma(dev, phys) : phys_to_dma(dev, phys); - return addr + size - 1 <= dev->coherent_dma_mask; + return phys_to_dma_direct(dev, phys) + size - 1 <= + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask); } -void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) +void *dma_direct_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; int page_order = get_order(size); struct page *page = NULL; + u64 phys_mask; void *ret; + if (attrs & DMA_ATTR_NO_WARN) + gfp |= __GFP_NOWARN; + /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; - - /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ - if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) - gfp |= GFP_DMA; - if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) - gfp |= GFP_DMA32; - + gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); again: /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(gfp)) { @@ -93,15 +139,14 @@ again: page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && - dev->coherent_dma_mask < DMA_BIT_MASK(64) && + phys_mask < DMA_BIT_MASK(64) && !(gfp & (GFP_DMA32 | GFP_DMA))) { gfp |= GFP_DMA32; goto again; } if (IS_ENABLED(CONFIG_ZONE_DMA) && - dev->coherent_dma_mask < DMA_BIT_MASK(32) && - !(gfp & GFP_DMA)) { + phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } @@ -124,7 +169,7 @@ again: * NOTE: this function must never look at the dma_addr argument, because we want * to be able to use it as a helper for iommu implementations as well. */ -void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, +void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; @@ -136,14 +181,96 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, free_pages((unsigned long)cpu_addr, page_order); } +void *dma_direct_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +{ + if (!dev_is_dma_coherent(dev)) + return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); +} + +void dma_direct_free(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) +{ + if (!dev_is_dma_coherent(dev)) + arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); + else + dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); +} + +static void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + if (dev_is_dma_coherent(dev)) + return; + arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); +} + +static void dma_direct_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + if (dev_is_dma_coherent(dev)) + return; + + for_each_sg(sgl, sg, nents, i) + arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); +} + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) +static void dma_direct_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + if (dev_is_dma_coherent(dev)) + return; + arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); + arch_sync_dma_for_cpu_all(dev); +} + +static void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + if (dev_is_dma_coherent(dev)) + return; + + for_each_sg(sgl, sg, nents, i) + arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); + arch_sync_dma_for_cpu_all(dev); +} + +static void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); +} + +static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir); +} +#endif + dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset; + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dma_addr = phys_to_dma(dev, phys); if (!check_addr(dev, dma_addr, size, __func__)) return DIRECT_MAPPING_ERROR; + + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_single_for_device(dev, dma_addr, size, dir); return dma_addr; } @@ -162,31 +289,29 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, sg_dma_len(sg) = sg->length; } + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_sg_for_device(dev, sgl, nents, dir); return nents; } +/* + * Because 32-bit DMA masks are so common we expect every architecture to be + * able to satisfy them - either by not supporting more physical memory, or by + * providing a ZONE_DMA32. If neither is the case, the architecture needs to + * use an IOMMU instead of the direct mapping. + */ int dma_direct_supported(struct device *dev, u64 mask) { -#ifdef CONFIG_ZONE_DMA - if (mask < phys_to_dma(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))) - return 0; -#else - /* - * Because 32-bit DMA masks are so common we expect every architecture - * to be able to satisfy them - either by not supporting more physical - * memory, or by providing a ZONE_DMA32. If neither is the case, the - * architecture needs to use an IOMMU instead of the direct mapping. - */ - if (mask < phys_to_dma(dev, DMA_BIT_MASK(32))) - return 0; -#endif - /* - * Upstream PCI/PCIe bridges or SoC interconnects may not carry - * as many DMA address bits as the device itself supports. - */ - if (dev->bus_dma_mask && mask > dev->bus_dma_mask) - return 0; - return 1; + u64 min_mask; + + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS); + else + min_mask = DMA_BIT_MASK(32); + + min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); + + return mask >= phys_to_dma(dev, min_mask); } int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr) @@ -199,7 +324,20 @@ const struct dma_map_ops dma_direct_ops = { .free = dma_direct_free, .map_page = dma_direct_map_page, .map_sg = dma_direct_map_sg, +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) + .sync_single_for_device = dma_direct_sync_single_for_device, + .sync_sg_for_device = dma_direct_sync_sg_for_device, +#endif +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) + .sync_single_for_cpu = dma_direct_sync_single_for_cpu, + .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, + .unmap_page = dma_direct_unmap_page, + .unmap_sg = dma_direct_unmap_sg, +#endif + .get_required_mask = dma_direct_get_required_mask, .dma_supported = dma_direct_supported, .mapping_error = dma_direct_mapping_error, + .cache_sync = arch_dma_cache_sync, }; EXPORT_SYMBOL(dma_direct_ops); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index d2a92ddaac4d..58dec7a92b7b 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -7,7 +7,7 @@ */ #include <linux/acpi.h> -#include <linux/dma-mapping.h> +#include <linux/dma-noncoherent.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/of_device.h> @@ -202,17 +202,26 @@ EXPORT_SYMBOL(dmam_release_declared_memory); * Create scatter-list for the already allocated DMA buffer. */ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t handle, size_t size) + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) { - struct page *page = virt_to_page(cpu_addr); + struct page *page; int ret; - ret = sg_alloc_table(sgt, 1, GFP_KERNEL); - if (unlikely(ret)) - return ret; + if (!dev_is_dma_coherent(dev)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) + return -ENXIO; - sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); - return 0; + page = pfn_to_page(arch_dma_coherent_to_pfn(dev, cpu_addr, + dma_addr)); + } else { + page = virt_to_page(cpu_addr); + } + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (!ret) + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return ret; } EXPORT_SYMBOL(dma_common_get_sgtable); @@ -220,27 +229,37 @@ EXPORT_SYMBOL(dma_common_get_sgtable); * Create userspace mapping for the DMA-coherent memory. */ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size) + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) { - int ret = -ENXIO; #ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; + unsigned long pfn; + int ret = -ENXIO; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; - if (off < count && user_count <= (count - off)) - ret = remap_pfn_range(vma, vma->vm_start, - page_to_pfn(virt_to_page(cpu_addr)) + off, - user_count << PAGE_SHIFT, - vma->vm_page_prot); -#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ + if (off >= count || user_count > count - off) + return -ENXIO; - return ret; + if (!dev_is_dma_coherent(dev)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) + return -ENXIO; + pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); + } else { + pfn = page_to_pfn(virt_to_page(cpu_addr)); + } + + return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, + user_count << PAGE_SHIFT, vma->vm_page_prot); +#else + return -ENXIO; +#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ } EXPORT_SYMBOL(dma_common_mmap); @@ -327,19 +346,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) vunmap(cpu_addr); } #endif - -/* - * enables DMA API use for a device - */ -int dma_configure(struct device *dev) -{ - if (dev->bus->dma_configure) - return dev->bus->dma_configure(dev); - return 0; -} - -void dma_deconfigure(struct device *dev) -{ - of_dma_deconfigure(dev); - acpi_dma_deconfigure(dev); -} diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c deleted file mode 100644 index 031fe235d958..000000000000 --- a/kernel/dma/noncoherent.c +++ /dev/null @@ -1,106 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2018 Christoph Hellwig. - * - * DMA operations that map physical memory directly without providing cache - * coherence. - */ -#include <linux/export.h> -#include <linux/mm.h> -#include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> -#include <linux/scatterlist.h> - -static void dma_noncoherent_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); -} - -static void dma_noncoherent_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); -} - -static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - dma_addr_t addr; - - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); - if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(dev, page_to_phys(page) + offset, - size, dir); - return addr; -} - -static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs); - if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir); - return nents; -} - -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) -static void dma_noncoherent_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); - arch_sync_dma_for_cpu_all(dev); -} - -static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); - arch_sync_dma_for_cpu_all(dev); -} - -static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir); -} - -static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir); -} -#endif - -const struct dma_map_ops dma_noncoherent_ops = { - .alloc = arch_dma_alloc, - .free = arch_dma_free, - .mmap = arch_dma_mmap, - .sync_single_for_device = dma_noncoherent_sync_single_for_device, - .sync_sg_for_device = dma_noncoherent_sync_sg_for_device, - .map_page = dma_noncoherent_map_page, - .map_sg = dma_noncoherent_map_sg, -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) - .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu, - .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu, - .unmap_page = dma_noncoherent_unmap_page, - .unmap_sg = dma_noncoherent_unmap_sg, -#endif - .dma_supported = dma_direct_supported, - .mapping_error = dma_direct_mapping_error, - .cache_sync = arch_dma_cache_sync, -}; -EXPORT_SYMBOL(dma_noncoherent_ops); diff --git a/kernel/events/core.c b/kernel/events/core.c index c80549bf82c6..5a97f34bc14c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3935,6 +3935,12 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } + /* If this is a pinned event it must be running on this CPU */ + if (event->attr.pinned && event->oncpu != smp_processor_id()) { + ret = -EBUSY; + goto out; + } + /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise @@ -8308,6 +8314,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->cpu != smp_processor_id()) + continue; if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) @@ -9425,9 +9433,7 @@ static void free_pmu_context(struct pmu *pmu) if (pmu->task_ctx_nr > perf_invalid_context) return; - mutex_lock(&pmus_lock); free_percpu(pmu->pmu_cpu_context); - mutex_unlock(&pmus_lock); } /* @@ -9683,12 +9689,8 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - int remove_device; - mutex_lock(&pmus_lock); - remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); - mutex_unlock(&pmus_lock); /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -9700,13 +9702,14 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (remove_device) { + if (pmu_bus_running) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); + mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 5d3cf407e374..4a9937076331 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -459,10 +459,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) if (size || handle->aux_flags) { /* * Only send RECORD_AUX if we have something useful to communicate + * + * Note: the OVERWRITE records by themselves are not considered + * useful, as they don't communicate any *new* information, + * aside from the short-lived offset, that becomes history at + * the next event sched-in and therefore isn't useful. + * The userspace that needs to copy out AUX data in overwrite + * mode should know to use user_page::aux_head for the actual + * offset. So, from now on we don't output AUX records that + * have *only* OVERWRITE flag set. */ - perf_event_aux_event(handle->event, aux_head, size, - handle->aux_flags); + if (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE) + perf_event_aux_event(handle->event, aux_head, size, + handle->aux_flags); } rb->user_page->aux_head = rb->aux_head; diff --git a/kernel/futex.c b/kernel/futex.c index 11fc3bb456d6..3e2de8fc1891 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1365,9 +1365,9 @@ static void __unqueue_futex(struct futex_q *q) { struct futex_hash_bucket *hb; - if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) - || WARN_ON(plist_node_empty(&q->list))) + if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) return; + lockdep_assert_held(q->lock_ptr); hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 5092494bf261..6e6d467f3dec 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -124,6 +124,27 @@ static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm, return area; } +/* Find the best CPU which has the lowest vector allocation count */ +static unsigned int matrix_find_best_cpu(struct irq_matrix *m, + const struct cpumask *msk) +{ + unsigned int cpu, best_cpu, maxavl = 0; + struct cpumap *cm; + + best_cpu = UINT_MAX; + + for_each_cpu(cpu, msk) { + cm = per_cpu_ptr(m->maps, cpu); + + if (!cm->online || cm->available <= maxavl) + continue; + + best_cpu = cpu; + maxavl = cm->available; + } + return best_cpu; +} + /** * irq_matrix_assign_system - Assign system wide entry in the matrix * @m: Matrix pointer @@ -239,11 +260,21 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) * @m: Matrix pointer * @cpu: On which CPU the interrupt should be allocated */ -int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu) +int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, + unsigned int *mapped_cpu) { - struct cpumap *cm = per_cpu_ptr(m->maps, cpu); - unsigned int bit, end = m->alloc_end; + unsigned int bit, cpu, end = m->alloc_end; + struct cpumap *cm; + + if (cpumask_empty(msk)) + return -EINVAL; + + cpu = matrix_find_best_cpu(m, msk); + if (cpu == UINT_MAX) + return -ENOSPC; + cm = per_cpu_ptr(m->maps, cpu); + end = m->alloc_end; /* Get managed bit which are not allocated */ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end); bit = find_first_bit(m->scratch_map, end); @@ -252,6 +283,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu) set_bit(bit, cm->alloc_map); cm->allocated++; m->total_allocated++; + *mapped_cpu = cpu; trace_irq_matrix_alloc_managed(bit, cpu, m, cm); return bit; } @@ -322,37 +354,27 @@ void irq_matrix_remove_reserved(struct irq_matrix *m) int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu) { - unsigned int cpu, best_cpu, maxavl = 0; + unsigned int cpu, bit; struct cpumap *cm; - unsigned int bit; - - best_cpu = UINT_MAX; - for_each_cpu(cpu, msk) { - cm = per_cpu_ptr(m->maps, cpu); - if (!cm->online || cm->available <= maxavl) - continue; + cpu = matrix_find_best_cpu(m, msk); + if (cpu == UINT_MAX) + return -ENOSPC; - best_cpu = cpu; - maxavl = cm->available; - } + cm = per_cpu_ptr(m->maps, cpu); + bit = matrix_alloc_area(m, cm, 1, false); + if (bit >= m->alloc_end) + return -ENOSPC; + cm->allocated++; + cm->available--; + m->total_allocated++; + m->global_available--; + if (reserved) + m->global_reserved--; + *mapped_cpu = cpu; + trace_irq_matrix_alloc(bit, cpu, m, cm); + return bit; - if (maxavl) { - cm = per_cpu_ptr(m->maps, best_cpu); - bit = matrix_alloc_area(m, cm, 1, false); - if (bit < m->alloc_end) { - cm->allocated++; - cm->available--; - m->total_allocated++; - m->global_available--; - if (reserved) - m->global_reserved--; - *mapped_cpu = best_cpu; - trace_irq_matrix_alloc(bit, best_cpu, m, cm); - return bit; - } - } - return -ENOSPC; } /** diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 2e62503bea0d..b28028b08d44 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -38,23 +38,43 @@ static int jump_label_cmp(const void *a, const void *b) const struct jump_entry *jea = a; const struct jump_entry *jeb = b; - if (jea->key < jeb->key) + if (jump_entry_key(jea) < jump_entry_key(jeb)) return -1; - if (jea->key > jeb->key) + if (jump_entry_key(jea) > jump_entry_key(jeb)) return 1; return 0; } +static void jump_label_swap(void *a, void *b, int size) +{ + long delta = (unsigned long)a - (unsigned long)b; + struct jump_entry *jea = a; + struct jump_entry *jeb = b; + struct jump_entry tmp = *jea; + + jea->code = jeb->code - delta; + jea->target = jeb->target - delta; + jea->key = jeb->key - delta; + + jeb->code = tmp.code + delta; + jeb->target = tmp.target + delta; + jeb->key = tmp.key + delta; +} + static void jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) { unsigned long size; + void *swapfn = NULL; + + if (IS_ENABLED(CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE)) + swapfn = jump_label_swap; size = (((unsigned long)stop - (unsigned long)start) / sizeof(struct jump_entry)); - sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); + sort(start, size, sizeof(struct jump_entry), jump_label_cmp, swapfn); } static void jump_label_update(struct static_key *key); @@ -85,6 +105,7 @@ void static_key_slow_inc_cpuslocked(struct static_key *key) int v, v1; STATIC_KEY_CHECK_USE(key); + lockdep_assert_cpus_held(); /* * Careful if we get concurrent static_key_slow_inc() calls; @@ -130,6 +151,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); void static_key_enable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); + lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) > 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); @@ -160,6 +182,7 @@ EXPORT_SYMBOL_GPL(static_key_enable); void static_key_disable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); + lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); @@ -185,6 +208,8 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + lockdep_assert_cpus_held(); + /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a @@ -261,8 +286,8 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { - if (entry->code <= (unsigned long)end && - entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + if (jump_entry_code(entry) <= (unsigned long)end && + jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE > (unsigned long)start) return 1; return 0; @@ -321,16 +346,6 @@ static inline void static_key_set_linked(struct static_key *key) key->type |= JUMP_TYPE_LINKED; } -static inline struct static_key *jump_entry_key(struct jump_entry *entry) -{ - return (struct static_key *)((unsigned long)entry->key & ~1UL); -} - -static bool jump_entry_branch(struct jump_entry *entry) -{ - return (unsigned long)entry->key & 1UL; -} - /*** * A 'struct static_key' uses a union such that it either points directly * to a table of 'struct jump_entry' or to a linked list of modules which in @@ -355,7 +370,7 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool enabled = static_key_enabled(key); - bool branch = jump_entry_branch(entry); + bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return enabled ^ branch; @@ -363,19 +378,20 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry) static void __jump_label_update(struct static_key *key, struct jump_entry *entry, - struct jump_entry *stop) + struct jump_entry *stop, + bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { /* * An entry->code of 0 indicates an entry which has been * disabled because it was in an init text area. */ - if (entry->code) { - if (kernel_text_address(entry->code)) + if (init || !jump_entry_is_init(entry)) { + if (kernel_text_address(jump_entry_code(entry))) arch_jump_label_transform(entry, jump_label_type(entry)); else WARN_ONCE(1, "can't patch jump_label at %pS", - (void *)(unsigned long)entry->code); + (void *)jump_entry_code(entry)); } } } @@ -410,6 +426,9 @@ void __init jump_label_init(void) if (jump_label_type(iter) == JUMP_LABEL_NOP) arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); + if (init_section_contains((void *)jump_entry_code(iter), 1)) + jump_entry_set_init(iter); + iterk = jump_entry_key(iter); if (iterk == key) continue; @@ -422,26 +441,13 @@ void __init jump_label_init(void) cpus_read_unlock(); } -/* Disable any jump label entries in __init/__exit code */ -void __init jump_label_invalidate_initmem(void) -{ - struct jump_entry *iter_start = __start___jump_table; - struct jump_entry *iter_stop = __stop___jump_table; - struct jump_entry *iter; - - for (iter = iter_start; iter < iter_stop; iter++) { - if (init_section_contains((void *)(unsigned long)iter->code, 1)) - iter->code = 0; - } -} - #ifdef CONFIG_MODULES static enum jump_label_type jump_label_init_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool type = static_key_type(key); - bool branch = jump_entry_branch(entry); + bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return type ^ branch; @@ -455,7 +461,7 @@ struct static_key_mod { static inline struct static_key_mod *static_key_mod(struct static_key *key) { - WARN_ON_ONCE(!(key->type & JUMP_TYPE_LINKED)); + WARN_ON_ONCE(!static_key_linked(key)); return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK); } @@ -514,7 +520,8 @@ static void __jump_label_mod_update(struct static_key *key) stop = __stop___jump_table; else stop = m->jump_entries + m->num_jump_entries; - __jump_label_update(key, mod->entries, stop); + __jump_label_update(key, mod->entries, stop, + m && m->state == MODULE_STATE_COMING); } } @@ -560,12 +567,15 @@ static int jump_label_add_module(struct module *mod) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; + if (within_module_init(jump_entry_code(iter), mod)) + jump_entry_set_init(iter); + iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; - if (within_module(iter->key, mod)) { + if (within_module((unsigned long)key, mod)) { static_key_set_entries(key, iter); continue; } @@ -595,7 +605,7 @@ static int jump_label_add_module(struct module *mod) /* Only update if we've changed from our initial state */ if (jump_label_type(iter) != jump_label_init_type(iter)) - __jump_label_update(key, iter, iter_stop); + __jump_label_update(key, iter, iter_stop, true); } return 0; @@ -615,7 +625,7 @@ static void jump_label_del_module(struct module *mod) key = jump_entry_key(iter); - if (within_module(iter->key, mod)) + if (within_module((unsigned long)key, mod)) continue; /* No memory during module load */ @@ -651,19 +661,6 @@ static void jump_label_del_module(struct module *mod) } } -/* Disable any jump label entries in module init code */ -static void jump_label_invalidate_module_init(struct module *mod) -{ - struct jump_entry *iter_start = mod->jump_entries; - struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; - struct jump_entry *iter; - - for (iter = iter_start; iter < iter_stop; iter++) { - if (within_module_init(iter->code, mod)) - iter->code = 0; - } -} - static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) @@ -685,9 +682,6 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, case MODULE_STATE_GOING: jump_label_del_module(mod); break; - case MODULE_STATE_LIVE: - jump_label_invalidate_module_init(mod); - break; } jump_label_unlock(); @@ -757,7 +751,8 @@ static void jump_label_update(struct static_key *key) entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) - __jump_label_update(key, entry, stop); + __jump_label_update(key, entry, stop, + system_state < SYSTEM_RUNNING); } #ifdef CONFIG_STATIC_KEYS_SELFTEST diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ab257be4d924..90e98e233647 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -546,8 +546,14 @@ static void do_free_cleaned_kprobes(void) struct optimized_kprobe *op, *tmp; list_for_each_entry_safe(op, tmp, &freeing_list, list) { - BUG_ON(!kprobe_unused(&op->kp)); list_del_init(&op->list); + if (WARN_ON_ONCE(!kprobe_unused(&op->kp))) { + /* + * This must not happen, but if there is a kprobe + * still in use, keep it on kprobes hash list. + */ + continue; + } free_aggr_kprobe(&op->kp); } } @@ -700,11 +706,11 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) } /* Cancel unoptimizing for reusing */ -static void reuse_unused_kprobe(struct kprobe *ap) +static int reuse_unused_kprobe(struct kprobe *ap) { struct optimized_kprobe *op; + int ret; - BUG_ON(!kprobe_unused(ap)); /* * Unused kprobe MUST be on the way of delayed unoptimizing (means * there is still a relative jump) and disabled. @@ -714,8 +720,12 @@ static void reuse_unused_kprobe(struct kprobe *ap) /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ - BUG_ON(!kprobe_optready(ap)); + ret = kprobe_optready(ap); + if (ret) + return ret; + optimize_kprobe(ap); + return 0; } /* Remove optimized instructions */ @@ -940,11 +950,16 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) #define kprobe_disarmed(p) kprobe_disabled(p) #define wait_for_kprobe_optimizer() do {} while (0) -/* There should be no unused kprobes can be reused without optimization */ -static void reuse_unused_kprobe(struct kprobe *ap) +static int reuse_unused_kprobe(struct kprobe *ap) { + /* + * If the optimized kprobe is NOT supported, the aggr kprobe is + * released at the same time that the last aggregated kprobe is + * unregistered. + * Thus there should be no chance to reuse unused kprobe. + */ printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); - BUG_ON(kprobe_unused(ap)); + return -EINVAL; } static void free_aggr_kprobe(struct kprobe *p) @@ -1259,8 +1274,6 @@ NOKPROBE_SYMBOL(cleanup_rp_inst); /* Add the new probe to ap->list */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { - BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); - if (p->post_handler) unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ @@ -1318,9 +1331,12 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) goto out; } init_aggr_kprobe(ap, orig_p); - } else if (kprobe_unused(ap)) + } else if (kprobe_unused(ap)) { /* This probe is going to die. Rescue it */ - reuse_unused_kprobe(ap); + ret = reuse_unused_kprobe(ap); + if (ret) + goto out; + } if (kprobe_gone(ap)) { /* @@ -1704,7 +1720,6 @@ noclean: return 0; disarmed: - BUG_ON(!kprobe_disarmed(ap)); hlist_del_rcu(&ap->hlist); return 0; } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index dd13f865ad40..1efada2dd9dd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -138,7 +138,7 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; * get freed - this significantly simplifies the debugging code. */ unsigned long nr_lock_classes; -static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; static inline struct lock_class *hlock_class(struct held_lock *hlock) { @@ -1391,7 +1391,9 @@ static void print_lock_class_header(struct lock_class *class, int depth) printk("%*s->", depth, ""); print_lock_name(class); - printk(KERN_CONT " ops: %lu", class->ops); +#ifdef CONFIG_DEBUG_LOCKDEP + printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); +#endif printk(KERN_CONT " {\n"); for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { @@ -2148,76 +2150,6 @@ static int check_no_collision(struct task_struct *curr, } /* - * This is for building a chain between just two different classes, - * instead of adding a new hlock upon current, which is done by - * add_chain_cache(). - * - * This can be called in any context with two classes, while - * add_chain_cache() must be done within the lock owener's context - * since it uses hlock which might be racy in another context. - */ -static inline int add_chain_cache_classes(unsigned int prev, - unsigned int next, - unsigned int irq_context, - u64 chain_key) -{ - struct hlist_head *hash_head = chainhashentry(chain_key); - struct lock_chain *chain; - - /* - * Allocate a new chain entry from the static array, and add - * it to the hash: - */ - - /* - * We might need to take the graph lock, ensure we've got IRQs - * disabled to make this an IRQ-safe lock.. for recursion reasons - * lockdep won't complain about its own locking errors. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - - if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { - if (!debug_locks_off_graph_unlock()) - return 0; - - print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); - dump_stack(); - return 0; - } - - chain = lock_chains + nr_lock_chains++; - chain->chain_key = chain_key; - chain->irq_context = irq_context; - chain->depth = 2; - if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = nr_chain_hlocks; - nr_chain_hlocks += chain->depth; - chain_hlocks[chain->base] = prev - 1; - chain_hlocks[chain->base + 1] = next -1; - } -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * Important for check_no_collision(). - */ - else { - if (!debug_locks_off_graph_unlock()) - return 0; - - print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); - dump_stack(); - return 0; - } -#endif - - hlist_add_head_rcu(&chain->entry, hash_head); - debug_atomic_inc(chain_lookup_misses); - inc_chains(); - - return 1; -} - -/* * Adds a dependency chain into chain hashtable. And must be called with * graph_lock held. * @@ -3262,6 +3194,10 @@ static int __lock_is_held(const struct lockdep_map *lock, int read); /* * This gets called for every mutex_lock*()/spin_lock*() operation. * We maintain the dependency maps and validate the locking attempt: + * + * The callers must make sure that IRQs are disabled before calling it, + * otherwise we could get an interrupt which would want to take locks, + * which would end up in lockdep again. */ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, @@ -3279,14 +3215,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(!debug_locks)) return 0; - /* - * Lockdep should run with IRQs disabled, otherwise we could - * get an interrupt which would want to take locks, which would - * end up in lockdep and have you got a head-ache already? - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - if (!prove_locking || lock->key == &__lockdep_no_validate__) check = 0; @@ -3300,7 +3228,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!class) return 0; } - atomic_inc((atomic_t *)&class->ops); + + debug_class_ops_inc(class); + if (very_verbose(class)) { printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) @@ -3543,6 +3473,9 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, { struct held_lock *hlock; + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, @@ -3696,6 +3629,13 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; + /* + * The most likely case is when the unlock is on the innermost + * lock. In this case, we are done! + */ + if (i == depth-1) + return 1; + if (reacquire_held_locks(curr, depth, i + 1)) return 0; @@ -3703,10 +3643,14 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) * We had N bottles of beer on the wall, we drank one, but now * there's not N-1 bottles of beer left on the wall... */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) - return 0; + DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1); - return 1; + /* + * Since reacquire_held_locks() would have called check_chain_key() + * indirectly via __lock_acquire(), we don't need to do it again + * on return. + */ + return 0; } static int __lock_is_held(const struct lockdep_map *lock, int read) @@ -4122,7 +4066,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat)) + if (unlikely(!lock_stat || !debug_locks)) return; if (unlikely(current->lockdep_recursion)) @@ -4142,7 +4086,7 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat)) + if (unlikely(!lock_stat || !debug_locks)) return; if (unlikely(current->lockdep_recursion)) diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index d459d624ba2a..88c847a41c8a 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -152,9 +152,15 @@ struct lockdep_stats { int nr_find_usage_forwards_recursions; int nr_find_usage_backwards_checks; int nr_find_usage_backwards_recursions; + + /* + * Per lock class locking operation stat counts + */ + unsigned long lock_class_ops[MAX_LOCKDEP_KEYS]; }; DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); +extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; #define __debug_atomic_inc(ptr) \ this_cpu_inc(lockdep_stats.ptr); @@ -179,9 +185,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); } \ __total; \ }) + +static inline void debug_class_ops_inc(struct lock_class *class) +{ + int idx; + + idx = class - lock_classes; + __debug_atomic_inc(lock_class_ops[idx]); +} + +static inline unsigned long debug_class_ops_read(struct lock_class *class) +{ + int idx, cpu; + unsigned long ops = 0; + + idx = class - lock_classes; + for_each_possible_cpu(cpu) + ops += per_cpu(lockdep_stats.lock_class_ops[idx], cpu); + return ops; +} + #else # define __debug_atomic_inc(ptr) do { } while (0) # define debug_atomic_inc(ptr) do { } while (0) # define debug_atomic_dec(ptr) do { } while (0) # define debug_atomic_read(ptr) 0 +# define debug_class_ops_inc(ptr) do { } while (0) #endif diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 3dd980dfba2d..3d31f9b0059e 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -68,7 +68,7 @@ static int l_show(struct seq_file *m, void *v) seq_printf(m, "%p", class->key); #ifdef CONFIG_DEBUG_LOCKDEP - seq_printf(m, " OPS:%8ld", class->ops); + seq_printf(m, " OPS:%8ld", debug_class_ops_read(class)); #endif #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index bfaeb05123ff..8a8c3c208c5e 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -74,12 +74,24 @@ */ #include "mcs_spinlock.h" +#define MAX_NODES 4 +/* + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in + * size and four of them will fit nicely in one 64-byte cacheline. For + * pvqspinlock, however, we need more space for extra data. To accommodate + * that, we insert two more long words to pad it up to 32 bytes. IOW, only + * two of them can fit in a cacheline in this case. That is OK as it is rare + * to have more than 2 levels of slowpath nesting in actual use. We don't + * want to penalize pvqspinlocks to optimize for a rare case in native + * qspinlocks. + */ +struct qnode { + struct mcs_spinlock mcs; #ifdef CONFIG_PARAVIRT_SPINLOCKS -#define MAX_NODES 8 -#else -#define MAX_NODES 4 + long reserved[2]; #endif +}; /* * The pending bit spinning loop count. @@ -101,7 +113,7 @@ * * PV doubles the storage and uses the second cacheline for PV state. */ -static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); /* * We must be able to distinguish between no-tail and the tail at 0:0, @@ -126,7 +138,13 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail) int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; - return per_cpu_ptr(&mcs_nodes[idx], cpu); + return per_cpu_ptr(&qnodes[idx].mcs, cpu); +} + +static inline __pure +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) +{ + return &((struct qnode *)base + idx)->mcs; } #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) @@ -232,6 +250,20 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) #endif /* _Q_PENDING_BITS == 8 */ /** + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending + * @lock : Pointer to queued spinlock structure + * Return: The previous lock value + * + * *,*,* -> *,1,* + */ +#ifndef queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); +} +#endif + +/** * set_locked - Set the lock bit and own the lock * @lock: Pointer to queued spinlock structure * @@ -326,43 +358,48 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) /* * trylock || pending * - * 0,0,0 -> 0,0,1 ; trylock - * 0,0,1 -> 0,1,1 ; pending + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock */ - val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); - if (!(val & ~_Q_LOCKED_MASK)) { - /* - * We're pending, wait for the owner to go away. - * - * *,1,1 -> *,1,0 - * - * this wait loop must be a load-acquire such that we match the - * store-release that clears the locked bit and create lock - * sequentiality; this is because not all - * clear_pending_set_locked() implementations imply full - * barriers. - */ - if (val & _Q_LOCKED_MASK) { - atomic_cond_read_acquire(&lock->val, - !(VAL & _Q_LOCKED_MASK)); - } + val = queued_fetch_set_pending_acquire(lock); - /* - * take ownership and clear the pending bit. - * - * *,1,0 -> *,0,1 - */ - clear_pending_set_locked(lock); - qstat_inc(qstat_lock_pending, true); - return; + /* + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. + */ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + + goto queue; } /* - * If pending was clear but there are waiters in the queue, then - * we need to undo our setting of pending before we queue ourselves. + * We're pending, wait for the owner to go away. + * + * 0,1,1 -> 0,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. + */ + if (val & _Q_LOCKED_MASK) + atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK)); + + /* + * take ownership and clear the pending bit. + * + * 0,1,0 -> 0,0,1 */ - if (!(val & _Q_PENDING_MASK)) - clear_pending(lock); + clear_pending_set_locked(lock); + qstat_inc(qstat_lock_pending, true); + return; /* * End of pending bit optimistic spinning and beginning of MCS @@ -371,11 +408,16 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) queue: qstat_inc(qstat_lock_slowpath, true); pv_queue: - node = this_cpu_ptr(&mcs_nodes[0]); + node = this_cpu_ptr(&qnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); - node += idx; + node = grab_mcs_node(node, idx); + + /* + * Keep counts of non-zero index values: + */ + qstat_inc(qstat_lock_idx1 + idx - 1, idx); /* * Ensure that we increment the head node->count before initialising @@ -476,16 +518,25 @@ locked: */ /* - * In the PV case we might already have _Q_LOCKED_VAL set. + * In the PV case we might already have _Q_LOCKED_VAL set, because + * of lock stealing; therefore we must also allow: + * + * n,0,1 -> 0,0,1 * - * The atomic_cond_read_acquire() call above has provided the - * necessary acquire semantics required for locking. + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. */ - if (((val & _Q_TAIL_MASK) == tail) && - atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) - goto release; /* No contention */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + } - /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ + /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ set_locked(lock); /* @@ -501,7 +552,7 @@ release: /* * release the node */ - __this_cpu_dec(mcs_nodes[0].count); + __this_cpu_dec(qnodes[0].mcs.count); } EXPORT_SYMBOL(queued_spin_lock_slowpath); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 5a0cf5f9008c..0130e488ebfe 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -49,8 +49,6 @@ enum vcpu_state { struct pv_node { struct mcs_spinlock mcs; - struct mcs_spinlock __res[3]; - int cpu; u8 state; }; @@ -281,7 +279,7 @@ static void pv_init_node(struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock)); + BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode)); pn->cpu = smp_processor_id(); pn->state = vcpu_running; diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 6bd78c0740fc..42d3d8dc8f49 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -55,6 +55,9 @@ enum qlock_stats { qstat_pv_wait_node, qstat_lock_pending, qstat_lock_slowpath, + qstat_lock_idx1, + qstat_lock_idx2, + qstat_lock_idx3, qstat_num, /* Total number of statistical counters */ qstat_reset_cnts = qstat_num, }; @@ -82,6 +85,9 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_wait_node] = "pv_wait_node", [qstat_lock_pending] = "lock_pending", [qstat_lock_slowpath] = "lock_slowpath", + [qstat_lock_idx1] = "lock_index1", + [qstat_lock_idx2] = "lock_index2", + [qstat_lock_idx3] = "lock_index3", [qstat_reset_cnts] = "reset_counters", }; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2823d4163a37..581edcc63c26 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1485,9 +1485,9 @@ void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) __rt_mutex_lock(lock, subclass); } EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); -#endif -#ifndef CONFIG_DEBUG_LOCK_ALLOC +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + /** * rt_mutex_lock - lock a rt_mutex * diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 3064c50e181e..09b180063ee1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -180,7 +180,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * but it gives the spinners an early indication that the * readers now have the lock. */ - rwsem_set_reader_owned(sem); + __rwsem_set_reader_owned(sem, waiter->task); } /* @@ -233,8 +233,19 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) waiter.type = RWSEM_WAITING_FOR_READ; raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) + if (list_empty(&sem->wait_list)) { + /* + * In case the wait queue is empty and the lock isn't owned + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_ACTIVE_READ_BIAS has already + * been set in the count. + */ + if (atomic_long_read(&sem->count) >= 0) { + raw_spin_unlock_irq(&sem->wait_lock); + return sem; + } adjustment += RWSEM_WAITING_BIAS; + } list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 776308d2fa9e..e586f0d03ad3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -117,8 +117,9 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); + rwsem_clear_reader_owned(sem); __up_read(sem); } @@ -181,7 +182,7 @@ void down_read_non_owner(struct rw_semaphore *sem) might_sleep(); __down_read(sem); - rwsem_set_reader_owned(sem); + __rwsem_set_reader_owned(sem, NULL); } EXPORT_SYMBOL(down_read_non_owner); @@ -215,7 +216,7 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { - DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); __up_read(sem); } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index b9d0e72aa80f..bad2bca0268b 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,24 +1,30 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * The owner field of the rw_semaphore structure will be set to - * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear - * the owner field when it unlocks. A reader, on the other hand, will - * not touch the owner field when it unlocks. + * The least significant 2 bits of the owner value has the following + * meanings when set. + * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers + * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, + * i.e. the owner(s) cannot be readily determined. It can be reader + * owned or the owning writer is indeterminate. * - * In essence, the owner field now has the following 4 states: - * 1) 0 - * - lock is free or the owner hasn't set the field yet - * 2) RWSEM_READER_OWNED - * - lock is currently or previously owned by readers (lock is free - * or not set by owner yet) - * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well - * - lock is owned by an anonymous writer, so spinning on the lock - * owner should be disabled. - * 4) Other non-zero value - * - a writer owns the lock and other writers can spin on the lock owner. + * When a writer acquires a rwsem, it puts its task_struct pointer + * into the owner field. It is cleared after an unlock. + * + * When a reader acquires a rwsem, it will also puts its task_struct + * pointer into the owner field with both the RWSEM_READER_OWNED and + * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will + * largely be left untouched. So for a free or reader-owned rwsem, + * the owner value may contain information about the last reader that + * acquires the rwsem. The anonymous bit is set because that particular + * reader may or may not still own the lock. + * + * That information may be helpful in debugging cases where the system + * seems to hang on a reader owned rwsem especially if only one reader + * is involved. Ideally we would like to track all the readers that own + * a rwsem, but the overhead is simply too big. */ -#define RWSEM_ANONYMOUSLY_OWNED (1UL << 0) -#define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED) +#define RWSEM_READER_OWNED (1UL << 0) +#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) @@ -44,15 +50,26 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem) WRITE_ONCE(sem->owner, NULL); } +/* + * The task_struct pointer of the last owning reader will be left in + * the owner field. + * + * Note that the owner value just indicates the task has owned the rwsem + * previously, it may not be the real owner or one of the real owners + * anymore when that field is examined, so take it with a grain of salt. + */ +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED + | RWSEM_ANONYMOUSLY_OWNED; + + WRITE_ONCE(sem->owner, (struct task_struct *)val); +} + static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) { - /* - * We check the owner value first to make sure that we will only - * do a write to the rwsem cacheline when it is really necessary - * to minimize cacheline contention. - */ - if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED) - WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); + __rwsem_set_reader_owned(sem, current); } /* @@ -72,6 +89,25 @@ static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) { return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; } + +#ifdef CONFIG_DEBUG_RWSEMS +/* + * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there + * is a task pointer in owner of a reader-owned rwsem, it will be the + * real owner or one of the real owners. The only exception is when the + * unlock is done by up_read_non_owner(). + */ +#define rwsem_clear_reader_owned rwsem_clear_reader_owned +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ + unsigned long val = (unsigned long)current | RWSEM_READER_OWNED + | RWSEM_ANONYMOUSLY_OWNED; + if (READ_ONCE(sem->owner) == (struct task_struct *)val) + cmpxchg_relaxed((unsigned long *)&sem->owner, val, + RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); +} +#endif + #else static inline void rwsem_set_owner(struct rw_semaphore *sem) { @@ -81,7 +117,18 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem) { } +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ +} + static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) { } #endif + +#ifndef rwsem_clear_reader_owned +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ +} +#endif diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 0be047dbd897..65a3b7e55b9f 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -260,7 +260,7 @@ static void test_cycle_work(struct work_struct *work) { struct test_cycle *cycle = container_of(work, typeof(*cycle), work); struct ww_acquire_ctx ctx; - int err; + int err, erra = 0; ww_acquire_init(&ctx, &ww_class); ww_mutex_lock(&cycle->a_mutex, &ctx); @@ -270,17 +270,19 @@ static void test_cycle_work(struct work_struct *work) err = ww_mutex_lock(cycle->b_mutex, &ctx); if (err == -EDEADLK) { + err = 0; ww_mutex_unlock(&cycle->a_mutex); ww_mutex_lock_slow(cycle->b_mutex, &ctx); - err = ww_mutex_lock(&cycle->a_mutex, &ctx); + erra = ww_mutex_lock(&cycle->a_mutex, &ctx); } if (!err) ww_mutex_unlock(cycle->b_mutex); - ww_mutex_unlock(&cycle->a_mutex); + if (!erra) + ww_mutex_unlock(&cycle->a_mutex); ww_acquire_fini(&ctx); - cycle->result = err; + cycle->result = err ?: erra; } static int __test_cycle(unsigned int nthreads) diff --git a/kernel/module.c b/kernel/module.c index 6746c85511fe..49a405891587 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3317,6 +3317,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) ndx = find_sec(info, ".data..ro_after_init"); if (ndx) info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + /* + * Mark the __jump_table section as ro_after_init as well: these data + * structures are never modified, with the exception of entries that + * refer to code in the __init section, which are annotated as such + * at module load time. + */ + ndx = find_sec(info, "__jump_table"); + if (ndx) + info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any diff --git a/kernel/power/process.c b/kernel/power/process.c index 7381d49a44db..4b6a54da7e65 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -96,7 +96,7 @@ static int try_to_freeze_tasks(bool user_only) if (wq_busy) show_workqueue_state(); - if (!wakeup) { + if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5342f6fc022e..0bd595a0b610 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -63,6 +63,12 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; static DEFINE_RAW_SPINLOCK(s2idle_lock); +bool pm_suspend_via_s2idle(void) +{ + return mem_sleep_current == PM_SUSPEND_TO_IDLE; +} +EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle); + void s2idle_set_ops(const struct platform_s2idle_ops *ops) { lock_system_sleep(); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 9210379c0353..939a2056c87a 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -196,7 +196,7 @@ config RCU_BOOST This option boosts the priority of preempted RCU readers that block the current preemptible RCU grace period for too long. This option also prevents heavy loads from blocking RCU - callback invocation for all flavors of RCU. + callback invocation. Say Y here if you are working with real-time apps or heavy loads Say N here if you are unsure. @@ -225,12 +225,12 @@ config RCU_NOCB_CPU callback invocation to energy-efficient CPUs in battery-powered asymmetric multiprocessors. - This option offloads callback invocation from the set of - CPUs specified at boot time by the rcu_nocbs parameter. - For each such CPU, a kthread ("rcuox/N") will be created to - invoke callbacks, where the "N" is the CPU being offloaded, - and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and - "s" for RCU-sched. Nothing prevents this kthread from running + This option offloads callback invocation from the set of CPUs + specified at boot time by the rcu_nocbs parameter. For each + such CPU, a kthread ("rcuox/N") will be created to invoke + callbacks, where the "N" is the CPU being offloaded, and where + the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched + (!PREEMPT kernels). Nothing prevents this kthread from running on the specified CPUs, but (1) the kthreads may be preempted between each callback, and (2) affinity or cgroups can be used to force the kthreads to run on whatever set of CPUs is desired. diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4d04683c31b2..2866166863f0 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -176,8 +176,9 @@ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally - * by call_rcu() and rcu callback execution, and are therefore not part of the - * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. + * by call_rcu() and rcu callback execution, and are therefore not part + * of the RCU API. These are in rcupdate.h because they are used by all + * RCU implementations. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -223,6 +224,7 @@ void kfree(const void *); */ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { + rcu_callback_t f; unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); @@ -233,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) return true; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head);) - head->func(head); + f = head->func; + WRITE_ONCE(head->func, (rcu_callback_t)0L); + f(head); rcu_lock_release(&rcu_callback_map); return false; } @@ -328,40 +332,35 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) } } -/* Returns first leaf rcu_node of the specified RCU flavor. */ -#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1]) +/* Returns a pointer to the first leaf rcu_node structure. */ +#define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1]) /* Is this rcu_node a leaf? */ #define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) /* Is this rcu_node the last leaf? */ -#define rcu_is_last_leaf_node(rsp, rnp) ((rnp) == &(rsp)->node[rcu_num_nodes - 1]) +#define rcu_is_last_leaf_node(rnp) ((rnp) == &rcu_state.node[rcu_num_nodes - 1]) /* - * Do a full breadth-first scan of the rcu_node structures for the - * specified rcu_state structure. + * Do a full breadth-first scan of the {s,}rcu_node structures for the + * specified state structure (for SRCU) or the only rcu_state structure + * (for RCU). */ -#define rcu_for_each_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) +#define srcu_for_each_node_breadth_first(sp, rnp) \ + for ((rnp) = &(sp)->node[0]; \ + (rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++) +#define rcu_for_each_node_breadth_first(rnp) \ + srcu_for_each_node_breadth_first(&rcu_state, rnp) /* - * Do a breadth-first scan of the non-leaf rcu_node structures for the - * specified rcu_state structure. Note that if there is a singleton - * rcu_node tree with but one rcu_node structure, this loop is a no-op. + * Scan the leaves of the rcu_node hierarchy for the rcu_state structure. + * Note that if there is a singleton rcu_node tree with but one rcu_node + * structure, this loop -will- visit the rcu_node structure. It is still + * a leaf node, even if it is also the root node. */ -#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++) - -/* - * Scan the leaves of the rcu_node hierarchy for the specified rcu_state - * structure. Note that if there is a singleton rcu_node tree with but - * one rcu_node structure, this loop -will- visit the rcu_node structure. - * It is still a leaf node, even if it is also the root node. - */ -#define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = rcu_first_leaf_node(rsp); \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) +#define rcu_for_each_leaf_node(rnp) \ + for ((rnp) = rcu_first_leaf_node(); \ + (rnp) < &rcu_state.node[rcu_num_nodes]; (rnp)++) /* * Iterate over all possible CPUs in a leaf RCU node. @@ -435,6 +434,12 @@ do { \ #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ +#ifdef CONFIG_SRCU +void srcu_init(void); +#else /* #ifdef CONFIG_SRCU */ +static inline void srcu_init(void) { } +#endif /* #else #ifdef CONFIG_SRCU */ + #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ static inline bool rcu_gp_is_normal(void) { return true; } @@ -515,29 +520,19 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #ifdef CONFIG_TINY_RCU static inline unsigned long rcu_get_gp_seq(void) { return 0; } -static inline unsigned long rcu_bh_get_gp_seq(void) { return 0; } -static inline unsigned long rcu_sched_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } -static inline void rcu_bh_force_quiescent_state(void) { } -static inline void rcu_sched_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } #else /* #ifdef CONFIG_TINY_RCU */ unsigned long rcu_get_gp_seq(void); -unsigned long rcu_bh_get_gp_seq(void); -unsigned long rcu_sched_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long rcu_exp_batches_completed_sched(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); void rcu_force_quiescent_state(void); -void rcu_bh_force_quiescent_state(void); -void rcu_sched_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; extern struct workqueue_struct *rcu_par_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 34244523550e..b459da70b4fc 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -190,36 +190,6 @@ static struct rcu_perf_ops rcu_ops = { }; /* - * Definitions for rcu_bh perf testing. - */ - -static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH) -{ - rcu_read_lock_bh(); - return 0; -} - -static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH) -{ - rcu_read_unlock_bh(); -} - -static struct rcu_perf_ops rcu_bh_ops = { - .ptype = RCU_BH_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = rcu_bh_perf_read_lock, - .readunlock = rcu_bh_perf_read_unlock, - .get_gp_seq = rcu_bh_get_gp_seq, - .gp_diff = rcu_seq_diff, - .exp_completed = rcu_exp_batches_completed_sched, - .async = call_rcu_bh, - .gp_barrier = rcu_barrier_bh, - .sync = synchronize_rcu_bh, - .exp_sync = synchronize_rcu_bh_expedited, - .name = "rcu_bh" -}; - -/* * Definitions for srcu perf testing. */ @@ -306,36 +276,6 @@ static struct rcu_perf_ops srcud_ops = { }; /* - * Definitions for sched perf testing. - */ - -static int sched_perf_read_lock(void) -{ - preempt_disable(); - return 0; -} - -static void sched_perf_read_unlock(int idx) -{ - preempt_enable(); -} - -static struct rcu_perf_ops sched_ops = { - .ptype = RCU_SCHED_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = sched_perf_read_lock, - .readunlock = sched_perf_read_unlock, - .get_gp_seq = rcu_sched_get_gp_seq, - .gp_diff = rcu_seq_diff, - .exp_completed = rcu_exp_batches_completed_sched, - .async = call_rcu_sched, - .gp_barrier = rcu_barrier_sched, - .sync = synchronize_sched, - .exp_sync = synchronize_sched_expedited, - .name = "sched" -}; - -/* * Definitions for RCU-tasks perf testing. */ @@ -611,7 +551,7 @@ rcu_perf_cleanup(void) kfree(writer_n_durations); } - /* Do flavor-specific cleanup operations. */ + /* Do torture-type-specific cleanup operations. */ if (cur_ops->cleanup != NULL) cur_ops->cleanup(); @@ -661,8 +601,7 @@ rcu_perf_init(void) long i; int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, - &tasks_ops, + &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, }; if (!torture_init_begin(perf_type, verbose)) @@ -680,6 +619,7 @@ rcu_perf_init(void) for (i = 0; i < ARRAY_SIZE(perf_ops); i++) pr_cont(" %s", perf_ops[i]->name); pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); firsterr = -EINVAL; goto unwind; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c596c6f1e457..210c77460365 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -66,15 +66,19 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@jos /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ #define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) -#define RCUTORTURE_RDR_BH 0x1 /* Extend readers by disabling bh. */ -#define RCUTORTURE_RDR_IRQ 0x2 /* ... disabling interrupts. */ -#define RCUTORTURE_RDR_PREEMPT 0x4 /* ... disabling preemption. */ -#define RCUTORTURE_RDR_RCU 0x8 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 4 /* Number of bits defined above. */ -#define RCUTORTURE_MAX_EXTEND (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | \ - RCUTORTURE_RDR_PREEMPT) +#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ +#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ +#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ +#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ +#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ +#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */ +#define RCUTORTURE_MAX_EXTEND \ + (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ + RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) #define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ /* Must be power of two minus one. */ +#define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3) torture_param(int, cbflood_inter_holdoff, HZ, "Holdoff between floods (jiffies)"); @@ -89,6 +93,12 @@ torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); +torture_param(bool, fwd_progress, 1, "Test grace-period forward progress"); +torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait"); +torture_param(int, fwd_progress_holdoff, 60, + "Time between forward-progress tests (s)"); +torture_param(bool, fwd_progress_need_resched, 1, + "Hide cond_resched() behind need_resched()"); torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, @@ -125,7 +135,7 @@ torture_param(int, verbose, 1, static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); static int nrealreaders; static int ncbflooders; @@ -137,6 +147,7 @@ static struct task_struct **cbflood_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; +static struct task_struct *fwd_prog_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; @@ -197,6 +208,18 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_STOPPING", }; +/* Record reader segment types and duration for first failing read. */ +struct rt_read_seg { + int rt_readstate; + unsigned long rt_delay_jiffies; + unsigned long rt_delay_ms; + unsigned long rt_delay_us; + bool rt_preempted; +}; +static int err_segs_recorded; +static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; +static int rt_read_nsegs; + static const char *rcu_torture_writer_state_getname(void) { unsigned int i = READ_ONCE(rcu_torture_writer_state); @@ -278,7 +301,8 @@ struct rcu_torture_ops { void (*init)(void); void (*cleanup)(void); int (*readlock)(void); - void (*read_delay)(struct torture_random_state *rrsp); + void (*read_delay)(struct torture_random_state *rrsp, + struct rt_read_seg *rtrsp); void (*readunlock)(int idx); unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); @@ -291,6 +315,7 @@ struct rcu_torture_ops { void (*cb_barrier)(void); void (*fqs)(void); void (*stats)(void); + int (*stall_dur)(void); int irq_capable; int can_boost; int extendables; @@ -310,12 +335,13 @@ static int rcu_torture_read_lock(void) __acquires(RCU) return 0; } -static void rcu_read_delay(struct torture_random_state *rrsp) +static void +rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) { unsigned long started; unsigned long completed; const unsigned long shortdelay_us = 200; - const unsigned long longdelay_ms = 50; + unsigned long longdelay_ms = 300; unsigned long long ts; /* We want a short delay sometimes to make a reader delay the grace @@ -325,16 +351,23 @@ static void rcu_read_delay(struct torture_random_state *rrsp) if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); + if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) + longdelay_ms = 5; /* Avoid triggering BH limits. */ mdelay(longdelay_ms); + rtrsp->rt_delay_ms = longdelay_ms; completed = cur_ops->get_gp_seq(); do_trace_rcu_torture_read(cur_ops->name, NULL, ts, started, completed); } - if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) { udelay(shortdelay_us); + rtrsp->rt_delay_us = shortdelay_us; + } if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 500))) + !(torture_random(rrsp) % (nrealreaders * 500))) { torture_preempt_schedule(); /* QS only if preemptible. */ + rtrsp->rt_preempted = true; + } } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -429,53 +462,14 @@ static struct rcu_torture_ops rcu_ops = { .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, + .stall_dur = rcu_jiffies_till_stall_check, .irq_capable = 1, .can_boost = rcu_can_boost(), + .extendables = RCUTORTURE_MAX_EXTEND, .name = "rcu" }; /* - * Definitions for rcu_bh torture testing. - */ - -static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) -{ - rcu_read_lock_bh(); - return 0; -} - -static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) -{ - rcu_read_unlock_bh(); -} - -static void rcu_bh_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops rcu_bh_ops = { - .ttype = RCU_BH_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .get_gp_seq = rcu_bh_get_gp_seq, - .gp_diff = rcu_seq_diff, - .deferred_free = rcu_bh_torture_deferred_free, - .sync = synchronize_rcu_bh, - .exp_sync = synchronize_rcu_bh_expedited, - .call = call_rcu_bh, - .cb_barrier = rcu_barrier_bh, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .extendables = (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ), - .ext_irq_conflict = RCUTORTURE_RDR_RCU, - .name = "rcu_bh" -}; - -/* * Don't even think about trying any of these in real life!!! * The names includes "busted", and they really means it! * The only purpose of these functions is to provide a buggy RCU @@ -531,7 +525,8 @@ static int srcu_torture_read_lock(void) __acquires(srcu_ctlp) return srcu_read_lock(srcu_ctlp); } -static void srcu_read_delay(struct torture_random_state *rrsp) +static void +srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) { long delay; const long uspertick = 1000000 / HZ; @@ -541,10 +536,12 @@ static void srcu_read_delay(struct torture_random_state *rrsp) delay = torture_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay && in_task()) + if (!delay && in_task()) { schedule_timeout_interruptible(longdelay); - else - rcu_read_delay(rrsp); + rtrsp->rt_delay_jiffies = longdelay; + } else { + rcu_read_delay(rrsp, rtrsp); + } } static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp) @@ -663,48 +660,6 @@ static struct rcu_torture_ops busted_srcud_ops = { }; /* - * Definitions for sched torture testing. - */ - -static int sched_torture_read_lock(void) -{ - preempt_disable(); - return 0; -} - -static void sched_torture_read_unlock(int idx) -{ - preempt_enable(); -} - -static void rcu_sched_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops sched_ops = { - .ttype = RCU_SCHED_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .get_gp_seq = rcu_sched_get_gp_seq, - .gp_diff = rcu_seq_diff, - .deferred_free = rcu_sched_torture_deferred_free, - .sync = synchronize_sched, - .exp_sync = synchronize_sched_expedited, - .get_state = get_state_synchronize_sched, - .cond_sync = cond_synchronize_sched, - .call = call_rcu_sched, - .cb_barrier = rcu_barrier_sched, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .extendables = RCUTORTURE_MAX_EXTEND, - .name = "sched" -}; - -/* * Definitions for RCU-tasks torture testing. */ @@ -1116,7 +1071,8 @@ rcu_torture_writer(void *arg) break; } } - rcu_torture_current_version++; + WRITE_ONCE(rcu_torture_current_version, + rcu_torture_current_version + 1); /* Cycle through nesting levels of rcu_expedite_gp() calls. */ if (can_expedite && !(torture_random(&rand) & 0xff & (!!expediting - 1))) { @@ -1132,7 +1088,10 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - stutter_wait("rcu_torture_writer"); + if (stutter_wait("rcu_torture_writer")) + for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) + if (list_empty(&rcu_tortures[i].rtort_free)) + WARN_ON_ONCE(1); } while (!torture_must_stop()); /* Reset expediting back to unexpedited. */ if (expediting > 0) @@ -1199,7 +1158,8 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) * change, do a ->read_delay(). */ static void rcutorture_one_extend(int *readstate, int newstate, - struct torture_random_state *trsp) + struct torture_random_state *trsp, + struct rt_read_seg *rtrsp) { int idxnew = -1; int idxold = *readstate; @@ -1208,6 +1168,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, WARN_ON_ONCE(idxold < 0); WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ if (statesnew & RCUTORTURE_RDR_BH) @@ -1216,6 +1177,10 @@ static void rcutorture_one_extend(int *readstate, int newstate, local_irq_disable(); if (statesnew & RCUTORTURE_RDR_PREEMPT) preempt_disable(); + if (statesnew & RCUTORTURE_RDR_RBH) + rcu_read_lock_bh(); + if (statesnew & RCUTORTURE_RDR_SCHED) + rcu_read_lock_sched(); if (statesnew & RCUTORTURE_RDR_RCU) idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; @@ -1226,12 +1191,16 @@ static void rcutorture_one_extend(int *readstate, int newstate, local_bh_enable(); if (statesold & RCUTORTURE_RDR_PREEMPT) preempt_enable(); + if (statesold & RCUTORTURE_RDR_RBH) + rcu_read_unlock_bh(); + if (statesold & RCUTORTURE_RDR_SCHED) + rcu_read_unlock_sched(); if (statesold & RCUTORTURE_RDR_RCU) cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) - cur_ops->read_delay(trsp); + cur_ops->read_delay(trsp, rtrsp); /* Update the reader state. */ if (idxnew == -1) @@ -1260,18 +1229,19 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) { int mask = rcutorture_extend_mask_max(); unsigned long randmask1 = torture_random(trsp) >> 8; - unsigned long randmask2 = randmask1 >> 1; + unsigned long randmask2 = randmask1 >> 3; WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); - /* Half the time lots of bits, half the time only one bit. */ - if (randmask1 & 0x1) + /* Most of the time lots of bits, half the time only one bit. */ + if (!(randmask1 & 0x7)) mask = mask & randmask2; else mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); + /* Can't enable bh w/irq disabled. */ if ((mask & RCUTORTURE_RDR_IRQ) && - !(mask & RCUTORTURE_RDR_BH) && - (oldmask & RCUTORTURE_RDR_BH)) - mask |= RCUTORTURE_RDR_BH; /* Can't enable bh w/irq disabled. */ + ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) || + (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH)))) + mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; if ((mask & RCUTORTURE_RDR_IRQ) && !(mask & cur_ops->ext_irq_conflict) && (oldmask & cur_ops->ext_irq_conflict)) @@ -1283,20 +1253,25 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * Do a randomly selected number of extensions of an existing RCU read-side * critical section. */ -static void rcutorture_loop_extend(int *readstate, - struct torture_random_state *trsp) +static struct rt_read_seg * +rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, + struct rt_read_seg *rtrsp) { int i; + int j; int mask = rcutorture_extend_mask_max(); WARN_ON_ONCE(!*readstate); /* -Existing- RCU read-side critsect! */ if (!((mask - 1) & mask)) - return; /* Current RCU flavor not extendable. */ - i = (torture_random(trsp) >> 3) & RCUTORTURE_RDR_MAX_LOOPS; - while (i--) { + return rtrsp; /* Current RCU reader not extendable. */ + /* Bias towards larger numbers of loops. */ + i = (torture_random(trsp) >> 3); + i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; + for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); - rcutorture_one_extend(readstate, mask, trsp); + rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); } + return &rtrsp[j]; } /* @@ -1306,16 +1281,20 @@ static void rcutorture_loop_extend(int *readstate, */ static bool rcu_torture_one_read(struct torture_random_state *trsp) { + int i; unsigned long started; unsigned long completed; int newstate; struct rcu_torture *p; int pipe_count; int readstate = 0; + struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } }; + struct rt_read_seg *rtrsp = &rtseg[0]; + struct rt_read_seg *rtrsp1; unsigned long long ts; newstate = rcutorture_extend_mask(readstate, trsp); - rcutorture_one_extend(&readstate, newstate, trsp); + rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1325,12 +1304,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&readstate, 0, trsp); + rcutorture_one_extend(&readstate, 0, trsp, rtrsp); return false; } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - rcutorture_loop_extend(&readstate, trsp); + rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -1351,8 +1330,17 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - rcutorture_one_extend(&readstate, 0, trsp); + rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + + /* If error or close call, record the sequence of reader protections. */ + if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { + i = 0; + for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++) + err_segs[i++] = *rtrsp1; + rt_read_nsegs = i; + } + return true; } @@ -1387,6 +1375,9 @@ static void rcu_torture_timer(struct timer_list *unused) static int rcu_torture_reader(void *arg) { + unsigned long lastsleep = jiffies; + long myid = (long)arg; + int mynumonline = myid; DEFINE_TORTURE_RANDOM(rand); struct timer_list t; @@ -1402,6 +1393,12 @@ rcu_torture_reader(void *arg) } if (!rcu_torture_one_read(&rand)) schedule_timeout_interruptible(HZ); + if (time_after(jiffies, lastsleep)) { + schedule_timeout_interruptible(1); + lastsleep = jiffies + 10; + } + while (num_online_cpus() < mynumonline && !torture_must_stop()) + schedule_timeout_interruptible(HZ / 5); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { @@ -1655,6 +1652,121 @@ static int __init rcu_torture_stall_init(void) return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } +/* State structure for forward-progress self-propagating RCU callback. */ +struct fwd_cb_state { + struct rcu_head rh; + int stop; +}; + +/* + * Forward-progress self-propagating RCU callback function. Because + * callbacks run from softirq, this function is an implicit RCU read-side + * critical section. + */ +static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp) +{ + struct fwd_cb_state *fcsp = container_of(rhp, struct fwd_cb_state, rh); + + if (READ_ONCE(fcsp->stop)) { + WRITE_ONCE(fcsp->stop, 2); + return; + } + cur_ops->call(&fcsp->rh, rcu_torture_fwd_prog_cb); +} + +/* Carry out grace-period forward-progress testing. */ +static int rcu_torture_fwd_prog(void *args) +{ + unsigned long cver; + unsigned long dur; + struct fwd_cb_state fcs; + unsigned long gps; + int idx; + int sd; + int sd4; + bool selfpropcb = false; + unsigned long stopat; + int tested = 0; + int tested_tries = 0; + static DEFINE_TORTURE_RANDOM(trs); + + VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); + if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) + set_user_nice(current, MAX_NICE); + if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) { + init_rcu_head_on_stack(&fcs.rh); + selfpropcb = true; + } + do { + schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + if (selfpropcb) { + WRITE_ONCE(fcs.stop, 0); + cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); + } + cver = READ_ONCE(rcu_torture_current_version); + gps = cur_ops->get_gp_seq(); + sd = cur_ops->stall_dur() + 1; + sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; + dur = sd4 + torture_random(&trs) % (sd - sd4); + stopat = jiffies + dur; + while (time_before(jiffies, stopat) && !torture_must_stop()) { + idx = cur_ops->readlock(); + udelay(10); + cur_ops->readunlock(idx); + if (!fwd_progress_need_resched || need_resched()) + cond_resched(); + } + tested_tries++; + if (!time_before(jiffies, stopat) && !torture_must_stop()) { + tested++; + cver = READ_ONCE(rcu_torture_current_version) - cver; + gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + WARN_ON(!cver && gps < 2); + pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); + } + if (selfpropcb) { + WRITE_ONCE(fcs.stop, 1); + cur_ops->sync(); /* Wait for running CB to complete. */ + cur_ops->cb_barrier(); /* Wait for queued callbacks. */ + } + /* Avoid slow periods, better to test when busy. */ + stutter_wait("rcu_torture_fwd_prog"); + } while (!torture_must_stop()); + if (selfpropcb) { + WARN_ON(READ_ONCE(fcs.stop) != 2); + destroy_rcu_head_on_stack(&fcs.rh); + } + /* Short runs might not contain a valid forward-progress attempt. */ + WARN_ON(!tested && tested_tries >= 5); + pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + torture_kthread_stopping("rcu_torture_fwd_prog"); + return 0; +} + +/* If forward-progress checking is requested and feasible, spawn the thread. */ +static int __init rcu_torture_fwd_prog_init(void) +{ + if (!fwd_progress) + return 0; /* Not requested, so don't do it. */ + if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); + return 0; + } + if (stall_cpu > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); + if (IS_MODULE(CONFIG_RCU_TORTURE_TESTS)) + return -EINVAL; /* In module, can fail back to user. */ + WARN_ON(1); /* Make sure rcutorture notices conflict. */ + return 0; + } + if (fwd_progress_holdoff <= 0) + fwd_progress_holdoff = 1; + if (fwd_progress_div <= 0) + fwd_progress_div = 4; + return torture_create_kthread(rcu_torture_fwd_prog, + NULL, fwd_prog_task); +} + /* Callback function for RCU barrier testing. */ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) { @@ -1817,6 +1929,7 @@ static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) { + int firsttime; int flags = 0; unsigned long gp_seq = 0; int i; @@ -1828,6 +1941,7 @@ rcu_torture_cleanup(void) } rcu_torture_barrier_cleanup(); + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); @@ -1860,7 +1974,7 @@ rcu_torture_cleanup(void) cpuhp_remove_state(rcutor_hp); /* - * Wait for all RCU callbacks to fire, then do flavor-specific + * Wait for all RCU callbacks to fire, then do torture-type-specific * cleanup operations. */ if (cur_ops->cb_barrier != NULL) @@ -1870,6 +1984,33 @@ rcu_torture_cleanup(void) rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + if (err_segs_recorded) { + pr_alert("Failure/close-call rcutorture reader segments:\n"); + if (rt_read_nsegs == 0) + pr_alert("\t: No segments recorded!!!\n"); + firsttime = 1; + for (i = 0; i < rt_read_nsegs; i++) { + pr_alert("\t%d: %#x ", i, err_segs[i].rt_readstate); + if (err_segs[i].rt_delay_jiffies != 0) { + pr_cont("%s%ldjiffies", firsttime ? "" : "+", + err_segs[i].rt_delay_jiffies); + firsttime = 0; + } + if (err_segs[i].rt_delay_ms != 0) { + pr_cont("%s%ldms", firsttime ? "" : "+", + err_segs[i].rt_delay_ms); + firsttime = 0; + } + if (err_segs[i].rt_delay_us != 0) { + pr_cont("%s%ldus", firsttime ? "" : "+", + err_segs[i].rt_delay_us); + firsttime = 0; + } + pr_cont("%s\n", + err_segs[i].rt_preempted ? "preempted" : ""); + + } + } if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); else if (torture_onoff_failures()) @@ -1939,12 +2080,12 @@ static void rcu_test_debug_objects(void) static int __init rcu_torture_init(void) { - int i; + long i; int cpu; int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &sched_ops, &tasks_ops, + &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, + &busted_srcud_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -1963,6 +2104,7 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_cont(" %s", torture_ops[i]->name); pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); firsterr = -EINVAL; goto unwind; } @@ -2013,6 +2155,8 @@ rcu_torture_init(void) per_cpu(rcu_torture_batch, cpu)[i] = 0; } } + err_segs_recorded = 0; + rt_read_nsegs = 0; /* Start up the kthreads. */ @@ -2044,7 +2188,7 @@ rcu_torture_init(void) goto unwind; } for (i = 0; i < nrealreaders; i++) { - firsterr = torture_create_kthread(rcu_torture_reader, NULL, + firsterr = torture_create_kthread(rcu_torture_reader, (void *)i, reader_tasks[i]); if (firsterr) goto unwind; @@ -2100,6 +2244,9 @@ rcu_torture_init(void) firsterr = rcu_torture_stall_init(); if (firsterr) goto unwind; + firsterr = rcu_torture_fwd_prog_init(); + if (firsterr) + goto unwind; firsterr = rcu_torture_barrier_init(); if (firsterr) goto unwind; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 04fc2ed71af8..b46e6683f8c9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -34,6 +34,8 @@ #include "rcu.h" int rcu_scheduler_active __read_mostly; +static LIST_HEAD(srcu_boot_list); +static bool srcu_init_done; static int init_srcu_struct_fields(struct srcu_struct *sp) { @@ -46,6 +48,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) sp->srcu_gp_waiting = false; sp->srcu_idx = 0; INIT_WORK(&sp->srcu_work, srcu_drive_gp); + INIT_LIST_HEAD(&sp->srcu_work.entry); return 0; } @@ -179,8 +182,12 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, *sp->srcu_cb_tail = rhp; sp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); - if (!READ_ONCE(sp->srcu_gp_running)) - schedule_work(&sp->srcu_work); + if (!READ_ONCE(sp->srcu_gp_running)) { + if (likely(srcu_init_done)) + schedule_work(&sp->srcu_work); + else if (list_empty(&sp->srcu_work.entry)) + list_add(&sp->srcu_work.entry, &srcu_boot_list); + } } EXPORT_SYMBOL_GPL(call_srcu); @@ -204,3 +211,21 @@ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } + +/* + * Queue work for srcu_struct structures with early boot callbacks. + * The work won't actually execute until the workqueue initialization + * phase that takes place after the scheduler starts. + */ +void __init srcu_init(void) +{ + struct srcu_struct *sp; + + srcu_init_done = true; + while (!list_empty(&srcu_boot_list)) { + sp = list_first_entry(&srcu_boot_list, + struct srcu_struct, srcu_work.entry); + list_del_init(&sp->srcu_work.entry); + schedule_work(&sp->srcu_work); + } +} diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6c9866a854b1..a8846ed7f352 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -51,6 +51,10 @@ module_param(exp_holdoff, ulong, 0444); static ulong counter_wrap_check = (ULONG_MAX >> 2); module_param(counter_wrap_check, ulong, 0444); +/* Early-boot callback-management, so early that no lock is required! */ +static LIST_HEAD(srcu_boot_list); +static bool __read_mostly srcu_init_done; + static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); static void process_srcu(struct work_struct *work); @@ -105,7 +109,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) rcu_init_levelspread(levelspread, num_rcu_lvl); /* Each pass through this loop initializes one srcu_node structure. */ - rcu_for_each_node_breadth_first(sp, snp) { + srcu_for_each_node_breadth_first(sp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); @@ -235,7 +239,6 @@ static void check_init_srcu_struct(struct srcu_struct *sp) { unsigned long flags; - WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT); /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ @@ -561,7 +564,7 @@ static void srcu_gp_end(struct srcu_struct *sp) /* Initiate callback invocation as needed. */ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - rcu_for_each_node_breadth_first(sp, snp) { + srcu_for_each_node_breadth_first(sp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; last_lvl = snp >= sp->level[rcu_num_lvls - 1]; @@ -701,7 +704,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); srcu_gp_start(sp); - queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp)); + if (likely(srcu_init_done)) + queue_delayed_work(rcu_gp_wq, &sp->work, + srcu_get_delay(sp)); + else if (list_empty(&sp->work.work.entry)) + list_add(&sp->work.work.entry, &srcu_boot_list); } spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -980,7 +987,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * There are memory-ordering constraints implied by synchronize_srcu(). * On systems with more than one CPU, when synchronize_srcu() returns, * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last corresponding SRCU-sched read-side critical section + * the end of its last corresponding SRCU read-side critical section * whose beginning preceded the call to synchronize_srcu(). In addition, * each CPU having an SRCU read-side critical section that extends beyond * the return from synchronize_srcu() is guaranteed to have executed a @@ -1308,3 +1315,17 @@ static int __init srcu_bootup_announce(void) return 0; } early_initcall(srcu_bootup_announce); + +void __init srcu_init(void) +{ + struct srcu_struct *sp; + + srcu_init_done = true; + while (!list_empty(&srcu_boot_list)) { + sp = list_first_entry(&srcu_boot_list, struct srcu_struct, + work.work.entry); + check_init_srcu_struct(sp); + list_del_init(&sp->work.work.entry); + queue_work(rcu_gp_wq, &sp->work.work); + } +} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index befc9321a89c..5f5963ba313e 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -46,69 +46,27 @@ struct rcu_ctrlblk { }; /* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_sched_ctrlblk = { - .donetail = &rcu_sched_ctrlblk.rcucblist, - .curtail = &rcu_sched_ctrlblk.rcucblist, +static struct rcu_ctrlblk rcu_ctrlblk = { + .donetail = &rcu_ctrlblk.rcucblist, + .curtail = &rcu_ctrlblk.rcucblist, }; -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .donetail = &rcu_bh_ctrlblk.rcucblist, - .curtail = &rcu_bh_ctrlblk.rcucblist, -}; - -void rcu_barrier_bh(void) -{ - wait_rcu_gp(call_rcu_bh); -} -EXPORT_SYMBOL(rcu_barrier_bh); - -void rcu_barrier_sched(void) -{ - wait_rcu_gp(call_rcu_sched); -} -EXPORT_SYMBOL(rcu_barrier_sched); - -/* - * Helper function for rcu_sched_qs() and rcu_bh_qs(). - * Also irqs are disabled to avoid confusion due to interrupt handlers - * invoking call_rcu(). - */ -static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) -{ - if (rcp->donetail != rcp->curtail) { - rcp->donetail = rcp->curtail; - return 1; - } - - return 0; -} - -/* - * Record an rcu quiescent state. And an rcu_bh quiescent state while we - * are at it, given that any rcu quiescent state is also an rcu_bh - * quiescent state. Use "+" instead of "||" to defeat short circuiting. - */ -void rcu_sched_qs(void) +void rcu_barrier(void) { - unsigned long flags; - - local_irq_save(flags); - if (rcu_qsctr_help(&rcu_sched_ctrlblk) + - rcu_qsctr_help(&rcu_bh_ctrlblk)) - raise_softirq(RCU_SOFTIRQ); - local_irq_restore(flags); + wait_rcu_gp(call_rcu); } +EXPORT_SYMBOL(rcu_barrier); -/* - * Record an rcu_bh quiescent state. - */ -void rcu_bh_qs(void) +/* Record an rcu quiescent state. */ +void rcu_qs(void) { unsigned long flags; local_irq_save(flags); - if (rcu_qsctr_help(&rcu_bh_ctrlblk)) + if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { + rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; raise_softirq(RCU_SOFTIRQ); + } local_irq_restore(flags); } @@ -120,34 +78,33 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - if (user) - rcu_sched_qs(); - if (user || !in_softirq()) - rcu_bh_qs(); + if (user) { + rcu_qs(); + } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { + set_tsk_need_resched(current); + set_preempt_need_resched(); + } } -/* - * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure - * whose grace period has elapsed. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) +/* Invoke the RCU callbacks whose grace period has elapsed. */ +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_head *next, *list; unsigned long flags; /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); - if (rcp->donetail == &rcp->rcucblist) { + if (rcu_ctrlblk.donetail == &rcu_ctrlblk.rcucblist) { /* No callbacks ready, so just leave. */ local_irq_restore(flags); return; } - list = rcp->rcucblist; - rcp->rcucblist = *rcp->donetail; - *rcp->donetail = NULL; - if (rcp->curtail == rcp->donetail) - rcp->curtail = &rcp->rcucblist; - rcp->donetail = &rcp->rcucblist; + list = rcu_ctrlblk.rcucblist; + rcu_ctrlblk.rcucblist = *rcu_ctrlblk.donetail; + *rcu_ctrlblk.donetail = NULL; + if (rcu_ctrlblk.curtail == rcu_ctrlblk.donetail) + rcu_ctrlblk.curtail = &rcu_ctrlblk.rcucblist; + rcu_ctrlblk.donetail = &rcu_ctrlblk.rcucblist; local_irq_restore(flags); /* Invoke the callbacks on the local list. */ @@ -162,37 +119,31 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) } } -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) -{ - __rcu_process_callbacks(&rcu_sched_ctrlblk); - __rcu_process_callbacks(&rcu_bh_ctrlblk); -} - /* * Wait for a grace period to elapse. But it is illegal to invoke - * synchronize_sched() from within an RCU read-side critical section. - * Therefore, any legal call to synchronize_sched() is a quiescent - * state, and so on a UP system, synchronize_sched() need do nothing. - * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the - * benefits of doing might_sleep() to reduce latency.) + * synchronize_rcu() from within an RCU read-side critical section. + * Therefore, any legal call to synchronize_rcu() is a quiescent + * state, and so on a UP system, synchronize_rcu() need do nothing. + * (But Lai Jiangshan points out the benefits of doing might_sleep() + * to reduce latency.) * * Cool, huh? (Due to Josh Triplett.) */ -void synchronize_sched(void) +void synchronize_rcu(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU read-side critical section"); + "Illegal synchronize_rcu() in RCU read-side critical section"); } -EXPORT_SYMBOL_GPL(synchronize_sched); +EXPORT_SYMBOL_GPL(synchronize_rcu); /* - * Helper function for call_rcu() and call_rcu_bh(). + * Post an RCU callback to be invoked after the end of an RCU grace + * period. But since we have but one CPU, that would be after any + * quiescent state. */ -static void __call_rcu(struct rcu_head *head, - rcu_callback_t func, - struct rcu_ctrlblk *rcp) +void call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; @@ -201,39 +152,20 @@ static void __call_rcu(struct rcu_head *head, head->next = NULL; local_irq_save(flags); - *rcp->curtail = head; - rcp->curtail = &head->next; + *rcu_ctrlblk.curtail = head; + rcu_ctrlblk.curtail = &head->next; local_irq_restore(flags); if (unlikely(is_idle_task(current))) { - /* force scheduling for rcu_sched_qs() */ + /* force scheduling for rcu_qs() */ resched_cpu(0); } } - -/* - * Post an RCU callback to be invoked after the end of an RCU-sched grace - * period. But since we have but one CPU, that would be after any - * quiescent state. - */ -void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func, &rcu_sched_ctrlblk); -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/* - * Post an RCU bottom-half callback to be invoked after any subsequent - * quiescent state. - */ -void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func, &rcu_bh_ctrlblk); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); +EXPORT_SYMBOL_GPL(call_rcu); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); rcu_early_boot_tests(); + srcu_init(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0b760c1369f7..121f833acd04 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -61,6 +61,7 @@ #include <linux/trace_events.h> #include <linux/suspend.h> #include <linux/ftrace.h> +#include <linux/tick.h> #include "tree.h" #include "rcu.h" @@ -73,45 +74,31 @@ /* Data structures. */ /* - * In order to export the rcu_state name to the tracing tools, it - * needs to be added in the __tracepoint_string section. - * This requires defining a separate variable tp_<sname>_varname - * that points to the string being used, and this will allow - * the tracing userspace tools to be able to decipher the string - * address to the matching string. + * Steal a bit from the bottom of ->dynticks for idle entry/exit + * control. Initially this is for TLB flushing. */ -#ifdef CONFIG_TRACING -# define DEFINE_RCU_TPS(sname) \ -static char sname##_varname[] = #sname; \ -static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; -# define RCU_STATE_NAME(sname) sname##_varname -#else -# define DEFINE_RCU_TPS(sname) -# define RCU_STATE_NAME(sname) __stringify(sname) +#define RCU_DYNTICK_CTRL_MASK 0x1 +#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) +#ifndef rcu_eqs_special_exit +#define rcu_eqs_special_exit() do { } while (0) #endif -#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ -DEFINE_RCU_TPS(sname) \ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ -struct rcu_state sname##_state = { \ - .level = { &sname##_state.node[0] }, \ - .rda = &sname##_data, \ - .call = cr, \ - .gp_state = RCU_GP_IDLE, \ - .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \ - .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .name = RCU_STATE_NAME(sname), \ - .abbr = sabbr, \ - .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ - .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ - .ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \ -} - -RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); -RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); - -static struct rcu_state *const rcu_state_p; -LIST_HEAD(rcu_struct_flavors); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { + .dynticks_nesting = 1, + .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, + .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), +}; +struct rcu_state rcu_state = { + .level = { &rcu_state.node[0] }, + .gp_state = RCU_GP_IDLE, + .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, + .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex), + .name = RCU_NAME, + .abbr = RCU_ABBR, + .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), + .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), + .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock), +}; /* Dump rcu_node combining tree at boot to verify correct setup. */ static bool dump_tree; @@ -158,16 +145,14 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; -static void -rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long gps, unsigned long flags); +static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, + unsigned long gps, unsigned long flags); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); -static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); -static void rcu_report_exp_rdp(struct rcu_state *rsp, - struct rcu_data *rdp, bool wake); +static void invoke_rcu_callbacks(struct rcu_data *rdp); +static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ @@ -183,7 +168,7 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); -/* Retreive RCU kthreads priority for rcutorture */ +/* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { return kthread_prio; @@ -217,67 +202,24 @@ unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. */ -static int rcu_gp_in_progress(struct rcu_state *rsp) +static int rcu_gp_in_progress(void) { - return rcu_seq_state(rcu_seq_current(&rsp->gp_seq)); -} - -/* - * Note a quiescent state. Because we do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period, this just sets a flag. - * The caller must have disabled preemption. - */ -void rcu_sched_qs(void) -{ - RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!"); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) - return; - trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gp_seq), - TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); + return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq)); } -void rcu_bh_qs(void) +void rcu_softirq_qs(void) { - RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); - if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { - trace_rcu_grace_period(TPS("rcu_bh"), - __this_cpu_read(rcu_bh_data.gp_seq), - TPS("cpuqs")); - __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); - } + rcu_qs(); + rcu_preempt_deferred_qs(current); } /* - * Steal a bit from the bottom of ->dynticks for idle entry/exit - * control. Initially this is for TLB flushing. - */ -#define RCU_DYNTICK_CTRL_MASK 0x1 -#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) -#ifndef rcu_eqs_special_exit -#define rcu_eqs_special_exit() do { } while (0) -#endif - -static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = 1, - .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), -}; - -/* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state. */ static void rcu_dynticks_eqs_enter(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -285,7 +227,7 @@ static void rcu_dynticks_eqs_enter(void) * critical sections, and we also must force ordering with the * next idle sojourn. */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); /* Better be in an extended quiescent state! */ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); @@ -300,7 +242,7 @@ static void rcu_dynticks_eqs_enter(void) */ static void rcu_dynticks_eqs_exit(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -308,11 +250,11 @@ static void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { - atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); + atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ /* Prefer duplicate flushes to losing a flush. */ rcu_eqs_special_exit(); @@ -331,11 +273,11 @@ static void rcu_dynticks_eqs_exit(void) */ static void rcu_dynticks_eqs_online(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) + if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR) return; - atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); } /* @@ -345,18 +287,18 @@ static void rcu_dynticks_eqs_online(void) */ bool rcu_dynticks_curr_cpu_in_eqs(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); } /* * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -int rcu_dynticks_snap(struct rcu_dynticks *rdtp) +int rcu_dynticks_snap(struct rcu_data *rdp) { - int snap = atomic_add_return(0, &rdtp->dynticks); + int snap = atomic_add_return(0, &rdp->dynticks); return snap & ~RCU_DYNTICK_CTRL_MASK; } @@ -371,13 +313,13 @@ static bool rcu_dynticks_in_eqs(int snap) } /* - * Return true if the CPU corresponding to the specified rcu_dynticks + * Return true if the CPU corresponding to the specified rcu_data * structure has spent some time in an extended quiescent state since * rcu_dynticks_snap() returned the specified snapshot. */ -static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) +static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) { - return snap != rcu_dynticks_snap(rdtp); + return snap != rcu_dynticks_snap(rdp); } /* @@ -391,14 +333,14 @@ bool rcu_eqs_special_set(int cpu) { int old; int new; - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); do { - old = atomic_read(&rdtp->dynticks); + old = atomic_read(&rdp->dynticks); if (old & RCU_DYNTICK_CTRL_CTR) return false; new = old | RCU_DYNTICK_CTRL_MASK; - } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); + } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old); return true; } @@ -413,82 +355,30 @@ bool rcu_eqs_special_set(int cpu) * * The caller must have disabled interrupts and must not be idle. */ -static void rcu_momentary_dyntick_idle(void) +static void __maybe_unused rcu_momentary_dyntick_idle(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); int special; - raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); - special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); + special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, + &this_cpu_ptr(&rcu_data)->dynticks); /* It is illegal to call this from idle state. */ WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); + rcu_preempt_deferred_qs(current); } -/* - * Note a context switch. This is a quiescent state for RCU-sched, - * and requires special handling for preemptible RCU. - * The caller must have disabled interrupts. - */ -void rcu_note_context_switch(bool preempt) -{ - barrier(); /* Avoid RCU read-side critical sections leaking down. */ - trace_rcu_utilization(TPS("Start context switch")); - rcu_sched_qs(); - rcu_preempt_note_context_switch(preempt); - /* Load rcu_urgent_qs before other flags. */ - if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) - goto out; - this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); - if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) - rcu_momentary_dyntick_idle(); - this_cpu_inc(rcu_dynticks.rcu_qs_ctr); - if (!preempt) - rcu_tasks_qs(current); -out: - trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ -} -EXPORT_SYMBOL_GPL(rcu_note_context_switch); - -/* - * Register a quiescent state for all RCU flavors. If there is an - * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight - * dyntick-idle quiescent state visible to other CPUs (but only for those - * RCU flavors in desperate need of a quiescent state, which will normally - * be none of them). Either way, do a lightweight quiescent state for - * all RCU flavors. - * - * The barrier() calls are redundant in the common case when this is - * called externally, but just in case this is called from within this - * file. +/** + * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle * + * If the current CPU is idle or running at a first-level (not nested) + * interrupt from idle, return true. The caller must have at least + * disabled preemption. */ -void rcu_all_qs(void) +static int rcu_is_cpu_rrupt_from_idle(void) { - unsigned long flags; - - if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) - return; - preempt_disable(); - /* Load rcu_urgent_qs before other flags. */ - if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { - preempt_enable(); - return; - } - this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); - barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { - local_irq_save(flags); - rcu_momentary_dyntick_idle(); - local_irq_restore(flags); - } - if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) - rcu_sched_qs(); - this_cpu_inc(rcu_dynticks.rcu_qs_ctr); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ - preempt_enable(); + return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 && + __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; } -EXPORT_SYMBOL_GPL(rcu_all_qs); #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ static long blimit = DEFAULT_RCU_BLIMIT; @@ -505,13 +395,47 @@ static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; +/* + * How long the grace period must be before we start recruiting + * quiescent-state help from rcu_note_context_switch(). + */ +static ulong jiffies_till_sched_qs = ULONG_MAX; +module_param(jiffies_till_sched_qs, ulong, 0444); +static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ +module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ + +/* + * Make sure that we give the grace-period kthread time to detect any + * idle CPUs before taking active measures to force quiescent states. + * However, don't go below 100 milliseconds, adjusted upwards for really + * large systems. + */ +static void adjust_jiffies_till_sched_qs(void) +{ + unsigned long j; + + /* If jiffies_till_sched_qs was specified, respect the request. */ + if (jiffies_till_sched_qs != ULONG_MAX) { + WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); + return; + } + j = READ_ONCE(jiffies_till_first_fqs) + + 2 * READ_ONCE(jiffies_till_next_fqs); + if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) + j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; + pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j); + WRITE_ONCE(jiffies_to_sched_qs, j); +} + static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp) { ulong j; int ret = kstrtoul(val, 0, &j); - if (!ret) + if (!ret) { WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j); + adjust_jiffies_till_sched_qs(); + } return ret; } @@ -520,8 +444,10 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param ulong j; int ret = kstrtoul(val, 0, &j); - if (!ret) + if (!ret) { WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1)); + adjust_jiffies_till_sched_qs(); + } return ret; } @@ -539,15 +465,8 @@ module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_fi module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644); module_param(rcu_kick_kthreads, bool, 0644); -/* - * How long the grace period must be before we start recruiting - * quiescent-state help from rcu_note_context_switch(). - */ -static ulong jiffies_till_sched_qs = HZ / 10; -module_param(jiffies_till_sched_qs, ulong, 0444); - -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); -static void force_quiescent_state(struct rcu_state *rsp); +static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); +static void force_quiescent_state(void); static int rcu_pending(void); /* @@ -555,29 +474,11 @@ static int rcu_pending(void); */ unsigned long rcu_get_gp_seq(void) { - return READ_ONCE(rcu_state_p->gp_seq); + return READ_ONCE(rcu_state.gp_seq); } EXPORT_SYMBOL_GPL(rcu_get_gp_seq); /* - * Return the number of RCU-sched GPs completed thus far for debug & stats. - */ -unsigned long rcu_sched_get_gp_seq(void) -{ - return READ_ONCE(rcu_sched_state.gp_seq); -} -EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); - -/* - * Return the number of RCU-bh GPs completed thus far for debug & stats. - */ -unsigned long rcu_bh_get_gp_seq(void) -{ - return READ_ONCE(rcu_bh_state.gp_seq); -} -EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); - -/* * Return the number of RCU expedited batches completed thus far for * debug & stats. Odd numbers mean that a batch is in progress, even * numbers mean idle. The value returned will thus be roughly double @@ -585,48 +486,20 @@ EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); */ unsigned long rcu_exp_batches_completed(void) { - return rcu_state_p->expedited_sequence; + return rcu_state.expedited_sequence; } EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); /* - * Return the number of RCU-sched expedited batches completed thus far - * for debug & stats. Similar to rcu_exp_batches_completed(). - */ -unsigned long rcu_exp_batches_completed_sched(void) -{ - return rcu_sched_state.expedited_sequence; -} -EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); - -/* * Force a quiescent state. */ void rcu_force_quiescent_state(void) { - force_quiescent_state(rcu_state_p); + force_quiescent_state(); } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Force a quiescent state for RCU BH. - */ -void rcu_bh_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_bh_state); -} -EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); - -/* - * Force a quiescent state for RCU-sched. - */ -void rcu_sched_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_sched_state); -} -EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); - -/* * Show the state of the grace-period kthreads. */ void show_rcu_gp_kthreads(void) @@ -634,31 +507,28 @@ void show_rcu_gp_kthreads(void) int cpu; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_state *rsp; - for_each_rcu_flavor(rsp) { - pr_info("%s: wait state: %d ->state: %#lx\n", - rsp->name, rsp->gp_state, rsp->gp_kthread->state); - rcu_for_each_node_breadth_first(rsp, rnp) { - if (ULONG_CMP_GE(rsp->gp_seq, rnp->gp_seq_needed)) - continue; - pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", - rnp->grplo, rnp->grphi, rnp->gp_seq, - rnp->gp_seq_needed); - if (!rcu_is_leaf_node(rnp)) + pr_info("%s: wait state: %d ->state: %#lx\n", rcu_state.name, + rcu_state.gp_state, rcu_state.gp_kthread->state); + rcu_for_each_node_breadth_first(rnp) { + if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) + continue; + pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", + rnp->grplo, rnp->grphi, rnp->gp_seq, + rnp->gp_seq_needed); + if (!rcu_is_leaf_node(rnp)) + continue; + for_each_leaf_node_possible_cpu(rnp, cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->gpwrap || + ULONG_CMP_GE(rcu_state.gp_seq, + rdp->gp_seq_needed)) continue; - for_each_leaf_node_possible_cpu(rnp, cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->gpwrap || - ULONG_CMP_GE(rsp->gp_seq, - rdp->gp_seq_needed)) - continue; - pr_info("\tcpu %d ->gp_seq_needed %lu\n", - cpu, rdp->gp_seq_needed); - } + pr_info("\tcpu %d ->gp_seq_needed %lu\n", + cpu, rdp->gp_seq_needed); } - /* sched_show_task(rsp->gp_kthread); */ } + /* sched_show_task(rcu_state.gp_kthread); */ } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); @@ -668,34 +538,25 @@ EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq) { - struct rcu_state *rsp = NULL; - switch (test_type) { case RCU_FLAVOR: - rsp = rcu_state_p; - break; case RCU_BH_FLAVOR: - rsp = &rcu_bh_state; - break; case RCU_SCHED_FLAVOR: - rsp = &rcu_sched_state; + *flags = READ_ONCE(rcu_state.gp_flags); + *gp_seq = rcu_seq_current(&rcu_state.gp_seq); break; default: break; } - if (rsp == NULL) - return; - *flags = READ_ONCE(rsp->gp_flags); - *gp_seq = rcu_seq_current(&rsp->gp_seq); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); /* - * Return the root node of the specified rcu_state structure. + * Return the root node of the rcu_state structure. */ -static struct rcu_node *rcu_get_root(struct rcu_state *rsp) +static struct rcu_node *rcu_get_root(void) { - return &rsp->node[0]; + return &rcu_state.node[0]; } /* @@ -708,28 +569,25 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) */ static void rcu_eqs_enter(bool user) { - struct rcu_state *rsp; - struct rcu_data *rdp; - struct rcu_dynticks *rdtp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - rdtp = this_cpu_ptr(&rcu_dynticks); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting == 0); - if (rdtp->dynticks_nesting != 1) { - rdtp->dynticks_nesting--; + rdp->dynticks_nesting == 0); + if (rdp->dynticks_nesting != 1) { + rdp->dynticks_nesting--; return; } lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); + trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - do_nocb_deferred_wakeup(rdp); - } + rdp = this_cpu_ptr(&rcu_data); + do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); - WRITE_ONCE(rdtp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + rcu_preempt_deferred_qs(current); + WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); } @@ -770,44 +628,61 @@ void rcu_user_enter(void) } #endif /* CONFIG_NO_HZ_FULL */ -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * +/* * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting + * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * - * If you add or remove a call to rcu_nmi_exit(), be sure to test + * If you add or remove a call to rcu_nmi_exit_common(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_nmi_exit(void) +static __always_inline void rcu_nmi_exit_common(bool irq) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); /* * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. * (We are exiting an NMI handler, so RCU better be paying attention * to us!) */ - WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0); WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so * leave it in non-RCU-idle state. */ - if (rdtp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ - rdtp->dynticks_nmi_nesting - 2); + if (rdp->dynticks_nmi_nesting != 1) { + trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); + WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ + rdp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks); + WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + + if (irq) + rcu_prepare_for_idle(); + rcu_dynticks_eqs_enter(); + + if (irq) + rcu_dynticks_task_enter(); +} + +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * @irq: Is this call from rcu_irq_exit? + * + * If you add or remove a call to rcu_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void rcu_nmi_exit(void) +{ + rcu_nmi_exit_common(false); } /** @@ -831,14 +706,8 @@ void rcu_nmi_exit(void) */ void rcu_irq_exit(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - lockdep_assert_irqs_disabled(); - if (rdtp->dynticks_nmi_nesting == 1) - rcu_prepare_for_idle(); - rcu_nmi_exit(); - if (rdtp->dynticks_nmi_nesting == 0) - rcu_dynticks_task_enter(); + rcu_nmi_exit_common(true); } /* @@ -866,24 +735,25 @@ void rcu_irq_exit_irqson(void) */ static void rcu_eqs_exit(bool user) { - struct rcu_dynticks *rdtp; + struct rcu_data *rdp; long oldval; lockdep_assert_irqs_disabled(); - rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; + rdp = this_cpu_ptr(&rcu_data); + oldval = rdp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { - rdtp->dynticks_nesting++; + rdp->dynticks_nesting++; return; } rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks); + trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - WRITE_ONCE(rdtp->dynticks_nesting, 1); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(rdp->dynticks_nesting, 1); + WARN_ON_ONCE(rdp->dynticks_nmi_nesting); + WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } /** @@ -921,24 +791,25 @@ void rcu_user_exit(void) #endif /* CONFIG_NO_HZ_FULL */ /** - * rcu_nmi_enter - inform RCU of entry to NMI context + * rcu_nmi_enter_common - inform RCU of entry to NMI context + * @irq: Is this call from rcu_irq_enter? * - * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and - * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know + * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and + * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) * - * If you add or remove a call to rcu_nmi_enter(), be sure to test + * If you add or remove a call to rcu_nmi_enter_common(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_nmi_enter(void) +static __always_inline void rcu_nmi_enter_common(bool irq) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); long incby = 2; /* Complain about underflow. */ - WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); + WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); /* * If idle from RCU viewpoint, atomically increment ->dynticks @@ -949,18 +820,34 @@ void rcu_nmi_enter(void) * period (observation due to Andy Lutomirski). */ if (rcu_dynticks_curr_cpu_in_eqs()) { + + if (irq) + rcu_dynticks_task_exit(); + rcu_dynticks_eqs_exit(); + + if (irq) + rcu_cleanup_after_idle(); + incby = 1; } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), - rdtp->dynticks_nmi_nesting, - rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ - rdtp->dynticks_nmi_nesting + incby); + rdp->dynticks_nmi_nesting, + rdp->dynticks_nmi_nesting + incby, rdp->dynticks); + WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ + rdp->dynticks_nmi_nesting + incby); barrier(); } /** + * rcu_nmi_enter - inform RCU of entry to NMI context + */ +void rcu_nmi_enter(void) +{ + rcu_nmi_enter_common(false); +} + +/** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * * Enter an interrupt handler, which might possibly result in exiting @@ -984,14 +871,8 @@ void rcu_nmi_enter(void) */ void rcu_irq_enter(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - lockdep_assert_irqs_disabled(); - if (rdtp->dynticks_nmi_nesting == 0) - rcu_dynticks_task_exit(); - rcu_nmi_enter(); - if (rdtp->dynticks_nmi_nesting == 1) - rcu_cleanup_after_idle(); + rcu_nmi_enter_common(true); } /* @@ -1043,7 +924,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t) cpu = task_cpu(t); if (!task_curr(t)) return; /* This task is not running on that CPU. */ - smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true); + smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true); } #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) @@ -1054,11 +935,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t) * Disable preemption to avoid false positives that could otherwise * happen due to the current CPU number being sampled, this task being * preempted, its old CPU being taken offline, resuming on some other CPU, - * then determining that its old CPU is now offline. Because there are - * multiple flavors of RCU, and because this function can be called in the - * midst of updating the flavors while a given CPU coming online or going - * offline, it is necessary to check all flavors. If any of the flavors - * believe that given CPU is online, it is considered to be online. + * then determining that its old CPU is now offline. * * Disable checking if in an NMI handler because we cannot safely * report errors from NMI handlers anyway. In addition, it is OK to use @@ -1069,39 +946,22 @@ bool rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_state *rsp; + bool ret = false; if (in_nmi() || !rcu_scheduler_fully_active) return true; preempt_disable(); - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) { - preempt_enable(); - return true; - } - } + rdp = this_cpu_ptr(&rcu_data); + rnp = rdp->mynode; + if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) + ret = true; preempt_enable(); - return false; + return ret; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ -/** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle - * - * If the current CPU is idle or running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. - */ -static int rcu_is_cpu_rrupt_from_idle(void) -{ - return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 && - __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1; -} - /* * We are reporting a quiescent state on behalf of some other CPU, so * it is our responsibility to check for and handle potential overflow @@ -1126,9 +986,9 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); + rdp->dynticks_snap = rcu_dynticks_snap(rdp); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); + trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; } @@ -1177,35 +1037,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); - rdp->dynticks_fqs++; - rcu_gpnum_ovf(rnp, rdp); - return 1; - } - - /* - * Has this CPU encountered a cond_resched() since the beginning - * of the grace period? For this to be the case, the CPU has to - * have noticed the current grace period. This might not be the - * case for nohz_full CPUs looping in the kernel. - */ - jtsq = jiffies_till_sched_qs; - ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); - if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && - READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && - rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) { - trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("rqc")); + if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) { + trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rnp, rdp); return 1; - } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { - /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ - smp_store_release(ruqp, true); } /* If waiting too long on an offline CPU, complain. */ if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && - time_after(jiffies, rdp->rsp->gp_start + HZ)) { + time_after(jiffies, rcu_state.gp_start + HZ)) { bool onl; struct rcu_node *rnp1; @@ -1226,39 +1066,56 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) /* * A CPU running for an extended time within the kernel can - * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, - * even context-switching back and forth between a pair of - * in-kernel CPU-bound tasks cannot advance grace periods. - * So if the grace period is old enough, make the CPU pay attention. - * Note that the unsynchronized assignments to the per-CPU - * rcu_need_heavy_qs variable are safe. Yes, setting of - * bits can be lost, but they will be set again on the next - * force-quiescent-state pass. So lost bit sets do not result - * in incorrect behavior, merely in a grace period lasting - * a few jiffies longer than it might otherwise. Because - * there are at most four threads involved, and because the - * updates are only once every few jiffies, the probability of - * lossage (and thus of slight grace-period extension) is - * quite low. + * delay RCU grace periods: (1) At age jiffies_to_sched_qs, + * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set + * both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the + * unsynchronized assignments to the per-CPU rcu_need_heavy_qs + * variable are safe because the assignments are repeated if this + * CPU failed to pass through a quiescent state. This code + * also checks .jiffies_resched in case jiffies_to_sched_qs + * is set way high. */ - rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); + jtsq = READ_ONCE(jiffies_to_sched_qs); + ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); + rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && - (time_after(jiffies, rdp->rsp->gp_start + jtsq) || - time_after(jiffies, rdp->rsp->jiffies_resched))) { + (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || + time_after(jiffies, rcu_state.jiffies_resched))) { WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); - rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */ + } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) { + WRITE_ONCE(*ruqp, true); } /* - * If more than halfway to RCU CPU stall-warning time, do a - * resched_cpu() to try to loosen things up a bit. Also check to - * see if the CPU is getting hammered with interrupts, but only - * once per grace period, just to keep the IPIs down to a dull roar. + * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks! + * The above code handles this, but only for straight cond_resched(). + * And some in-kernel loops check need_resched() before calling + * cond_resched(), which defeats the above code for CPUs that are + * running in-kernel with scheduling-clock interrupts disabled. + * So hit them over the head with the resched_cpu() hammer! */ - if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { + if (tick_nohz_full_cpu(rdp->cpu) && + time_after(jiffies, + READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) { resched_cpu(rdp->cpu); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); + } + + /* + * If more than halfway to RCU CPU stall-warning time, invoke + * resched_cpu() more frequently to try to loosen things up a bit. + * Also check to see if the CPU is getting hammered with interrupts, + * but only once per grace period, just to keep the IPIs down to + * a dull roar. + */ + if (time_after(jiffies, rcu_state.jiffies_resched)) { + if (time_after(jiffies, + READ_ONCE(rdp->last_fqs_resched) + jtsq)) { + resched_cpu(rdp->cpu); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); + } if (IS_ENABLED(CONFIG_IRQ_WORK) && !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { @@ -1272,17 +1129,17 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; } -static void record_gp_stall_check_time(struct rcu_state *rsp) +static void record_gp_stall_check_time(void) { unsigned long j = jiffies; unsigned long j1; - rsp->gp_start = j; + rcu_state.gp_start = j; j1 = rcu_jiffies_till_stall_check(); /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rsp->jiffies_stall, j + j1); /* ^^^ */ - rsp->jiffies_resched = j + j1 / 2; - rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); + smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + rcu_state.jiffies_resched = j + j1 / 2; + rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } /* @@ -1298,25 +1155,23 @@ static const char *gp_state_getname(short gs) /* * Complain about starvation of grace-period kthread. */ -static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) +static void rcu_check_gp_kthread_starvation(void) { - unsigned long gpa; + struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; - j = jiffies; - gpa = READ_ONCE(rsp->gp_activity); - if (j - gpa > 2 * HZ) { + j = jiffies - READ_ONCE(rcu_state.gp_activity); + if (j > 2 * HZ) { pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", - rsp->name, j - gpa, - (long)rcu_seq_current(&rsp->gp_seq), - rsp->gp_flags, - gp_state_getname(rsp->gp_state), rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : ~0, - rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); - if (rsp->gp_kthread) { + rcu_state.name, j, + (long)rcu_seq_current(&rcu_state.gp_seq), + rcu_state.gp_flags, + gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, + gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); + if (gpk) { pr_err("RCU grace-period kthread stack dump:\n"); - sched_show_task(rsp->gp_kthread); - wake_up_process(rsp->gp_kthread); + sched_show_task(gpk); + wake_up_process(gpk); } } } @@ -1327,13 +1182,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) * that don't support NMI-based stack dumps. The NMI-triggered stack * traces are more accurate because they are printed by the target CPU. */ -static void rcu_dump_cpu_stacks(struct rcu_state *rsp) +static void rcu_dump_cpu_stacks(void) { int cpu; unsigned long flags; struct rcu_node *rnp; - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) @@ -1347,19 +1202,20 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) * If too much time has passed in the current grace period, and if * so configured, go kick the relevant kthreads. */ -static void rcu_stall_kick_kthreads(struct rcu_state *rsp) +static void rcu_stall_kick_kthreads(void) { unsigned long j; if (!rcu_kick_kthreads) return; - j = READ_ONCE(rsp->jiffies_kick_kthreads); - if (time_after(jiffies, j) && rsp->gp_kthread && - (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) { - WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + j = READ_ONCE(rcu_state.jiffies_kick_kthreads); + if (time_after(jiffies, j) && rcu_state.gp_kthread && + (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", + rcu_state.name); rcu_ftrace_dump(DUMP_ALL); - wake_up_process(rsp->gp_kthread); - WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); + wake_up_process(rcu_state.gp_kthread); + WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); } } @@ -1369,18 +1225,18 @@ static void panic_on_rcu_stall(void) panic("RCU Stall\n"); } -static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) +static void print_other_cpu_stall(unsigned long gp_seq) { int cpu; unsigned long flags; unsigned long gpa; unsigned long j; int ndetected = 0; - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); long totqlen = 0; /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(rsp); + rcu_stall_kick_kthreads(); if (rcu_cpu_stall_suppress) return; @@ -1389,15 +1245,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", rsp->name); + pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); print_cpu_stall_info_begin(); - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { - print_cpu_stall_info(rsp, cpu); + print_cpu_stall_info(cpu); ndetected++; } } @@ -1406,52 +1262,52 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data, cpu)->cblist); pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rsp->gp_start), - (long)rcu_seq_current(&rsp->gp_seq), totqlen); + smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); if (ndetected) { - rcu_dump_cpu_stacks(rsp); + rcu_dump_cpu_stacks(); /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); + rcu_print_detail_task_stall(); } else { - if (rcu_seq_current(&rsp->gp_seq) != gp_seq) { + if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = READ_ONCE(rsp->gp_activity); + gpa = READ_ONCE(rcu_state.gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", - rsp->name, j - gpa, j, gpa, - jiffies_till_next_fqs, - rcu_get_root(rsp)->qsmask); + rcu_state.name, j - gpa, j, gpa, + READ_ONCE(jiffies_till_next_fqs), + rcu_get_root()->qsmask); /* In this case, the current CPU might be at fault. */ sched_show_task(current); } } /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) - WRITE_ONCE(rsp->jiffies_stall, + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - rcu_check_gp_kthread_starvation(rsp); + rcu_check_gp_kthread_starvation(); panic_on_rcu_stall(); - force_quiescent_state(rsp); /* Kick them all. */ + force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(struct rcu_state *rsp) +static void print_cpu_stall(void) { int cpu; unsigned long flags; - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rcu_get_root(); long totqlen = 0; /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(rsp); + rcu_stall_kick_kthreads(); if (rcu_cpu_stall_suppress) return; @@ -1460,27 +1316,27 @@ static void print_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - pr_err("INFO: %s self-detected stall on CPU", rsp->name); + pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); print_cpu_stall_info_begin(); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); - print_cpu_stall_info(rsp, smp_processor_id()); + print_cpu_stall_info(smp_processor_id()); raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data, cpu)->cblist); pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rsp->gp_start, - (long)rcu_seq_current(&rsp->gp_seq), totqlen); + jiffies - rcu_state.gp_start, + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - rcu_check_gp_kthread_starvation(rsp); + rcu_check_gp_kthread_starvation(); - rcu_dump_cpu_stacks(rsp); + rcu_dump_cpu_stacks(); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) - WRITE_ONCE(rsp->jiffies_stall, + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1493,10 +1349,11 @@ static void print_cpu_stall(struct rcu_state *rsp) * progress and it could be we're stuck in kernel space without context * switches for an entirely unreasonable amount of time. */ - resched_cpu(smp_processor_id()); + set_tsk_need_resched(current); + set_preempt_need_resched(); } -static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) +static void check_cpu_stall(struct rcu_data *rdp) { unsigned long gs1; unsigned long gs2; @@ -1507,54 +1364,55 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) struct rcu_node *rnp; if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || - !rcu_gp_in_progress(rsp)) + !rcu_gp_in_progress()) return; - rcu_stall_kick_kthreads(rsp); + rcu_stall_kick_kthreads(); j = jiffies; /* * Lots of memory barriers to reject false positives. * - * The idea is to pick up rsp->gp_seq, then rsp->jiffies_stall, - * then rsp->gp_start, and finally another copy of rsp->gp_seq. - * These values are updated in the opposite order with memory - * barriers (or equivalent) during grace-period initialization - * and cleanup. Now, a false positive can occur if we get an new - * value of rsp->gp_start and a old value of rsp->jiffies_stall. - * But given the memory barriers, the only way that this can happen - * is if one grace period ends and another starts between these - * two fetches. This is detected by comparing the second fetch - * of rsp->gp_seq with the previous fetch from rsp->gp_seq. + * The idea is to pick up rcu_state.gp_seq, then + * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally + * another copy of rcu_state.gp_seq. These values are updated in + * the opposite order with memory barriers (or equivalent) during + * grace-period initialization and cleanup. Now, a false positive + * can occur if we get an new value of rcu_state.gp_start and a old + * value of rcu_state.jiffies_stall. But given the memory barriers, + * the only way that this can happen is if one grace period ends + * and another starts between these two fetches. This is detected + * by comparing the second fetch of rcu_state.gp_seq with the + * previous fetch from rcu_state.gp_seq. * - * Given this check, comparisons of jiffies, rsp->jiffies_stall, - * and rsp->gp_start suffice to forestall false positives. + * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, + * and rcu_state.gp_start suffice to forestall false positives. */ - gs1 = READ_ONCE(rsp->gp_seq); + gs1 = READ_ONCE(rcu_state.gp_seq); smp_rmb(); /* Pick up ->gp_seq first... */ - js = READ_ONCE(rsp->jiffies_stall); + js = READ_ONCE(rcu_state.jiffies_stall); smp_rmb(); /* ...then ->jiffies_stall before the rest... */ - gps = READ_ONCE(rsp->gp_start); + gps = READ_ONCE(rcu_state.gp_start); smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ - gs2 = READ_ONCE(rsp->gp_seq); + gs2 = READ_ONCE(rcu_state.gp_seq); if (gs1 != gs2 || ULONG_CMP_LT(j, js) || ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - if (rcu_gp_in_progress(rsp) && + if (rcu_gp_in_progress() && (READ_ONCE(rnp->qsmask) & rdp->grpmask) && - cmpxchg(&rsp->jiffies_stall, js, jn) == js) { + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(rsp); + print_cpu_stall(); - } else if (rcu_gp_in_progress(rsp) && + } else if (rcu_gp_in_progress() && ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rsp->jiffies_stall, js, jn) == js) { + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp, gs2); + print_other_cpu_stall(gs2); } } @@ -1569,17 +1427,14 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) */ void rcu_cpu_stall_reset(void) { - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); + WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); } /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, gp_seq_req, + trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req, rnp->level, rnp->grplo, rnp->grphi, s); } @@ -1603,7 +1458,6 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, unsigned long gp_seq_req) { bool ret = false; - struct rcu_state *rsp = rdp->rsp; struct rcu_node *rnp; /* @@ -1647,18 +1501,18 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, } /* If GP already in progress, just leave, otherwise start one. */ - if (rcu_gp_in_progress(rsp)) { + if (rcu_gp_in_progress()) { trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot")); goto unlock_out; } trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); - WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); - rsp->gp_req_activity = jiffies; - if (!rsp->gp_kthread) { + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT); + rcu_state.gp_req_activity = jiffies; + if (!rcu_state.gp_kthread) { trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1675,10 +1529,10 @@ unlock_out: * Clean up any old requests for the just-ended grace period. Also return * whether any additional grace periods have been requested. */ -static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static bool rcu_future_gp_cleanup(struct rcu_node *rnp) { bool needmore; - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); if (!needmore) @@ -1689,19 +1543,18 @@ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) } /* - * Awaken the grace-period kthread for the specified flavor of RCU. - * Don't do a self-awaken, and don't bother awakening when there is - * nothing for the grace-period kthread to do (as in several CPUs - * raced to awaken, and we lost), and finally don't try to awaken - * a kthread that has not yet been created. + * Awaken the grace-period kthread. Don't do a self-awaken, and don't + * bother awakening when there is nothing for the grace-period kthread + * to do (as in several CPUs raced to awaken, and we lost), and finally + * don't try to awaken a kthread that has not yet been created. */ -static void rcu_gp_kthread_wake(struct rcu_state *rsp) +static void rcu_gp_kthread_wake(void) { - if (current == rsp->gp_kthread || - !READ_ONCE(rsp->gp_flags) || - !rsp->gp_kthread) + if (current == rcu_state.gp_kthread || + !READ_ONCE(rcu_state.gp_flags) || + !rcu_state.gp_kthread) return; - swake_up_one(&rsp->gp_wq); + swake_up_one(&rcu_state.gp_wq); } /* @@ -1716,8 +1569,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) * * The caller must hold rnp->lock with interrupts disabled. */ -static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) +static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) { unsigned long gp_seq_req; bool ret = false; @@ -1738,15 +1590,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - gp_seq_req = rcu_seq_snap(&rsp->gp_seq); + gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq); if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req)) ret = rcu_start_this_gp(rnp, rdp, gp_seq_req); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccWaitCB")); + trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB")); else - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccReadyCB")); + trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB")); return ret; } @@ -1757,25 +1609,24 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * that a new grace-period request be made, invokes rcu_accelerate_cbs() * while holding the leaf rcu_node structure's ->lock. */ -static void rcu_accelerate_cbs_unlocked(struct rcu_state *rsp, - struct rcu_node *rnp, +static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, struct rcu_data *rdp) { unsigned long c; bool needwake; lockdep_assert_irqs_disabled(); - c = rcu_seq_snap(&rsp->gp_seq); + c = rcu_seq_snap(&rcu_state.gp_seq); if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { /* Old request still live, so mark recent callbacks. */ (void)rcu_segcblist_accelerate(&rdp->cblist, c); return; } raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rnp, rdp); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); } /* @@ -1788,8 +1639,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_state *rsp, * * The caller must hold rnp->lock with interrupts disabled. */ -static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) +static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); @@ -1804,7 +1654,7 @@ static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq); /* Classify any remaining callbacks. */ - return rcu_accelerate_cbs(rsp, rnp, rdp); + return rcu_accelerate_cbs(rnp, rdp); } /* @@ -1813,8 +1663,7 @@ static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * structure corresponding to the current CPU, and must have irqs disabled. * Returns true if the grace-period kthread needs to be awakened. */ -static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) +static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret; bool need_gp; @@ -1827,10 +1676,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || unlikely(READ_ONCE(rdp->gpwrap))) { - ret = rcu_advance_cbs(rsp, rnp, rdp); /* Advance callbacks. */ - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuend")); + ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */ + trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); } else { - ret = rcu_accelerate_cbs(rsp, rnp, rdp); /* Recent callbacks. */ + ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */ } /* Now handle the beginnings of any new-to-this-CPU grace periods. */ @@ -1841,10 +1690,9 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * set up to detect a quiescent state, otherwise don't * go looking for one. */ - trace_rcu_grace_period(rsp->name, rnp->gp_seq, TPS("cpustart")); + trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); } @@ -1856,7 +1704,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, return ret; } -static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) +static void note_gp_changes(struct rcu_data *rdp) { unsigned long flags; bool needwake; @@ -1870,16 +1718,16 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); return; } - needwake = __note_gp_changes(rsp, rnp, rdp); + needwake = __note_gp_changes(rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); } -static void rcu_gp_slow(struct rcu_state *rsp, int delay) +static void rcu_gp_slow(int delay) { if (delay > 0 && - !(rcu_seq_ctr(rsp->gp_seq) % + !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_uninterruptible(delay); } @@ -1887,24 +1735,24 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) /* * Initialize a new grace period. Return false if no grace period required. */ -static bool rcu_gp_init(struct rcu_state *rsp) +static bool rcu_gp_init(void) { unsigned long flags; unsigned long oldmask; unsigned long mask; struct rcu_data *rdp; - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); - WRITE_ONCE(rsp->gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - if (!READ_ONCE(rsp->gp_flags)) { + if (!READ_ONCE(rcu_state.gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq_rcu_node(rnp); return false; } - WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ + WRITE_ONCE(rcu_state.gp_flags, 0); /* Clear all flags: New GP. */ - if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { + if (WARN_ON_ONCE(rcu_gp_in_progress())) { /* * Grace period already in progress, don't start another. * Not supposed to be able to happen. @@ -1914,10 +1762,10 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* Advance to a new grace period and initialize state. */ - record_gp_stall_check_time(rsp); + record_gp_stall_check_time(); /* Record GP times before starting GP, hence rcu_seq_start(). */ - rcu_seq_start(&rsp->gp_seq); - trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("start")); + rcu_seq_start(&rcu_state.gp_seq); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -1926,15 +1774,15 @@ static bool rcu_gp_init(struct rcu_state *rsp) * for subsequent online CPUs, and that quiescent-state forcing * will handle subsequent offline CPUs. */ - rsp->gp_state = RCU_GP_ONOFF; - rcu_for_each_leaf_node(rsp, rnp) { - spin_lock(&rsp->ofl_lock); + rcu_state.gp_state = RCU_GP_ONOFF; + rcu_for_each_leaf_node(rnp) { + raw_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_irq_rcu_node(rnp); - spin_unlock(&rsp->ofl_lock); + raw_spin_unlock(&rcu_state.ofl_lock); continue; } @@ -1970,45 +1818,45 @@ static bool rcu_gp_init(struct rcu_state *rsp) } raw_spin_unlock_irq_rcu_node(rnp); - spin_unlock(&rsp->ofl_lock); + raw_spin_unlock(&rcu_state.ofl_lock); } - rcu_gp_slow(rsp, gp_preinit_delay); /* Races with CPU hotplug. */ + rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ /* * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first order, - * starting from the root rcu_node structure, relying on the layout - * of the tree within the rsp->node[] array. Note that other CPUs - * will access only the leaves of the hierarchy, thus seeing that no - * grace period is in progress, at least until the corresponding - * leaf node has been initialized. + * structures for all currently online CPUs in breadth-first + * order, starting from the root rcu_node structure, relying on the + * layout of the tree within the rcu_state.node[] array. Note that + * other CPUs will access only the leaves of the hierarchy, thus + * seeing that no grace period is in progress, at least until the + * corresponding leaf node has been initialized. * * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ - rsp->gp_state = RCU_GP_INIT; - rcu_for_each_node_breadth_first(rsp, rnp) { - rcu_gp_slow(rsp, gp_init_delay); + rcu_state.gp_state = RCU_GP_INIT; + rcu_for_each_node_breadth_first(rnp) { + rcu_gp_slow(gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); - rdp = this_cpu_ptr(rsp->rda); - rcu_preempt_check_blocked_tasks(rsp, rnp); + rdp = this_cpu_ptr(&rcu_data); + rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); + WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq); if (rnp == rdp->mynode) - (void)__note_gp_changes(rsp, rnp, rdp); + (void)__note_gp_changes(rnp, rdp); rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, + trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); /* Quiescent states for tasks on any now-offline CPUs. */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); else raw_spin_unlock_irq_rcu_node(rnp); cond_resched_tasks_rcu_qs(); - WRITE_ONCE(rsp->gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_activity, jiffies); } return true; @@ -2018,12 +1866,12 @@ static bool rcu_gp_init(struct rcu_state *rsp) * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state * time. */ -static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) +static bool rcu_gp_fqs_check_wake(int *gfp) { - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); /* Someone like call_rcu() requested a force-quiescent-state scan. */ - *gfp = READ_ONCE(rsp->gp_flags); + *gfp = READ_ONCE(rcu_state.gp_flags); if (*gfp & RCU_GP_FLAG_FQS) return true; @@ -2037,45 +1885,110 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) /* * Do one round of quiescent-state forcing. */ -static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) +static void rcu_gp_fqs(bool first_time) { - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); - WRITE_ONCE(rsp->gp_activity, jiffies); - rsp->n_force_qs++; + WRITE_ONCE(rcu_state.gp_activity, jiffies); + rcu_state.n_force_qs++; if (first_time) { /* Collect dyntick-idle snapshots. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); + force_qs_rnp(dyntick_save_progress_counter); } else { /* Handle dyntick-idle and offline CPUs. */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + force_qs_rnp(rcu_implicit_dynticks_qs); } /* Clear flag to prevent immediate re-entry. */ - if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq_rcu_node(rnp); - WRITE_ONCE(rsp->gp_flags, - READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, + READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq_rcu_node(rnp); } } /* + * Loop doing repeated quiescent-state forcing until the grace period ends. + */ +static void rcu_gp_fqs_loop(void) +{ + bool first_gp_fqs; + int gf; + unsigned long j; + int ret; + struct rcu_node *rnp = rcu_get_root(); + + first_gp_fqs = true; + j = READ_ONCE(jiffies_till_first_fqs); + ret = 0; + for (;;) { + if (!ret) { + rcu_state.jiffies_force_qs = jiffies + j; + WRITE_ONCE(rcu_state.jiffies_kick_kthreads, + jiffies + 3 * j); + } + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), + TPS("fqswait")); + rcu_state.gp_state = RCU_GP_WAIT_FQS; + ret = swait_event_idle_timeout_exclusive( + rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + rcu_state.gp_state = RCU_GP_DOING_FQS; + /* Locking provides needed memory barriers. */ + /* If grace period done, leave loop. */ + if (!READ_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)) + break; + /* If time for quiescent-state forcing, do it. */ + if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || + (gf & RCU_GP_FLAG_FQS)) { + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), + TPS("fqsstart")); + rcu_gp_fqs(first_gp_fqs); + first_gp_fqs = false; + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), + TPS("fqsend")); + cond_resched_tasks_rcu_qs(); + WRITE_ONCE(rcu_state.gp_activity, jiffies); + ret = 0; /* Force full wait till next FQS. */ + j = READ_ONCE(jiffies_till_next_fqs); + } else { + /* Deal with stray signal. */ + cond_resched_tasks_rcu_qs(); + WRITE_ONCE(rcu_state.gp_activity, jiffies); + WARN_ON(signal_pending(current)); + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), + TPS("fqswaitsig")); + ret = 1; /* Keep old FQS timing. */ + j = jiffies; + if (time_after(jiffies, rcu_state.jiffies_force_qs)) + j = 1; + else + j = rcu_state.jiffies_force_qs - j; + } + } +} + +/* * Clean up after the old grace period. */ -static void rcu_gp_cleanup(struct rcu_state *rsp) +static void rcu_gp_cleanup(void) { unsigned long gp_duration; bool needgp = false; unsigned long new_gp_seq; struct rcu_data *rdp; - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); struct swait_queue_head *sq; - WRITE_ONCE(rsp->gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - gp_duration = jiffies - rsp->gp_start; - if (gp_duration > rsp->gp_max) - rsp->gp_max = gp_duration; + gp_duration = jiffies - rcu_state.gp_start; + if (gp_duration > rcu_state.gp_max) + rcu_state.gp_max = gp_duration; /* * We know the grace period is complete, but to everyone else @@ -2096,48 +2009,50 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * the rcu_node structures before the beginning of the next grace * period is recorded in any of the rcu_node structures. */ - new_gp_seq = rsp->gp_seq; + new_gp_seq = rcu_state.gp_seq; rcu_seq_end(&new_gp_seq); - rcu_for_each_node_breadth_first(rsp, rnp) { + rcu_for_each_node_breadth_first(rnp) { raw_spin_lock_irq_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) - dump_blkd_tasks(rsp, rnp, 10); + dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->gp_seq, new_gp_seq); - rdp = this_cpu_ptr(rsp->rda); + rdp = this_cpu_ptr(&rcu_data); if (rnp == rdp->mynode) - needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; + needgp = __note_gp_changes(rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ - needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp; + needgp = rcu_future_gp_cleanup(rnp) || needgp; sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); cond_resched_tasks_rcu_qs(); - WRITE_ONCE(rsp->gp_activity, jiffies); - rcu_gp_slow(rsp, gp_cleanup_delay); + WRITE_ONCE(rcu_state.gp_activity, jiffies); + rcu_gp_slow(gp_cleanup_delay); } - rnp = rcu_get_root(rsp); - raw_spin_lock_irq_rcu_node(rnp); /* GP before rsp->gp_seq update. */ + rnp = rcu_get_root(); + raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */ /* Declare grace period done. */ - rcu_seq_end(&rsp->gp_seq); - trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("end")); - rsp->gp_state = RCU_GP_IDLE; + rcu_seq_end(&rcu_state.gp_seq); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); + rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ - rdp = this_cpu_ptr(rsp->rda); + rdp = this_cpu_ptr(&rcu_data); if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, TPS("CleanupMore")); needgp = true; } /* Advance CBs to reduce false positives below. */ - if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { - WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - rsp->gp_req_activity = jiffies; - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), + if (!rcu_accelerate_cbs(rnp, rdp) && needgp) { + WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); + rcu_state.gp_req_activity = jiffies; + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), TPS("newreq")); } else { - WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); + WRITE_ONCE(rcu_state.gp_flags, + rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); } @@ -2145,116 +2060,60 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* * Body of kthread that handles grace periods. */ -static int __noreturn rcu_gp_kthread(void *arg) +static int __noreturn rcu_gp_kthread(void *unused) { - bool first_gp_fqs; - int gf; - unsigned long j; - int ret; - struct rcu_state *rsp = arg; - struct rcu_node *rnp = rcu_get_root(rsp); - rcu_bind_gp_kthread(); for (;;) { /* Handle grace-period start. */ for (;;) { - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), TPS("reqwait")); - rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & - RCU_GP_FLAG_INIT); - rsp->gp_state = RCU_GP_DONE_GPS; + rcu_state.gp_state = RCU_GP_WAIT_GPS; + swait_event_idle_exclusive(rcu_state.gp_wq, + READ_ONCE(rcu_state.gp_flags) & + RCU_GP_FLAG_INIT); + rcu_state.gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ - if (rcu_gp_init(rsp)) + if (rcu_gp_init()) break; cond_resched_tasks_rcu_qs(); - WRITE_ONCE(rsp->gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_activity, jiffies); WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), TPS("reqwaitsig")); } /* Handle quiescent-state forcing. */ - first_gp_fqs = true; - j = jiffies_till_first_fqs; - ret = 0; - for (;;) { - if (!ret) { - rsp->jiffies_force_qs = jiffies + j; - WRITE_ONCE(rsp->jiffies_kick_kthreads, - jiffies + 3 * j); - } - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), - TPS("fqswait")); - rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_idle_timeout_exclusive(rsp->gp_wq, - rcu_gp_fqs_check_wake(rsp, &gf), j); - rsp->gp_state = RCU_GP_DOING_FQS; - /* Locking provides needed memory barriers. */ - /* If grace period done, leave loop. */ - if (!READ_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)) - break; - /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || - (gf & RCU_GP_FLAG_FQS)) { - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), - TPS("fqsstart")); - rcu_gp_fqs(rsp, first_gp_fqs); - first_gp_fqs = false; - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), - TPS("fqsend")); - cond_resched_tasks_rcu_qs(); - WRITE_ONCE(rsp->gp_activity, jiffies); - ret = 0; /* Force full wait till next FQS. */ - j = jiffies_till_next_fqs; - } else { - /* Deal with stray signal. */ - cond_resched_tasks_rcu_qs(); - WRITE_ONCE(rsp->gp_activity, jiffies); - WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), - TPS("fqswaitsig")); - ret = 1; /* Keep old FQS timing. */ - j = jiffies; - if (time_after(jiffies, rsp->jiffies_force_qs)) - j = 1; - else - j = rsp->jiffies_force_qs - j; - } - } + rcu_gp_fqs_loop(); /* Handle grace-period end. */ - rsp->gp_state = RCU_GP_CLEANUP; - rcu_gp_cleanup(rsp); - rsp->gp_state = RCU_GP_CLEANED; + rcu_state.gp_state = RCU_GP_CLEANUP; + rcu_gp_cleanup(); + rcu_state.gp_state = RCU_GP_CLEANED; } } /* - * Report a full set of quiescent states to the specified rcu_state data - * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period - * kthread if another grace period is required. Whether we wake - * the grace-period kthread or it awakens itself for the next round - * of quiescent-state forcing, that kthread will clean up after the - * just-completed grace period. Note that the caller must hold rnp->lock, - * which is released before return. + * Report a full set of quiescent states to the rcu_state data structure. + * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if + * another grace period is required. Whether we wake the grace-period + * kthread or it awakens itself for the next round of quiescent-state + * forcing, that kthread will clean up after the just-completed grace + * period. Note that the caller must hold rnp->lock, which is released + * before return. */ -static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +static void rcu_report_qs_rsp(unsigned long flags) + __releases(rcu_get_root()->lock) { - raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp)); - WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); - WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - rcu_gp_kthread_wake(rsp); + raw_lockdep_assert_held_rcu_node(rcu_get_root()); + WARN_ON_ONCE(!rcu_gp_in_progress()); + WRITE_ONCE(rcu_state.gp_flags, + READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags); + rcu_gp_kthread_wake(); } /* @@ -2271,9 +2130,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * disabled. This allows propagating quiescent state due to resumed tasks * during grace-period initialization. */ -static void -rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long gps, unsigned long flags) +static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, + unsigned long gps, unsigned long flags) __releases(rnp->lock) { unsigned long oldmask = 0; @@ -2296,7 +2154,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; - trace_rcu_quiescent_state_report(rsp->name, rnp->gp_seq, + trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, !!rnp->gp_tasks); @@ -2326,19 +2184,18 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * state for this grace period. Invoke rcu_report_qs_rsp() * to clean up and start the next grace period if one is needed. */ - rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ + rcu_report_qs_rsp(flags); /* releases rnp->lock. */ } /* * Record a quiescent state for all tasks that were previously queued * on the specified rcu_node structure and that were blocking the current - * RCU grace period. The caller must hold the specified rnp->lock with + * RCU grace period. The caller must hold the corresponding rnp->lock with * irqs disabled, and this lock is released upon return, but irqs remain * disabled. */ static void __maybe_unused -rcu_report_unblock_qs_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) +rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { unsigned long gps; @@ -2346,8 +2203,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(rcu_state_p == &rcu_sched_state) || - WARN_ON_ONCE(rsp != rcu_state_p) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2361,7 +2217,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp, * Only one rcu_node structure in the tree, so don't * try to report up to its nonexistent parent! */ - rcu_report_qs_rsp(rsp, flags); + rcu_report_qs_rsp(flags); return; } @@ -2370,7 +2226,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp, mask = rnp->grpmask; raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ - rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); + rcu_report_qs_rnp(mask, rnp_p, gps, flags); } /* @@ -2378,7 +2234,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp, * structure. This must be called from the specified CPU. */ static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) +rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -2397,7 +2253,6 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -2411,12 +2266,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rnp, rdp); - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); } } @@ -2427,10 +2282,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * quiescent state for this grace period, and record that fact if so. */ static void -rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) +rcu_check_quiescent_state(struct rcu_data *rdp) { /* Check for grace-period ends and beginnings. */ - note_gp_changes(rsp, rdp); + note_gp_changes(rdp); /* * Does this CPU still need to do its part for current grace period? @@ -2450,24 +2305,26 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp); + rcu_report_qs_rdp(rdp->cpu, rdp); } /* - * Trace the fact that this CPU is going offline. + * Near the end of the offline process. Trace the fact that this CPU + * is going offline. */ -static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) +int rcutree_dying_cpu(unsigned int cpu) { RCU_TRACE(bool blkd;) - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);) RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; + return 0; RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) - trace_rcu_grace_period(rsp->name, rnp->gp_seq, + trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); + return 0; } /* @@ -2521,23 +2378,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) * There can only be one CPU hotplug operation at a time, so no need for * explicit locking. */ -static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) +int rcutree_dead_cpu(unsigned int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; + return 0; /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); + /* Do any needed no-CB deferred wakeups from this CPU. */ + do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); + return 0; } /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. */ -static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_data *rdp) { unsigned long flags; struct rcu_head *rhp; @@ -2546,10 +2406,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { - trace_rcu_batch_start(rsp->name, + trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), 0); - trace_rcu_batch_end(rsp->name, 0, + trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2564,7 +2424,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), + trace_rcu_batch_start(rcu_state.name, + rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); local_irq_restore(flags); @@ -2573,7 +2434,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rhp = rcu_cblist_dequeue(&rcl); for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { debug_rcu_head_unqueue(rhp); - if (__rcu_reclaim(rsp->name, rhp)) + if (__rcu_reclaim(rcu_state.name, rhp)) rcu_cblist_dequeued_lazy(&rcl); /* * Stop only if limit reached and CPU has something to do. @@ -2587,7 +2448,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); count = -rcl.len; - trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(), + trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); /* Update counts and requeue any remaining callbacks. */ @@ -2603,7 +2464,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ if (count == 0 && rdp->qlen_last_fqs_check != 0) { rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->n_force_qs_snap = rcu_state.n_force_qs; } else if (count < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = count; @@ -2631,37 +2492,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) void rcu_check_callbacks(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); - increment_cpu_stall_ticks(); - if (user || rcu_is_cpu_rrupt_from_idle()) { - - /* - * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. - * - * No memory barrier is required here because both - * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local - * variables that other CPUs neither access nor modify, - * at least not while the corresponding CPU is online. - */ - - rcu_sched_qs(); - rcu_bh_qs(); - rcu_note_voluntary_context_switch(current); - - } else if (!in_softirq()) { - - /* - * Get here if this CPU did not take its interrupt from - * softirq, in other words, if it is not interrupting - * a rcu_bh read-side critical section. This is an _bh - * critical section, so note it. - */ - - rcu_bh_qs(); + raw_cpu_inc(rcu_data.ticks_this_gp); + /* The load-acquire pairs with the store-release setting to true. */ + if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { + /* Idle and userspace execution already are quiescent states. */ + if (!rcu_is_cpu_rrupt_from_idle() && !user) { + set_tsk_need_resched(current); + set_preempt_need_resched(); + } + __this_cpu_write(rcu_data.rcu_urgent_qs, false); } - rcu_preempt_check_callbacks(); + rcu_flavor_check_callbacks(user); if (rcu_pending()) invoke_rcu_core(); @@ -2675,20 +2516,19 @@ void rcu_check_callbacks(int user) * * The caller must have suppressed start of new grace periods. */ -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) +static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) { int cpu; unsigned long flags; unsigned long mask; struct rcu_node *rnp; - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (rcu_state_p == &rcu_sched_state || - rsp != rcu_state_p || + if (!IS_ENABLED(CONFIG_PREEMPT) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they @@ -2705,13 +2545,13 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { - if (f(per_cpu_ptr(rsp->rda, cpu))) + if (f(per_cpu_ptr(&rcu_data, cpu))) mask |= bit; } } if (mask != 0) { /* Idle/offline CPUs, report (releases rnp->lock). */ - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2723,7 +2563,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(struct rcu_state *rsp) +static void force_quiescent_state(void) { unsigned long flags; bool ret; @@ -2731,9 +2571,9 @@ static void force_quiescent_state(struct rcu_state *rsp) struct rcu_node *rnp_old = NULL; /* Funnel through hierarchy to reduce memory contention. */ - rnp = __this_cpu_read(rsp->rda->mynode); + rnp = __this_cpu_read(rcu_data.mynode); for (; rnp != NULL; rnp = rnp->parent) { - ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || + ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) || !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); @@ -2741,18 +2581,19 @@ static void force_quiescent_state(struct rcu_state *rsp) return; rnp_old = rnp; } - /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ + /* rnp_old == rcu_get_root(), rnp == NULL. */ /* Reached the root of the rcu_node tree, acquire lock. */ raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); - if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } - WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, + READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); } /* @@ -2760,30 +2601,29 @@ static void force_quiescent_state(struct rcu_state *rsp) * RCU to come out of its idle mode. */ static void -rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) +rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp) { const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ; unsigned long flags; unsigned long j; - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp_root = rcu_get_root(); static atomic_t warned = ATOMIC_INIT(0); - if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress(rsp) || + if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) return; j = jiffies; /* Expensive access, and in common case don't get here. */ - if (time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || + if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_read(&warned)) return; raw_spin_lock_irqsave_rcu_node(rnp, flags); j = jiffies; - if (rcu_gp_in_progress(rsp) || + if (rcu_gp_in_progress() || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_read(&warned)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -2793,21 +2633,21 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ j = jiffies; - if (rcu_gp_in_progress(rsp) || + if (rcu_gp_in_progress() || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rsp->gp_req_activity + gpssdelay) || - time_before(j, rsp->gp_activity + gpssdelay) || + time_before(j, rcu_state.gp_req_activity + gpssdelay) || + time_before(j, rcu_state.gp_activity + gpssdelay) || atomic_xchg(&warned, 1)) { raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", - __func__, (long)READ_ONCE(rsp->gp_seq), + __func__, (long)READ_ONCE(rcu_state.gp_seq), (long)READ_ONCE(rnp_root->gp_seq_needed), - j - rsp->gp_req_activity, j - rsp->gp_activity, - rsp->gp_flags, rsp->gp_state, rsp->name, - rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); + j - rcu_state.gp_req_activity, j - rcu_state.gp_activity, + rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name, + rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL); WARN_ON(1); if (rnp_root != rnp) raw_spin_unlock_rcu_node(rnp_root); @@ -2815,69 +2655,65 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * This does the RCU core processing work for the specified rcu_state - * and rcu_data structures. This may be called only from the CPU to - * whom the rdp belongs. + * This does the RCU core processing work for the specified rcu_data + * structures. This may be called only from the CPU to whom the rdp + * belongs. */ -static void -__rcu_process_callbacks(struct rcu_state *rsp) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { unsigned long flags; - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + if (cpu_is_offline(smp_processor_id())) + return; + trace_rcu_utilization(TPS("Start RCU core")); WARN_ON_ONCE(!rdp->beenonline); + /* Report any deferred quiescent states if preemption enabled. */ + if (!(preempt_count() & PREEMPT_MASK)) { + rcu_preempt_deferred_qs(current); + } else if (rcu_preempt_need_deferred_qs(current)) { + set_tsk_need_resched(current); + set_preempt_need_resched(); + } + /* Update RCU state based on any recent quiescent states. */ - rcu_check_quiescent_state(rsp, rdp); + rcu_check_quiescent_state(rdp); /* No grace period and unregistered callbacks? */ - if (!rcu_gp_in_progress(rsp) && + if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist)) { local_irq_save(flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) - rcu_accelerate_cbs_unlocked(rsp, rnp, rdp); + rcu_accelerate_cbs_unlocked(rnp, rdp); local_irq_restore(flags); } - rcu_check_gp_start_stall(rsp, rnp, rdp); + rcu_check_gp_start_stall(rnp, rdp); /* If there are callbacks ready, invoke them. */ if (rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_callbacks(rsp, rdp); + invoke_rcu_callbacks(rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); -} - -/* - * Do RCU core processing for the current CPU. - */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) -{ - struct rcu_state *rsp; - - if (cpu_is_offline(smp_processor_id())) - return; - trace_rcu_utilization(TPS("Start RCU core")); - for_each_rcu_flavor(rsp) - __rcu_process_callbacks(rsp); trace_rcu_utilization(TPS("End RCU core")); } /* - * Schedule RCU callback invocation. If the specified type of RCU - * does not support RCU priority boosting, just do a direct call, - * otherwise wake up the per-CPU kernel kthread. Note that because we - * are running on the current CPU with softirqs disabled, the - * rcu_cpu_kthread_task cannot disappear out from under us. + * Schedule RCU callback invocation. If the running implementation of RCU + * does not support RCU priority boosting, just do a direct call, otherwise + * wake up the per-CPU kernel kthread. Note that because we are running + * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task + * cannot disappear out from under us. */ -static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +static void invoke_rcu_callbacks(struct rcu_data *rdp) { if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) return; - if (likely(!rsp->boost)) { - rcu_do_batch(rsp, rdp); + if (likely(!rcu_state.boost)) { + rcu_do_batch(rdp); return; } invoke_rcu_callbacks_kthread(); @@ -2892,8 +2728,8 @@ static void invoke_rcu_core(void) /* * Handle any core-RCU processing required by a call_rcu() invocation. */ -static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, - struct rcu_head *head, unsigned long flags) +static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, + unsigned long flags) { /* * If called from an extended quiescent state, invoke the RCU @@ -2917,18 +2753,18 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ - note_gp_changes(rsp, rdp); + note_gp_changes(rdp); /* Start a new grace period if one not already started. */ - if (!rcu_gp_in_progress(rsp)) { - rcu_accelerate_cbs_unlocked(rsp, rdp->mynode, rdp); + if (!rcu_gp_in_progress()) { + rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; - if (rsp->n_force_qs == rdp->n_force_qs_snap && + if (rcu_state.n_force_qs == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) - force_quiescent_state(rsp); - rdp->n_force_qs_snap = rsp->n_force_qs; + force_quiescent_state(); + rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } } @@ -2944,12 +2780,11 @@ static void rcu_leak_callback(struct rcu_head *rhp) /* * Helper function for call_rcu() and friends. The cpu argument will * normally be -1, indicating "currently running CPU". It may specify - * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() + * a CPU only if that CPU is a no-CBs CPU. Currently, only rcu_barrier() * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, - struct rcu_state *rsp, int cpu, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) { unsigned long flags; struct rcu_data *rdp; @@ -2971,14 +2806,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, head->func = func; head->next = NULL; local_irq_save(flags); - rdp = this_cpu_ptr(rsp->rda); + rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { int offline; if (cpu != -1) - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = per_cpu_ptr(&rcu_data, cpu); if (likely(rdp->mynode)) { /* Post-boot, so this should be for a no-CBs CPU. */ offline = !__call_rcu_nocb(rdp, head, lazy, flags); @@ -3001,72 +2836,60 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, rcu_idle_count_callbacks_posted(); if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, + trace_rcu_kfree_callback(rcu_state.name, head, + (unsigned long)func, rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); else - trace_rcu_callback(rsp->name, head, + trace_rcu_callback(rcu_state.name, head, rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ - __call_rcu_core(rsp, rdp, head, flags); + __call_rcu_core(rdp, head, flags); local_irq_restore(flags); } /** - * call_rcu_sched() - Queue an RCU for invocation after sched grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_sched() assumes - * that the read-side critical sections end on enabling of preemption - * or on voluntary preemption. - * RCU read-side critical sections are delimited by: - * - * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR - * - anything that disables preemption. - * - * These may be nested. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func, &rcu_sched_state, -1, 0); -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/** - * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. + * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period * * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by: - * - * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR - * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. - * - * These may be nested. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func, &rcu_bh_state, -1, 0); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and + * may be nested. In addition, regions of code across which interrupts, + * preemption, or softirqs have been disabled also serve as RCU read-side + * critical sections. This includes hardware interrupt handlers, softirq + * handlers, and NMI handlers. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). + */ +void call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + __call_rcu(head, func, -1, 0); +} +EXPORT_SYMBOL_GPL(call_rcu); /* * Queue an RCU callback for lazy invocation after a grace period. @@ -3075,110 +2898,12 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); * callbacks in the list of pending callbacks. Until then, this * function may only be called from __kfree_rcu(). */ -void kfree_call_rcu(struct rcu_head *head, - rcu_callback_t func) +void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, rcu_state_p, -1, 1); + __call_rcu(head, func, -1, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); -/* - * Because a context switch is a grace period for RCU-sched and RCU-bh, - * any blocking grace-period wait automatically implies a grace period - * if there is only one CPU online at any point time during execution - * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds - * some overhead: RCU still operates correctly. - */ -static int rcu_blocking_is_gp(void) -{ - int ret; - - might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - ret = num_online_cpus() <= 1; - preempt_enable(); - return ret; -} - -/** - * synchronize_sched - wait until an rcu-sched grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-sched - * grace period has elapsed, in other words after all currently executing - * rcu-sched read-side critical sections have completed. These read-side - * critical sections are delimited by rcu_read_lock_sched() and - * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), - * local_irq_disable(), and so on may be used in place of - * rcu_read_lock_sched(). - * - * This means that all preempt_disable code sequences, including NMI and - * non-threaded hardware-interrupt handlers, in progress on entry will - * have completed before this primitive returns. However, this does not - * guarantee that softirq handlers will have completed, since in some - * kernels, these handlers can run in process context, and can block. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_sched() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-sched read-side critical section whose beginning - * preceded the call to synchronize_sched(). In addition, each CPU having - * an RCU read-side critical section that extends beyond the return from - * synchronize_sched() is guaranteed to have executed a full memory barrier - * after the beginning of synchronize_sched() and before the beginning of - * that RCU read-side critical section. Note that these guarantees include - * CPUs that are offline, idle, or executing in user mode, as well as CPUs - * that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_sched(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - */ -void synchronize_sched(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU-sched read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_gp_is_expedited()) - synchronize_sched_expedited(); - else - wait_rcu_gp(call_rcu_sched); -} -EXPORT_SYMBOL_GPL(synchronize_sched); - -/** - * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. - * - * Control will return to the caller some time after a full rcu_bh grace - * period has elapsed, in other words after all currently executing rcu_bh - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), - * and may be nested. - * - * See the description of synchronize_sched() for more detailed information - * on memory ordering guarantees. - */ -void synchronize_rcu_bh(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_bh_expedited(); - else - wait_rcu_gp(call_rcu_bh); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_bh); - /** * get_state_synchronize_rcu - Snapshot current RCU state * @@ -3193,7 +2918,7 @@ unsigned long get_state_synchronize_rcu(void) * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_state_p->gp_seq); + return rcu_seq_snap(&rcu_state.gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3213,70 +2938,30 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void cond_synchronize_rcu(unsigned long oldstate) { - if (!rcu_seq_done(&rcu_state_p->gp_seq, oldstate)) + if (!rcu_seq_done(&rcu_state.gp_seq, oldstate)) synchronize_rcu(); else smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); -/** - * get_state_synchronize_sched - Snapshot current RCU-sched state - * - * Returns a cookie that is used by a later call to cond_synchronize_sched() - * to determine whether or not a full grace period has elapsed in the - * meantime. - */ -unsigned long get_state_synchronize_sched(void) -{ - /* - * Any prior manipulation of RCU-protected data must happen - * before the load from ->gp_seq. - */ - smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_sched_state.gp_seq); -} -EXPORT_SYMBOL_GPL(get_state_synchronize_sched); - -/** - * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period - * - * @oldstate: return value from earlier call to get_state_synchronize_sched() - * - * If a full RCU-sched grace period has elapsed since the earlier call to - * get_state_synchronize_sched(), just return. Otherwise, invoke - * synchronize_sched() to wait for a full grace period. - * - * Yes, this function does not take counter wrap into account. But - * counter wrap is harmless. If the counter wraps, we have waited for - * more than 2 billion grace periods (and way more on a 64-bit system!), - * so waiting for one additional grace period should be just fine. - */ -void cond_synchronize_sched(unsigned long oldstate) -{ - if (!rcu_seq_done(&rcu_sched_state.gp_seq, oldstate)) - synchronize_sched(); - else - smp_mb(); /* Ensure GP ends before subsequent accesses. */ -} -EXPORT_SYMBOL_GPL(cond_synchronize_sched); - /* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, for the specified type of RCU, returning 1 if so. - * The checks are in order of increasing expense: checks that can be - * carried out against CPU-local state are performed first. However, - * we must check for CPU stalls first, else we might not get a chance. + * Check to see if there is any immediate RCU-related work to be done by + * the current CPU, returning 1 if so and zero otherwise. The checks are + * in order of increasing expense: checks that can be carried out against + * CPU-local state are performed first. However, we must check for CPU + * stalls first, else we might not get a chance. */ -static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) +static int rcu_pending(void) { + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rsp, rdp); + check_cpu_stall(rdp); /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ - if (rcu_nohz_full_cpu(rsp)) + if (rcu_nohz_full_cpu()) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ @@ -3288,7 +2973,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (!rcu_gp_in_progress(rsp) && + if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; @@ -3307,21 +2992,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -static int rcu_pending(void) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) - return 1; - return 0; -} - -/* * Return true if the specified CPU has any callback. If all_lazy is * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) @@ -3331,17 +3001,12 @@ static bool rcu_cpu_has_callbacks(bool *all_lazy) bool al = true; bool hc = false; struct rcu_data *rdp; - struct rcu_state *rsp; - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - if (rcu_segcblist_empty(&rdp->cblist)) - continue; + rdp = this_cpu_ptr(&rcu_data); + if (!rcu_segcblist_empty(&rdp->cblist)) { hc = true; - if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) { + if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) al = false; - break; - } } if (all_lazy) *all_lazy = al; @@ -3349,81 +3014,80 @@ static bool rcu_cpu_has_callbacks(bool *all_lazy) } /* - * Helper function for _rcu_barrier() tracing. If tracing is disabled, + * Helper function for rcu_barrier() tracing. If tracing is disabled, * the compiler is expected to optimize this away. */ -static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, - int cpu, unsigned long done) +static void rcu_barrier_trace(const char *s, int cpu, unsigned long done) { - trace_rcu_barrier(rsp->name, s, cpu, - atomic_read(&rsp->barrier_cpu_count), done); + trace_rcu_barrier(rcu_state.name, s, cpu, + atomic_read(&rcu_state.barrier_cpu_count), done); } /* - * RCU callback function for _rcu_barrier(). If we are last, wake - * up the task executing _rcu_barrier(). + * RCU callback function for rcu_barrier(). If we are last, wake + * up the task executing rcu_barrier(). */ static void rcu_barrier_callback(struct rcu_head *rhp) { - struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); - struct rcu_state *rsp = rdp->rsp; - - if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, TPS("LastCB"), -1, - rsp->barrier_sequence); - complete(&rsp->barrier_completion); + if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { + rcu_barrier_trace(TPS("LastCB"), -1, + rcu_state.barrier_sequence); + complete(&rcu_state.barrier_completion); } else { - _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence); + rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence); } } /* * Called with preemption disabled, and from cross-cpu IRQ context. */ -static void rcu_barrier_func(void *type) +static void rcu_barrier_func(void *unused) { - struct rcu_state *rsp = type; - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); - _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence); + rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { - atomic_inc(&rsp->barrier_cpu_count); + atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1, - rsp->barrier_sequence); + rcu_barrier_trace(TPS("IRQNQ"), -1, + rcu_state.barrier_sequence); } } -/* - * Orchestrate the specified type of RCU barrier, waiting for all - * RCU callbacks of the specified type to complete. +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + * + * Note that this primitive does not necessarily wait for an RCU grace period + * to complete. For example, if there are no RCU callbacks queued anywhere + * in the system, then rcu_barrier() is within its rights to return + * immediately, without waiting for anything, much less an RCU grace period. */ -static void _rcu_barrier(struct rcu_state *rsp) +void rcu_barrier(void) { int cpu; struct rcu_data *rdp; - unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); + unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); - _rcu_barrier_trace(rsp, TPS("Begin"), -1, s); + rcu_barrier_trace(TPS("Begin"), -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ - mutex_lock(&rsp->barrier_mutex); + mutex_lock(&rcu_state.barrier_mutex); /* Did someone else do our work for us? */ - if (rcu_seq_done(&rsp->barrier_sequence, s)) { - _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1, - rsp->barrier_sequence); + if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { + rcu_barrier_trace(TPS("EarlyExit"), -1, + rcu_state.barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ - mutex_unlock(&rsp->barrier_mutex); + mutex_unlock(&rcu_state.barrier_mutex); return; } /* Mark the start of the barrier operation. */ - rcu_seq_start(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence); + rcu_seq_start(&rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3431,8 +3095,8 @@ static void _rcu_barrier(struct rcu_state *rsp) * (or preemption of this task). Exclude CPU-hotplug operations * to ensure that no offline CPU has callbacks queued. */ - init_completion(&rsp->barrier_completion); - atomic_set(&rsp->barrier_cpu_count, 1); + init_completion(&rcu_state.barrier_completion); + atomic_set(&rcu_state.barrier_cpu_count, 1); get_online_cpus(); /* @@ -3443,26 +3107,26 @@ static void _rcu_barrier(struct rcu_state *rsp) for_each_possible_cpu(cpu) { if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) continue; - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_is_nocb_cpu(cpu)) { - if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { - _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu, - rsp->barrier_sequence); + if (!rcu_nocb_cpu_needs_barrier(cpu)) { + rcu_barrier_trace(TPS("OfflineNoCB"), cpu, + rcu_state.barrier_sequence); } else { - _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu, - rsp->barrier_sequence); + rcu_barrier_trace(TPS("OnlineNoCB"), cpu, + rcu_state.barrier_sequence); smp_mb__before_atomic(); - atomic_inc(&rsp->barrier_cpu_count); + atomic_inc(&rcu_state.barrier_cpu_count); __call_rcu(&rdp->barrier_head, - rcu_barrier_callback, rsp, cpu, 0); + rcu_barrier_callback, cpu, 0); } } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { - _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu, - rsp->barrier_sequence); - smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); + rcu_barrier_trace(TPS("OnlineQ"), cpu, + rcu_state.barrier_sequence); + smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); } else { - _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu, - rsp->barrier_sequence); + rcu_barrier_trace(TPS("OnlineNQ"), cpu, + rcu_state.barrier_sequence); } } put_online_cpus(); @@ -3471,37 +3135,20 @@ static void _rcu_barrier(struct rcu_state *rsp) * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rsp->barrier_cpu_count)) - complete(&rsp->barrier_completion); + if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) + complete(&rcu_state.barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ - wait_for_completion(&rsp->barrier_completion); + wait_for_completion(&rcu_state.barrier_completion); /* Mark the end of the barrier operation. */ - _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence); - rcu_seq_end(&rsp->barrier_sequence); + rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence); + rcu_seq_end(&rcu_state.barrier_sequence); /* Other rcu_barrier() invocations can now safely proceed. */ - mutex_unlock(&rsp->barrier_mutex); -} - -/** - * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. - */ -void rcu_barrier_bh(void) -{ - _rcu_barrier(&rcu_bh_state); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -/** - * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. - */ -void rcu_barrier_sched(void) -{ - _rcu_barrier(&rcu_sched_state); + mutex_unlock(&rcu_state.barrier_mutex); } -EXPORT_SYMBOL_GPL(rcu_barrier_sched); +EXPORT_SYMBOL_GPL(rcu_barrier); /* * Propagate ->qsinitmask bits up the rcu_node tree to account for the @@ -3535,46 +3182,46 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) * Do boot-time initialization of a CPU's per-CPU RCU data. */ static void __init -rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) +rcu_boot_init_percpu_data(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); - rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); - rdp->rcu_ofl_gp_seq = rsp->gp_seq; + WARN_ON_ONCE(rdp->dynticks_nesting != 1); + WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); + rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; - rdp->rcu_onl_gp_seq = rsp->gp_seq; + rdp->rcu_onl_gp_seq = rcu_state.gp_seq; rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; rdp->cpu = cpu; - rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); } /* - * Initialize a CPU's per-CPU RCU data. Note that only one online or + * Invoked early in the CPU-online process, when pretty much all services + * are available. The incoming CPU is not present. + * + * Initializes a CPU's per-CPU RCU data. Note that only one online or * offline event can be happening at a given time. Note also that we can * accept some slop in the rsp->gp_seq access due to the fact that this * CPU cannot possibly have any RCU callbacks in flight yet. */ -static void -rcu_init_percpu_data(int cpu, struct rcu_state *rsp) +int rcutree_prepare_cpu(unsigned int cpu) { unsigned long flags; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp = rcu_get_root(); /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->blimit = blimit; if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rdp->dynticks->dynticks_nesting = 1; /* CPU not up, no tearing. */ + rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ @@ -3589,25 +3236,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gp_seq = rnp->gp_seq; rdp->gp_seq_needed = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; - rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuonl")); + trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Invoked early in the CPU-online process, when pretty much all - * services are available. The incoming CPU is not present. - */ -int rcutree_prepare_cpu(unsigned int cpu) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - rcu_init_percpu_data(cpu, rsp); - rcu_prepare_kthreads(cpu); rcu_spawn_all_nocb_kthreads(cpu); @@ -3619,7 +3252,7 @@ int rcutree_prepare_cpu(unsigned int cpu) */ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) { - struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); } @@ -3633,15 +3266,12 @@ int rcutree_online_cpu(unsigned int cpu) unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_state *rsp; - for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->ffmask |= rdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask |= rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (IS_ENABLED(CONFIG_TREE_SRCU)) srcu_online_cpu(cpu); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) @@ -3660,15 +3290,12 @@ int rcutree_offline_cpu(unsigned int cpu) unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_state *rsp; - for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->ffmask &= ~rdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcutree_affinity_setting(cpu, cpu); if (IS_ENABLED(CONFIG_TREE_SRCU)) @@ -3676,32 +3303,6 @@ int rcutree_offline_cpu(unsigned int cpu) return 0; } -/* - * Near the end of the offline process. We do only tracing here. - */ -int rcutree_dying_cpu(unsigned int cpu) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - rcu_cleanup_dying_cpu(rsp); - return 0; -} - -/* - * The outgoing CPU is gone and we are running elsewhere. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) { - rcu_cleanup_dead_cpu(cpu, rsp); - do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); - } - return 0; -} - static DEFINE_PER_CPU(int, rcu_cpu_started); /* @@ -3723,137 +3324,113 @@ void rcu_cpu_starting(unsigned int cpu) unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_state *rsp; if (per_cpu(rcu_cpu_started, cpu)) return; per_cpu(rcu_cpu_started, cpu) = 1; - for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - mask = rdp->grpmask; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->qsmaskinitnext |= mask; - oldmask = rnp->expmaskinitnext; - rnp->expmaskinitnext |= mask; - oldmask ^= rnp->expmaskinitnext; - nbits = bitmap_weight(&oldmask, BITS_PER_LONG); - /* Allow lockless access for expedited grace periods. */ - smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ - rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ - rdp->rcu_onl_gp_seq = READ_ONCE(rsp->gp_seq); - rdp->rcu_onl_gp_flags = READ_ONCE(rsp->gp_flags); - if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ - /* Report QS -after- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); - } else { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->qsmaskinitnext |= mask; + oldmask = rnp->expmaskinitnext; + rnp->expmaskinitnext |= mask; + oldmask ^= rnp->expmaskinitnext; + nbits = bitmap_weight(&oldmask, BITS_PER_LONG); + /* Allow lockless access for expedited grace periods. */ + smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ + rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); + rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); + if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + /* Report QS -after- changing ->qsmaskinitnext! */ + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); + } else { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } #ifdef CONFIG_HOTPLUG_CPU /* - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinitnext - * bit masks. + * The outgoing function has no further need of RCU, so remove it from + * the rcu_node tree's ->qsmaskinitnext bit masks. + * + * Note that this function is special in that it is invoked directly + * from the outgoing CPU rather than from the cpuhp_step mechanism. + * This is because this function must be invoked at a precise location. */ -static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +void rcu_report_dead(unsigned int cpu) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + /* QS for any half-done expedited grace period. */ + preempt_disable(); + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + preempt_enable(); + rcu_preempt_deferred_qs(current); + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - spin_lock(&rsp->ofl_lock); + raw_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ - rdp->rcu_ofl_gp_seq = READ_ONCE(rsp->gp_seq); - rdp->rcu_ofl_gp_flags = READ_ONCE(rsp->gp_flags); + rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); + rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - spin_unlock(&rsp->ofl_lock); -} - -/* - * The outgoing function has no further need of RCU, so remove it from - * the list of CPUs that RCU must track. - * - * Note that this function is special in that it is invoked directly - * from the outgoing CPU rather than from the cpuhp_step mechanism. - * This is because this function must be invoked at a precise location. - */ -void rcu_report_dead(unsigned int cpu) -{ - struct rcu_state *rsp; - - /* QS for any half-done expedited RCU-sched GP. */ - preempt_disable(); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(rcu_sched_state.rda), true); - preempt_enable(); - for_each_rcu_flavor(rsp) - rcu_cleanup_dying_idle_cpu(cpu, rsp); + raw_spin_unlock(&rcu_state.ofl_lock); per_cpu(rcu_cpu_started, cpu) = 0; } -/* Migrate the dead CPU's callbacks to the current CPU. */ -static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) +/* + * The outgoing CPU has just passed through the dying-idle state, and we + * are being invoked from the CPU that was IPIed to continue the offline + * operation. Migrate the outgoing CPU's callbacks to the current CPU. + */ +void rcutree_migrate_callbacks(int cpu) { unsigned long flags; struct rcu_data *my_rdp; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp_root = rcu_get_root(); bool needwake; if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ local_irq_save(flags); - my_rdp = this_cpu_ptr(rsp->rda); + my_rdp = this_cpu_ptr(&rcu_data); if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { local_irq_restore(flags); return; } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ /* Leverage recent GPs and set GP for new callbacks. */ - needwake = rcu_advance_cbs(rsp, rnp_root, rdp) || - rcu_advance_cbs(rsp, rnp_root, my_rdp); + needwake = rcu_advance_cbs(rnp_root, rdp) || + rcu_advance_cbs(rnp_root, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); } - -/* - * The outgoing CPU has just passed through the dying-idle state, - * and we are being invoked from the CPU that was IPIed to continue the - * offline operation. We need to migrate the outgoing CPU's callbacks. - */ -void rcutree_migrate_callbacks(int cpu) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - rcu_migrate_callbacks(cpu, rsp); -} #endif /* @@ -3881,14 +3458,13 @@ static int rcu_pm_notify(struct notifier_block *self, } /* - * Spawn the kthreads that handle each RCU flavor's grace periods. + * Spawn the kthreads that handle RCU's grace periods. */ static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; int kthread_prio_in = kthread_prio; struct rcu_node *rnp; - struct rcu_state *rsp; struct sched_param sp; struct task_struct *t; @@ -3908,19 +3484,17 @@ static int __init rcu_spawn_gp_kthread(void) kthread_prio, kthread_prio_in); rcu_scheduler_fully_active = 1; - for_each_rcu_flavor(rsp) { - t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); - BUG_ON(IS_ERR(t)); - rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rsp->gp_kthread = t; - if (kthread_prio) { - sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - wake_up_process(t); + t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); + BUG_ON(IS_ERR(t)); + rnp = rcu_get_root(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_state.gp_kthread = t; + if (kthread_prio) { + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + wake_up_process(t); rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); return 0; @@ -3947,9 +3521,9 @@ void rcu_scheduler_starting(void) } /* - * Helper function for rcu_init() that initializes one rcu_state structure. + * Helper function for rcu_init() that initializes the rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp) +static void __init rcu_init_one(void) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; @@ -3971,14 +3545,15 @@ static void __init rcu_init_one(struct rcu_state *rsp) /* Initialize the level-tracking arrays. */ for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1]; + rcu_state.level[i] = + rcu_state.level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= levelspread[i]; - rnp = rsp->level[i]; + rnp = rcu_state.level[i]; for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), @@ -3986,9 +3561,9 @@ static void __init rcu_init_one(struct rcu_state *rsp) raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, &rcu_fqs_class[i], fqs[i]); - rnp->gp_seq = rsp->gp_seq; - rnp->gp_seq_needed = rsp->gp_seq; - rnp->completedqs = rsp->gp_seq; + rnp->gp_seq = rcu_state.gp_seq; + rnp->gp_seq_needed = rcu_state.gp_seq; + rnp->completedqs = rcu_state.gp_seq; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; @@ -4001,8 +3576,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->parent = NULL; } else { rnp->grpnum = j % levelspread[i - 1]; - rnp->grpmask = 1UL << rnp->grpnum; - rnp->parent = rsp->level[i - 1] + + rnp->grpmask = BIT(rnp->grpnum); + rnp->parent = rcu_state.level[i - 1] + j / levelspread[i - 1]; } rnp->level = i; @@ -4016,16 +3591,15 @@ static void __init rcu_init_one(struct rcu_state *rsp) } } - init_swait_queue_head(&rsp->gp_wq); - init_swait_queue_head(&rsp->expedited_wq); - rnp = rcu_first_leaf_node(rsp); + init_swait_queue_head(&rcu_state.gp_wq); + init_swait_queue_head(&rcu_state.expedited_wq); + rnp = rcu_first_leaf_node(); for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; - per_cpu_ptr(rsp->rda, i)->mynode = rnp; - rcu_boot_init_percpu_data(i, rsp); + per_cpu_ptr(&rcu_data, i)->mynode = rnp; + rcu_boot_init_percpu_data(i); } - list_add(&rsp->flavors, &rcu_struct_flavors); } /* @@ -4051,6 +3625,8 @@ static void __init rcu_init_geometry(void) jiffies_till_first_fqs = d; if (jiffies_till_next_fqs == ULONG_MAX) jiffies_till_next_fqs = d; + if (jiffies_till_sched_qs == ULONG_MAX) + adjust_jiffies_till_sched_qs(); /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == RCU_FANOUT_LEAF && @@ -4109,16 +3685,16 @@ static void __init rcu_init_geometry(void) /* * Dump out the structure of the rcu_node combining tree associated - * with the rcu_state structure referenced by rsp. + * with the rcu_state structure. */ -static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) +static void __init rcu_dump_rcu_node_tree(void) { int level = 0; struct rcu_node *rnp; pr_info("rcu_node tree layout dump\n"); pr_info(" "); - rcu_for_each_node_breadth_first(rsp, rnp) { + rcu_for_each_node_breadth_first(rnp) { if (rnp->level != level) { pr_cont("\n"); pr_info(" "); @@ -4140,11 +3716,9 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_bh_state); - rcu_init_one(&rcu_sched_state); + rcu_init_one(); if (dump_tree) - rcu_dump_rcu_node_tree(&rcu_sched_state); - __rcu_init_preempt(); + rcu_dump_rcu_node_tree(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* @@ -4164,6 +3738,7 @@ void __init rcu_init(void) WARN_ON(!rcu_gp_wq); rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_par_gp_wq); + srcu_init(); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4e74df768c57..703e19ff532d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -34,34 +34,9 @@ #include "rcu_segcblist.h" -/* - * Dynticks per-CPU state. - */ -struct rcu_dynticks { - long dynticks_nesting; /* Track process nesting level. */ - long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ - atomic_t dynticks; /* Even value for idle, else odd. */ - bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ - unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ - bool rcu_urgent_qs; /* GP old need light quiescent state. */ -#ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* Are all CPU's CBs lazy? */ - unsigned long nonlazy_posted; - /* # times non-lazy CBs posted to CPU. */ - unsigned long nonlazy_posted_snap; - /* idle-period nonlazy_posted snapshot. */ - unsigned long last_accelerate; - /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; - /* Last jiffy CBs were all advanced. */ - int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -}; - /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { smp_call_func_t rew_func; - struct rcu_state *rew_rsp; unsigned long rew_s; struct work_struct rew_work; }; @@ -170,7 +145,7 @@ struct rcu_node { * are indexed relative to this interval rather than the global CPU ID space. * This generates the bit for a CPU in node-local masks. */ -#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) +#define leaf_node_cpu_bit(rnp, cpu) (BIT((cpu) - (rnp)->grplo)) /* * Union to allow "aggregate OR" operation on the need for a quiescent @@ -189,12 +164,11 @@ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ - unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ - /* for rcu_all_qs() invocations. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ + bool deferred_qs; /* This CPU awaiting a deferred QS? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ @@ -213,23 +187,27 @@ struct rcu_data { long blimit; /* Upper limit on a processed batch */ /* 3) dynticks interface. */ - struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ - - /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ - unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ - unsigned long cond_resched_completed; - /* Grace period that needs help */ - /* from cond_resched(). */ - - /* 5) _rcu_barrier(), OOM callbacks, and expediting. */ - struct rcu_head barrier_head; + long dynticks_nesting; /* Track process nesting level. */ + long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ + atomic_t dynticks; /* Even value for idle, else odd. */ + bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ + bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_RCU_FAST_NO_HZ - struct rcu_head oom_head; + bool all_lazy; /* Are all CPU's CBs lazy? */ + unsigned long nonlazy_posted; /* # times non-lazy CB posted to CPU. */ + unsigned long nonlazy_posted_snap; + /* Nonlazy_posted snapshot. */ + unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ + unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ + int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + + /* 4) rcu_barrier(), OOM callbacks, and expediting. */ + struct rcu_head barrier_head; int exp_dynticks_snap; /* Double-check need for IPI. */ - /* 6) Callback offloading. */ + /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; @@ -256,7 +234,7 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 7) Diagnostic data, including RCU CPU stall warnings. */ + /* 6) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ @@ -266,9 +244,9 @@ struct rcu_data { short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */ unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ short rcu_onl_gp_flags; /* ->gp_flags at last online. */ + unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ int cpu; - struct rcu_state *rsp; }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ @@ -314,8 +292,6 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS + 1]; /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ - struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ - call_rcu_func_t call; /* call_rcu() flavor. */ int ncpus; /* # CPUs seen so far. */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -334,7 +310,7 @@ struct rcu_state { atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ unsigned long barrier_sequence; /* ++ at start and end of */ - /* _rcu_barrier(). */ + /* rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ struct mutex exp_mutex; /* Serialize expedited GP. */ @@ -366,9 +342,8 @@ struct rcu_state { /* jiffies. */ const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ - struct list_head flavors; /* List of RCU flavors. */ - spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; + raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ }; @@ -388,7 +363,6 @@ struct rcu_state { #define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ -#ifndef RCU_TREE_NONCORE static const char * const gp_state_names[] = { "RCU_GP_IDLE", "RCU_GP_WAIT_GPS", @@ -400,13 +374,29 @@ static const char * const gp_state_names[] = { "RCU_GP_CLEANUP", "RCU_GP_CLEANED", }; -#endif /* #ifndef RCU_TREE_NONCORE */ - -extern struct list_head rcu_struct_flavors; -/* Sequence through rcu_state structures for each RCU flavor. */ -#define for_each_rcu_flavor(rsp) \ - list_for_each_entry((rsp), &rcu_struct_flavors, flavors) +/* + * In order to export the rcu_state name to the tracing tools, it + * needs to be added in the __tracepoint_string section. + * This requires defining a separate variable tp_<sname>_varname + * that points to the string being used, and this will allow + * the tracing userspace tools to be able to decipher the string + * address to the matching string. + */ +#ifdef CONFIG_PREEMPT_RCU +#define RCU_ABBR 'p' +#define RCU_NAME_RAW "rcu_preempt" +#else /* #ifdef CONFIG_PREEMPT_RCU */ +#define RCU_ABBR 's' +#define RCU_NAME_RAW "rcu_sched" +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +#ifndef CONFIG_TRACING +#define RCU_NAME RCU_NAME_RAW +#else /* #ifdef CONFIG_TRACING */ +static char rcu_name[] = RCU_NAME_RAW; +static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; +#define RCU_NAME rcu_name +#endif /* #else #ifdef CONFIG_TRACING */ /* * RCU implementation internal declarations: @@ -419,7 +409,7 @@ extern struct rcu_state rcu_bh_state; extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ -int rcu_dynticks_snap(struct rcu_dynticks *rdtp); +int rcu_dynticks_snap(struct rcu_data *rdp); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -428,45 +418,37 @@ DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DECLARE_PER_CPU(char, rcu_cpu_has_work); #endif /* #ifdef CONFIG_RCU_BOOST */ -#ifndef RCU_TREE_NONCORE - /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -static void rcu_preempt_note_context_switch(bool preempt); +static void rcu_qs(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_print_detail_task_stall(struct rcu_state *rsp); +static void rcu_print_detail_task_stall(void); static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); -static void rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, - struct rcu_node *rnp); -static void rcu_preempt_check_callbacks(void); +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +static void rcu_flavor_check_callbacks(int user); void call_rcu(struct rcu_head *head, rcu_callback_t func); -static void __init __rcu_init_preempt(void); -static void dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, - int ncheck); +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); -#ifdef CONFIG_RCU_BOOST -static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp); -#endif /* #ifdef CONFIG_RCU_BOOST */ static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); static void rcu_prepare_for_idle(void); static void rcu_idle_count_callbacks_posted(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); +static bool rcu_preempt_need_deferred_qs(struct task_struct *t); +static void rcu_preempt_deferred_qs(struct task_struct *t); static void print_cpu_stall_info_begin(void); -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); +static void print_cpu_stall_info(int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); -static void increment_cpu_stall_ticks(void); -static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); +static bool rcu_nocb_cpu_needs_barrier(int cpu); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); @@ -481,11 +463,11 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_all_nocb_kthreads(int cpu); static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU -static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); +static void __init rcu_organize_nocb_kthreads(void); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); -static bool rcu_nohz_full_cpu(struct rcu_state *rsp); +static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); @@ -496,5 +478,3 @@ void srcu_offline_cpu(unsigned int cpu); void srcu_online_cpu(unsigned int cpu) { } void srcu_offline_cpu(unsigned int cpu) { } #endif /* #else #ifdef CONFIG_SRCU */ - -#endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0b2c2ad69629..8d18c1014e2b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -25,39 +25,39 @@ /* * Record the start of an expedited grace period. */ -static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +static void rcu_exp_gp_seq_start(void) { - rcu_seq_start(&rsp->expedited_sequence); + rcu_seq_start(&rcu_state.expedited_sequence); } /* * Return then value that expedited-grace-period counter will have * at the end of the current grace period. */ -static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) +static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) { - return rcu_seq_endval(&rsp->expedited_sequence); + return rcu_seq_endval(&rcu_state.expedited_sequence); } /* * Record the end of an expedited grace period. */ -static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +static void rcu_exp_gp_seq_end(void) { - rcu_seq_end(&rsp->expedited_sequence); + rcu_seq_end(&rcu_state.expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } /* * Take a snapshot of the expedited-grace-period counter. */ -static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +static unsigned long rcu_exp_gp_seq_snap(void) { unsigned long s; smp_mb(); /* Caller's modifications seen first by other CPUs. */ - s = rcu_seq_snap(&rsp->expedited_sequence); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + s = rcu_seq_snap(&rcu_state.expedited_sequence); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("snap")); return s; } @@ -66,9 +66,9 @@ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) * if a full expedited grace period has elapsed since that snapshot * was taken. */ -static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +static bool rcu_exp_gp_seq_done(unsigned long s) { - return rcu_seq_done(&rsp->expedited_sequence, s); + return rcu_seq_done(&rcu_state.expedited_sequence, s); } /* @@ -78,26 +78,26 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) * ever been online. This means that this function normally takes its * no-work-to-do fastpath. */ -static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +static void sync_exp_reset_tree_hotplug(void) { bool done; unsigned long flags; unsigned long mask; unsigned long oldmask; - int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */ + int ncpus = smp_load_acquire(&rcu_state.ncpus); /* Order vs. locking. */ struct rcu_node *rnp; struct rcu_node *rnp_up; /* If no new CPUs onlined since last time, nothing to do. */ - if (likely(ncpus == rsp->ncpus_snap)) + if (likely(ncpus == rcu_state.ncpus_snap)) return; - rsp->ncpus_snap = ncpus; + rcu_state.ncpus_snap = ncpus; /* * Each pass through the following loop propagates newly onlined * CPUs for the current rcu_node structure up the rcu_node tree. */ - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -135,13 +135,13 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) * Reset the ->expmask values in the rcu_node tree in preparation for * a new expedited grace period. */ -static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +static void __maybe_unused sync_exp_reset_tree(void) { unsigned long flags; struct rcu_node *rnp; - sync_exp_reset_tree_hotplug(rsp); - rcu_for_each_node_breadth_first(rsp, rnp) { + sync_exp_reset_tree_hotplug(); + rcu_for_each_node_breadth_first(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; @@ -194,7 +194,7 @@ static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) * * Caller must hold the specified rcu_node structure's ->lock. */ -static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, +static void __rcu_report_exp_rnp(struct rcu_node *rnp, bool wake, unsigned long flags) __releases(rnp->lock) { @@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - swake_up_one(&rsp->expedited_wq); + swake_up_one(&rcu_state.expedited_wq); } break; } @@ -229,20 +229,19 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) +static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) { unsigned long flags; raw_spin_lock_irqsave_rcu_node(rnp, flags); - __rcu_report_exp_rnp(rsp, rnp, wake, flags); + __rcu_report_exp_rnp(rnp, wake, flags); } /* * Report expedited quiescent state for multiple CPUs, all covered by the * specified leaf rcu_node structure. */ -static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, +static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long mask, bool wake) { unsigned long flags; @@ -253,23 +252,23 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, return; } rnp->expmask &= ~mask; - __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ + __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ } /* * Report expedited quiescent state for specified rcu_data (CPU). */ -static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, - bool wake) +static void rcu_report_exp_rdp(struct rcu_data *rdp) { - rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); + WRITE_ONCE(rdp->deferred_qs, false); + rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } -/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s) +/* Common code for work-done checking. */ +static bool sync_exp_work_done(unsigned long s) { - if (rcu_exp_gp_seq_done(rsp, s)) { - trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); + if (rcu_exp_gp_seq_done(s)) { + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ return true; @@ -284,28 +283,28 @@ static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s) * with the mutex held, indicating that the caller must actually do the * expedited grace period. */ -static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +static bool exp_funnel_lock(unsigned long s) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); struct rcu_node *rnp = rdp->mynode; - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp_root = rcu_get_root(); /* Low-contention fastpath. */ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && (rnp == rnp_root || ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - mutex_trylock(&rsp->exp_mutex)) + mutex_trylock(&rcu_state.exp_mutex)) goto fastpath; /* * Each pass through the following loop works its way up * the rcu_node tree, returning if others have done the work or - * otherwise falls through to acquire rsp->exp_mutex. The mapping + * otherwise falls through to acquire ->exp_mutex. The mapping * from CPU to rcu_node structure can be inexact, as it is just * promoting locality and is not strictly needed for correctness. */ for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, s)) + if (sync_exp_work_done(s)) return true; /* Work not done, either wait here or go up. */ @@ -314,68 +313,29 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) /* Someone else doing GP, so wait for them. */ spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, rnp->grplo, rnp->grphi, TPS("wait")); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, s)); + sync_exp_work_done(s)); return true; } rnp->exp_seq_rq = s; /* Followers can wait on us. */ spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, - rnp->grphi, TPS("nxtlvl")); + trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, + rnp->grplo, rnp->grphi, TPS("nxtlvl")); } - mutex_lock(&rsp->exp_mutex); + mutex_lock(&rcu_state.exp_mutex); fastpath: - if (sync_exp_work_done(rsp, s)) { - mutex_unlock(&rsp->exp_mutex); + if (sync_exp_work_done(s)) { + mutex_unlock(&rcu_state.exp_mutex); return true; } - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + rcu_exp_gp_seq_start(); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("start")); return false; } -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *data) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = data; - - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - if (rcu_is_cpu_rrupt_from_idle()) { - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); - return; - } - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - resched_cpu(smp_processor_id()); -} - -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_sched_state; - - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) - return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); - WARN_ON_ONCE(ret); -} - /* * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. @@ -391,7 +351,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) struct rcu_exp_work *rewp = container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); - struct rcu_state *rsp = rewp->rew_rsp; func = rewp->rew_func; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -400,15 +359,14 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) mask_ofl_test = 0; for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int snap; if (raw_smp_processor_id() == cpu || !(rnp->qsmaskinitnext & mask)) { mask_ofl_test |= mask; } else { - snap = rcu_dynticks_snap(rdtp); + snap = rcu_dynticks_snap(rdp); if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; else @@ -429,17 +387,16 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) /* IPI the remaining CPUs for expedited quiescent state. */ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); if (!(mask_ofl_ipi & mask)) continue; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp->dynticks, - rdp->exp_dynticks_snap)) { + if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; continue; } - ret = smp_call_function_single(cpu, func, rsp, 0); + ret = smp_call_function_single(cpu, func, NULL, 0); if (!ret) { mask_ofl_ipi &= ~mask; continue; @@ -450,7 +407,7 @@ retry_ipi: (rnp->expmask & mask)) { /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); + trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl")); schedule_timeout_uninterruptible(1); goto retry_ipi; } @@ -462,33 +419,31 @@ retry_ipi: /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } /* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) +static void sync_rcu_exp_select_cpus(smp_call_func_t func) { int cpu; struct rcu_node *rnp; - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); - sync_exp_reset_tree(rsp); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); + trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); + sync_exp_reset_tree(); + trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("select")); /* Schedule work for each leaf rcu_node structure. */ - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ rnp->rew.rew_func = func; - rnp->rew.rew_rsp = rsp; if (!READ_ONCE(rcu_par_gp_wq) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || - rcu_is_last_leaf_node(rsp, rnp)) { + rcu_is_last_leaf_node(rnp)) { /* No workqueues yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; @@ -505,12 +460,12 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, } /* Wait for workqueue jobs (if any) to complete. */ - rcu_for_each_leaf_node(rsp, rnp) + rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) flush_work(&rnp->rew.rew_work); } -static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +static void synchronize_sched_expedited_wait(void) { int cpu; unsigned long jiffies_stall; @@ -518,16 +473,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) unsigned long mask; int ndetected; struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp_root = rcu_get_root(); int ret; - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait")); + trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; for (;;) { ret = swait_event_timeout_exclusive( - rsp->expedited_wq, + rcu_state.expedited_wq, sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) @@ -537,9 +492,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) continue; panic_on_rcu_stall(); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rsp->name); + rcu_state.name); ndetected = 0; - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { ndetected += rcu_print_task_exp_stall(rnp); for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp; @@ -548,7 +503,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) if (!(rnp->expmask & mask)) continue; ndetected++; - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = per_cpu_ptr(&rcu_data, cpu); pr_cont(" %d-%c%c%c", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], @@ -556,11 +511,11 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rsp->expedited_sequence, + jiffies - jiffies_start, rcu_state.expedited_sequence, rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); if (ndetected) { pr_err("blocking rcu_node structures:"); - rcu_for_each_node_breadth_first(rsp, rnp) { + rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ if (sync_rcu_preempt_exp_done_unlocked(rnp)) @@ -572,7 +527,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } pr_cont("\n"); } - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { for_each_leaf_node_possible_cpu(rnp, cpu) { mask = leaf_node_cpu_bit(rnp, cpu); if (!(rnp->expmask & mask)) @@ -590,21 +545,21 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) * grace period. Also update all the ->exp_seq_rq counters as needed * in order to avoid counter-wrap problems. */ -static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +static void rcu_exp_wait_wake(unsigned long s) { struct rcu_node *rnp; - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + synchronize_sched_expedited_wait(); + rcu_exp_gp_seq_end(); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); /* * Switch over to wakeup mode, allowing the next GP, but -only- the * next GP, to proceed. */ - mutex_lock(&rsp->exp_wake_mutex); + mutex_lock(&rcu_state.exp_wake_mutex); - rcu_for_each_node_breadth_first(rsp, rnp) { + rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { spin_lock(&rnp->exp_lock); /* Recheck, avoid hang in case someone just arrived. */ @@ -613,24 +568,23 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ - wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]); + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); } - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_wake_mutex); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake")); + mutex_unlock(&rcu_state.exp_wake_mutex); } /* * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. */ -static void rcu_exp_sel_wait_wake(struct rcu_state *rsp, - smp_call_func_t func, unsigned long s) +static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s) { /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, func); + sync_rcu_exp_select_cpus(func); /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); + rcu_exp_wait_wake(s); } /* @@ -641,15 +595,14 @@ static void wait_rcu_exp_gp(struct work_struct *wp) struct rcu_exp_work *rewp; rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s); + rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s); } /* - * Given an rcu_state pointer and a smp_call_function() handler, kick - * off the specified flavor of expedited grace period. + * Given a smp_call_function() handler, kick off the specified + * implementation of expedited grace period. */ -static void _synchronize_rcu_expedited(struct rcu_state *rsp, - smp_call_func_t func) +static void _synchronize_rcu_expedited(smp_call_func_t func) { struct rcu_data *rdp; struct rcu_exp_work rew; @@ -658,71 +611,37 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { - wait_rcu_gp(rsp->call); + wait_rcu_gp(call_rcu); return; } /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) + s = rcu_exp_gp_seq_snap(); + if (exp_funnel_lock(s)) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { /* Direct call during scheduler init and early_initcalls(). */ - rcu_exp_sel_wait_wake(rsp, func, s); + rcu_exp_sel_wait_wake(func, s); } else { /* Marshall arguments & schedule the expedited grace period. */ rew.rew_func = func; - rew.rew_rsp = rsp; rew.rew_s = s; INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); queue_work(rcu_gp_wq, &rew.rew_work); } /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - rnp = rcu_get_root(rsp); + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); + rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, s)); + sync_exp_work_done(s)); smp_mb(); /* Workqueue actions happen before return. */ /* Let the next expedited grace period start. */ - mutex_unlock(&rsp->exp_mutex); -} - -/** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * This implementation can be thought of as an application of sequence - * locking to expedited grace periods, but using the sequence counter to - * determine when someone else has already done the work instead of for - * retrying readers. - */ -void synchronize_sched_expedited(void) -{ - struct rcu_state *rsp = &rcu_sched_state; - - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched_expedited() in RCU read-side critical section"); - - /* If only one CPU, this is automatically a grace period. */ - if (rcu_blocking_is_gp()) - return; - - _synchronize_rcu_expedited(rsp, sync_sched_exp_handler); + mutex_unlock(&rcu_state.exp_mutex); } -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); #ifdef CONFIG_PREEMPT_RCU @@ -733,34 +652,78 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); * ->expmask fields in the rcu_node tree. Otherwise, immediately * report the quiescent state. */ -static void sync_rcu_exp_handler(void *info) +static void sync_rcu_exp_handler(void *unused) { - struct rcu_data *rdp; - struct rcu_state *rsp = info; + unsigned long flags; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; struct task_struct *t = current; /* - * Within an RCU read-side critical section, request that the next - * rcu_read_unlock() report. Unless this RCU read-side critical - * section has already blocked, in which case it is already set - * up for the expedited grace period to wait on it. + * First, the common case of not being in an RCU read-side + * critical section. If also enabled or idle, immediately + * report the quiescent state, otherwise defer. */ - if (t->rcu_read_lock_nesting > 0 && - !t->rcu_read_unlock_special.b.blocked) { - t->rcu_read_unlock_special.b.exp_need_qs = true; + if (!t->rcu_read_lock_nesting) { + if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || + rcu_dynticks_curr_cpu_in_eqs()) { + rcu_report_exp_rdp(rdp); + } else { + rdp->deferred_qs = true; + set_tsk_need_resched(t); + set_preempt_need_resched(); + } return; } /* - * We are either exiting an RCU read-side critical section (negative - * values of t->rcu_read_lock_nesting) or are not in one at all - * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU - * read-side critical section that blocked before this expedited - * grace period started. Either way, we can immediately report - * the quiescent state. + * Second, the less-common case of being in an RCU read-side + * critical section. In this case we can count on a future + * rcu_read_unlock(). However, this rcu_read_unlock() might + * execute on some other CPU, but in that case there will be + * a future context switch. Either way, if the expedited + * grace period is still waiting on this CPU, set ->deferred_qs + * so that the eventual quiescent state will be reported. + * Note that there is a large group of race conditions that + * can have caused this quiescent state to already have been + * reported, so we really do need to check ->expmask. + */ + if (t->rcu_read_lock_nesting > 0) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->expmask & rdp->grpmask) + rdp->deferred_qs = true; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + + /* + * The final and least likely case is where the interrupted + * code was just about to or just finished exiting the RCU-preempt + * read-side critical section, and no, we can't tell which. + * So either way, set ->deferred_qs to flag later code that + * a quiescent state is required. + * + * If the CPU is fully enabled (or if some buggy RCU-preempt + * read-side critical section is being used from idle), just + * invoke rcu_preempt_defer_qs() to immediately report the + * quiescent state. We cannot use rcu_read_unlock_special() + * because we are in an interrupt handler, which will cause that + * function to take an early exit without doing anything. + * + * Otherwise, force a context switch after the CPU enables everything. */ - rdp = this_cpu_ptr(rsp->rda); - rcu_report_exp_rdp(rsp, rdp, true); + rdp->deferred_qs = true; + if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { + rcu_preempt_deferred_qs(t); + } else { + set_tsk_need_resched(t); + set_preempt_need_resched(); + } +} + +/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ } /** @@ -780,11 +743,11 @@ static void sync_rcu_exp_handler(void *info) * you are using synchronize_rcu_expedited() in a loop, please restructure * your code to batch your updates, and then Use a single synchronize_rcu() * instead. + * + * This has the same semantics as (but is more brutal than) synchronize_rcu(). */ void synchronize_rcu_expedited(void) { - struct rcu_state *rsp = rcu_state_p; - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), @@ -792,19 +755,82 @@ void synchronize_rcu_expedited(void) if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; - _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); + _synchronize_rcu_expedited(sync_rcu_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #else /* #ifdef CONFIG_PREEMPT_RCU */ +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *unused) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = this_cpu_ptr(&rcu_data); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) + return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + return; + } + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0); + WARN_ON_ONCE(ret); +} + /* - * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptible RCU does not exist, map to rcu-sched. + * Because a context switch is a grace period for !PREEMPT, any + * blocking grace-period wait automatically implies a grace period if + * there is only one CPU online at any point time during execution of + * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds some + * overhead: RCU still operates correctly. */ +static int rcu_blocking_is_gp(void) +{ + int ret; + + might_sleep(); /* Check for RCU read-side critical section. */ + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; +} + +/* PREEMPT=n implementation of synchronize_rcu_expedited(). */ void synchronize_rcu_expedited(void) { - synchronize_sched_expedited(); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); + + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + _synchronize_rcu_expedited(sync_sched_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a97c20ea9bce..05915e536336 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -38,8 +38,7 @@ #include "../locking/rtmutex_common.h" /* - * Control variables for per-CPU and per-rcu_node kthreads. These - * handle all flavors of RCU. + * Control variables for per-CPU and per-rcu_node kthreads. */ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -106,6 +105,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); if (jiffies_till_next_fqs != ULONG_MAX) pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); + if (jiffies_till_sched_qs != ULONG_MAX) + pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs); if (rcu_kick_kthreads) pr_info("\tKick kthreads if too-long grace period.\n"); if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) @@ -123,12 +124,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_PREEMPT_RCU -RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -static struct rcu_state *const rcu_state_p = &rcu_preempt_state; -static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; - -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake); +static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake); static void rcu_read_unlock_special(struct task_struct *t); /* @@ -284,13 +280,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && - t->rcu_read_unlock_special.b.exp_need_qs) { - t->rcu_read_unlock_special.b.exp_need_qs = false; - rcu_report_exp_rdp(rdp->rsp, rdp, true); - } else { - WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); - } + if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) + rcu_report_exp_rdp(rdp); + else + WARN_ON_ONCE(rdp->deferred_qs); } /* @@ -306,15 +299,15 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * * Callers to this function must disable preemption. */ -static void rcu_preempt_qs(void) +static void rcu_qs(void) { - RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); - if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n"); + if (__this_cpu_read(rcu_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), - __this_cpu_read(rcu_data_p->gp_seq), + __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); - __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); - barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ + __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); + barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */ current->rcu_read_unlock_special.b.need_qs = false; } } @@ -332,19 +325,20 @@ static void rcu_preempt_qs(void) * * Caller must disable interrupts. */ -static void rcu_preempt_note_context_switch(bool preempt) +void rcu_note_context_switch(bool preempt) { struct task_struct *t = current; - struct rcu_data *rdp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ + trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); t->rcu_read_unlock_special.b.blocked = true; @@ -357,7 +351,7 @@ static void rcu_preempt_note_context_switch(bool preempt) */ WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - trace_rcu_preempt_task(rdp->rsp->name, + trace_rcu_preempt_task(rcu_state.name, t->pid, (rnp->qsmask & rdp->grpmask) ? rnp->gp_seq @@ -371,6 +365,9 @@ static void rcu_preempt_note_context_switch(bool preempt) * behalf of preempted instance of __rcu_read_unlock(). */ rcu_read_unlock_special(t); + rcu_preempt_deferred_qs(t); + } else { + rcu_preempt_deferred_qs(t); } /* @@ -382,8 +379,13 @@ static void rcu_preempt_note_context_switch(bool preempt) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - rcu_preempt_qs(); + rcu_qs(); + if (rdp->deferred_qs) + rcu_report_exp_rdp(rdp); + trace_rcu_utilization(TPS("End context switch")); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ } +EXPORT_SYMBOL_GPL(rcu_note_context_switch); /* * Check for preempted RCU readers blocking the current grace period @@ -464,74 +466,56 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) } /* - * Handle special cases during rcu_read_unlock(), such as needing to - * notify RCU core processing or task having blocked during the RCU - * read-side critical section. + * Report deferred quiescent states. The deferral time can + * be quite short, for example, in the case of the call from + * rcu_read_unlock_special(). */ -static void rcu_read_unlock_special(struct task_struct *t) +static void +rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) { bool empty_exp; bool empty_norm; bool empty_exp_now; - unsigned long flags; struct list_head *np; bool drop_boost_mutex = false; struct rcu_data *rdp; struct rcu_node *rnp; union rcu_special special; - /* NMI handlers cannot block and cannot safely manipulate state. */ - if (in_nmi()) - return; - - local_irq_save(flags); - /* * If RCU core is waiting for this CPU to exit its critical section, * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; + rdp = this_cpu_ptr(&rcu_data); + if (!special.s && !rdp->deferred_qs) { + local_irq_restore(flags); + return; + } if (special.b.need_qs) { - rcu_preempt_qs(); + rcu_qs(); t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s) { + if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { local_irq_restore(flags); return; } } /* - * Respond to a request for an expedited grace period, but only if - * we were not preempted, meaning that we were running on the same - * CPU throughout. If we were preempted, the exp_need_qs flag - * would have been cleared at the time of the first preemption, - * and the quiescent state would be reported when we were dequeued. + * Respond to a request by an expedited grace period for a + * quiescent state from this CPU. Note that requests from + * tasks are handled when removing the task from the + * blocked-tasks list below. */ - if (special.b.exp_need_qs) { - WARN_ON_ONCE(special.b.blocked); - t->rcu_read_unlock_special.b.exp_need_qs = false; - rdp = this_cpu_ptr(rcu_state_p->rda); - rcu_report_exp_rdp(rcu_state_p, rdp, true); + if (rdp->deferred_qs) { + rcu_report_exp_rdp(rdp); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; } } - /* Hardware IRQ handlers cannot block, complain if they get here. */ - if (in_irq() || in_serving_softirq()) { - lockdep_rcu_suspicious(__FILE__, __LINE__, - "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); - pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n", - t->rcu_read_unlock_special.s, - t->rcu_read_unlock_special.b.blocked, - t->rcu_read_unlock_special.b.exp_need_qs, - t->rcu_read_unlock_special.b.need_qs); - local_irq_restore(flags); - return; - } - /* Clean up if blocked during RCU read-side critical section. */ if (special.b.blocked) { t->rcu_read_unlock_special.b.blocked = false; @@ -582,7 +566,7 @@ static void rcu_read_unlock_special(struct task_struct *t) rnp->grplo, rnp->grphi, !!rnp->gp_tasks); - rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); + rcu_report_unblock_qs_rnp(rnp, flags); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -596,13 +580,79 @@ static void rcu_read_unlock_special(struct task_struct *t) * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) - rcu_report_exp_rnp(rcu_state_p, rnp, true); + rcu_report_exp_rnp(rnp, true); } else { local_irq_restore(flags); } } /* + * Is a deferred quiescent-state pending, and are we also not in + * an RCU read-side critical section? It is the caller's responsibility + * to ensure it is otherwise safe to report any deferred quiescent + * states. The reason for this is that it is safe to report a + * quiescent state during context switch even though preemption + * is disabled. This function cannot be expected to understand these + * nuances, so the caller must handle them. + */ +static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +{ + return (this_cpu_ptr(&rcu_data)->deferred_qs || + READ_ONCE(t->rcu_read_unlock_special.s)) && + t->rcu_read_lock_nesting <= 0; +} + +/* + * Report a deferred quiescent state if needed and safe to do so. + * As with rcu_preempt_need_deferred_qs(), "safe" involves only + * not being in an RCU read-side critical section. The caller must + * evaluate safety in terms of interrupt, softirq, and preemption + * disabling. + */ +static void rcu_preempt_deferred_qs(struct task_struct *t) +{ + unsigned long flags; + bool couldrecurse = t->rcu_read_lock_nesting >= 0; + + if (!rcu_preempt_need_deferred_qs(t)) + return; + if (couldrecurse) + t->rcu_read_lock_nesting -= INT_MIN; + local_irq_save(flags); + rcu_preempt_deferred_qs_irqrestore(t, flags); + if (couldrecurse) + t->rcu_read_lock_nesting += INT_MIN; +} + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +static void rcu_read_unlock_special(struct task_struct *t) +{ + unsigned long flags; + bool preempt_bh_were_disabled = + !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); + bool irqs_were_disabled; + + /* NMI handlers cannot block and cannot safely manipulate state. */ + if (in_nmi()) + return; + + local_irq_save(flags); + irqs_were_disabled = irqs_disabled_flags(flags); + if ((preempt_bh_were_disabled || irqs_were_disabled) && + t->rcu_read_unlock_special.b.blocked) { + /* Need to defer quiescent state until everything is enabled. */ + raise_softirq_irqoff(RCU_SOFTIRQ); + local_irq_restore(flags); + return; + } + rcu_preempt_deferred_qs_irqrestore(t, flags); +} + +/* * Dump detailed information for all tasks blocking the current RCU * grace period on the specified rcu_node structure. */ @@ -633,12 +683,12 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) * Dump detailed information for all tasks blocking the current RCU * grace period. */ -static void rcu_print_detail_task_stall(struct rcu_state *rsp) +static void rcu_print_detail_task_stall(void) { - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); rcu_print_detail_task_stall_rnp(rnp); - rcu_for_each_leaf_node(rsp, rnp) + rcu_for_each_leaf_node(rnp) rcu_print_detail_task_stall_rnp(rnp); } @@ -706,14 +756,13 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. */ -static void -rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) - dump_blkd_tasks(rsp, rnp, 10); + dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { rnp->gp_tasks = rnp->blkd_tasks.next; @@ -732,62 +781,38 @@ rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) * * Caller must disable hard irqs. */ -static void rcu_preempt_check_callbacks(void) +static void rcu_flavor_check_callbacks(int user) { - struct rcu_state *rsp = &rcu_preempt_state; struct task_struct *t = current; - if (t->rcu_read_lock_nesting == 0) { - rcu_preempt_qs(); + if (user || rcu_is_cpu_rrupt_from_idle()) { + rcu_note_voluntary_context_switch(current); + } + if (t->rcu_read_lock_nesting > 0 || + (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { + /* No QS, force context switch if deferred. */ + if (rcu_preempt_need_deferred_qs(t)) { + set_tsk_need_resched(t); + set_preempt_need_resched(); + } + } else if (rcu_preempt_need_deferred_qs(t)) { + rcu_preempt_deferred_qs(t); /* Report deferred QS. */ + return; + } else if (!t->rcu_read_lock_nesting) { + rcu_qs(); /* Report immediate QS. */ return; } + + /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ if (t->rcu_read_lock_nesting > 0 && - __this_cpu_read(rcu_data_p->core_needs_qs) && - __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm) && + __this_cpu_read(rcu_data.core_needs_qs) && + __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && !t->rcu_read_unlock_special.b.need_qs && - time_after(jiffies, rsp->gp_start + HZ)) + time_after(jiffies, rcu_state.gp_start + HZ)) t->rcu_read_unlock_special.b.need_qs = true; } /** - * call_rcu() - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all pre-existing RCU read-side - * critical sections have completed. However, the callback function - * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing RCU read-side critical section. On systems with more - * than one CPU, this means that when "func()" is invoked, each CPU is - * guaranteed to have executed a full memory barrier since the end of its - * last RCU read-side critical section whose beginning preceded the call - * to call_rcu(). It also means that each CPU executing an RCU read-side - * critical section that continues beyond the start of "func()" must have - * executed a memory barrier after the call_rcu() but before the beginning - * of that RCU read-side critical section. Note that these guarantees - * include CPUs that are offline, idle, or executing in user mode, as - * well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting RCU callback function "func()", then both CPU A and CPU B are - * guaranteed to execute a full memory barrier during the time interval - * between the call to call_rcu() and the invocation of "func()" -- even - * if CPU A and CPU B are the same CPU (but again only if the system has - * more than one CPU). - */ -void call_rcu(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func, rcu_state_p, -1, 0); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/** * synchronize_rcu - wait until a grace period has elapsed. * * Control will return to the caller some time after a full grace @@ -797,14 +822,28 @@ EXPORT_SYMBOL_GPL(call_rcu); * concurrently with new RCU read-side critical sections that began while * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * In addition, regions of code across which interrupts, preemption, or + * softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last RCU read-side critical section whose beginning + * preceded the call to synchronize_rcu(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_rcu() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_rcu() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. * - * See the description of synchronize_sched() for more detailed - * information on memory-ordering guarantees. However, please note - * that -only- the memory-ordering guarantees apply. For example, - * synchronize_rcu() is -not- guaranteed to wait on things like code - * protected by preempt_disable(), instead, synchronize_rcu() is -only- - * guaranteed to wait on RCU read-side critical sections, that is, sections - * of code protected by rcu_read_lock(). + * Furthermore, if CPU A invoked synchronize_rcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). */ void synchronize_rcu(void) { @@ -821,28 +860,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - * - * Note that this primitive does not necessarily wait for an RCU grace period - * to complete. For example, if there are no RCU callbacks queued anywhere - * in the system, then rcu_barrier() is within its rights to return - * immediately, without waiting for anything, much less an RCU grace period. - */ -void rcu_barrier(void) -{ - _rcu_barrier(rcu_state_p); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - -/* - * Initialize preemptible RCU's state structures. - */ -static void __init __rcu_init_preempt(void) -{ - rcu_init_one(rcu_state_p); -} - /* * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, @@ -859,6 +876,7 @@ void exit_rcu(void) barrier(); t->rcu_read_unlock_special.b.blocked = true; __rcu_read_unlock(); + rcu_preempt_deferred_qs(current); } /* @@ -866,7 +884,7 @@ void exit_rcu(void) * specified number of elements. */ static void -dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) +dump_blkd_tasks(struct rcu_node *rnp, int ncheck) { int cpu; int i; @@ -893,7 +911,7 @@ dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) } pr_cont("\n"); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = per_cpu_ptr(&rcu_data, cpu); onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", cpu, ".o"[onl], @@ -904,8 +922,6 @@ dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) #else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state *const rcu_state_p = &rcu_sched_state; - /* * Tell them what RCU they are running. */ @@ -916,14 +932,85 @@ static void __init rcu_bootup_announce(void) } /* - * Because preemptible RCU does not exist, we never have to check for - * CPUs being in quiescent states. + * Note a quiescent state for PREEMPT=n. Because we do not need to know + * how many quiescent states passed, just if there was at least one since + * the start of the grace period, this just sets a flag. The caller must + * have disabled preemption. */ -static void rcu_preempt_note_context_switch(bool preempt) +static void rcu_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!"); + if (!__this_cpu_read(rcu_data.cpu_no_qs.s)) + return; + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); + __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); } /* + * Register an urgently needed quiescent state. If there is an + * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * dyntick-idle quiescent state visible to other CPUs, which will in + * some cases serve for expedited as well as normal grace periods. + * Either way, register a lightweight quiescent state. + * + * The barrier() calls are redundant in the common case when this is + * called externally, but just in case this is called from within this + * file. + * + */ +void rcu_all_qs(void) +{ + unsigned long flags; + + if (!raw_cpu_read(rcu_data.rcu_urgent_qs)) + return; + preempt_disable(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { + preempt_enable(); + return; + } + this_cpu_write(rcu_data.rcu_urgent_qs, false); + barrier(); /* Avoid RCU read-side critical sections leaking down. */ + if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } + rcu_qs(); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ + preempt_enable(); +} +EXPORT_SYMBOL_GPL(rcu_all_qs); + +/* + * Note a PREEMPT=n context switch. The caller must have disabled interrupts. + */ +void rcu_note_context_switch(bool preempt) +{ + barrier(); /* Avoid RCU read-side critical sections leaking down. */ + trace_rcu_utilization(TPS("Start context switch")); + rcu_qs(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) + goto out; + this_cpu_write(rcu_data.rcu_urgent_qs, false); + if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) + rcu_momentary_dyntick_idle(); + if (!preempt) + rcu_tasks_qs(current); +out: + trace_rcu_utilization(TPS("End context switch")); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ +} +EXPORT_SYMBOL_GPL(rcu_note_context_switch); + +/* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ @@ -941,10 +1028,20 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) } /* + * Because there is no preemptible RCU, there can be no deferred quiescent + * states. + */ +static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +{ + return false; +} +static void rcu_preempt_deferred_qs(struct task_struct *t) { } + +/* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ -static void rcu_print_detail_task_stall(struct rcu_state *rsp) +static void rcu_print_detail_task_stall(void) { } @@ -972,36 +1069,54 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. */ -static void -rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { WARN_ON_ONCE(rnp->qsmask); } /* - * Because preemptible RCU does not exist, it never has any callbacks - * to check. + * Check to see if this CPU is in a non-context-switch quiescent state + * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). + * Also schedule RCU core processing. + * + * This function must be called from hardirq context. It is normally + * invoked from the scheduling-clock interrupt. */ -static void rcu_preempt_check_callbacks(void) +static void rcu_flavor_check_callbacks(int user) { -} + if (user || rcu_is_cpu_rrupt_from_idle()) { -/* - * Because preemptible RCU does not exist, rcu_barrier() is just - * another name for rcu_barrier_sched(). - */ -void rcu_barrier(void) -{ - rcu_barrier_sched(); + /* + * Get here if this CPU took its interrupt from user + * mode or from the idle loop, and if this is not a + * nested interrupt. In this case, the CPU is in + * a quiescent state, so note it. + * + * No memory barrier is required here because rcu_qs() + * references only CPU-local variables that other CPUs + * neither access nor modify, at least not while the + * corresponding CPU is online. + */ + + rcu_qs(); + } } -EXPORT_SYMBOL_GPL(rcu_barrier); -/* - * Because preemptible RCU does not exist, it need not be initialized. - */ -static void __init __rcu_init_preempt(void) +/* PREEMPT=n implementation of synchronize_rcu(). */ +void synchronize_rcu(void) { + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); + if (rcu_blocking_is_gp()) + return; + if (rcu_gp_is_expedited()) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); } +EXPORT_SYMBOL_GPL(synchronize_rcu); /* * Because preemptible RCU does not exist, tasks cannot possibly exit @@ -1015,7 +1130,7 @@ void exit_rcu(void) * Dump the guaranteed-empty blocked-tasks state. Trust but verify. */ static void -dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) +dump_blkd_tasks(struct rcu_node *rnp, int ncheck) { WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } @@ -1212,21 +1327,20 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * already exist. We only create this kthread for preemptible RCU. * Returns zero if all is well, a negated errno otherwise. */ -static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) +static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { - int rnp_index = rnp - &rsp->node[0]; + int rnp_index = rnp - rcu_get_root(); unsigned long flags; struct sched_param sp; struct task_struct *t; - if (rcu_state_p != rsp) + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) return 0; if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) return 0; - rsp->boost = 1; + rcu_state.boost = 1; if (rnp->boost_kthread_task != NULL) return 0; t = kthread_create(rcu_boost_kthread, (void *)rnp, @@ -1244,9 +1358,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, static void rcu_kthread_do_work(void) { - rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); - rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); - rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); + rcu_do_batch(this_cpu_ptr(&rcu_data)); } static void rcu_cpu_kthread_setup(unsigned int cpu) @@ -1268,9 +1380,9 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) } /* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the - * RCU softirq used in flavors and configurations of RCU that do not - * support RCU priority boosting. + * Per-CPU kernel thread that invokes RCU callbacks. This replaces + * the RCU softirq used in configurations of RCU that do not support RCU + * priority boosting. */ static void rcu_cpu_kthread(unsigned int cpu) { @@ -1353,18 +1465,18 @@ static void __init rcu_spawn_boost_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rcu_for_each_leaf_node(rcu_state_p, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); + rcu_for_each_leaf_node(rnp) + (void)rcu_spawn_one_boost_kthread(rnp); } static void rcu_prepare_kthreads(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ if (rcu_scheduler_fully_active) - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); + (void)rcu_spawn_one_boost_kthread(rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1411,8 +1523,8 @@ static void rcu_prepare_kthreads(int cpu) * 1 if so. This function is part of the RCU implementation; it is -not- * an exported member of the RCU API. * - * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs - * any flavor of RCU. + * Because we not have RCU_FAST_NO_HZ, just check whether or not this + * CPU has RCU callbacks queued. */ int rcu_needs_cpu(u64 basemono, u64 *nextevt) { @@ -1478,41 +1590,36 @@ static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); /* - * Try to advance callbacks for all flavors of RCU on the current CPU, but - * only if it has been awhile since the last time we did so. Afterwards, - * if there are any callbacks ready for immediate invocation, return true. + * Try to advance callbacks on the current CPU, but only if it has been + * awhile since the last time we did so. Afterwards, if there are any + * callbacks ready for immediate invocation, return true. */ static bool __maybe_unused rcu_try_advance_all_cbs(void) { bool cbs_ready = false; - struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; - struct rcu_state *rsp; /* Exit early if we advanced recently. */ - if (jiffies == rdtp->last_advance_all) + if (jiffies == rdp->last_advance_all) return false; - rdtp->last_advance_all = jiffies; + rdp->last_advance_all = jiffies; - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; + rnp = rdp->mynode; - /* - * Don't bother checking unless a grace period has - * completed since we last checked and there are - * callbacks not yet ready to invoke. - */ - if ((rcu_seq_completed_gp(rdp->gp_seq, - rcu_seq_current(&rnp->gp_seq)) || - unlikely(READ_ONCE(rdp->gpwrap))) && - rcu_segcblist_pend_cbs(&rdp->cblist)) - note_gp_changes(rsp, rdp); - - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - cbs_ready = true; - } + /* + * Don't bother checking unless a grace period has + * completed since we last checked and there are + * callbacks not yet ready to invoke. + */ + if ((rcu_seq_completed_gp(rdp->gp_seq, + rcu_seq_current(&rnp->gp_seq)) || + unlikely(READ_ONCE(rdp->gpwrap))) && + rcu_segcblist_pend_cbs(&rdp->cblist)) + note_gp_changes(rdp); + + if (rcu_segcblist_ready_cbs(&rdp->cblist)) + cbs_ready = true; return cbs_ready; } @@ -1526,16 +1633,16 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) */ int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); unsigned long dj; lockdep_assert_irqs_disabled(); /* Snapshot to detect later posting of non-lazy callback. */ - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; + rdp->nonlazy_posted_snap = rdp->nonlazy_posted; /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { + if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) { *nextevt = KTIME_MAX; return 0; } @@ -1546,10 +1653,10 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) invoke_rcu_core(); return 1; } - rdtp->last_accelerate = jiffies; + rdp->last_accelerate = jiffies; /* Request timer delay depending on laziness, and round. */ - if (!rdtp->all_lazy) { + if (!rdp->all_lazy) { dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { @@ -1572,10 +1679,8 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) static void rcu_prepare_for_idle(void) { bool needwake; - struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; - struct rcu_state *rsp; int tne; lockdep_assert_irqs_disabled(); @@ -1584,10 +1689,10 @@ static void rcu_prepare_for_idle(void) /* Handle nohz enablement switches conservatively. */ tne = READ_ONCE(tick_nohz_active); - if (tne != rdtp->tick_nohz_enabled_snap) { + if (tne != rdp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(NULL)) invoke_rcu_core(); /* force nohz to see update. */ - rdtp->tick_nohz_enabled_snap = tne; + rdp->tick_nohz_enabled_snap = tne; return; } if (!tne) @@ -1598,10 +1703,10 @@ static void rcu_prepare_for_idle(void) * callbacks, invoke RCU core for the side-effect of recalculating * idle duration on re-entry to idle. */ - if (rdtp->all_lazy && - rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { - rdtp->all_lazy = false; - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; + if (rdp->all_lazy && + rdp->nonlazy_posted != rdp->nonlazy_posted_snap) { + rdp->all_lazy = false; + rdp->nonlazy_posted_snap = rdp->nonlazy_posted; invoke_rcu_core(); return; } @@ -1610,19 +1715,16 @@ static void rcu_prepare_for_idle(void) * If we have not yet accelerated this jiffy, accelerate all * callbacks on this CPU. */ - if (rdtp->last_accelerate == jiffies) + if (rdp->last_accelerate == jiffies) return; - rdtp->last_accelerate = jiffies; - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - if (!rcu_segcblist_pend_cbs(&rdp->cblist)) - continue; + rdp->last_accelerate = jiffies; + if (rcu_segcblist_pend_cbs(&rdp->cblist)) { rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rnp, rdp); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); } } @@ -1650,104 +1752,23 @@ static void rcu_cleanup_after_idle(void) */ static void rcu_idle_count_callbacks_posted(void) { - __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); -} - -/* - * Data for flushing lazy RCU callbacks at OOM time. - */ -static atomic_t oom_callback_count; -static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); - -/* - * RCU OOM callback -- decrement the outstanding count and deliver the - * wake-up if we are the last one. - */ -static void rcu_oom_callback(struct rcu_head *rhp) -{ - if (atomic_dec_and_test(&oom_callback_count)) - wake_up(&oom_callback_wq); -} - -/* - * Post an rcu_oom_notify callback on the current CPU if it has at - * least one lazy callback. This will unnecessarily post callbacks - * to CPUs that already have a non-lazy callback at the end of their - * callback list, but this is an infrequent operation, so accept some - * extra overhead to keep things simple. - */ -static void rcu_oom_notify_cpu(void *unused) -{ - struct rcu_state *rsp; - struct rcu_data *rdp; - - for_each_rcu_flavor(rsp) { - rdp = raw_cpu_ptr(rsp->rda); - if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) { - atomic_inc(&oom_callback_count); - rsp->call(&rdp->oom_head, rcu_oom_callback); - } - } -} - -/* - * If low on memory, ensure that each CPU has a non-lazy callback. - * This will wake up CPUs that have only lazy callbacks, in turn - * ensuring that they free up the corresponding memory in a timely manner. - * Because an uncertain amount of memory will be freed in some uncertain - * timeframe, we do not claim to have freed anything. - */ -static int rcu_oom_notify(struct notifier_block *self, - unsigned long notused, void *nfreed) -{ - int cpu; - - /* Wait for callbacks from earlier instance to complete. */ - wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); - smp_mb(); /* Ensure callback reuse happens after callback invocation. */ - - /* - * Prevent premature wakeup: ensure that all increments happen - * before there is a chance of the counter reaching zero. - */ - atomic_set(&oom_callback_count, 1); - - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched_tasks_rcu_qs(); - } - - /* Unconditionally decrement: no need to wake ourselves up. */ - atomic_dec(&oom_callback_count); - - return NOTIFY_OK; + __this_cpu_add(rcu_data.nonlazy_posted, 1); } -static struct notifier_block rcu_oom_nb = { - .notifier_call = rcu_oom_notify -}; - -static int __init rcu_register_oom_notifier(void) -{ - register_oom_notifier(&rcu_oom_nb); - return 0; -} -early_initcall(rcu_register_oom_notifier); - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_FAST_NO_HZ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap; sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", - rdtp->last_accelerate & 0xffff, jiffies & 0xffff, + rdp->last_accelerate & 0xffff, jiffies & 0xffff, ulong2long(nlpd), - rdtp->all_lazy ? 'L' : '.', - rdtp->tick_nohz_enabled_snap ? '.' : 'D'); + rdp->all_lazy ? 'L' : '.', + rdp->tick_nohz_enabled_snap ? '.' : 'D'); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -1768,21 +1789,19 @@ static void print_cpu_stall_info_begin(void) /* * Print out diagnostic information for the specified stalled CPU. * - * If the specified CPU is aware of the current RCU grace period - * (flavor specified by rsp), then print the number of scheduling - * clock interrupts the CPU has taken during the time that it has - * been aware. Otherwise, print the number of RCU grace periods - * that this CPU is ignorant of, for example, "1" if the CPU was - * aware of the previous grace period. + * If the specified CPU is aware of the current RCU grace period, then + * print the number of scheduling clock interrupts the CPU has taken + * during the time that it has been aware. Otherwise, print the number + * of RCU grace periods that this CPU is ignorant of, for example, "1" + * if the CPU was aware of the previous grace period. * * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. */ -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +static void print_cpu_stall_info(int cpu) { unsigned long delta; char fast_no_hz[72]; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = rdp->dynticks; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; unsigned long ticks_value; @@ -1792,7 +1811,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) */ touch_nmi_watchdog(); - ticks_value = rcu_seq_ctr(rsp->gp_seq - rdp->gp_seq); + ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); if (ticks_value) { ticks_title = "GPs behind"; } else { @@ -1810,10 +1829,10 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - rcu_dynticks_snap(rdtp) & 0xfff, - rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, + rcu_dynticks_snap(rdp) & 0xfff, + rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, + READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, fast_no_hz); } @@ -1823,20 +1842,12 @@ static void print_cpu_stall_info_end(void) pr_err("\t"); } -/* Zero ->ticks_this_gp for all flavors of RCU. */ +/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ static void zero_cpu_stall_ticks(struct rcu_data *rdp) { rdp->ticks_this_gp = 0; rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); -} - -/* Increment ->ticks_this_gp for all flavors of RCU. */ -static void increment_cpu_stall_ticks(void) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - raw_cpu_inc(rsp->rda->ticks_this_gp); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); } #ifdef CONFIG_RCU_NOCB_CPU @@ -1958,17 +1969,17 @@ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) mod_timer(&rdp->nocb_timer, jiffies + 1); WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } /* - * Does the specified CPU need an RCU callback for the specified flavor + * Does the specified CPU need an RCU callback for this invocation * of rcu_barrier()? */ -static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +static bool rcu_nocb_cpu_needs_barrier(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); unsigned long ret; #ifdef CONFIG_PROVE_RCU struct rcu_head *rhp; @@ -1979,7 +1990,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) * There needs to be a barrier before this function is called, * but associated with a prior determination that no more * callbacks would be posted. In the worst case, the first - * barrier in _rcu_barrier() suffices (but the caller cannot + * barrier in rcu_barrier() suffices (but the caller cannot * necessarily rely on this, not a substitute for the caller * getting the concurrency design right!). There must also be * a barrier between the following load an posting of a callback @@ -2037,7 +2048,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, /* If we are not being polled and there is a kthread, awaken it ... */ t = READ_ONCE(rdp->nocb_kthread); if (rcu_nocb_poll || !t) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); return; } @@ -2046,7 +2057,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ wake_nocb_leader(rdp, false); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, @@ -2057,7 +2068,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, /* ... or if many callbacks queued. */ if (!irqs_disabled_flags(flags)) { wake_nocb_leader(rdp, true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeOvf")); } else { wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, @@ -2065,7 +2076,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, } rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } return; } @@ -2087,12 +2098,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, return false; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) - trace_rcu_kfree_callback(rdp->rsp->name, rhp, + trace_rcu_kfree_callback(rcu_state.name, rhp, (unsigned long)rhp->func, -atomic_long_read(&rdp->nocb_q_count_lazy), -atomic_long_read(&rdp->nocb_q_count)); else - trace_rcu_callback(rdp->rsp->name, rhp, + trace_rcu_callback(rcu_state.name, rhp, -atomic_long_read(&rdp->nocb_q_count_lazy), -atomic_long_read(&rdp->nocb_q_count)); @@ -2142,7 +2153,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; local_irq_save(flags); - c = rcu_seq_snap(&rdp->rsp->gp_seq); + c = rcu_seq_snap(&rcu_state.gp_seq); if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { local_irq_restore(flags); } else { @@ -2150,7 +2161,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) - rcu_gp_kthread_wake(rdp->rsp); + rcu_gp_kthread_wake(); } /* @@ -2187,7 +2198,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); + trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); swait_event_interruptible_exclusive(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); @@ -2197,7 +2208,7 @@ wait_again: raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll")); + trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll")); } /* @@ -2224,7 +2235,7 @@ wait_again: if (rcu_nocb_poll) { schedule_timeout_interruptible(1); } else { - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("WokeEmpty")); } goto wait_again; @@ -2269,7 +2280,7 @@ wait_again: static void nocb_follower_wait(struct rcu_data *rdp) { for (;;) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); swait_event_interruptible_exclusive(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { @@ -2277,7 +2288,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) return; } WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); } } @@ -2312,10 +2323,10 @@ static int rcu_nocb_kthread(void *arg) rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rdp->rsp->name, + trace_rcu_batch_start(rcu_state.name, atomic_long_read(&rdp->nocb_q_count_lazy), atomic_long_read(&rdp->nocb_q_count), -1); c = cl = 0; @@ -2323,23 +2334,23 @@ static int rcu_nocb_kthread(void *arg) next = list->next; /* Wait for enqueuing to complete, if needed. */ while (next == NULL && &list->next != tail) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WaitQueue")); schedule_timeout_interruptible(1); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeQueue")); next = list->next; } debug_rcu_head_unqueue(list); local_bh_disable(); - if (__rcu_reclaim(rdp->rsp->name, list)) + if (__rcu_reclaim(rcu_state.name, list)) cl++; c++; local_bh_enable(); cond_resched_tasks_rcu_qs(); list = next; } - trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); + trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1); smp_mb__before_atomic(); /* _add after CB invocation. */ atomic_long_add(-c, &rdp->nocb_q_count); atomic_long_add(-cl, &rdp->nocb_q_count_lazy); @@ -2367,7 +2378,7 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); } /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ @@ -2393,7 +2404,6 @@ void __init rcu_init_nohz(void) { int cpu; bool need_rcu_nocb_mask = false; - struct rcu_state *rsp; #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) @@ -2427,11 +2437,9 @@ void __init rcu_init_nohz(void) if (rcu_nocb_poll) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - for_each_rcu_flavor(rsp) { - for_each_cpu(cpu, rcu_nocb_mask) - init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu)); - rcu_organize_nocb_kthreads(rsp); - } + for_each_cpu(cpu, rcu_nocb_mask) + init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu)); + rcu_organize_nocb_kthreads(); } /* Initialize per-rcu_data variables for no-CBs CPUs. */ @@ -2446,16 +2454,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) /* * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are - * brought online out of order, this can require re-organizing the - * leader-follower relationships. + * rcuo kthread, spawn it. If the CPUs are brought online out of order, + * this can require re-organizing the leader-follower relationships. */ -static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) +static void rcu_spawn_one_nocb_kthread(int cpu) { struct rcu_data *rdp; struct rcu_data *rdp_last; struct rcu_data *rdp_old_leader; - struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu); struct task_struct *t; /* @@ -2485,9 +2492,9 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) rdp_spawn->nocb_next_follower = rdp_old_leader; } - /* Spawn the kthread for this CPU and RCU flavor. */ + /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_kthread, rdp_spawn, - "rcuo%c/%d", rsp->abbr, cpu); + "rcuo%c/%d", rcu_state.abbr, cpu); BUG_ON(IS_ERR(t)); WRITE_ONCE(rdp_spawn->nocb_kthread, t); } @@ -2498,11 +2505,8 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) */ static void rcu_spawn_all_nocb_kthreads(int cpu) { - struct rcu_state *rsp; - if (rcu_scheduler_fully_active) - for_each_rcu_flavor(rsp) - rcu_spawn_one_nocb_kthread(rsp, cpu); + rcu_spawn_one_nocb_kthread(cpu); } /* @@ -2526,7 +2530,7 @@ module_param(rcu_nocb_leader_stride, int, 0444); /* * Initialize leader-follower relationships for all no-CBs CPU. */ -static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) +static void __init rcu_organize_nocb_kthreads(void) { int cpu; int ls = rcu_nocb_leader_stride; @@ -2548,7 +2552,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) * we will spawn the needed set of rcu_nocb_kthread() kthreads. */ for_each_cpu(cpu, rcu_nocb_mask) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { /* New leader, set up for followers & next leader. */ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; @@ -2585,7 +2589,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +static bool rcu_nocb_cpu_needs_barrier(int cpu) { WARN_ON_ONCE(1); /* Should be dead code. */ return false; @@ -2654,12 +2658,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) * This code relies on the fact that all NO_HZ_FULL CPUs are also * CONFIG_RCU_NOCB_CPU CPUs. */ -static bool rcu_nohz_full_cpu(struct rcu_state *rsp) +static bool rcu_nohz_full_cpu(void) { #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && - (!rcu_gp_in_progress(rsp) || - ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ))) + (!rcu_gp_in_progress() || + ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ return false; diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 39cb23d22109..f203b94f6b5b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -203,11 +203,7 @@ void rcu_test_sync_prims(void) if (!IS_ENABLED(CONFIG_PROVE_RCU)) return; synchronize_rcu(); - synchronize_rcu_bh(); - synchronize_sched(); synchronize_rcu_expedited(); - synchronize_rcu_bh_expedited(); - synchronize_sched_expedited(); } #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) @@ -298,7 +294,7 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. * - * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ int rcu_read_lock_bh_held(void) @@ -336,7 +332,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, int i; int j; - /* Initialize and register callbacks for each flavor specified. */ + /* Initialize and register callbacks for each crcu_array element. */ for (i = 0; i < n; i++) { if (checktiny && (crcu_array[i] == call_rcu || @@ -472,6 +468,7 @@ int rcu_jiffies_till_stall_check(void) } return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; } +EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); void rcu_sysrq_start(void) { @@ -701,19 +698,19 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_sched() + * transitions to complete. Invoking synchronize_rcu() * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_sched(), + * interrupts disabled. Without this synchronize_rcu(), * a read-side critical section that started before the * grace period might be incorrectly seen as having started * after the grace period. * - * This synchronize_sched() also dispenses with the + * This synchronize_rcu() also dispenses with the * need for a memory barrier on the first store to * ->rcu_tasks_holdout, as it forces the store to happen * after the beginning of the grace period. */ - synchronize_sched(); + synchronize_rcu(); /* * There were callbacks, so we need to wait for an @@ -740,7 +737,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) * This does only part of the job, ensuring that all * tasks that were previously exiting reach the point * where they have disabled preemption, allowing the - * later synchronize_sched() to finish the job. + * later synchronize_rcu() to finish the job. */ synchronize_srcu(&tasks_rcu_exit_srcu); @@ -790,20 +787,20 @@ static int __noreturn rcu_tasks_kthread(void *arg) * cause their RCU-tasks read-side critical sections to * extend past the end of the grace period. However, * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_sched() + * interrupts disabled, we can use synchronize_rcu() * to force the needed ordering on all such CPUs. * - * This synchronize_sched() also confines all + * This synchronize_rcu() also confines all * ->rcu_tasks_holdout accesses to be within the grace * period, avoiding the need for memory barriers for * ->rcu_tasks_holdout accesses. * - * In addition, this synchronize_sched() waits for exiting + * In addition, this synchronize_rcu() waits for exiting * tasks to complete their final preempt_disable() region * of execution, cleaning up after the synchronize_srcu() * above. */ - synchronize_sched(); + synchronize_rcu(); /* Invoke the callbacks. */ while (list) { @@ -870,15 +867,10 @@ static void __init rcu_tasks_bootup_oddness(void) #ifdef CONFIG_PROVE_RCU /* - * Early boot self test parameters, one for each flavor + * Early boot self test parameters. */ static bool rcu_self_test; -static bool rcu_self_test_bh; -static bool rcu_self_test_sched; - module_param(rcu_self_test, bool, 0444); -module_param(rcu_self_test_bh, bool, 0444); -module_param(rcu_self_test_sched, bool, 0444); static int rcu_self_test_counter; @@ -888,25 +880,16 @@ static void test_callback(struct rcu_head *r) pr_info("RCU test callback executed %d\n", rcu_self_test_counter); } +DEFINE_STATIC_SRCU(early_srcu); + static void early_boot_test_call_rcu(void) { static struct rcu_head head; + static struct rcu_head shead; call_rcu(&head, test_callback); -} - -static void early_boot_test_call_rcu_bh(void) -{ - static struct rcu_head head; - - call_rcu_bh(&head, test_callback); -} - -static void early_boot_test_call_rcu_sched(void) -{ - static struct rcu_head head; - - call_rcu_sched(&head, test_callback); + if (IS_ENABLED(CONFIG_SRCU)) + call_srcu(&early_srcu, &shead, test_callback); } void rcu_early_boot_tests(void) @@ -915,10 +898,6 @@ void rcu_early_boot_tests(void) if (rcu_self_test) early_boot_test_call_rcu(); - if (rcu_self_test_bh) - early_boot_test_call_rcu_bh(); - if (rcu_self_test_sched) - early_boot_test_call_rcu_sched(); rcu_test_sync_prims(); } @@ -930,16 +909,11 @@ static int rcu_verify_early_boot_tests(void) if (rcu_self_test) { early_boot_test_counter++; rcu_barrier(); + if (IS_ENABLED(CONFIG_SRCU)) { + early_boot_test_counter++; + srcu_barrier(&early_srcu); + } } - if (rcu_self_test_bh) { - early_boot_test_counter++; - rcu_barrier_bh(); - } - if (rcu_self_test_sched) { - early_boot_test_counter++; - rcu_barrier_sched(); - } - if (rcu_self_test_counter != early_boot_test_counter) { WARN_ON(1); ret = -1; diff --git a/kernel/reboot.c b/kernel/reboot.c index 8fb44dec9ad7..e1b79b6a2735 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -49,6 +49,7 @@ int reboot_force; */ void (*pm_power_off_prepare)(void); +EXPORT_SYMBOL_GPL(pm_power_off_prepare); /** * emergency_restart - reboot the system diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 625bc9897f62..fe0223121883 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -135,9 +135,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) * In theory, the compile should just see 0 here, and optimize out the call * to sched_rt_avg_update. But I don't trust it... */ -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - s64 steal = 0, irq_delta = 0; -#endif + s64 __maybe_unused steal = 0, irq_delta = 0; + #ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; @@ -177,7 +176,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; -#ifdef HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif @@ -701,6 +700,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; + p->se.runnable_weight = load->weight; return; } @@ -713,6 +713,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; + p->se.runnable_weight = load->weight; } } @@ -1167,7 +1168,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p); + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); @@ -2915,10 +2916,10 @@ unsigned long nr_iowait(void) } /* - * Consumers of these two interfaces, like for example the cpufreq menu - * governor are using nonsensical data. Boosting frequency for a CPU that has - * IO-wait which might not even end up running the task when it does become - * runnable. + * Consumers of these two interfaces, like for example the cpuidle menu + * governor, are using nonsensical data. Preferring shallow idle state selection + * for a CPU that has IO-wait which might not even end up running the task when + * it does become runnable. */ unsigned long nr_iowait_cpu(int cpu) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 997ea7b839fa..91e4202b0634 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1607,7 +1607,7 @@ out: return cpu; } -static void migrate_task_rq_dl(struct task_struct *p) +static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { struct rq *rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f808ddf2a868..ee271bb661cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); +static unsigned long capacity_of(int cpu); /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) @@ -1392,6 +1393,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int last_cpupid, this_cpupid; this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + + /* + * Allow first faults or private faults to migrate immediately early in + * the lifetime of a task. The magic number 4 is based on waiting for + * two full passes of the "multi-stage node selection" test that is + * executed below. + */ + if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) + return true; /* * Multi-stage node selection is used in conjunction with a periodic @@ -1410,7 +1422,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * This quadric squishes small probabilities, making it less likely we * act on an unlikely task<->page relation. */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != dst_nid) return false; @@ -1446,7 +1457,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -1454,8 +1464,6 @@ struct numa_stats { /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; - - unsigned int nr_running; }; /* @@ -1463,36 +1471,16 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int smt, cpu, cpus = 0; - unsigned long capacity; + int cpu; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { struct rq *rq = cpu_rq(cpu); - ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); - - cpus++; } - /* - * If we raced with hotplug and there are no CPUs left in our mask - * the @ns structure is NULL'ed and task_numa_compare() will - * not find this node attractive. - * - * We'll detect a huge imbalance and bail there. - */ - if (!cpus) - return; - - /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); - capacity = cpus / smt; /* cores */ - - capacity = min_t(unsigned, capacity, - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); } struct task_numa_env { @@ -1514,6 +1502,21 @@ struct task_numa_env { static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { + struct rq *rq = cpu_rq(env->dst_cpu); + + /* Bail out if run-queue part of active NUMA balance. */ + if (xchg(&rq->numa_migrate_on, 1)) + return; + + /* + * Clear previous best_cpu/rq numa-migrate flag, since task now + * found a better CPU to move/swap. + */ + if (env->best_cpu != -1) { + rq = cpu_rq(env->best_cpu); + WRITE_ONCE(rq->numa_migrate_on, 0); + } + if (env->best_task) put_task_struct(env->best_task); if (p) @@ -1553,6 +1556,13 @@ static bool load_too_imbalanced(long src_load, long dst_load, } /* + * Maximum NUMA importance can be 1998 (2*999); + * SMALLIMP @ 30 would be close to 1998/64. + * Used to deter task migration. + */ +#define SMALLIMP 30 + +/* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking * into account that it might be best if task running on the dst_cpu should @@ -1569,6 +1579,9 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; int dist = env->dist; + if (READ_ONCE(dst_rq->numa_migrate_on)) + return; + rcu_read_lock(); cur = task_rcu_dereference(&dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) @@ -1582,7 +1595,7 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; if (!cur) { - if (maymove || imp > env->best_imp) + if (maymove && moveimp >= env->best_imp) goto assign; else goto unlock; @@ -1625,16 +1638,22 @@ static void task_numa_compare(struct task_numa_env *env, task_weight(cur, env->dst_nid, dist); } - if (imp <= env->best_imp) - goto unlock; - if (maymove && moveimp > imp && moveimp > env->best_imp) { - imp = moveimp - 1; + imp = moveimp; cur = NULL; goto assign; } /* + * If the NUMA importance is less than SMALLIMP, + * task migration might only result in ping pong + * of tasks and also hurt performance due to cache + * misses. + */ + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) + goto unlock; + + /* * In the overloaded case, try and keep the load balanced. */ load = task_h_load(env->p) - task_h_load(cur); @@ -1710,6 +1729,7 @@ static int task_numa_migrate(struct task_struct *p) .best_cpu = -1, }; struct sched_domain *sd; + struct rq *best_rq; unsigned long taskweight, groupweight; int nid, ret, dist; long taskimp, groupimp; @@ -1805,20 +1825,17 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - /* - * Reset the scan period if the task is being rescheduled on an - * alternative node to recheck if the tasks is now properly placed. - */ - p->numa_scan_period = task_scan_start(p); - + best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); @@ -2596,6 +2613,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } +static void update_scan_period(struct task_struct *p, int new_cpu) +{ + int src_nid = cpu_to_node(task_cpu(p)); + int dst_nid = cpu_to_node(new_cpu); + + if (!static_branch_likely(&sched_numa_balancing)) + return; + + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) + return; + + if (src_nid == dst_nid) + return; + + /* + * Allow resets if faults have been trapped before one scan + * has completed. This is most likely due to a new task that + * is pulled cross-node due to wakeups or load balancing. + */ + if (p->numa_scan_seq) { + /* + * Avoid scan adjustments if moving to the preferred + * node or if the task was not previously running on + * the preferred node. + */ + if (dst_nid == p->numa_preferred_nid || + (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + return; + } + + p->numa_scan_period = task_scan_start(p); +} + #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2609,6 +2659,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void update_scan_period(struct task_struct *p, int new_cpu) +{ +} + #endif /* CONFIG_NUMA_BALANCING */ static void @@ -3647,6 +3701,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) WRITE_ONCE(p->se.avg.util_est, ue); } +static inline int task_fits_capacity(struct task_struct *p, long capacity) +{ + return capacity * 1024 > task_util_est(p) * capacity_margin; +} + +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return; + + if (!p) { + rq->misfit_task_load = 0; + return; + } + + if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + rq->misfit_task_load = 0; + return; + } + + rq->misfit_task_load = task_h_load(p); +} + #else /* CONFIG_SMP */ #define UPDATE_TG 0x0 @@ -3676,6 +3753,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) {} +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -3925,7 +4003,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * put back on, and if we advance min_vruntime, we'll be placed back * further than we started -- ie. we'll be penalized. */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); } @@ -4400,9 +4478,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) /* * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us + * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is + * not running add to the tail so that later runqueues don't get starved. */ - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (cfs_b->distribute_running) + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + else + list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); /* * If we're the first throttled task, make sure the bandwidth @@ -4546,14 +4628,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * in us over-using our runtime if it is all used during this loop, but * only by limited amounts in that extreme case. */ - while (throttled && cfs_b->runtime > 0) { + while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; + cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); raw_spin_lock(&cfs_b->lock); + cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->runtime -= min(runtime, cfs_b->runtime); @@ -4664,6 +4748,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); + if (cfs_b->distribute_running) { + raw_spin_unlock(&cfs_b->lock); + return; + } + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock(&cfs_b->lock); return; @@ -4673,6 +4762,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) runtime = cfs_b->runtime; expires = cfs_b->runtime_expires; + if (runtime) + cfs_b->distribute_running = 1; + raw_spin_unlock(&cfs_b->lock); if (!runtime) @@ -4683,6 +4775,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -4791,6 +4884,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; + cfs_b->distribute_running = 0; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -6188,6 +6282,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) { long min_cap, max_cap; + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 0; + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; @@ -6198,7 +6295,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) /* Bring task utilization in sync with prev_cpu */ sync_entity_load_avg(&p->se); - return min_cap * 1024 < task_util(p) * capacity_margin; + return !task_fits_capacity(p, min_cap); } /* @@ -6275,7 +6372,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ -static void migrate_task_rq_fair(struct task_struct *p) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { /* * As blocked tasks retain absolute vruntime the migration needs to @@ -6328,6 +6425,8 @@ static void migrate_task_rq_fair(struct task_struct *p) /* We have migrated, no longer consider this task hot */ p->se.exec_start = 0; + + update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) @@ -6615,9 +6714,12 @@ done: __maybe_unused; if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + update_misfit_status(p, rq); + return p; idle: + update_misfit_status(NULL, rq); new_tasks = idle_balance(rq, rf); /* @@ -6823,6 +6925,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -6853,6 +6962,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type src_grp_type; struct list_head tasks; }; @@ -7233,7 +7343,7 @@ static inline bool others_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ if (READ_ONCE(rq->avg_irq.util_avg)) return true; #endif @@ -7396,12 +7506,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -7417,6 +7521,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -7525,13 +7630,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, min_capacity; + unsigned long capacity, min_capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -7545,6 +7651,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; min_capacity = ULONG_MAX; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -7575,6 +7682,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } min_capacity = min(capacity, min_capacity); + max_capacity = max(capacity, max_capacity); } } else { /* @@ -7588,12 +7696,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity += sgc->capacity; min_capacity = min(sgc->min_capacity, min_capacity); + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = min_capacity; + sdg->sgc->max_capacity = max_capacity; } /* @@ -7689,16 +7799,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) } /* - * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity than sched_group ref. */ static inline bool -group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->min_capacity * capacity_margin < ref->sgc->min_capacity * 1024; } +/* + * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity_orig than sched_group ref. + */ +static inline bool +group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity * capacity_margin < + ref->sgc->max_capacity * 1024; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -7709,6 +7830,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task_load) + return group_misfit_task; + return group_other; } @@ -7741,7 +7865,7 @@ static bool update_nohz_stats(struct rq *rq, bool force) * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. - * @overload: Indicate more than one runnable task for any CPU. + * @overload: Indicate pullable load (e.g. >1 runnable task). */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, @@ -7783,6 +7907,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, */ if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load < rq->misfit_task_load) { + sgs->group_misfit_task_load = rq->misfit_task_load; + *overload = 1; + } } /* Adjust by relative CPU capacity of the group */ @@ -7818,6 +7948,17 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* + * Don't try to pull misfit tasks we can't help. + * We can use max_capacity here as reduction in capacity on some + * CPUs in the group should either be possible to resolve + * internally or be covered by avg_load imbalance (eventually). + */ + if (sgs->group_type == group_misfit_task && + (!group_smaller_max_cpu_capacity(sg, sds->local) || + !group_has_capacity(env, &sds->local_stat))) + return false; + if (sgs->group_type > busiest->group_type) return true; @@ -7837,7 +7978,14 @@ static bool update_sd_pick_busiest(struct lb_env *env, * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_cpu_capacity(sds->local, sg)) + group_smaller_min_cpu_capacity(sds->local, sg)) + return false; + + /* + * If we have more than one misfit sg go with the biggest misfit. + */ + if (sgs->group_type == group_misfit_task && + sgs->group_misfit_task_load < busiest->group_misfit_task_load) return false; asym_packing: @@ -7908,11 +8056,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int load_idx, prefer_sibling = 0; + int load_idx; bool overload = false; - - if (child && child->flags & SD_PREFER_SIBLING) - prefer_sibling = 1; + bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) @@ -7986,8 +8132,8 @@ next_group: if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - if (env->dst_rq->rd->overload != overload) - env->dst_rq->rd->overload = overload; + if (READ_ONCE(env->dst_rq->rd->overload) != overload) + WRITE_ONCE(env->dst_rq->rd->overload, overload); } } @@ -8137,8 +8283,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * factors in sg capacity and sgs with smaller group_type are * skipped when updating the busiest sg: */ - if (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load) { + if (busiest->group_type != group_misfit_task && + (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load)) { env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -8172,6 +8319,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task_load); + } + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -8238,6 +8391,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -8275,6 +8432,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ + env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -8322,8 +8480,32 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we simply + * seek the "biggest" misfit task. + */ + if (env->src_grp_type == group_misfit_task) { + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + continue; + } + capacity = capacity_of(i); + /* + * For ASYM_CPUCAPACITY domains, don't pick a CPU that could + * eventually lead to active_balancing high->low capacity. + * Higher per-CPU capacity is considered better than balancing + * average load. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + capacity_of(env->dst_cpu) < capacity && + rq->nr_running == 1) + continue; + wl = weighted_cpuload(rq); /* @@ -8391,6 +8573,9 @@ static int need_active_balance(struct lb_env *env) return 1; } + if (env->src_grp_type == group_misfit_task) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -9033,7 +9218,7 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2) { + if (rq->nr_running >= 2 || rq->misfit_task_load) { flags = NOHZ_KICK_MASK; goto out; } @@ -9402,7 +9587,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + !READ_ONCE(this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); @@ -9564,6 +9749,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + update_misfit_status(curr, rq); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 85ae8488039c..858589b83377 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,7 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) -SCHED_FEAT(LB_BIAS, true) +SCHED_FEAT(LB_BIAS, false) /* * Decrement CPU capacity based on time not spent running tasks diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 35475c0c5419..90fb5bc12ad4 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -269,9 +269,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; @@ -282,9 +279,6 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, cfs_rq->curr == se)) { @@ -358,7 +352,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* * irq: * diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index d2894db28955..7e56b489ff32 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -6,7 +6,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); #else static inline int diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5b00a816a4b3..b8c007713b3b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -345,6 +345,8 @@ struct cfs_bandwidth { int nr_periods; int nr_throttled; u64 throttled_time; + + bool distribute_running; #endif }; @@ -714,8 +716,12 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; - /* Indicate more than one runnable task for any CPU */ - bool overload; + /* + * Indicate pullable load on at least one CPU, e.g: + * - More than one runnable task + * - Running task is misfit + */ + int overload; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -782,6 +788,7 @@ struct rq { #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; + unsigned int numa_migrate_on; #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; @@ -841,6 +848,8 @@ struct rq { unsigned char idle_balance; + unsigned long misfit_task_load; + /* For active balancing */ int active_balance; int push_cpu; @@ -854,8 +863,7 @@ struct rq { struct sched_avg avg_rt; struct sched_avg avg_dl; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) -#define HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif u64 idle_stamp; @@ -1184,6 +1192,7 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { atomic_t ref; @@ -1193,6 +1202,7 @@ struct sched_group_capacity { */ unsigned long capacity; unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long max_capacity; /* Max per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ @@ -1392,7 +1402,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = 0; #undef SCHED_FEAT -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ @@ -1522,7 +1532,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); @@ -1692,8 +1702,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP - if (!rq->rd->overload) - rq->rd->overload = true; + if (!READ_ONCE(rq->rd->overload)) + WRITE_ONCE(rq->rd->overload, 1); #endif } @@ -2213,7 +2223,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq) } #endif -#ifdef HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { return rq->avg_irq.util_avg; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 505a41c42b96..9d74371e4aad 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -7,8 +7,8 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ -cpumask_var_t sched_domains_tmpmask; -cpumask_var_t sched_domains_tmpmask2; +static cpumask_var_t sched_domains_tmpmask; +static cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG @@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) { @@ -692,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg_span = sched_group_span(sg); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; } static int @@ -851,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; return sg; } @@ -1061,7 +1064,6 @@ static struct cpumask ***sched_domains_numa_masks; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain - * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1073,13 +1075,12 @@ static struct cpumask ***sched_domains_numa_masks; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, - struct sched_domain *child, int cpu) + struct sched_domain *child, int dflags, int cpu) { struct sd_data *sdd = &tl->data; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -1100,6 +1101,9 @@ sd_init(struct sched_domain_topology_level *tl, "wrong sd_flags in topology description\n")) sd_flags &= ~TOPOLOGY_SD_FLAGS; + /* Apply detected topology flags */ + sd_flags |= dflags; + *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, @@ -1122,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE - | 0*SD_PREFER_SIBLING + | 1*SD_PREFER_SIBLING | 0*SD_NUMA | sd_flags , @@ -1148,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_ASYM_CPUCAPACITY) { struct sched_domain *t = sd; + /* + * Don't attempt to spread across CPUs of different capacities. + */ + if (sd->child) + sd->child->flags &= ~SD_PREFER_SIBLING; + for_each_lower_domain(t) t->flags |= SD_BALANCE_WAKE; } if (sd->flags & SD_SHARE_CPUCAPACITY) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 117; sd->cache_nice_tries = 1; sd->busy_idx = 2; @@ -1169,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->busy_idx = 3; sd->idle_idx = 2; + sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { sd->flags &= ~(SD_BALANCE_EXEC | @@ -1178,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { - sd->flags |= SD_PREFER_SIBLING; sd->cache_nice_tries = 1; sd->busy_idx = 2; sd->idle_idx = 1; @@ -1604,9 +1612,9 @@ static void __sdt_free(const struct cpumask *cpu_map) static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int cpu) + struct sched_domain *child, int dflags, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); + struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu); if (child) { sd->level = child->level + 1; @@ -1633,6 +1641,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve } /* + * Find the sched_domain_topology_level where all CPU capacities are visible + * for all CPUs. + */ +static struct sched_domain_topology_level +*asym_cpu_capacity_level(const struct cpumask *cpu_map) +{ + int i, j, asym_level = 0; + bool asym = false; + struct sched_domain_topology_level *tl, *asym_tl = NULL; + unsigned long cap; + + /* Is there any asymmetry? */ + cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); + + for_each_cpu(i, cpu_map) { + if (arch_scale_cpu_capacity(NULL, i) != cap) { + asym = true; + break; + } + } + + if (!asym) + return NULL; + + /* + * Examine topology from all CPU's point of views to detect the lowest + * sched_domain_topology_level where a highest capacity CPU is visible + * to everyone. + */ + for_each_cpu(i, cpu_map) { + unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); + int tl_id = 0; + + for_each_sd_topology(tl) { + if (tl_id < asym_level) + goto next_level; + + for_each_cpu_and(j, tl->mask(i), cpu_map) { + unsigned long capacity; + + capacity = arch_scale_cpu_capacity(NULL, j); + + if (capacity <= max_capacity) + continue; + + max_capacity = capacity; + asym_level = tl_id; + asym_tl = tl; + } +next_level: + tl_id++; + } + } + + return asym_tl; +} + + +/* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs */ @@ -1644,18 +1711,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct s_data d; struct rq *rq = NULL; int i, ret = -ENOMEM; + struct sched_domain_topology_level *tl_asym; + bool has_asym = false; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; + tl_asym = asym_cpu_capacity_level(cpu_map); + /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; sd = NULL; for_each_sd_topology(tl) { - sd = build_sched_domain(tl, cpu_map, attr, sd, i); + int dflags = 0; + + if (tl == tl_asym) { + dflags |= SD_ASYM_CPUCAPACITY; + has_asym = true; + } + + sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); + if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP) @@ -1704,6 +1783,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } rcu_read_unlock(); + if (has_asym) + static_branch_enable_cpuslocked(&sched_asym_cpucapacity); + if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); diff --git a/kernel/signal.c b/kernel/signal.c index 5843c541fda9..e4aad0e90882 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3460,7 +3460,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) } static int -do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp) +do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, + size_t min_ss_size) { struct task_struct *t = current; @@ -3490,7 +3491,7 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp) ss_size = 0; ss_sp = NULL; } else { - if (unlikely(ss_size < MINSIGSTKSZ)) + if (unlikely(ss_size < min_ss_size)) return -ENOMEM; } @@ -3508,7 +3509,8 @@ SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) if (uss && copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, - current_user_stack_pointer()); + current_user_stack_pointer(), + MINSIGSTKSZ); if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) err = -EFAULT; return err; @@ -3519,7 +3521,8 @@ int restore_altstack(const stack_t __user *uss) stack_t new; if (copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; - (void)do_sigaltstack(&new, NULL, current_user_stack_pointer()); + (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(), + MINSIGSTKSZ); /* squash all but EFAULT for now */ return 0; } @@ -3553,7 +3556,8 @@ static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, uss.ss_size = uss32.ss_size; } ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, - compat_user_stack_pointer()); + compat_user_stack_pointer(), + COMPAT_MINSIGSTKSZ); if (ret >= 0 && uoss_ptr) { compat_stack_t old; memset(&old, 0, sizeof(old)); diff --git a/kernel/softirq.c b/kernel/softirq.c index 6f584861d329..7a0720a20003 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -301,7 +301,8 @@ restart: pending >>= softirq_bit; } - rcu_bh_qs(); + if (__this_cpu_read(ksoftirqd) == current) + rcu_softirq_qs(); local_irq_disable(); pending = local_softirq_pending(); diff --git a/kernel/torture.c b/kernel/torture.c index 1ac24a826589..17d91f5fba2a 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -573,7 +573,7 @@ static int stutter; * Block until the stutter interval ends. This must be called periodically * by all running kthreads that need to be subject to stuttering. */ -void stutter_wait(const char *title) +bool stutter_wait(const char *title) { int spt; @@ -590,6 +590,7 @@ void stutter_wait(const char *title) } torture_shutdown_absorb(title); } + return !!spt; } EXPORT_SYMBOL_GPL(stutter_wait); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2868d85f1fb1..fac0ddf8a8e2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return NULL; - if (!bio->bi_css) + if (!bio->bi_blkg) return NULL; - return cgroup_get_kernfs_id(bio->bi_css->cgroup); + return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else static union kernfs_node_id * diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index f704390db9fc..d8765c952fab 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -5,12 +5,12 @@ * Copyright (C) 2018 Joel Fernandes (Google) <joel@joelfernandes.org> */ +#include <linux/trace_clock.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/kernel.h> #include <linux/kthread.h> -#include <linux/ktime.h> #include <linux/module.h> #include <linux/printk.h> #include <linux/string.h> @@ -25,13 +25,13 @@ MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default ir static void busy_wait(ulong time) { - ktime_t start, end; - start = ktime_get(); + u64 start, end; + start = trace_clock_local(); do { - end = ktime_get(); + end = trace_clock_local(); if (kthread_should_stop()) break; - } while (ktime_to_ns(ktime_sub(end, start)) < (time * 1000)); + } while ((end - start) < (time * 1000)); } static int preemptirq_delay_run(void *data) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 85f6b01431c7..d239004aaf29 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -738,16 +738,30 @@ static void free_synth_field(struct synth_field *field) kfree(field); } -static struct synth_field *parse_synth_field(char *field_type, - char *field_name) +static struct synth_field *parse_synth_field(int argc, char **argv, + int *consumed) { struct synth_field *field; + const char *prefix = NULL; + char *field_type = argv[0], *field_name; int len, ret = 0; char *array; if (field_type[0] == ';') field_type++; + if (!strcmp(field_type, "unsigned")) { + if (argc < 3) + return ERR_PTR(-EINVAL); + prefix = "unsigned "; + field_type = argv[1]; + field_name = argv[2]; + *consumed = 3; + } else { + field_name = argv[1]; + *consumed = 2; + } + len = strlen(field_name); if (field_name[len - 1] == ';') field_name[len - 1] = '\0'; @@ -760,11 +774,15 @@ static struct synth_field *parse_synth_field(char *field_type, array = strchr(field_name, '['); if (array) len += strlen(array); + if (prefix) + len += strlen(prefix); field->type = kzalloc(len, GFP_KERNEL); if (!field->type) { ret = -ENOMEM; goto free; } + if (prefix) + strcat(field->type, prefix); strcat(field->type, field_type); if (array) { strcat(field->type, array); @@ -1009,7 +1027,7 @@ static int create_synth_event(int argc, char **argv) struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; struct synth_event *event = NULL; bool delete_event = false; - int i, n_fields = 0, ret = 0; + int i, consumed = 0, n_fields = 0, ret = 0; char *name; mutex_lock(&synth_event_mutex); @@ -1061,16 +1079,16 @@ static int create_synth_event(int argc, char **argv) goto err; } - field = parse_synth_field(argv[i], argv[i + 1]); + field = parse_synth_field(argc - i, &argv[i], &consumed); if (IS_ERR(field)) { ret = PTR_ERR(field); goto err; } - fields[n_fields] = field; - i++; n_fields++; + fields[n_fields++] = field; + i += consumed - 1; } - if (i < argc) { + if (i < argc && strcmp(argv[i], ";") != 0) { ret = -EINVAL; goto err; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index bf2c06ef9afc..a3be42304485 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -28,8 +28,8 @@ #include <linux/sched/task.h> #include <linux/static_key.h> -extern struct tracepoint * const __start___tracepoints_ptrs[]; -extern struct tracepoint * const __stop___tracepoints_ptrs[]; +extern tracepoint_ptr_t __start___tracepoints_ptrs[]; +extern tracepoint_ptr_t __stop___tracepoints_ptrs[]; DEFINE_SRCU(tracepoint_srcu); EXPORT_SYMBOL_GPL(tracepoint_srcu); @@ -371,25 +371,17 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); -static void for_each_tracepoint_range(struct tracepoint * const *begin, - struct tracepoint * const *end, +static void for_each_tracepoint_range( + tracepoint_ptr_t *begin, tracepoint_ptr_t *end, void (*fct)(struct tracepoint *tp, void *priv), void *priv) { + tracepoint_ptr_t *iter; + if (!begin) return; - - if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) { - const int *iter; - - for (iter = (const int *)begin; iter < (const int *)end; iter++) - fct(offset_to_ptr(iter), priv); - } else { - struct tracepoint * const *iter; - - for (iter = begin; iter < end; iter++) - fct(*iter, priv); - } + for (iter = begin; iter < end; iter++) + fct(tracepoint_ptr_deref(iter), priv); } #ifdef CONFIG_MODULES |