diff options
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r-- | kernel/rcu/tree.c | 324 |
1 files changed, 272 insertions, 52 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..d91c9156fab2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -43,7 +43,6 @@ #include <uapi/linux/sched/types.h> #include <linux/prefetch.h> #include <linux/delay.h> -#include <linux/stop_machine.h> #include <linux/random.h> #include <linux/trace_events.h> #include <linux/suspend.h> @@ -55,6 +54,7 @@ #include <linux/oom.h> #include <linux/smpboot.h> #include <linux/jiffies.h> +#include <linux/slab.h> #include <linux/sched/isolation.h> #include <linux/sched/clock.h> #include "../time/tick-internal.h" @@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; -struct rcu_state rcu_state = { +static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, @@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); * held, but the bit corresponding to the current CPU will be stable * in most contexts. */ -unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) { return READ_ONCE(rnp->qsmaskinitnext); } @@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -bool rcu_dynticks_curr_cpu_in_eqs(void) +static bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -int rcu_dynticks_snap(struct rcu_data *rdp) +static int rcu_dynticks_snap(struct rcu_data *rdp) { int snap = atomic_add_return(0, &rdp->dynticks); @@ -529,16 +529,6 @@ static struct rcu_node *rcu_get_root(void) } /* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ - if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) - return "???"; - return gp_state_names[gs]; -} - -/* * Send along grace-period-related data for rcutorture diagnostics. */ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, @@ -577,7 +567,7 @@ static void rcu_eqs_enter(bool user) } lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); @@ -650,14 +640,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); + trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, + atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (irq) @@ -744,7 +735,7 @@ static void rcu_eqs_exit(bool user) rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks); + trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); @@ -800,8 +791,8 @@ void rcu_user_exit(void) */ static __always_inline void rcu_nmi_enter_common(bool irq) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); long incby = 2; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); /* Complain about underflow. */ WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); @@ -828,12 +819,17 @@ static __always_inline void rcu_nmi_enter_common(bool irq) } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + raw_spin_lock_rcu_node(rdp->mynode); + // Recheck under lock. + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + rdp->rcu_forced_tick = true; + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, - rdp->dynticks_nmi_nesting + incby, rdp->dynticks); + rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); @@ -898,6 +894,7 @@ void rcu_irq_enter_irqson(void) */ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) { + raw_lockdep_assert_held_rcu_node(rdp->mynode); WRITE_ONCE(rdp->rcu_urgent_qs, false); WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { @@ -1934,7 +1931,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2146,7 +2143,6 @@ static void rcu_do_batch(struct rcu_data *rdp) /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), 0); trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), @@ -2168,7 +2164,6 @@ static void rcu_do_batch(struct rcu_data *rdp) if (unlikely(bl > 100)) tlimit = local_clock() + rcu_resched_ns; trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); if (offloaded) @@ -2179,9 +2174,19 @@ static void rcu_do_batch(struct rcu_data *rdp) tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + rcu_callback_t f; + debug_rcu_head_unqueue(rhp); - if (__rcu_reclaim(rcu_state.name, rhp)) - rcu_cblist_dequeued_lazy(&rcl); + + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_callback(rcu_state.name, rhp); + + f = rhp->func; + WRITE_ONCE(rhp->func, (rcu_callback_t)0L); + f(rhp); + + rcu_lock_release(&rcu_callback_map); + /* * Stop only if limit reached and CPU has something to do. * Note: The rcl structure counts down from zero. @@ -2294,7 +2299,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPTION) || + if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they @@ -2308,14 +2313,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; } - for_each_leaf_node_possible_cpu(rnp, cpu) { - unsigned long bit = leaf_node_cpu_bit(rnp, cpu); - if ((rnp->qsmask & bit) != 0) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (f(rdp)) { - mask |= bit; - rcu_disable_urgency_upon_qs(rdp); - } + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (f(rdp)) { + mask |= rdp->grpmask; + rcu_disable_urgency_upon_qs(rdp); } } if (mask != 0) { @@ -2474,8 +2476,8 @@ static void rcu_cpu_kthread(unsigned int cpu) char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); int spincnt; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); @@ -2583,7 +2585,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct rcu_data *rdp; @@ -2618,18 +2620,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); } + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ - rcu_segcblist_enqueue(&rdp->cblist, head, lazy); + rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, (unsigned long)func, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); else trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ @@ -2679,28 +2680,230 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 0); + __call_rcu(head, func); } EXPORT_SYMBOL_GPL(call_rcu); + +/* Maximum number of jiffies to wait before draining a batch. */ +#define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_N_BATCHES 2 + +/** + * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests + * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head_free: List of kfree_rcu() objects waiting for a grace period + * @krcp: Pointer to @kfree_rcu_cpu structure + */ + +struct kfree_rcu_cpu_work { + struct rcu_work rcu_work; + struct rcu_head *head_free; + struct kfree_rcu_cpu *krcp; +}; + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period + * @lock: Synchronize access to this structure + * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES + * @monitor_todo: Tracks whether a @monitor_work delayed work is pending + * @initialized: The @lock and @rcu_work fields have been initialized + * + * This is a per-CPU structure. The reason that it is not included in + * the rcu_data structure is to permit this code to be extracted from + * the RCU files. Such extraction could allow further optimization of + * the interactions with the slab allocators. + */ +struct kfree_rcu_cpu { + struct rcu_head *head; + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; + spinlock_t lock; + struct delayed_work monitor_work; + bool monitor_todo; + bool initialized; +}; + +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); + +/* + * This function is invoked in workqueue context after a grace period. + * It frees all the objects queued on ->head_free. + */ +static void kfree_rcu_work(struct work_struct *work) +{ + unsigned long flags; + struct rcu_head *head, *next; + struct kfree_rcu_cpu *krcp; + struct kfree_rcu_cpu_work *krwp; + + krwp = container_of(to_rcu_work(work), + struct kfree_rcu_cpu_work, rcu_work); + krcp = krwp->krcp; + spin_lock_irqsave(&krcp->lock, flags); + head = krwp->head_free; + krwp->head_free = NULL; + spin_unlock_irqrestore(&krcp->lock, flags); + + // List "head" is now private, so traverse locklessly. + for (; head; head = next) { + unsigned long offset = (unsigned long)head->func; + + next = head->next; + // Potentially optimize with kfree_bulk in future. + debug_rcu_head_unqueue(head); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + + if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) { + /* Could be optimized with kfree_bulk() in future. */ + kfree((void *)head - offset); + } + + rcu_lock_release(&rcu_callback_map); + cond_resched_tasks_rcu_qs(); + } +} + /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). + * Schedule the kfree batch RCU work to run in workqueue context after a GP. + * + * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES + * timeout has been reached. + */ +static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) +{ + int i; + struct kfree_rcu_cpu_work *krwp = NULL; + + lockdep_assert_held(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + if (!krcp->krw_arr[i].head_free) { + krwp = &(krcp->krw_arr[i]); + break; + } + + // If a previous RCU batch is in progress, we cannot immediately + // queue another one, so return false to tell caller to retry. + if (!krwp) + return false; + + krwp->head_free = krcp->head; + krcp->head = NULL; + INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work); + queue_rcu_work(system_wq, &krwp->rcu_work); + return true; +} + +static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, + unsigned long flags) +{ + // Attempt to start a new batch. + krcp->monitor_todo = false; + if (queue_kfree_rcu_work(krcp)) { + // Success! Our job is done here. + spin_unlock_irqrestore(&krcp->lock, flags); + return; + } + + // Previous RCU batch still in progress, try again later. + krcp->monitor_todo = true; + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * It invokes kfree_rcu_drain_unlock() to attempt to start another batch. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, + monitor_work.work); + + spin_lock_irqsave(&krcp->lock, flags); + if (krcp->monitor_todo) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * Queue a request for lazy invocation of kfree() after a grace period. + * + * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch + * will be kfree'd in workqueue context. This allows us to: + * + * 1. Batch requests together to reduce the number of grace periods during + * heavy kfree_rcu() load. + * + * 2. It makes it possible to use kfree_bulk() on a large number of + * kfree_rcu() requests thus reducing cache misses and the per-object + * overhead of kfree(). */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 1); + unsigned long flags; + struct kfree_rcu_cpu *krcp; + + local_irq_save(flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + if (krcp->initialized) + spin_lock(&krcp->lock); + + // Queue the object but don't yet schedule the batch. + if (debug_rcu_head_queue(head)) { + // Probable double kfree_rcu(), just leak. + WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", + __func__, head); + goto unlock_return; + } + head->func = func; + head->next = krcp->head; + krcp->head = head; + + // Set timer to drain after KFREE_DRAIN_JIFFIES. + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !krcp->monitor_todo) { + krcp->monitor_todo = true; + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + } + +unlock_return: + if (krcp->initialized) + spin_unlock(&krcp->lock); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kfree_call_rcu); +void __init kfree_rcu_scheduler_running(void) +{ + int cpu; + unsigned long flags; + + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_irqsave(&krcp->lock, flags); + if (!krcp->head || krcp->monitor_todo) { + spin_unlock_irqrestore(&krcp->lock, flags); + continue; + } + krcp->monitor_todo = true; + schedule_delayed_work_on(cpu, &krcp->monitor_work, + KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); + } +} + /* * During early boot, any blocking grace-period wait automatically - * implies a grace period. Later on, this is never the case for PREEMPT. + * implies a grace period. Later on, this is never the case for PREEMPTION. * - * Howevr, because a context switch is a grace period for !PREEMPT, any + * Howevr, because a context switch is a grace period for !PREEMPTION, any * blocking grace-period wait automatically implies a grace period if * there is only one CPU online at any point time during execution of * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to @@ -2896,7 +3099,7 @@ static void rcu_barrier_func(void *unused) debug_rcu_head_queue(&rdp->barrier_head); rcu_nocb_lock(rdp); WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); - if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); @@ -3557,12 +3760,29 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; struct workqueue_struct *rcu_par_gp_wq; +static void __init kfree_rcu_batch_init(void) +{ + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_init(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + krcp->krw_arr[i].krcp = krcp; + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + krcp->initialized = true; + } +} + void __init rcu_init(void) { int cpu; rcu_early_boot_tests(); + kfree_rcu_batch_init(); rcu_bootup_announce(); rcu_init_geometry(); rcu_init_one(); |