diff options
Diffstat (limited to 'kernel')
55 files changed, 1450 insertions, 615 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index dc5c77544fd6..17ea6d4a9a24 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -86,7 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o -obj-$(CONFIG_NET) += bpf/ +obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/audit.c b/kernel/audit.c index 80983df92cd4..1f37f15117e5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy) set_freezable(); while (!kthread_should_stop()) { struct sk_buff *skb; - DECLARE_WAITQUEUE(wait, current); flush_hold_queue(); @@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy) audit_printk_skb(skb); continue; } - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kauditd_wait, &wait); - if (!skb_queue_len(&audit_skb_queue)) { - try_to_freeze(); - schedule(); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kauditd_wait, &wait); + wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); } return 0; } @@ -739,7 +730,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); audit_log_task_info(ab, current); - audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", + audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); audit_log_end(ab); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e242e3a9864a..80f29e015570 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count) chunk->owners[i].index = i; } fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); + chunk->mark.mask = FS_IN_IGNORED; return chunk; } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 45427239f375..0daf7f6ae7df 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ -obj-y := core.o syscall.o verifier.o - +obj-y := core.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o ifdef CONFIG_TEST_BPF -obj-y += test_stub.o +obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f0c30c59b317..d6594e457a25 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -655,3 +655,12 @@ void bpf_prog_free(struct bpf_prog *fp) schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); + +/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call + * skb_copy_bits(), so provide a weak definition of it for NET-less config. + */ +int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, + int len) +{ + return -EFAULT; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 801f5f3b9307..9f81818f2941 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1409,7 +1409,8 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) if (memcmp(&old->regs[i], &cur->regs[i], sizeof(old->regs[0])) != 0) { if (old->regs[i].type == NOT_INIT || - old->regs[i].type == UNKNOWN_VALUE) + (old->regs[i].type == UNKNOWN_VALUE && + cur->regs[i].type != NOT_INIT)) continue; return false; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a0..937ecdfdf258 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -107,46 +107,6 @@ void context_tracking_user_enter(void) } NOKPROBE_SYMBOL(context_tracking_user_enter); -#ifdef CONFIG_PREEMPT -/** - * preempt_schedule_context - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) -{ - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - - /* - * Need to disable preemption in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - preempt_disable_notrace(); - prev_ctx = exception_enter(); - preempt_enable_no_resched_notrace(); - - preempt_schedule(); - - preempt_disable_notrace(); - exception_exit(prev_ctx); - preempt_enable_notrace(); -} -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_PREEMPT */ - /** * context_tracking_user_exit - Inform the context tracking that the CPU is * exiting userspace mode and entering the kernel. diff --git a/kernel/cpu.c b/kernel/cpu.c index 356450f09c1f..5d220234b3ca 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -64,6 +64,8 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; + /* And allows lockless put_online_cpus(). */ + atomic_t puts_pending; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -84,6 +86,16 @@ static struct { #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) +static void apply_puts_pending(int max) +{ + int delta; + + if (atomic_read(&cpu_hotplug.puts_pending) >= max) { + delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); + cpu_hotplug.refcount -= delta; + } +} + void get_online_cpus(void) { might_sleep(); @@ -91,6 +103,7 @@ void get_online_cpus(void) return; cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); + apply_puts_pending(65536); cpu_hotplug.refcount++; mutex_unlock(&cpu_hotplug.lock); } @@ -103,6 +116,7 @@ bool try_get_online_cpus(void) if (!mutex_trylock(&cpu_hotplug.lock)) return false; cpuhp_lock_acquire_tryread(); + apply_puts_pending(65536); cpu_hotplug.refcount++; mutex_unlock(&cpu_hotplug.lock); return true; @@ -113,7 +127,11 @@ void put_online_cpus(void) { if (cpu_hotplug.active_writer == current) return; - mutex_lock(&cpu_hotplug.lock); + if (!mutex_trylock(&cpu_hotplug.lock)) { + atomic_inc(&cpu_hotplug.puts_pending); + cpuhp_lock_release(); + return; + } if (WARN_ON(!cpu_hotplug.refcount)) cpu_hotplug.refcount++; /* try to fix things up */ @@ -155,6 +173,7 @@ void cpu_hotplug_begin(void) cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); + apply_puts_pending(1); if (likely(!cpu_hotplug.refcount)) break; __set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1f107c74087b..723cfc9d0ad7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) goto out; } + /* + * We can't shrink if we won't have enough room for SCHED_DEADLINE + * tasks. + */ + ret = -EBUSY; + if (is_cpu_exclusive(cur) && + !cpuset_cpumask_can_shrink(cur->cpus_allowed, + trial->cpus_allowed)) + goto out; + ret = 0; out: rcu_read_unlock(); @@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, goto out_unlock; cgroup_taskset_for_each(task, tset) { - /* - * Kthreads which disallow setaffinity shouldn't be moved - * to a new cpuset; we don't want to change their cpu - * affinity and isolating such threads by their set of - * allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for - * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. - */ - ret = -EINVAL; - if (task->flags & PF_NO_SETAFFINITY) + ret = task_can_attach(task, cs->cpus_allowed); + if (ret) goto out_unlock; ret = security_task_setscheduler(task); if (ret) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1425d07018de..3e19d3ebc29c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1562,8 +1562,10 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group if (!task) { /* - * Per cpu events are removed via an smp call and - * the removal is always successful. + * Per cpu events are removed via an smp call. The removal can + * fail if the CPU is currently offline, but in that case we + * already called __perf_remove_from_context from + * perf_event_exit_cpu. */ cpu_function_call(event->cpu, __perf_remove_from_context, &re); return; @@ -4458,7 +4460,7 @@ perf_output_sample_regs(struct perf_output_handle *handle, } } -static void perf_sample_regs_user(struct perf_regs_user *regs_user, +static void perf_sample_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { if (!user_mode(regs)) { @@ -4469,11 +4471,22 @@ static void perf_sample_regs_user(struct perf_regs_user *regs_user, } if (regs) { - regs_user->regs = regs; regs_user->abi = perf_reg_abi(current); + regs_user->regs = regs; + } else { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; + regs_user->regs = NULL; } } +static void perf_sample_regs_intr(struct perf_regs *regs_intr, + struct pt_regs *regs) +{ + regs_intr->regs = regs; + regs_intr->abi = perf_reg_abi(current); +} + + /* * Get remaining task size from user stack pointer. * @@ -4855,6 +4868,23 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_TRANSACTION) perf_output_put(handle, data->txn); + if (sample_type & PERF_SAMPLE_REGS_INTR) { + u64 abi = data->regs_intr.abi; + /* + * If there are no regs to dump, notice it through + * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). + */ + perf_output_put(handle, abi); + + if (abi) { + u64 mask = event->attr.sample_regs_intr; + + perf_output_sample_regs(handle, + data->regs_intr.regs, + mask); + } + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -4920,12 +4950,13 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) + perf_sample_regs_user(&data->regs_user, regs); + if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); - perf_sample_regs_user(&data->regs_user, regs); - if (data->regs_user.regs) { u64 mask = event->attr.sample_regs_user; size += hweight64(mask) * sizeof(u64); @@ -4941,15 +4972,11 @@ void perf_prepare_sample(struct perf_event_header *header, * in case new sample type is added, because we could eat * up the rest of the sample size. */ - struct perf_regs_user *uregs = &data->regs_user; u16 stack_size = event->attr.sample_stack_user; u16 size = sizeof(u64); - if (!uregs->abi) - perf_sample_regs_user(uregs, regs); - stack_size = perf_sample_ustack_size(stack_size, header->size, - uregs->regs); + data->regs_user.regs); /* * If there is something to dump, add space for the dump @@ -4962,6 +4989,21 @@ void perf_prepare_sample(struct perf_event_header *header, data->stack_user_size = stack_size; header->size += size; } + + if (sample_type & PERF_SAMPLE_REGS_INTR) { + /* regs dump ABI info */ + int size = sizeof(u64); + + perf_sample_regs_intr(&data->regs_intr, regs); + + if (data->regs_intr.regs) { + u64 mask = event->attr.sample_regs_intr; + + size += hweight64(mask) * sizeof(u64); + } + + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -6071,11 +6113,6 @@ static int perf_swevent_init(struct perf_event *event) return 0; } -static int perf_swevent_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -6085,8 +6122,6 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -6204,8 +6239,6 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -6431,8 +6464,6 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; /* @@ -6511,8 +6542,6 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -6542,7 +6571,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) static int perf_event_idx_default(struct perf_event *event) { - return event->hw.idx + 1; + return 0; } /* @@ -7162,6 +7191,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, ret = -EINVAL; } + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) + ret = perf_reg_validate(attr->sample_regs_intr); out: return ret; @@ -8130,7 +8161,7 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { - struct remove_event re = { .detach_group = false }; + struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; perf_pmu_rotate_stop(ctx->pmu); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 1559fb0b9296..9803a6600d49 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } -static int hw_breakpoint_event_idx(struct perf_event *bp) -{ - return 0; -} - static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, - - .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1d0af8a2c646..ed8f2cde34c5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1640,7 +1640,6 @@ bool uprobe_deny_signal(void) if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; set_tsk_thread_flag(t, TIF_UPROBE); - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } } diff --git a/kernel/exit.c b/kernel/exit.c index 5d30019ff953..232c4bc8bcc9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -997,6 +997,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); + if ((exit_code & 0x7f) == 0) { why = CLD_EXITED; status = exit_code >> 8; @@ -1079,6 +1081,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * thread can reap it because we its state == DEAD/TRACE. */ read_unlock(&tasklist_lock); + sched_annotate_sleep(); retval = wo->wo_rusage ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; @@ -1210,6 +1213,7 @@ unlock_sig: pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); @@ -1272,6 +1276,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (!wo->wo_info) { retval = wo->wo_rusage diff --git a/kernel/fork.c b/kernel/fork.c index 9b7d746d6d62..9ca84189cfc2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1022,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand) { if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); + /* + * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * without an RCU grace period, see __lock_task_sighand(). + */ kmem_cache_free(sighand_cachep, sighand); } } - /* * Initialize POSIX timer handling for a thread group. */ diff --git a/kernel/futex.c b/kernel/futex.c index f3a3a071283c..63678b573d61 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -143,9 +143,8 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers in - * get_futex_key_refs(), through either ihold or atomic_inc, depending on the - * futex type. + * to futex and the waiters read -- this is done by the barriers for both + * shared and private futexes in get_futex_key_refs(). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -344,13 +343,20 @@ static void get_futex_key_refs(union futex_key *key) futex_get_mm(key); /* implies MB (B) */ break; default: + /* + * Private futexes do not hold reference on an inode or + * mm, therefore the only purpose of calling get_futex_key_refs + * is because we need the barrier for the lockless waiter check. + */ smp_mb(); /* explicit MB (B) */ } } /* * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. + * The hash bucket spinlock must not be held. This is + * a no-op for private futexes, see comment in the get + * counterpart. */ static void drop_futex_key_refs(union futex_key *key) { @@ -641,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +/* + * Must be called with the hb lock held. + */ static void free_pi_state(struct futex_pi_state *pi_state) { + if (!pi_state) + return; + if (!atomic_dec_and_test(&pi_state->refcount)) return; @@ -1521,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } retry: - if (pi_state != NULL) { - /* - * We will have to lookup the pi_state again, so free this one - * to keep the accounting correct. - */ - free_pi_state(pi_state); - pi_state = NULL; - } - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -1619,6 +1622,8 @@ retry_private: case 0: break; case -EFAULT: + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1634,6 +1639,8 @@ retry_private: * exit to complete. * - The user space value changed. */ + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1710,6 +1717,7 @@ retry_private: } out_unlock: + free_pi_state(pi_state); double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -1727,8 +1735,6 @@ out_put_keys: out_put_key1: put_futex_key(&key1); out: - if (pi_state != NULL) - free_pi_state(pi_state); return ret ? ret : task_count; } diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index cf66c5c8458e..3b7408759bdf 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM + depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/kmod.c b/kernel/kmod.c index 8637e041a247..80f7a6d00519 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -196,12 +196,34 @@ int __request_module(bool wait, const char *fmt, ...) EXPORT_SYMBOL(__request_module); #endif /* CONFIG_MODULES */ +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + /* * This is the task which runs the usermode application */ static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + int wait = sub_info->wait & ~UMH_KILLABLE; struct cred *new; int retval; @@ -221,7 +243,7 @@ static int ____call_usermodehelper(void *data) retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) - goto fail; + goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); @@ -233,7 +255,7 @@ static int ____call_usermodehelper(void *data) retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); - goto fail; + goto out; } } @@ -242,12 +264,13 @@ static int ____call_usermodehelper(void *data) retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ + if (wait != UMH_WAIT_PROC) + umh_complete(sub_info); if (!retval) return 0; - - /* Exec failed? */ -fail: - sub_info->retval = retval; do_exit(0); } @@ -258,26 +281,6 @@ static int call_helper(void *data) return ____call_usermodehelper(data); } -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -336,18 +339,8 @@ static void __call_usermodehelper(struct work_struct *work) kmod_thread_locker = NULL; } - switch (wait) { - case UMH_NO_WAIT: - call_usermodehelper_freeinfo(sub_info); - break; - - case UMH_WAIT_PROC: - if (pid > 0) - break; - /* FALLTHROUGH */ - case UMH_WAIT_EXEC: - if (pid < 0) - sub_info->retval = pid; + if (pid < 0) { + sub_info->retval = pid; umh_complete(sub_info); } } @@ -588,7 +581,12 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) goto out; } - sub_info->complete = &done; + /* + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. + */ + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index dadbf88c22c4..454195194d4a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -378,8 +378,14 @@ done: * reschedule now, before we try-lock the mutex. This avoids getting * scheduled out right after we obtained the mutex. */ - if (need_resched()) + if (need_resched()) { + /* + * We _should_ have TASK_RUNNING here, but just in case + * we do not, make it so, otherwise we might get stuck. + */ + __set_current_state(TASK_RUNNING); schedule_preempt_disabled(); + } return false; } diff --git a/kernel/module.c b/kernel/module.c index 88cec1ddb1e3..e52a8739361a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3097,6 +3097,32 @@ static int may_init_module(void) } /* + * Can't use wait_event_interruptible() because our condition + * 'finished_loading()' contains a blocking primitive itself (mutex_lock). + */ +static int wait_finished_loading(struct module *mod) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + add_wait_queue(&module_wq, &wait); + for (;;) { + if (finished_loading(mod->name)) + break; + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + remove_wait_queue(&module_wq, &wait); + + return ret; +} + +/* * We try to place it in the list now to make sure it's unique before * we dedicate too many resources. In particular, temporary percpu * memory exhaustion. @@ -3116,8 +3142,8 @@ again: || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); + + err = wait_finished_loading(mod); if (err) goto out_unlocked; goto again; diff --git a/kernel/panic.c b/kernel/panic.c index d09dc5c32c67..cf80672b7924 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -244,6 +244,7 @@ static const struct tnt tnts[] = { * 'I' - Working around severe firmware bug. * 'O' - Out-of-tree module has been loaded. * 'E' - Unsigned module has been loaded. + * 'L' - A soft lockup has previously occurred. * * The string is overwritten by the next call to print_tainted(). */ diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a9dfa79b6bab..1f35a3478f3c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -502,8 +502,14 @@ int hibernation_restore(int platform_mode) error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - dpm_resume_end(PMSG_RECOVER); + /* + * The above should either succeed and jump to the new kernel, + * or return with an error. Otherwise things are just + * undefined, so let's be paranoid. + */ + BUG_ON(!error); } + dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); resume_console(); pm_restore_console(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4ca9a33ff620..c347e3ce3a55 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -146,7 +146,7 @@ static int platform_suspend_prepare(suspend_state_t state) static int platform_suspend_prepare_late(suspend_state_t state) { - return state == PM_SUSPEND_FREEZE && freeze_ops->prepare ? + return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ? freeze_ops->prepare() : 0; } @@ -164,7 +164,7 @@ static void platform_resume_noirq(suspend_state_t state) static void platform_resume_early(suspend_state_t state) { - if (state == PM_SUSPEND_FREEZE && freeze_ops->restore) + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore) freeze_ops->restore(); } diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 807ccfbf69b3..e6fae503d1bc 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,6 +1,6 @@ obj-y += update.o srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o -obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o +obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ff1a6de62f17..07bb02eda844 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void); */ #define TPS(x) tracepoint_string(x) +void rcu_early_boot_tests(void); + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 240fa9094f83..4d559baf06e0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -812,6 +812,7 @@ rcu_torture_cbflood(void *arg) cur_ops->cb_barrier(); stutter_wait("rcu_torture_cbflood"); } while (!torture_must_stop()); + vfree(rhp); torture_kthread_stopping("rcu_torture_cbflood"); return 0; } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c0623fc47125..0db5649f8817 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -247,7 +247,7 @@ void rcu_bh_qs(void) * be called from hardirq context. It is normally called from the * scheduling-clock interrupt. */ -void rcu_check_callbacks(int cpu, int user) +void rcu_check_callbacks(int user) { RCU_TRACE(check_cpu_stalls()); if (user || rcu_is_cpu_rrupt_from_idle()) @@ -380,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); -void rcu_init(void) +void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + + rcu_early_boot_tests(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 133e47223095..7680fc275036 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -105,7 +105,7 @@ struct rcu_state sname##_state = { \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ }; \ -DEFINE_PER_CPU(struct rcu_data, sname##_data) +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data) RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); @@ -152,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; -#ifdef CONFIG_RCU_BOOST - -/* - * Control variables for per-CPU and per-rcu_node kthreads. These - * handle all flavors of RCU. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - -#endif /* #ifdef CONFIG_RCU_BOOST */ - static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -286,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void) * and requires special handling for preemptible RCU. * The caller must have disabled preemption. */ -void rcu_note_context_switch(int cpu) +void rcu_note_context_switch(void) { trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); - rcu_preempt_note_context_switch(cpu); + rcu_preempt_note_context_switch(); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); @@ -325,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj); static void force_quiescent_state(struct rcu_state *rsp); -static int rcu_pending(int cpu); +static int rcu_pending(void); /* * Return the number of RCU-sched batches processed thus far for debug & stats. @@ -510,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) * we really have entered idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, - bool user) +static void rcu_eqs_enter_common(long long oldval, bool user) { struct rcu_state *rsp; struct rcu_data *rdp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { @@ -531,7 +518,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, rdp = this_cpu_ptr(rsp->rda); do_nocb_deferred_wakeup(rdp); } - rcu_prepare_for_idle(smp_processor_id()); + rcu_prepare_for_idle(); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); @@ -565,7 +552,7 @@ static void rcu_eqs_enter(bool user) WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; - rcu_eqs_enter_common(rdtp, oldval, user); + rcu_eqs_enter_common(oldval, user); } else { rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; } @@ -589,7 +576,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); + rcu_sysidle_enter(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -639,8 +626,8 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else - rcu_eqs_enter_common(rdtp, oldval, true); - rcu_sysidle_enter(rdtp, 1); + rcu_eqs_enter_common(oldval, true); + rcu_sysidle_enter(1); local_irq_restore(flags); } @@ -651,16 +638,17 @@ void rcu_irq_exit(void) * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, - int user) +static void rcu_eqs_exit_common(long long oldval, int user) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + rcu_dynticks_task_exit(); smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - rcu_cleanup_after_idle(smp_processor_id()); + rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = @@ -691,7 +679,7 @@ static void rcu_eqs_exit(bool user) rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(rdtp, oldval, user); + rcu_eqs_exit_common(oldval, user); } } @@ -712,7 +700,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); + rcu_sysidle_exit(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -763,8 +751,8 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else - rcu_eqs_exit_common(rdtp, oldval, true); - rcu_sysidle_exit(rdtp, 1); + rcu_eqs_exit_common(oldval, true); + rcu_sysidle_exit(1); local_irq_restore(flags); } @@ -2387,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * invoked from the scheduling-clock interrupt. If rcu_pending returns * false, there is no point in invoking rcu_check_callbacks(). */ -void rcu_check_callbacks(int cpu, int user) +void rcu_check_callbacks(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); increment_cpu_stall_ticks(); @@ -2419,8 +2407,8 @@ void rcu_check_callbacks(int cpu, int user) rcu_bh_qs(); } - rcu_preempt_check_callbacks(cpu); - if (rcu_pending(cpu)) + rcu_preempt_check_callbacks(); + if (rcu_pending()) invoke_rcu_core(); if (user) rcu_note_voluntary_context_switch(current); @@ -2963,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data) */ void synchronize_sched_expedited(void) { + cpumask_var_t cm; + bool cma = false; + int cpu; long firstsnap, s, snap; int trycount = 0; struct rcu_state *rsp = &rcu_sched_state; @@ -2997,11 +2988,26 @@ void synchronize_sched_expedited(void) } WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); + /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ + cma = zalloc_cpumask_var(&cm, GFP_KERNEL); + if (cma) { + cpumask_copy(cm, cpu_online_mask); + cpumask_clear_cpu(raw_smp_processor_id(), cm); + for_each_cpu(cpu, cm) { + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + cpumask_clear_cpu(cpu, cm); + } + if (cpumask_weight(cm) == 0) + goto all_cpus_idle; + } + /* * Each pass through the following loop attempts to force a * context switch on each CPU. */ - while (try_stop_cpus(cpu_online_mask, + while (try_stop_cpus(cma ? cm : cpu_online_mask, synchronize_sched_expedited_cpu_stop, NULL) == -EAGAIN) { put_online_cpus(); @@ -3013,6 +3019,7 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); + free_cpumask_var(cm); return; } @@ -3022,6 +3029,7 @@ void synchronize_sched_expedited(void) } else { wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_normal); + free_cpumask_var(cm); return; } @@ -3031,6 +3039,7 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); + free_cpumask_var(cm); return; } @@ -3045,6 +3054,7 @@ void synchronize_sched_expedited(void) /* CPU hotplug operation in flight, use normal GP. */ wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_normal); + free_cpumask_var(cm); return; } snap = atomic_long_read(&rsp->expedited_start); @@ -3052,6 +3062,9 @@ void synchronize_sched_expedited(void) } atomic_long_inc(&rsp->expedited_stoppedcpus); +all_cpus_idle: + free_cpumask_var(cm); + /* * Everyone up to our most recent fetch is covered by our grace * period. Update the counter, but only if our work is still @@ -3143,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) * by the current CPU, returning 1 if so. This function is part of the * RCU implementation; it is -not- an exported member of the RCU API. */ -static int rcu_pending(int cpu) +static int rcu_pending(void) { struct rcu_state *rsp; for_each_rcu_flavor(rsp) - if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) + if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) return 1; return 0; } @@ -3158,7 +3171,7 @@ static int rcu_pending(int cpu) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) +static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; @@ -3166,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) struct rcu_state *rsp; for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = this_cpu_ptr(rsp->rda); if (!rdp->nxtlist) continue; hc = true; @@ -3299,11 +3312,16 @@ static void _rcu_barrier(struct rcu_state *rsp) continue; rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); - atomic_inc(&rsp->barrier_cpu_count); - __call_rcu(&rdp->barrier_head, rcu_barrier_callback, - rsp, cpu, 0); + if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { + _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + rsp->n_barrier_done); + } else { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, + rcu_barrier_callback, rsp, cpu, 0); + } } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); @@ -3480,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self, case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - for_each_rcu_flavor(rsp) + for_each_rcu_flavor(rsp) { rcu_cleanup_dead_cpu(cpu, rsp); + do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); + } break; default: break; @@ -3761,6 +3781,8 @@ void __init rcu_init(void) pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + + rcu_early_boot_tests(); } #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d03764652d91..8e7b1843896e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -139,7 +139,7 @@ struct rcu_node { unsigned long expmask; /* Groups that have ->blkd_tasks */ /* elements that need to drain to allow the */ /* current expedited grace period to */ - /* complete (only for TREE_PREEMPT_RCU). */ + /* complete (only for PREEMPT_RCU). */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ @@ -530,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); extern struct rcu_state rcu_bh_state; DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -547,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu); +static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, @@ -561,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); -#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ +#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); @@ -579,14 +579,15 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, #endif /* #ifdef CONFIG_RCU_BOOST */ static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); -static void rcu_cleanup_after_idle(int cpu); -static void rcu_prepare_for_idle(int cpu); +static void rcu_cleanup_after_idle(void); +static void rcu_prepare_for_idle(void); static void rcu_idle_count_callbacks_posted(void); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); @@ -605,8 +606,8 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_enter(int irq); +static void rcu_sysidle_exit(int irq); static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj); static bool is_sysidle_rcu_state(struct rcu_state *rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 387dd4599344..3ec85cb5d544 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -30,14 +30,24 @@ #include <linux/smpboot.h> #include "../time/tick-internal.h" -#define RCU_KTHREAD_PRIO 1 - #ifdef CONFIG_RCU_BOOST + #include "../locking/rtmutex_common.h" -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO -#else -#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO -#endif + +/* rcuc/rcub kthread realtime priority */ +static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; +module_param(kthread_prio, int, 0644); + +/* + * Control variables for per-CPU and per-rcu_node kthreads. These + * handle all flavors of RCU. + */ +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DEFINE_PER_CPU(char, rcu_cpu_has_work); + +#endif /* #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ @@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE pr_info("\tRCU torture testing starts during boot.\n"); #endif -#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); -#endif #if defined(CONFIG_RCU_CPU_STALL_INFO) pr_info("\tAdditional per-CPU info printed with stalls.\n"); #endif @@ -85,9 +92,12 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); +#ifdef CONFIG_RCU_BOOST + pr_info("\tRCU kthread priority: %d.\n", kthread_prio); +#endif } -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state_p = &rcu_preempt_state; @@ -156,7 +166,7 @@ static void rcu_preempt_qs(void) * * Caller must disable preemption. */ -static void rcu_preempt_note_context_switch(int cpu) +static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; unsigned long flags; @@ -167,7 +177,7 @@ static void rcu_preempt_note_context_switch(int cpu) !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); + rdp = this_cpu_ptr(rcu_preempt_state.rda); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); @@ -415,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t) } } -#ifdef CONFIG_RCU_CPU_STALL_VERBOSE - /* * Dump detailed information for all tasks blocking the current RCU * grace period on the specified rcu_node structure. @@ -451,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) rcu_print_detail_task_stall_rnp(rnp); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ - -static void rcu_print_detail_task_stall(struct rcu_state *rsp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ - #ifdef CONFIG_RCU_CPU_STALL_INFO static void rcu_print_task_stall_begin(struct rcu_node *rnp) @@ -621,7 +621,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * * Caller must disable hard irqs. */ -static void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(void) { struct task_struct *t = current; @@ -630,8 +630,8 @@ static void rcu_preempt_check_callbacks(int cpu) return; } if (t->rcu_read_lock_nesting > 0 && - per_cpu(rcu_preempt_data, cpu).qs_pending && - !per_cpu(rcu_preempt_data, cpu).passed_quiesce) + __this_cpu_read(rcu_preempt_data.qs_pending) && + !__this_cpu_read(rcu_preempt_data.passed_quiesce)) t->rcu_read_unlock_special.b.need_qs = true; } @@ -919,7 +919,7 @@ void exit_rcu(void) __rcu_read_unlock(); } -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *rcu_state_p = &rcu_sched_state; @@ -945,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_note_context_switch(int cpu) +static void rcu_preempt_note_context_switch(void) { } @@ -1017,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * Because preemptible RCU does not exist, it never has any callbacks * to check. */ -static void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(void) { } @@ -1070,7 +1070,7 @@ void exit_rcu(void) { } -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST @@ -1326,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, smp_mb__after_unlock_lock(); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = RCU_BOOST_PRIO; + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ return 0; @@ -1343,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) { struct sched_param sp; - sp.sched_priority = RCU_KTHREAD_PRIO; + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); } @@ -1512,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu) * any flavor of RCU. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +int rcu_needs_cpu(unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(cpu, NULL); + return rcu_cpu_has_callbacks(NULL); } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ @@ -1523,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up * after it. */ -static void rcu_cleanup_after_idle(int cpu) +static void rcu_cleanup_after_idle(void) { } @@ -1531,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu) * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, * is nothing. */ -static void rcu_prepare_for_idle(int cpu) +static void rcu_prepare_for_idle(void) { } @@ -1624,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * The caller must have disabled interrupts. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(int cpu, unsigned long *dj) +int rcu_needs_cpu(unsigned long *dj) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { + if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { *dj = ULONG_MAX; return 0; } @@ -1666,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) * * The caller must have disabled interrupts. */ -static void rcu_prepare_for_idle(int cpu) +static void rcu_prepare_for_idle(void) { #ifndef CONFIG_RCU_NOCB_CPU_ALL bool needwake; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); struct rcu_node *rnp; struct rcu_state *rsp; int tne; @@ -1679,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu) /* Handle nohz enablement switches conservatively. */ tne = ACCESS_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(cpu, NULL)) + if (rcu_cpu_has_callbacks(NULL)) invoke_rcu_core(); /* force nohz to see update. */ rdtp->tick_nohz_enabled_snap = tne; return; @@ -1688,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu) return; /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(smp_processor_id())) return; /* @@ -1712,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu) return; rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = this_cpu_ptr(rsp->rda); if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; @@ -1731,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu) * any grace periods that elapsed while the CPU was idle, and if any * callbacks are now ready to invoke, initiate invocation. */ -static void rcu_cleanup_after_idle(int cpu) +static void rcu_cleanup_after_idle(void) { #ifndef CONFIG_RCU_NOCB_CPU_ALL - if (rcu_is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -2050,6 +2050,33 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) } /* + * Does the specified CPU need an RCU callback for the specified flavor + * of rcu_barrier()? + */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_head *rhp; + + /* No-CBs CPUs might have callbacks on any of three lists. */ + rhp = ACCESS_ONCE(rdp->nocb_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_gp_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_follower_head); + + /* Having no rcuo kthread but CBs after scheduler starts is bad! */ + if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { + /* RCU callback enqueued before CPU first came online??? */ + pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", + cpu, rhp->func); + WARN_ON_ONCE(1); + } + + return !!rhp; +} + +/* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy @@ -2546,9 +2573,13 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) rdp->nocb_leader = rdp_spawn; if (rdp_last && rdp != rdp_spawn) rdp_last->nocb_next_follower = rdp; - rdp_last = rdp; - rdp = rdp->nocb_next_follower; - rdp_last->nocb_next_follower = NULL; + if (rdp == rdp_spawn) { + rdp = rdp->nocb_next_follower; + } else { + rdp_last = rdp; + rdp = rdp->nocb_next_follower; + rdp_last->nocb_next_follower = NULL; + } } while (rdp); rdp_spawn->nocb_next_follower = rdp_old_leader; } @@ -2642,6 +2673,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + WARN_ON_ONCE(1); /* Should be dead code. */ + return false; +} + static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } @@ -2728,9 +2765,10 @@ static int full_sysidle_state; /* Current system-idle state. */ * to detect full-system idle states, not RCU quiescent states and grace * periods. The caller must have disabled interrupts. */ -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_enter(int irq) { unsigned long j; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); /* If there are no nohz_full= CPUs, no need to track this. */ if (!tick_nohz_full_enabled()) @@ -2799,8 +2837,10 @@ void rcu_sysidle_force_exit(void) * usermode execution does -not- count as idle here! The caller must * have disabled interrupts. */ -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_exit(int irq) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + /* If there are no nohz_full= CPUs, no need to track this. */ if (!tick_nohz_full_enabled()) return; @@ -3094,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_enter(int irq) { } -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_exit(int irq) { } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3ef8ba58694e..e0d31a345ee6 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -306,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = { EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) @@ -531,7 +531,8 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_head *next; LIST_HEAD(rcu_tasks_holdouts); - /* FIXME: Add housekeeping affinity. */ + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current); /* * Each pass through the following loop makes one check for @@ -690,3 +691,87 @@ static void rcu_spawn_tasks_kthread(void) } #endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_PROVE_RCU + +/* + * Early boot self test parameters, one for each flavor + */ +static bool rcu_self_test; +static bool rcu_self_test_bh; +static bool rcu_self_test_sched; + +module_param(rcu_self_test, bool, 0444); +module_param(rcu_self_test_bh, bool, 0444); +module_param(rcu_self_test_sched, bool, 0444); + +static int rcu_self_test_counter; + +static void test_callback(struct rcu_head *r) +{ + rcu_self_test_counter++; + pr_info("RCU test callback executed %d\n", rcu_self_test_counter); +} + +static void early_boot_test_call_rcu(void) +{ + static struct rcu_head head; + + call_rcu(&head, test_callback); +} + +static void early_boot_test_call_rcu_bh(void) +{ + static struct rcu_head head; + + call_rcu_bh(&head, test_callback); +} + +static void early_boot_test_call_rcu_sched(void) +{ + static struct rcu_head head; + + call_rcu_sched(&head, test_callback); +} + +void rcu_early_boot_tests(void) +{ + pr_info("Running RCU self tests\n"); + + if (rcu_self_test) + early_boot_test_call_rcu(); + if (rcu_self_test_bh) + early_boot_test_call_rcu_bh(); + if (rcu_self_test_sched) + early_boot_test_call_rcu_sched(); +} + +static int rcu_verify_early_boot_tests(void) +{ + int ret = 0; + int early_boot_test_counter = 0; + + if (rcu_self_test) { + early_boot_test_counter++; + rcu_barrier(); + } + if (rcu_self_test_bh) { + early_boot_test_counter++; + rcu_barrier_bh(); + } + if (rcu_self_test_sched) { + early_boot_test_counter++; + rcu_barrier_sched(); + } + + if (rcu_self_test_counter != early_boot_test_counter) { + WARN_ON(1); + ret = -1; + } + + return ret; +} +late_initcall(rcu_verify_early_boot_tests); +#else +void rcu_early_boot_tests(void) {} +#endif /* CONFIG_PROVE_RCU */ diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a63f4dc27909..607f852b4d04 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout); * * This waits to be signaled for completion of a specific task. It is NOT * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. + * for IO (which traditionally means blkio only). */ void __sched wait_for_completion_io(struct completion *x) { @@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io); * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. + * interruptible. The caller is accounted as waiting for IO (which traditionally + * means blkio only). * * Return: 0 if timed out, and positive (at least 1, or number of jiffies left * till timeout) if completed. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44999505e1bf..bb398c0c5f08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * Can drop rq->lock because from sched_class::switched_from() methods drop it. + */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio) @@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); + /* Possble rq->lock 'hole'. */ p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); @@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_preempt_count(p) & PREEMPT_ACTIVE)); + !p->on_rq); #ifdef CONFIG_LOCKDEP /* @@ -1407,7 +1411,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + if (p->nr_cpus_allowed > 1) + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!is_idle_task(rq->curr)) - return; + rcu_read_lock(); + + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; if (set_nr_if_polling(rq->idle)) { trace_sched_wake_idle_without_ipi(cpu); @@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu) /* Else cpu is not in idle, do nothing here */ raw_spin_unlock_irqrestore(&rq->lock, flags); } + +out: + rcu_read_unlock(); } bool cpus_share_cache(int this_cpu, int that_cpu) @@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; + p->numa_faults = NULL; p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; - INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } @@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i) } #endif -static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw -= tsk_bw; -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw += tsk_bw; -} - -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; -} - /* * We must be sure that accepting a new task (or allowing changing the * parameters of an existing one) is consistent with the bandwidth @@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, /** * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) + * + * The context switch have flipped the stack from under us and restored the + * local variables which were saved when this task called schedule() in the + * past. prev == current is still correct but we need to recalculate this_rq + * because prev may have moved to another CPU. */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static struct rq *finish_task_switch(struct task_struct *prev) __releases(rq->lock) { + struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; long prev_state; @@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) } tick_nohz_task_switch(current); + return rq; } #ifdef CONFIG_SMP @@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + struct rq *rq; - /* - * FIXME: do we need to worry about rq being invalidated by the - * task_switch? - */ + /* finish_task_switch() drops rq->lock and enables preemtion */ + preempt_disable(); + rq = finish_task_switch(prev); post_schedule(rq); + preempt_enable(); if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } /* - * context_switch - switch to the new MM and the new - * thread's register state. + * context_switch - switch to the new MM and the new thread's register state. */ -static inline void +static inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev, context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + + return finish_task_switch(prev); } /* @@ -2475,44 +2462,6 @@ EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* - * Return any ns on the sched_clock that have not yet been accounted in - * @p in case that task is currently running. - * - * Called with task_rq_lock() held on @rq. - */ -static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -{ - u64 ns = 0; - - /* - * Must be ->curr _and_ ->on_rq. If dequeued, we would - * project cycles that may never be accounted to this - * thread, breaking clock_gettime(). - */ - if (task_current(rq, p) && task_on_rq_queued(p)) { - update_rq_clock(rq); - ns = rq_clock_task(rq) - p->se.exec_start; - if ((s64)ns < 0) - ns = 0; - } - - return ns; -} - -unsigned long long task_delta_exec(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - u64 ns = 0; - - rq = task_rq_lock(p, &flags); - ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - -/* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's * pending runtime that have not been accounted yet. @@ -2521,7 +2470,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) { unsigned long flags; struct rq *rq; - u64 ns = 0; + u64 ns; #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* @@ -2540,7 +2489,16 @@ unsigned long long task_sched_runtime(struct task_struct *p) #endif rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + /* + * Must be ->curr _and_ ->on_rq. If dequeued, we would + * project cycles that may never be accounted to this + * thread, breaking clock_gettime(). + */ + if (task_current(rq, p) && task_on_rq_queued(p)) { + update_rq_clock(rq); + p->sched_class->update_curr(rq); + } + ns = p->se.sum_exec_runtime; task_rq_unlock(rq, p, &flags); return ns; @@ -2802,7 +2760,7 @@ need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(cpu); + rcu_note_context_switch(); prev = rq->curr; schedule_debug(prev); @@ -2855,15 +2813,8 @@ need_resched: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ - /* - * The context switch have flipped the stack from under us - * and restored the local variables which were saved when - * this task called schedule() in the past. prev == current - * is still correct, but it can be moved to another cpu/rq. - */ - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + rq = context_switch(rq, prev, next); /* unlocks the rq */ + cpu = cpu_of(rq); } else raw_spin_unlock_irq(&rq->lock); @@ -2903,10 +2854,14 @@ asmlinkage __visible void __sched schedule_user(void) * or we have been woken up remotely but the IPI has not yet arrived, * we haven't yet exited the RCU idle mode. Do it here manually until * we find a better solution. + * + * NB: There are buggy callers of this function. Ideally we + * should warn if prev_state != IN_USER, but that will trigger + * too frequently to make sense yet. */ - user_exit(); + enum ctx_state prev_state = exception_enter(); schedule(); - user_enter(); + exception_exit(prev_state); } #endif @@ -2951,6 +2906,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_context(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + + do { + __preempt_count_add(PREEMPT_ACTIVE); + /* + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + prev_ctx = exception_enter(); + __schedule(); + exception_exit(prev_ctx); + + __preempt_count_sub(PREEMPT_ACTIVE); + barrier(); + } while (need_resched()); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_CONTEXT_TRACKING */ + #endif /* CONFIG_PREEMPT */ /* @@ -4637,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) +{ + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; + + rcu_read_lock_sched(); + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); + + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + rcu_read_unlock_sched(); + + return ret; +} + +int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed) +{ + int ret = 0; + + /* + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. + */ + if (p->flags & PF_NO_SETAFFINITY) { + ret = -EINVAL; + goto out; + } + +#ifdef CONFIG_SMP + if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, + cs_cpus_allowed)) { + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b; + bool overflow; + int cpus; + unsigned long flags; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(dest_cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + + } +#endif +out: + return ret; +} + #ifdef CONFIG_SMP /* * move_queued_task - move a queued task to new rq. @@ -6087,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA static int sched_domains_numa_levels; +enum numa_topology_type sched_numa_topology_type; static int *sched_domains_numa_distance; +int sched_max_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; #endif @@ -6259,7 +6332,7 @@ static void sched_numa_warn(const char *str) printk(KERN_WARNING "\n"); } -static bool find_numa_distance(int distance) +bool find_numa_distance(int distance) { int i; @@ -6274,6 +6347,56 @@ static bool find_numa_distance(int distance) return false; } +/* + * A system can have three types of NUMA topology: + * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system + * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes + * NUMA_BACKPLANE: nodes can reach other nodes through a backplane + * + * The difference between a glueless mesh topology and a backplane + * topology lies in whether communication between not directly + * connected nodes goes through intermediary nodes (where programs + * could run), or through backplane controllers. This affects + * placement of programs. + * + * The type of topology can be discerned with the following tests: + * - If the maximum distance between any nodes is 1 hop, the system + * is directly connected. + * - If for two nodes A and B, located N > 1 hops away from each other, + * there is an intermediary node C, which is < N hops away from both + * nodes A and B, the system is a glueless mesh. + */ +static void init_numa_topology_type(void) +{ + int a, b, c, n; + + n = sched_max_numa_distance; + + if (n <= 1) + sched_numa_topology_type = NUMA_DIRECT; + + for_each_online_node(a) { + for_each_online_node(b) { + /* Find two nodes furthest removed from each other. */ + if (node_distance(a, b) < n) + continue; + + /* Is there an intermediary node between a and b? */ + for_each_online_node(c) { + if (node_distance(a, c) < n && + node_distance(b, c) < n) { + sched_numa_topology_type = + NUMA_GLUELESS_MESH; + return; + } + } + + sched_numa_topology_type = NUMA_BACKPLANE; + return; + } + } +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6327,6 +6450,10 @@ static void sched_init_numa(void) if (!sched_debug()) break; } + + if (!level) + return; + /* * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). @@ -6406,6 +6533,9 @@ static void sched_init_numa(void) sched_domain_topology = tl; sched_domains_numa_levels = level; + sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + + init_numa_topology_type(); } static void sched_domains_numa_masks_set(int cpu) @@ -7158,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { + /* + * Blocking primitives will set (and therefore destroy) current->state, + * since we will exit with TASK_RUNNING make sure we enter with it, + * otherwise we will destroy state. + */ + if (WARN_ONCE(current->state != TASK_RUNNING, + "do not call blocking ops when !TASK_RUNNING; " + "state=%lx set at [<%p>] %pS\n", + current->state, + (void *)current->task_state_change, + (void *)current->task_state_change)) + __set_current_state(TASK_RUNNING); + + ___might_sleep(file, line, preempt_offset); +} +EXPORT_SYMBOL(__might_sleep); + +void ___might_sleep(const char *file, int line, int preempt_offset) +{ static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ @@ -7189,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) #endif dump_stack(); } -EXPORT_SYMBOL(__might_sleep); +EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -7403,8 +7552,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgrp_id, - lockdep_is_held(&tsk->sighand->siglock)), + /* + * All callers are synchronized by task_rq_lock(); we do not use RCU + * which is pointless here. Thus, we pass "true" to task_css_check() + * to prevent lockdep warnings. + */ + tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); tsk->sched_task_group = tg; @@ -7833,6 +7986,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } +static void cpu_cgroup_fork(struct task_struct *task) +{ + sched_move_task(task); +} + static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { @@ -8205,6 +8363,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, .css_offline = cpu_cgroup_css_offline, + .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 538c9796ad4a..020039bd1326 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); int cpudl_init(struct cpudl *cp); void cpudl_cleanup(struct cpudl *cp); -#else -#define cpudl_set(cp, cpu, dl) do { } while (0) -#define cpudl_init() do { } while (0) #endif /* CONFIG_SMP */ #endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 6b033347fdfd..63cbb9ca0496 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#else -#define cpupri_set(cp, cpu, pri) do { } while (0) -#define cpupri_init() do { } while (0) #endif #endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 256e577faf1b..e5db8c6feebd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -518,12 +518,20 @@ again: } /* - * We need to take care of a possible races here. In fact, the - * task might have changed its scheduling policy to something - * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setattr()). + * We need to take care of several possible races here: + * + * - the task might have changed its scheduling policy + * to something different than SCHED_DEADLINE + * - the task might have changed its reservation parameters + * (through sched_setattr()) + * - the task might have been boosted by someone else and + * might be in the boosting/deboosting path + * + * In all this cases we bail out, as the task is already + * in the runqueue or is going to be enqueued back anyway. */ - if (!dl_task(p) || dl_se->dl_new) + if (!dl_task(p) || dl_se->dl_new || + dl_se->dl_boosted || !dl_se->dl_throttled) goto unlock; sched_clock_tick(); @@ -532,7 +540,7 @@ again: dl_se->dl_yielded = 0; if (task_on_rq_queued(p)) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (task_has_dl_policy(rq->curr)) + if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); @@ -555,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - if (hrtimer_active(timer)) { - hrtimer_try_to_cancel(timer); - return; - } - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); timer->function = dl_task_timer; } @@ -625,7 +628,7 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= delta_exec; + dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) @@ -847,8 +850,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * smaller than our one... OTW we keep our runtime and * deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { pi_se = &pi_task->dl; + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task + * that is going to be deboosted, but exceedes its + * runtime while doing so. No point in replenishing + * it, as it's going to return back to its original + * scheduling class after this. + */ + BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + return; + } /* * If p is throttled, we do nothing. In fact, if it exhausted @@ -914,7 +928,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (sd_flag != SD_BALANCE_WAKE) goto out; rq = cpu_rq(cpu); @@ -999,6 +1013,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) { hrtick_start(rq, p->dl.runtime); } +#else /* !CONFIG_SCHED_HRTICK */ +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ +} #endif static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, @@ -1052,10 +1070,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) /* Running task will never be pushed. */ dequeue_pushable_dl_task(rq, p); -#ifdef CONFIG_SCHED_HRTICK if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); -#endif set_post_schedule(rq); @@ -1074,10 +1090,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); -#ifdef CONFIG_SCHED_HRTICK if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) start_hrtick_dl(rq, p); -#endif } static void task_fork_dl(struct task_struct *p) @@ -1314,6 +1328,7 @@ static int push_dl_task(struct rq *rq) { struct task_struct *next_task; struct rq *later_rq; + int ret = 0; if (!rq->dl.overloaded) return 0; @@ -1359,7 +1374,6 @@ retry: * The task is still there. We don't try * again, some other cpu will pull it when ready. */ - dequeue_pushable_dl_task(rq, next_task); goto out; } @@ -1375,6 +1389,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); + ret = 1; resched_curr(later_rq); @@ -1383,7 +1398,7 @@ retry: out: put_task_struct(next_task); - return 1; + return ret; } static void push_dl_tasks(struct rq *rq) @@ -1489,7 +1504,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) p->nr_cpus_allowed > 1 && dl_task(rq->curr) && (rq->curr->nr_cpus_allowed < 2 || - dl_entity_preempt(&rq->curr->dl, &p->dl))) { + !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } } @@ -1498,10 +1513,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq; + struct root_domain *src_rd; int weight; BUG_ON(!dl_task(p)); + rq = task_rq(p); + src_rd = rq->rd; + /* + * Migrating a SCHED_DEADLINE task between exclusive + * cpusets (different root_domains) entails a bandwidth + * update. We already made space for us in the destination + * domain (see cpuset_can_attach()). + */ + if (!cpumask_intersects(src_rd->span, new_mask)) { + struct dl_bw *src_dl_b; + + src_dl_b = dl_bw_of(cpu_of(rq)); + /* + * We now free resources of the root_domain we are migrating + * off. In the worst case, sched_setattr() may temporary fail + * until we complete the update. + */ + raw_spin_lock(&src_dl_b->lock); + __dl_clear(src_dl_b, p->dl.dl_bw); + raw_spin_unlock(&src_dl_b->lock); + } + /* * Update only if the task is actually running (i.e., * it is on the rq AND it is not throttled). @@ -1518,8 +1556,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; - rq = task_rq(p); - /* * The process used to be able to migrate OR it can now migrate */ @@ -1567,22 +1603,48 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ +/* + * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. + */ +static void cancel_dl_timer(struct rq *rq, struct task_struct *p) +{ + struct hrtimer *dl_timer = &p->dl.dl_timer; + + /* Nobody will change task's class if pi_lock is held */ + lockdep_assert_held(&p->pi_lock); + + if (hrtimer_active(dl_timer)) { + int ret = hrtimer_try_to_cancel(dl_timer); + + if (unlikely(ret == -1)) { + /* + * Note, p may migrate OR new deadline tasks + * may appear in rq when we are unlocking it. + * A caller of us must be fine with that. + */ + raw_spin_unlock(&rq->lock); + hrtimer_cancel(dl_timer); + raw_spin_lock(&rq->lock); + } + } +} + static void switched_from_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) - hrtimer_try_to_cancel(&p->dl.dl_timer); + cancel_dl_timer(rq, p); __dl_clear_params(p); -#ifdef CONFIG_SMP /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one * from an overloaded cpu, if any. */ - if (!rq->dl.dl_nr_running) - pull_dl_task(rq); -#endif + if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) + return; + + if (pull_dl_task(rq)) + resched_curr(rq); } /* @@ -1603,12 +1665,17 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && + push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ - if (check_resched && task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (check_resched) { + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + } } } @@ -1678,4 +1745,15 @@ const struct sched_class dl_sched_class = { .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, + + .update_curr = update_curr_dl, }; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); + +void print_dl_stats(struct seq_file *m, int cpu) +{ + print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); +} +#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ce33780d8f20..92cc52001e74 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #undef P } +void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) +{ + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); + SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +} + extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) @@ -329,6 +335,7 @@ do { \ spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + print_dl_stats(m, cpu); print_rq(m, rq, cpu); spin_unlock_irqrestore(&sched_debug_lock, flags); @@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults_memory) - nr_faults = p->numa_faults_memory[2*node + i]; + if (p->numa_faults) + nr_faults = p->numa_faults[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b069bf3e708..df2cdf77f899 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -726,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); } +static void update_curr_fair(struct rq *rq) +{ + update_curr(cfs_rq_of(&rq->curr->se)); +} + static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -828,11 +833,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { + unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + if (scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / scan_size; floor = 1000 / windows; scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); @@ -867,7 +873,6 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; - struct list_head task_list; struct rcu_head rcu; nodemask_t active_nodes; @@ -895,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p) return p->numa_group ? p->numa_group->gid : 0; } -static inline int task_faults_idx(int nid, int priv) +/* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ +static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) { - return NR_NUMA_HINT_FAULT_TYPES * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; - return p->numa_faults_memory[task_faults_idx(nid, 0)] + - p->numa_faults_memory[task_faults_idx(nid, 1)]; + return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -914,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) if (!p->numa_group) return 0; - return p->numa_group->faults[task_faults_idx(nid, 0)] + - p->numa_group->faults[task_faults_idx(nid, 1)]; + return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(nid, 0)] + - group->faults_cpu[task_faults_idx(nid, 1)]; + return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + + group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; +} + +/* Handle placement on systems where not all nodes are directly connected. */ +static unsigned long score_nearby_nodes(struct task_struct *p, int nid, + int maxdist, bool task) +{ + unsigned long score = 0; + int node; + + /* + * All nodes are directly connected, and the same distance + * from each other. No need for fancy placement algorithms. + */ + if (sched_numa_topology_type == NUMA_DIRECT) + return 0; + + /* + * This code is called for each node, introducing N^2 complexity, + * which should be ok given the number of nodes rarely exceeds 8. + */ + for_each_online_node(node) { + unsigned long faults; + int dist = node_distance(nid, node); + + /* + * The furthest away nodes in the system are not interesting + * for placement; nid was already counted. + */ + if (dist == sched_max_numa_distance || node == nid) + continue; + + /* + * On systems with a backplane NUMA topology, compare groups + * of nodes, and move tasks towards the group with the most + * memory accesses. When comparing two nodes at distance + * "hoplimit", only nodes closer by than "hoplimit" are part + * of each group. Skip other nodes. + */ + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist > maxdist) + continue; + + /* Add up the faults from nearby nodes. */ + if (task) + faults = task_faults(p, node); + else + faults = group_faults(p, node); + + /* + * On systems with a glueless mesh NUMA topology, there are + * no fixed "groups of nodes". Instead, nodes that are not + * directly connected bounce traffic through intermediate + * nodes; a numa_group can occupy any set of nodes. + * The further away a node is, the less the faults count. + * This seems to result in good task placement. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + faults *= (sched_max_numa_distance - dist); + faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + } + + score += faults; + } + + return score; } /* @@ -930,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes. */ -static inline unsigned long task_weight(struct task_struct *p, int nid) +static inline unsigned long task_weight(struct task_struct *p, int nid, + int dist) { - unsigned long total_faults; + unsigned long faults, total_faults; - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; total_faults = p->total_numa_faults; @@ -942,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) if (!total_faults) return 0; - return 1000 * task_faults(p, nid) / total_faults; + faults = task_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, true); + + return 1000 * faults / total_faults; } -static inline unsigned long group_weight(struct task_struct *p, int nid) +static inline unsigned long group_weight(struct task_struct *p, int nid, + int dist) { - if (!p->numa_group || !p->numa_group->total_faults) + unsigned long faults, total_faults; + + if (!p->numa_group) + return 0; + + total_faults = p->numa_group->total_faults; + + if (!total_faults) return 0; - return 1000 * group_faults(p, nid) / p->numa_group->total_faults; + faults = group_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, false); + + return 1000 * faults / total_faults; } bool should_numa_migrate_memory(struct task_struct *p, struct page * page, @@ -1083,6 +1174,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; int imbalance_pct; + int dist; struct task_struct *best_task; long best_imp; @@ -1162,11 +1254,29 @@ static void task_numa_compare(struct task_numa_env *env, long load; long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; + int dist = env->dist; rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ + + raw_spin_lock_irq(&dst_rq->lock); + cur = dst_rq->curr; + /* + * No need to move the exiting task, and this ensures that ->curr + * wasn't reaped and thus get_task_struct() in task_numa_assign() + * is safe under RCU read lock. + * Note that rcu_read_lock() itself can't protect from the final + * put_task_struct() after the last schedule(). + */ + if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + raw_spin_unlock_irq(&dst_rq->lock); + + /* + * Because we have preemption enabled we can get migrated around and + * end try selecting ourselves (current == env->p) as a swap candidate. + */ + if (cur == env->p) + goto unlock; /* * "imp" is the fault differential for the source task between the @@ -1185,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env, * in any group then look only at task weights. */ if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. @@ -1200,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env, * instead. */ if (cur->numa_group) - imp += group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); else - imp += task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } } @@ -1303,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p) }; struct sched_domain *sd; unsigned long taskweight, groupweight; - int nid, ret; + int nid, ret, dist; long taskimp, groupimp; /* @@ -1331,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p) return -EINVAL; } - taskweight = task_weight(p, env.src_nid); - groupweight = group_weight(p, env.src_nid); - update_numa_stats(&env.src_stats, env.src_nid); env.dst_nid = p->numa_preferred_nid; - taskimp = task_weight(p, env.dst_nid) - taskweight; - groupimp = group_weight(p, env.dst_nid) - groupweight; + dist = env.dist = node_distance(env.src_nid, env.dst_nid); + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + update_numa_stats(&env.src_stats, env.src_nid); + taskimp = task_weight(p, env.dst_nid, dist) - taskweight; + groupimp = group_weight(p, env.dst_nid, dist) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); - /* No space available on the preferred nid. Look elsewhere. */ - if (env.best_cpu == -1) { + /* + * Look at other nodes in these cases: + * - there is no space available on the preferred_nid + * - the task is part of a numa_group that is interleaved across + * multiple NUMA nodes; in order to better consolidate the group, + * we need to check other locations. + */ + if (env.best_cpu == -1 || (p->numa_group && + nodes_weight(p->numa_group->active_nodes) > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; + dist = node_distance(env.src_nid, env.dst_nid); + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist != env.dist) { + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + } + /* Only consider nodes where both task and groups benefit */ - taskimp = task_weight(p, nid) - taskweight; - groupimp = group_weight(p, nid) - groupweight; + taskimp = task_weight(p, nid, dist) - taskweight; + groupimp = group_weight(p, nid, dist) - groupweight; if (taskimp < 0 && groupimp < 0) continue; + env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); task_numa_find_cpu(&env, taskimp, groupimp); @@ -1408,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1520,7 +1646,7 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } @@ -1557,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) return delta; } +/* + * Determine the preferred nid for a task in a numa_group. This needs to + * be done in a way that produces consistent results with group_weight, + * otherwise workloads might not converge. + */ +static int preferred_group_nid(struct task_struct *p, int nid) +{ + nodemask_t nodes; + int dist; + + /* Direct connections between all NUMA nodes. */ + if (sched_numa_topology_type == NUMA_DIRECT) + return nid; + + /* + * On a system with glueless mesh NUMA topology, group_weight + * scores nodes according to the number of NUMA hinting faults on + * both the node itself, and on nearby nodes. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + unsigned long score, max_score = 0; + int node, max_node = nid; + + dist = sched_max_numa_distance; + + for_each_online_node(node) { + score = group_weight(p, node, dist); + if (score > max_score) { + max_score = score; + max_node = node; + } + } + return max_node; + } + + /* + * Finding the preferred nid in a system with NUMA backplane + * interconnect topology is more involved. The goal is to locate + * tasks from numa_groups near each other in the system, and + * untangle workloads from different sides of the system. This requires + * searching down the hierarchy of node groups, recursively searching + * inside the highest scoring group of nodes. The nodemask tricks + * keep the complexity of the search down. + */ + nodes = node_online_map; + for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { + unsigned long max_faults = 0; + nodemask_t max_group; + int a, b; + + /* Are there nodes at this distance from each other? */ + if (!find_numa_distance(dist)) + continue; + + for_each_node_mask(a, nodes) { + unsigned long faults = 0; + nodemask_t this_group; + nodes_clear(this_group); + + /* Sum group's NUMA faults; includes a==b case. */ + for_each_node_mask(b, nodes) { + if (node_distance(a, b) < dist) { + faults += group_faults(p, b); + node_set(b, this_group); + node_clear(b, nodes); + } + } + + /* Remember the top group. */ + if (faults > max_faults) { + max_faults = faults; + max_group = this_group; + /* + * subtle: at the smallest distance there is + * just one node left in each "group", the + * winner is the preferred nid. + */ + nid = a; + } + } + /* Next round, evaluate the nodes within max_group. */ + nodes = max_group; + } + return nid; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; @@ -1584,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { + /* Keep track of the offsets in numa_faults array */ + int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; unsigned long faults = 0, group_faults = 0; - int priv, i; + int priv; for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { long diff, f_diff, f_weight; - i = task_faults_idx(nid, priv); + mem_idx = task_faults_idx(NUMA_MEM, nid, priv); + membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); + cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); + cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); /* Decay existing window, copy faults since last scan */ - diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; - fault_types[priv] += p->numa_faults_buffer_memory[i]; - p->numa_faults_buffer_memory[i] = 0; + diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; + fault_types[priv] += p->numa_faults[membuf_idx]; + p->numa_faults[membuf_idx] = 0; /* * Normalize the faults_from, so all tasks in a group @@ -1605,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p) * faults are less important. */ f_weight = div64_u64(runtime << 16, period + 1); - f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / (total_faults + 1); - f_diff = f_weight - p->numa_faults_cpu[i] / 2; - p->numa_faults_buffer_cpu[i] = 0; + f_diff = f_weight - p->numa_faults[cpu_idx] / 2; + p->numa_faults[cpubuf_idx] = 0; - p->numa_faults_memory[i] += diff; - p->numa_faults_cpu[i] += f_diff; - faults += p->numa_faults_memory[i]; + p->numa_faults[mem_idx] += diff; + p->numa_faults[cpu_idx] += f_diff; + faults += p->numa_faults[mem_idx]; p->total_numa_faults += diff; if (p->numa_group) { - /* safe because we can only change our own group */ - p->numa_group->faults[i] += diff; - p->numa_group->faults_cpu[i] += f_diff; + /* + * safe because we can only change our own group + * + * mem_idx represents the offset for a given + * nid and priv in a specific region because it + * is at the beginning of the numa_faults array. + */ + p->numa_group->faults[mem_idx] += diff; + p->numa_group->faults_cpu[mem_idx] += f_diff; p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[i]; + group_faults += p->numa_group->faults[mem_idx]; } } @@ -1639,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); spin_unlock_irq(group_lock); - max_nid = max_group_nid; + max_nid = preferred_group_nid(p, max_group_nid); } if (max_faults) { @@ -1682,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, atomic_set(&grp->refcount, 1); spin_lock_init(&grp->lock); - INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * @@ -1691,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, node_set(task_node(current), grp->active_nodes); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] = p->numa_faults_memory[i]; + grp->faults[i] = p->numa_faults[i]; grp->total_faults = p->total_numa_faults; - list_add(&p->numa_entry, &grp->task_list); grp->nr_tasks++; rcu_assign_pointer(p->numa_group, grp); } @@ -1750,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults_memory[i]; - grp->faults[i] += p->numa_faults_memory[i]; + my_grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] += p->numa_faults[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; - list_move(&p->numa_entry, &grp->task_list); my_grp->nr_tasks--; grp->nr_tasks++; @@ -1776,27 +1996,23 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults_memory; + void *numa_faults = p->numa_faults; unsigned long flags; int i; if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] -= p->numa_faults[i]; grp->total_faults -= p->total_numa_faults; - list_del(&p->numa_entry); grp->nr_tasks--; spin_unlock_irqrestore(&grp->lock, flags); RCU_INIT_POINTER(p->numa_group, NULL); put_numa_group(grp); } - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; - p->numa_faults_cpu= NULL; - p->numa_faults_buffer_cpu = NULL; + p->numa_faults = NULL; kfree(numa_faults); } @@ -1819,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) return; /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults_memory)) { - int size = sizeof(*p->numa_faults_memory) * + if (unlikely(!p->numa_faults)) { + int size = sizeof(*p->numa_faults) * NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults_memory) + p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults) return; - BUG_ON(p->numa_faults_buffer_memory); - /* - * The averaged statistics, shared & private, memory & cpu, - * occupy the first half of the array. The second half of the - * array is for current counters, which are averaged into the - * first set by task_numa_placement. - */ - p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); - p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); - p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1876,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; - p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; p->numa_faults_locality[local] += pages; } @@ -4446,7 +4652,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } - } else { + } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; @@ -4524,9 +4730,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (p->nr_cpus_allowed == 1) - return prev_cpu; - if (sd_flag & SD_BALANCE_WAKE) want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); @@ -5166,7 +5369,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || !(env->sd->flags & SD_NUMA)) { return false; } @@ -5205,7 +5408,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); @@ -6149,8 +6352,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) + sds->local_stat.group_has_free_capacity) { sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); + sgs->group_type = group_classify(sg, sgs); + } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -7938,6 +8143,8 @@ const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, + .update_curr = update_curr_fair, + #ifdef CONFIG_FAIR_GROUP_SCHED .task_move_group = task_move_group_fair, #endif diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 67ad4e7f506a..c65dac8c97cd 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task return 0; } +static void update_curr_idle(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, + .update_curr = update_curr_idle, }; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d024e6ce30ba..ee15f5a0d1c1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (p->nr_cpus_allowed == 1) - goto out; - /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) goto out; @@ -1351,16 +1348,22 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->nr_cpus_allowed == 1) + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) - return; - /* * There appears to be other cpus that can accept * current and none to run 'p', so lets reschedule @@ -2128,6 +2131,8 @@ const struct sched_class rt_sched_class = { .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + + .update_curr = update_curr_rt, }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 24156c8434d1..9a2a45c970e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -176,6 +176,25 @@ struct dl_bw { u64 bw, total_bw; }; +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + extern struct mutex sched_domains_mutex; #ifdef CONFIG_CGROUP_SCHED @@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +#endif + #ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); @@ -1127,6 +1164,11 @@ struct sched_class { void (*task_fork) (struct task_struct *p); void (*task_dead) (struct task_struct *p); + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, @@ -1135,6 +1177,8 @@ struct sched_class { unsigned int (*get_rr_interval) (struct rq *rq, struct task_struct *task); + void (*update_curr) (struct rq *rq); + #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_move_group) (struct task_struct *p, int on_rq); #endif @@ -1502,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 67426e529f59..79ffec45a6ac 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) return 0; } +static void update_curr_stop(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU stop tasks: */ @@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, + .update_curr = update_curr_stop, }; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 5a62915f47a8..852143a79f36 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/wait.h> #include <linux/hash.h> +#include <linux/kthread.h> void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) { @@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * } EXPORT_SYMBOL(autoremove_wake_function); +static inline bool is_kthread_should_stop(void) +{ + return (current->flags & PF_KTHREAD) && kthread_should_stop(); +} + +/* + * DEFINE_WAIT_FUNC(wait, woken_wake_func); + * + * add_wait_queue(&wq, &wait); + * for (;;) { + * if (condition) + * break; + * + * p->state = mode; condition = true; + * smp_mb(); // A smp_wmb(); // C + * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * schedule() try_to_wake_up(); + * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ + * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * smp_mb() // B smp_wmb(); // C + * wait->flags |= WQ_FLAG_WOKEN; + * } + * remove_wait_queue(&wq, &wait); + * + */ +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +{ + set_current_state(mode); /* A */ + /* + * The above implies an smp_mb(), which matches with the smp_wmb() from + * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must + * also observe all state before the wakeup. + */ + if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + timeout = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + + /* + * The below implies an smp_mb(), it too pairs with the smp_wmb() from + * woken_wake_function() such that we must either observe the wait + * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss + * an event. + */ + set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + + return timeout; +} +EXPORT_SYMBOL(wait_woken); + +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expects write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in wait_woken(). + */ + smp_wmb(); /* C */ + wait->flags |= WQ_FLAG_WOKEN; + + return default_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(woken_wake_function); + int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; diff --git a/kernel/signal.c b/kernel/signal.c index 8f0876f9f6dd..19e35135fc60 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, local_irq_restore(*flags); break; } - + /* + * This sighand can be already freed and even reused, but + * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which + * initializes ->siglock: this slab can't go away, it has + * the same object type, ->siglock can't be reinitialized. + * + * We need to ensure that tsk->sighand is still the same + * after we take the lock, we can race with de_thread() or + * __exit_signal(). In the latter case the next iteration + * must see ->sighand == NULL. + */ spin_lock(&sighand->siglock); if (likely(sighand == tsk->sighand)) { rcu_read_unlock(); @@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) int error = -ESRCH; struct task_struct *p; - rcu_read_lock(); -retry: - p = pid_task(pid, PIDTYPE_PID); - if (p) { - error = group_send_sig_info(sig, info, p); - if (unlikely(error == -ESRCH)) - /* - * The task was unhashed in between, try again. - * If it is dead, pid_task() will return NULL, - * if we race with de_thread() it will find the - * new leader. - */ - goto retry; - } - rcu_read_unlock(); + for (;;) { + rcu_read_lock(); + p = pid_task(pid, PIDTYPE_PID); + if (p) + error = group_send_sig_info(sig, info, p); + rcu_read_unlock(); + if (likely(!p || error != -ESRCH)) + return error; - return error; + /* + * The task was unhashed in between, try again. If it + * is dead, pid_task() will return NULL, if we race with + * de_thread() it will find the new leader. + */ + } } int kill_proc_info(int sig, struct siginfo *info, pid_t pid) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index eb89e1807408..f032fb5284e3 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data) set_current_state(TASK_INTERRUPTIBLE); preempt_disable(); if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->cleanup) ht->cleanup(td->cpu, cpu_online(td->cpu)); @@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data) /* Check for state change setup */ switch (td->status) { case HP_THREAD_NONE: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->setup) ht->setup(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; + case HP_THREAD_PARKED: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->unpark) ht->unpark(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; } if (!ht->thread_should_run(td->cpu)) { - preempt_enable(); + preempt_enable_no_resched(); schedule(); } else { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); ht->thread_fn(td->cpu); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 0699add19164..501baa9ac1be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu) * in the task stack here. */ __do_softirq(); - rcu_note_context_switch(cpu); + rcu_note_context_switch(); local_irq_enable(); cond_resched(); return; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4aada6d9fe74..15f2511a1b7c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_numa_balancing_scan_size, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "numa_balancing", diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9c94c19f1305..55449909f114 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && - (!ismax || evt->mult <= (1U << evt->shift))) + (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 492b986195d5..a16b67859e2a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -553,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - *sample = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime; break; } return 0; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 42b463ad90f2..31ea01f42e1f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } } else { + memset(&event.sigev_value, 0, sizeof(event.sigev_value)); event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7b5741fc4110..1f4356037a7d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -585,7 +585,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, last_jiffies = jiffies; } while (read_seqretry(&jiffies_lock, seq)); - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || + if (rcu_needs_cpu(&rcu_delta_jiffies) || arch_needs_cpu() || irq_work_needs_cpu()) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3260ffdb368f..2d3f5c504939 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1377,12 +1377,11 @@ unsigned long get_next_timer_interrupt(unsigned long now) void update_process_times(int user_tick) { struct task_struct *p = current; - int cpu = smp_processor_id(); /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); - rcu_check_callbacks(cpu, user_tick); + rcu_check_callbacks(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_tick(); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fb186b9ddf51..31c90fec4158 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1925,8 +1925,16 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) * when we are adding another op to the rec or removing the * current one. Thus, if the op is being added, we can * ignore it because it hasn't attached itself to the rec - * yet. That means we just need to find the op that has a - * trampoline and is not beeing added. + * yet. + * + * If an ops is being modified (hooking to different functions) + * then we don't care about the new functions that are being + * added, just the old ones (that are probably being removed). + * + * If we are adding an ops to a function that already is using + * a trampoline, it needs to be removed (trampolines are only + * for single ops connected), then an ops that is not being + * modified also needs to be checked. */ do_for_each_ftrace_op(op, ftrace_ops_list) { @@ -1940,17 +1948,23 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) if (op->flags & FTRACE_OPS_FL_ADDING) continue; + /* - * If the ops is not being added and has a trampoline, - * then it must be the one that we want! + * If the ops is being modified and is in the old + * hash, then it is probably being removed from this + * function. */ - if (hash_contains_ip(ip, op->func_hash)) - return op; - - /* If the ops is being modified, it may be in the old hash. */ if ((op->flags & FTRACE_OPS_FL_MODIFYING) && hash_contains_ip(ip, &op->old_hash)) return op; + /* + * If the ops is not being added or modified, and it's + * in its normal filter hash, then this must be the one + * we want! + */ + if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, op->func_hash)) + return op; } while_for_each_ftrace_op(op); @@ -2293,10 +2307,13 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); } -static void ftrace_run_modify_code(struct ftrace_ops *ops, int command) +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, + struct ftrace_hash *old_hash) { ops->flags |= FTRACE_OPS_FL_MODIFYING; + ops->old_hash.filter_hash = old_hash; ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; ops->flags &= ~FTRACE_OPS_FL_MODIFYING; } @@ -3340,7 +3357,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = static int ftrace_probe_registered; -static void __enable_ftrace_function_probe(void) +static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) { int ret; int i; @@ -3348,7 +3365,8 @@ static void __enable_ftrace_function_probe(void) if (ftrace_probe_registered) { /* still need to update the function call sites */ if (ftrace_enabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + old_hash); return; } @@ -3477,13 +3495,14 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } while_for_each_ftrace_rec(); ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + __enable_ftrace_function_probe(old_hash); + if (!ret) free_ftrace_hash_rcu(old_hash); else count = ret; - __enable_ftrace_function_probe(); - out_unlock: mutex_unlock(&ftrace_lock); out: @@ -3764,10 +3783,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops) +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_hash *old_hash) { if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); } static int @@ -3813,7 +3833,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, old_hash = *orig_hash; ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret) { - ftrace_ops_update_code(ops); + ftrace_ops_update_code(ops, old_hash); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); @@ -4058,7 +4078,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); if (!ret) { - ftrace_ops_update_code(iter->ops); + ftrace_ops_update_code(iter->ops, old_hash); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2d75c94ae87d..a56e07c8d15b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work) * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on + * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -int ring_buffer_wait(struct ring_buffer *buffer, int cpu) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); DEFINE_WAIT(wait); struct rb_irq_work *work; + int ret = 0; /* * Depending on what the caller is waiting for, either any @@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu) } - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + while (true) { + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. - */ - work->waiters_pending = true; + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + work->waiters_pending = true; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) + break; + + if (cpu != RING_BUFFER_ALL_CPUS && + !ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + + if (!full) + break; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + if (!pagebusy) + break; + } - if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || - (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) schedule(); + } finish_wait(&work->waiters, &wait); - return 0; + + return ret; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a528392b1f4..92f4a6cee172 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static int wait_on_pipe(struct trace_iterator *iter) +static int wait_on_pipe(struct trace_iterator *iter, bool full) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file, + full); } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -4434,15 +4435,12 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&iter->mutex); if (ret) return ret; - - if (signal_pending(current)) - return -EINTR; } return 1; @@ -5372,16 +5370,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&trace_types_lock); if (ret) { size = ret; goto out_unlock; } - if (signal_pending(current)) { - size = -EINTR; - goto out_unlock; - } goto again; } size = 0; @@ -5500,7 +5494,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, }; struct buffer_ref *ref; int entries, size, i; - ssize_t ret; + ssize_t ret = 0; mutex_lock(&trace_types_lock); @@ -5538,13 +5532,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int r; ref = kzalloc(sizeof(*ref), GFP_KERNEL); - if (!ref) + if (!ref) { + ret = -ENOMEM; break; + } ref->ref = 1; ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (!ref->page) { + ret = -ENOMEM; kfree(ref); break; } @@ -5582,19 +5579,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { + if (ret) + goto out; + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { ret = -EAGAIN; goto out; } mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, true); mutex_lock(&trace_types_lock); if (ret) goto out; - if (signal_pending(current)) { - ret = -EINTR; - goto out; - } + goto again; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4dc8b79c5f75..29228c4d5696 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -313,7 +313,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ @@ -360,7 +360,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ @@ -567,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -641,7 +641,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; |