diff options
Diffstat (limited to 'kernel/sched/psi.c')
-rw-r--r-- | kernel/sched/psi.c | 111 |
1 files changed, 80 insertions, 31 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 028520702717..8f45cdb6463b 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state) case PSI_MEM_FULL: return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; case PSI_CPU_SOME: - return tasks[NR_RUNNING] > 1; + return tasks[NR_RUNNING] > tasks[NR_ONCPU]; case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -669,13 +669,14 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_NONIDLE] += delta; } -static u32 psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set, + bool wake_clock) { struct psi_group_cpu *groupc; + u32 state_mask = 0; unsigned int t, m; enum psi_states s; - u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -695,10 +696,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu, if (!(m & (1 << t))) continue; if (groupc->tasks[t] == 0 && !psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } groupc->tasks[t]--; @@ -717,7 +718,11 @@ static u32 psi_group_change(struct psi_group *group, int cpu, write_seqcount_end(&groupc->seq); - return state_mask; + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -744,27 +749,32 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter) return &psi_system; } -void psi_task_change(struct task_struct *task, int clear, int set) +static void psi_flags_change(struct task_struct *task, int clear, int set) { - int cpu = task_cpu(task); - struct psi_group *group; - bool wake_clock = true; - void *iter = NULL; - - if (!task->pid) - return; - if (((task->psi_flags & set) || (task->psi_flags & clear) != clear) && !psi_bug) { printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", - task->pid, task->comm, cpu, + task->pid, task->comm, task_cpu(task), task->psi_flags, clear, set); psi_bug = 1; } task->psi_flags &= ~clear; task->psi_flags |= set; +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + struct psi_group *group; + bool wake_clock = true; + void *iter = NULL; + + if (!task->pid) + return; + + psi_flags_change(task, clear, set); /* * Periodic aggregation shuts off if there is a period of no @@ -777,14 +787,51 @@ void psi_task_change(struct task_struct *task, int clear, int set) wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; - while ((group = iterate_groups(task, &iter))) { - u32 state_mask = psi_group_change(group, cpu, clear, set); + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set, wake_clock); +} + +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep) +{ + struct psi_group *group, *common = NULL; + int cpu = task_cpu(prev); + void *iter; + + if (next->pid) { + psi_flags_change(next, 0, TSK_ONCPU); + /* + * When moving state between tasks, the group that + * contains them both does not change: we can stop + * updating the tree once we reach the first common + * ancestor. Iterate @next's ancestors until we + * encounter @prev's state. + */ + iter = NULL; + while ((group = iterate_groups(next, &iter))) { + if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + common = group; + break; + } + + psi_group_change(group, cpu, 0, TSK_ONCPU, true); + } + } + + /* + * If this is a voluntary sleep, dequeue will have taken care + * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We + * only need to deal with it during preemption. + */ + if (sleep) + return; - if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1); + if (prev->pid) { + psi_flags_change(prev, TSK_ONCPU, 0); - if (wake_clock && !delayed_work_pending(&group->avgs_work)) - schedule_delayed_work(&group->avgs_work, PSI_FREQ); + iter = NULL; + while ((group = iterate_groups(prev, &iter)) && group != common) + psi_group_change(group, cpu, TSK_ONCPU, 0, true); } } @@ -818,17 +865,17 @@ void psi_memstall_enter(unsigned long *flags) if (static_branch_likely(&psi_disabled)) return; - *flags = current->flags & PF_MEMSTALL; + *flags = current->in_memstall; if (*flags) return; /* - * PF_MEMSTALL setting & accounting needs to be atomic wrt + * in_memstall setting & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we can * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags |= PF_MEMSTALL; + current->in_memstall = 1; psi_task_change(current, 0, TSK_MEMSTALL); rq_unlock_irq(rq, &rf); @@ -851,13 +898,13 @@ void psi_memstall_leave(unsigned long *flags) if (*flags) return; /* - * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags &= ~PF_MEMSTALL; + current->in_memstall = 0; psi_task_change(current, TSK_MEMSTALL, 0); rq_unlock_irq(rq, &rf); @@ -916,12 +963,14 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) + if (task_on_rq_queued(task)) { task_flags = TSK_RUNNING; - else if (task->in_iowait) + if (task_current(rq, task)) + task_flags |= TSK_ONCPU; + } else if (task->in_iowait) task_flags = TSK_IOWAIT; - if (task->flags & PF_MEMSTALL) + if (task->in_memstall) task_flags |= TSK_MEMSTALL; if (task_flags) |