diff options
Diffstat (limited to 'kernel/sched/psi.c')
-rw-r--r-- | kernel/sched/psi.c | 617 |
1 files changed, 556 insertions, 61 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0e97ca9306ef..7acc632c3b82 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -4,6 +4,9 @@ * Copyright (c) 2018 Facebook, Inc. * Author: Johannes Weiner <hannes@cmpxchg.org> * + * Polling support by Suren Baghdasaryan <surenb@google.com> + * Copyright (c) 2018 Google, Inc. + * * When CPU, memory and IO are contended, tasks experience delays that * reduce throughput and introduce latencies into the workload. Memory * and IO contention, in addition, can cause a full loss of forward @@ -129,9 +132,13 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/seqlock.h> +#include <linux/uaccess.h> #include <linux/cgroup.h> #include <linux/module.h> #include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/file.h> +#include <linux/poll.h> #include <linux/psi.h> #include "sched.h" @@ -140,9 +147,9 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) { @@ -156,16 +163,21 @@ __setup("psi=", setup_psi); #define EXP_60s 1981 /* 1/exp(2s/60s) */ #define EXP_300s 2034 /* 1/exp(2s/300s) */ +/* PSI trigger definitions */ +#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10 /* 10 updates per window */ + /* Sampling frequency in nanoseconds */ static u64 psi_period __read_mostly; /* System-level pressure and stall tracking */ static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); -static struct psi_group psi_system = { +struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static void psi_update_work(struct work_struct *work); +static void psi_avgs_work(struct work_struct *work); static void group_init(struct psi_group *group) { @@ -173,9 +185,20 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; - INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); + group->avg_next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + mutex_init(&group->avgs_lock); + /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); + mutex_init(&group->trigger_lock); + INIT_LIST_HEAD(&group->triggers); + memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); + group->poll_states = 0; + group->poll_min_period = U32_MAX; + memset(group->polling_total, 0, sizeof(group->polling_total)); + group->polling_next_update = ULLONG_MAX; + group->polling_until = 0; + rcu_assign_pointer(group->poll_kworker, NULL); } void __init psi_init(void) @@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); - unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask; + + *pchanged_states = 0; /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; } while (read_seqcount_retry(&groupc->seq, seq)); @@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start; - delta = times[s] - groupc->times_prev[s]; - groupc->times_prev[s] = times[s]; + delta = times[s] - groupc->times_prev[aggregator][s]; + groupc->times_prev[aggregator][s] = times[s]; times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); } } @@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool update_stats(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, + enum psi_aggregators aggregator, + u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; + u32 changed_states = 0; int cpu; int s; - mutex_lock(&group->stat_lock); - /* * Collect the per-cpu time buckets and average them into a * single time sample that is normalized to wallclock time. @@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states; - get_recent_times(group, cpu, times); + get_recent_times(group, cpu, aggregator, times, + &cpu_changed_states); + changed_states |= cpu_changed_states; nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -315,13 +346,22 @@ static bool update_stats(struct psi_group *group) /* total= */ for (s = 0; s < NR_PSI_STATES - 1; s++) - group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + group->total[aggregator][s] += + div_u64(deltas[s], max(nonidle_total, 1UL)); + + if (pchanged_states) + *pchanged_states = changed_states; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; /* avgX= */ - now = sched_clock(); - expires = group->next_update; - if (now < expires) - goto out; + expires = group->avg_next_update; if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); @@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->total_prev[s]; + sample = group->total[PSI_AVGS][s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->stat_lock); - return nonidle_total; + + return avg_next_update; } -static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; + u32 changed_states; bool nonidle; + u64 now; dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, clock_work); + group = container_of(dwork, struct psi_group, avgs_work); + + mutex_lock(&group->avgs_lock); + now = sched_clock(); + + collect_percpu_times(group, PSI_AVGS, &changed_states); + nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - - nonidle = update_stats(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); if (nonidle) { - unsigned long delay = 0; - u64 now; + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); + } + + mutex_unlock(&group->avgs_lock); +} + +/* Trigger tracking window manupulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void init_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->triggers, node) + window_reset(&t->win, now, + group->total[PSI_POLL][t->state], 0); + memcpy(group->polling_total, group->total[PSI_POLL], + sizeof(group->polling_total)); + group->polling_next_update = now + group->poll_min_period; +} + +static u64 update_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + bool new_stall = false; + u64 *total = group->total[PSI_POLL]; + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, &group->triggers, node) { + u64 growth; + + /* Check for stall activity */ + if (group->polling_total[t->state] == total[t->state]) + continue; + + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + new_stall = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) + wake_up_interruptible(&t->event_wait); + t->last_event_time = now; + } + + if (new_stall) + memcpy(group->polling_total, total, + sizeof(group->polling_total)); + + return now + group->poll_min_period; +} + +/* + * Schedule polling if it's not already scheduled. It's safe to call even from + * hotpath because even though kthread_queue_delayed_work takes worker->lock + * spinlock that spinlock is never contended due to poll_scheduled atomic + * preventing such competition. + */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +{ + struct kthread_worker *kworker; + + /* Do not reschedule if already scheduled */ + if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + return; + + rcu_read_lock(); - now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; - schedule_delayed_work(dwork, delay); + kworker = rcu_dereference(group->poll_kworker); + /* + * kworker might be NULL in case psi_trigger_destroy races with + * psi_task_change (hotpath) which can't use locks + */ + if (likely(kworker)) + kthread_queue_delayed_work(kworker, &group->poll_work, delay); + else + atomic_set(&group->poll_scheduled, 0); + + rcu_read_unlock(); +} + +static void psi_poll_work(struct kthread_work *work) +{ + struct kthread_delayed_work *dwork; + struct psi_group *group; + u32 changed_states; + u64 now; + + dwork = container_of(work, struct kthread_delayed_work, work); + group = container_of(dwork, struct psi_group, poll_work); + + atomic_set(&group->poll_scheduled, 0); + + mutex_lock(&group->trigger_lock); + + now = sched_clock(); + + collect_percpu_times(group, PSI_POLL, &changed_states); + + if (changed_states & group->poll_states) { + /* Initialize trigger windows when entering polling mode */ + if (now > group->polling_until) + init_triggers(group, now); + + /* + * Keep the monitor active for at least the duration of the + * minimum tracking window as long as monitor states are + * changing. + */ + group->polling_until = now + + group->poll_min_period * UPDATES_PER_WINDOW; + } + + if (now > group->polling_until) { + group->polling_next_update = ULLONG_MAX; + goto out; } + + if (now >= group->polling_next_update) + group->polling_next_update = update_triggers(group, now); + + psi_schedule_poll_work(group, + nsecs_to_jiffies(group->polling_next_update - now) + 1); + +out: + mutex_unlock(&group->trigger_lock); } static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, delta = now - groupc->state_start; groupc->state_start = now; - if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; } - if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; else if (memstall_tick) { u32 sample; @@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) groupc->times[PSI_CPU_SOME] += delta; - if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } -static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static u32 psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) { struct psi_group_cpu *groupc; unsigned int t, m; + enum psi_states s; + u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + /* Calculate state mask representing active states */ + for (s = 0; s < NR_PSI_STATES; s++) { + if (test_state(groupc->tasks, s)) + state_mask |= (1 << s); + } + groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); + + return state_mask; } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set) */ if (unlikely((clear & TSK_RUNNING) && (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_update_work)) + wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; while ((group = iterate_groups(task, &iter))) { - psi_group_change(group, cpu, clear, set); - if (wake_clock && !delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); + u32 state_mask = psi_group_change(group, cpu, clear, set); + + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } } @@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.clock_work); + cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); + /* All triggers must be removed by now */ + WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); } /** @@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; + u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - update_stats(group); + /* Update averages before reporting them */ + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); for (full = 0; full < 2 - (res == PSI_CPU); full++) { unsigned long avg[3]; @@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) for (w = 0; w < 3; w++) avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file) return single_open(file, psi_cpu_show, NULL); } +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res) +{ + struct psi_trigger *t; + enum psi_states state; + u32 threshold_us; + u32 window_us; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_SOME + res * 2; + else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_FULL + res * 2; + else + return ERR_PTR(-EINVAL); + + if (state >= PSI_NONIDLE) + return ERR_PTR(-EINVAL); + + if (window_us < WINDOW_MIN_US || + window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return ERR_PTR(-EINVAL); + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + t->group = group; + t->state = state; + t->threshold = threshold_us * NSEC_PER_USEC; + t->win.size = window_us * NSEC_PER_USEC; + window_reset(&t->win, 0, 0, 0); + + t->event = 0; + t->last_event_time = 0; + init_waitqueue_head(&t->event_wait); + kref_init(&t->refcount); + + mutex_lock(&group->trigger_lock); + + if (!rcu_access_pointer(group->poll_kworker)) { + struct sched_param param = { + .sched_priority = MAX_RT_PRIO - 1, + }; + struct kthread_worker *kworker; + + kworker = kthread_create_worker(0, "psimon"); + if (IS_ERR(kworker)) { + kfree(t); + mutex_unlock(&group->trigger_lock); + return ERR_CAST(kworker); + } + sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + kthread_init_delayed_work(&group->poll_work, + psi_poll_work); + rcu_assign_pointer(group->poll_kworker, kworker); + } + + list_add(&t->node, &group->triggers); + group->poll_min_period = min(group->poll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->nr_triggers[t->state]++; + group->poll_states |= (1 << t->state); + + mutex_unlock(&group->trigger_lock); + + return t; +} + +static void psi_trigger_destroy(struct kref *ref) +{ + struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); + struct psi_group *group = t->group; + struct kthread_worker *kworker_to_destroy = NULL; + + if (static_branch_likely(&psi_disabled)) + return; + + /* + * Wakeup waiters to stop polling. Can happen if cgroup is deleted + * from under a polling process. + */ + wake_up_interruptible(&t->event_wait); + + mutex_lock(&group->trigger_lock); + + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->nr_triggers[t->state]--; + if (!group->nr_triggers[t->state]) + group->poll_states &= ~(1 << t->state); + /* reset min update period for the remaining triggers */ + list_for_each_entry(tmp, &group->triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->poll_min_period = period; + /* Destroy poll_kworker when the last trigger is destroyed */ + if (group->poll_states == 0) { + group->polling_until = 0; + kworker_to_destroy = rcu_dereference_protected( + group->poll_kworker, + lockdep_is_held(&group->trigger_lock)); + rcu_assign_pointer(group->poll_kworker, NULL); + } + } + + mutex_unlock(&group->trigger_lock); + + /* + * Wait for both *trigger_ptr from psi_trigger_replace and + * poll_kworker RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_kworker + */ + synchronize_rcu(); + /* + * Destroy the kworker after releasing trigger_lock to prevent a + * deadlock while waiting for psi_poll_work to acquire trigger_lock + */ + if (kworker_to_destroy) { + kthread_cancel_delayed_work_sync(&group->poll_work); + kthread_destroy_worker(kworker_to_destroy); + } + kfree(t); +} + +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) +{ + struct psi_trigger *old = *trigger_ptr; + + if (static_branch_likely(&psi_disabled)) + return; + + rcu_assign_pointer(*trigger_ptr, new); + if (old) + kref_put(&old->refcount, psi_trigger_destroy); +} + +__poll_t psi_trigger_poll(void **trigger_ptr, + struct file *file, poll_table *wait) +{ + __poll_t ret = DEFAULT_POLLMASK; + struct psi_trigger *t; + + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + rcu_read_lock(); + + t = rcu_dereference(*(void __rcu __force **)trigger_ptr); + if (!t) { + rcu_read_unlock(); + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + } + kref_get(&t->refcount); + + rcu_read_unlock(); + + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + ret |= EPOLLPRI; + + kref_put(&t->refcount, psi_trigger_destroy); + + return ret; +} + +static ssize_t psi_write(struct file *file, const char __user *user_buf, + size_t nbytes, enum psi_res res) +{ + char buf[32]; + size_t buf_size; + struct seq_file *seq; + struct psi_trigger *new; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + buf_size = min(nbytes, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size - 1] = '\0'; + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) + return PTR_ERR(new); + + seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); + psi_trigger_replace(&seq->private, new); + mutex_unlock(&seq->lock); + + return nbytes; +} + +static ssize_t psi_io_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + return psi_trigger_poll(&seq->private, file, wait); +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + psi_trigger_replace(&seq->private, NULL); + return single_release(inode, file); +} + static const struct file_operations psi_io_fops = { .open = psi_io_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_io_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_memory_fops = { .open = psi_memory_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_memory_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_cpu_fops = { .open = psi_cpu_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_cpu_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static int __init psi_proc_init(void) |