summaryrefslogtreecommitdiffstats
path: root/block/blk-cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-cgroup.c')
-rw-r--r--block/blk-cgroup.c284
1 files changed, 279 insertions, 5 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index eb85cb87c40f..694595b29b8f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,6 +27,7 @@
#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/blk-cgroup.h>
+#include <linux/tracehook.h>
#include "blk.h"
#define MAX_KEY_LEN 100
@@ -50,6 +51,8 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
+static bool blkcg_debug_stats = false;
+
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
{
@@ -564,6 +567,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
[BLKG_RWSTAT_WRITE] = "Write",
[BLKG_RWSTAT_SYNC] = "Sync",
[BLKG_RWSTAT_ASYNC] = "Async",
+ [BLKG_RWSTAT_DISCARD] = "Discard",
};
const char *dname = blkg_dev_name(pd->blkg);
u64 v;
@@ -577,7 +581,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
(unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
+ atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
+ atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
return v;
}
@@ -954,30 +959,77 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
const char *dname;
+ char *buf;
struct blkg_rwstat rwstat;
- u64 rbytes, wbytes, rios, wios;
+ u64 rbytes, wbytes, rios, wios, dbytes, dios;
+ size_t size = seq_get_buf(sf, &buf), off = 0;
+ int i;
+ bool has_stats = false;
dname = blkg_dev_name(blkg);
if (!dname)
continue;
+ /*
+ * Hooray string manipulation, count is the size written NOT
+ * INCLUDING THE \0, so size is now count+1 less than what we
+ * had before, but we want to start writing the next bit from
+ * the \0 so we only add count to buf.
+ */
+ off += scnprintf(buf+off, size-off, "%s ", dname);
+
spin_lock_irq(blkg->q->queue_lock);
rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
offsetof(struct blkcg_gq, stat_bytes));
rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+ dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
offsetof(struct blkcg_gq, stat_ios));
rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+ dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
spin_unlock_irq(blkg->q->queue_lock);
- if (rbytes || wbytes || rios || wios)
- seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
- dname, rbytes, wbytes, rios, wios);
+ if (rbytes || wbytes || rios || wios) {
+ has_stats = true;
+ off += scnprintf(buf+off, size-off,
+ "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
+ rbytes, wbytes, rios, wios,
+ dbytes, dios);
+ }
+
+ if (!blkcg_debug_stats)
+ goto next;
+
+ if (atomic_read(&blkg->use_delay)) {
+ has_stats = true;
+ off += scnprintf(buf+off, size-off,
+ " use_delay=%d delay_nsec=%llu",
+ atomic_read(&blkg->use_delay),
+ (unsigned long long)atomic64_read(&blkg->delay_nsec));
+ }
+
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+ size_t written;
+
+ if (!blkg->pd[i] || !pol->pd_stat_fn)
+ continue;
+
+ written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
+ if (written)
+ has_stats = true;
+ off += written;
+ }
+next:
+ if (has_stats) {
+ off += scnprintf(buf+off, size-off, "\n");
+ seq_commit(sf, off);
+ }
}
rcu_read_unlock();
@@ -1191,6 +1243,14 @@ int blkcg_init_queue(struct request_queue *q)
if (preloaded)
radix_tree_preload_end();
+ ret = blk_iolatency_init(q);
+ if (ret) {
+ spin_lock_irq(q->queue_lock);
+ blkg_destroy_all(q);
+ spin_unlock_irq(q->queue_lock);
+ return ret;
+ }
+
ret = blk_throtl_init(q);
if (ret) {
spin_lock_irq(q->queue_lock);
@@ -1288,6 +1348,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
mutex_unlock(&blkcg_pol_mutex);
}
+static void blkcg_exit(struct task_struct *tsk)
+{
+ if (tsk->throttle_queue)
+ blk_put_queue(tsk->throttle_queue);
+ tsk->throttle_queue = NULL;
+}
+
struct cgroup_subsys io_cgrp_subsys = {
.css_alloc = blkcg_css_alloc,
.css_offline = blkcg_css_offline,
@@ -1297,6 +1364,7 @@ struct cgroup_subsys io_cgrp_subsys = {
.dfl_cftypes = blkcg_files,
.legacy_cftypes = blkcg_legacy_files,
.legacy_name = "blkio",
+ .exit = blkcg_exit,
#ifdef CONFIG_MEMCG
/*
* This ensures that, if available, memcg is automatically enabled
@@ -1547,3 +1615,209 @@ out_unlock:
mutex_unlock(&blkcg_pol_register_mutex);
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+
+/*
+ * Scale the accumulated delay based on how long it has been since we updated
+ * the delay. We only call this when we are adding delay, in case it's been a
+ * while since we added delay, and when we are checking to see if we need to
+ * delay a task, to account for any delays that may have occurred.
+ */
+static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
+{
+ u64 old = atomic64_read(&blkg->delay_start);
+
+ /*
+ * We only want to scale down every second. The idea here is that we
+ * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
+ * time window. We only want to throttle tasks for recent delay that
+ * has occurred, in 1 second time windows since that's the maximum
+ * things can be throttled. We save the current delay window in
+ * blkg->last_delay so we know what amount is still left to be charged
+ * to the blkg from this point onward. blkg->last_use keeps track of
+ * the use_delay counter. The idea is if we're unthrottling the blkg we
+ * are ok with whatever is happening now, and we can take away more of
+ * the accumulated delay as we've already throttled enough that
+ * everybody is happy with their IO latencies.
+ */
+ if (time_before64(old + NSEC_PER_SEC, now) &&
+ atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
+ u64 cur = atomic64_read(&blkg->delay_nsec);
+ u64 sub = min_t(u64, blkg->last_delay, now - old);
+ int cur_use = atomic_read(&blkg->use_delay);
+
+ /*
+ * We've been unthrottled, subtract a larger chunk of our
+ * accumulated delay.
+ */
+ if (cur_use < blkg->last_use)
+ sub = max_t(u64, sub, blkg->last_delay >> 1);
+
+ /*
+ * This shouldn't happen, but handle it anyway. Our delay_nsec
+ * should only ever be growing except here where we subtract out
+ * min(last_delay, 1 second), but lord knows bugs happen and I'd
+ * rather not end up with negative numbers.
+ */
+ if (unlikely(cur < sub)) {
+ atomic64_set(&blkg->delay_nsec, 0);
+ blkg->last_delay = 0;
+ } else {
+ atomic64_sub(sub, &blkg->delay_nsec);
+ blkg->last_delay = cur - sub;
+ }
+ blkg->last_use = cur_use;
+ }
+}
+
+/*
+ * This is called when we want to actually walk up the hierarchy and check to
+ * see if we need to throttle, and then actually throttle if there is some
+ * accumulated delay. This should only be called upon return to user space so
+ * we're not holding some lock that would induce a priority inversion.
+ */
+static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ u64 exp;
+ u64 delay_nsec = 0;
+ int tok;
+
+ while (blkg->parent) {
+ if (atomic_read(&blkg->use_delay)) {
+ blkcg_scale_delay(blkg, now);
+ delay_nsec = max_t(u64, delay_nsec,
+ atomic64_read(&blkg->delay_nsec));
+ }
+ blkg = blkg->parent;
+ }
+
+ if (!delay_nsec)
+ return;
+
+ /*
+ * Let's not sleep for all eternity if we've amassed a huge delay.
+ * Swapping or metadata IO can accumulate 10's of seconds worth of
+ * delay, and we want userspace to be able to do _something_ so cap the
+ * delays at 1 second. If there's 10's of seconds worth of delay then
+ * the tasks will be delayed for 1 second for every syscall.
+ */
+ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+
+ /*
+ * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
+ * that hasn't landed upstream yet. Once that stuff is in place we need
+ * to do a psi_memstall_enter/leave if memdelay is set.
+ */
+
+ exp = ktime_add_ns(now, delay_nsec);
+ tok = io_schedule_prepare();
+ do {
+ __set_current_state(TASK_KILLABLE);
+ if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
+ break;
+ } while (!fatal_signal_pending(current));
+ io_schedule_finish(tok);
+}
+
+/**
+ * blkcg_maybe_throttle_current - throttle the current task if it has been marked
+ *
+ * This is only called if we've been marked with set_notify_resume(). Obviously
+ * we can be set_notify_resume() for reasons other than blkcg throttling, so we
+ * check to see if current->throttle_queue is set and if not this doesn't do
+ * anything. This should only ever be called by the resume code, it's not meant
+ * to be called by people willy-nilly as it will actually do the work to
+ * throttle the task if it is setup for throttling.
+ */
+void blkcg_maybe_throttle_current(void)
+{
+ struct request_queue *q = current->throttle_queue;
+ struct cgroup_subsys_state *css;
+ struct blkcg *blkcg;
+ struct blkcg_gq *blkg;
+ bool use_memdelay = current->use_memdelay;
+
+ if (!q)
+ return;
+
+ current->throttle_queue = NULL;
+ current->use_memdelay = false;
+
+ rcu_read_lock();
+ css = kthread_blkcg();
+ if (css)
+ blkcg = css_to_blkcg(css);
+ else
+ blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
+
+ if (!blkcg)
+ goto out;
+ blkg = blkg_lookup(blkcg, q);
+ if (!blkg)
+ goto out;
+ blkg = blkg_try_get(blkg);
+ if (!blkg)
+ goto out;
+ rcu_read_unlock();
+
+ blkcg_maybe_throttle_blkg(blkg, use_memdelay);
+ blkg_put(blkg);
+ blk_put_queue(q);
+ return;
+out:
+ rcu_read_unlock();
+ blk_put_queue(q);
+}
+EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
+
+/**
+ * blkcg_schedule_throttle - this task needs to check for throttling
+ * @q - the request queue IO was submitted on
+ * @use_memdelay - do we charge this to memory delay for PSI
+ *
+ * This is called by the IO controller when we know there's delay accumulated
+ * for the blkg for this task. We do not pass the blkg because there are places
+ * we call this that may not have that information, the swapping code for
+ * instance will only have a request_queue at that point. This set's the
+ * notify_resume for the task to check and see if it requires throttling before
+ * returning to user space.
+ *
+ * We will only schedule once per syscall. You can call this over and over
+ * again and it will only do the check once upon return to user space, and only
+ * throttle once. If the task needs to be throttled again it'll need to be
+ * re-set at the next time we see the task.
+ */
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
+{
+ if (unlikely(current->flags & PF_KTHREAD))
+ return;
+
+ if (!blk_get_queue(q))
+ return;
+
+ if (current->throttle_queue)
+ blk_put_queue(current->throttle_queue);
+ current->throttle_queue = q;
+ if (use_memdelay)
+ current->use_memdelay = use_memdelay;
+ set_notify_resume(current);
+}
+EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
+
+/**
+ * blkcg_add_delay - add delay to this blkg
+ * @now - the current time in nanoseconds
+ * @delta - how many nanoseconds of delay to add
+ *
+ * Charge @delta to the blkg's current delay accumulation. This is used to
+ * throttle tasks if an IO controller thinks we need more throttling.
+ */
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
+{
+ blkcg_scale_delay(blkg, now);
+ atomic64_add(delta, &blkg->delay_nsec);
+}
+EXPORT_SYMBOL_GPL(blkcg_add_delay);
+
+module_param(blkcg_debug_stats, bool, 0644);
+MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");