summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/linux/backing-dev.h26
-rw-r--r--include/linux/blk-cgroup.h340
-rw-r--r--include/linux/cgroup_subsys.h2
-rw-r--r--include/linux/kernfs.h4
-rw-r--r--include/trace/events/writeback.h180
5 files changed, 397 insertions, 155 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0fe9df983ab7..5a5d79ee256f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi
* %current's blkcg equals the effective blkcg of its memcg. No
* need to use the relatively expensive cgroup_get_e_css().
*/
- if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+ if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
return wb;
return NULL;
}
@@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
}
struct wb_iter {
- int start_blkcg_id;
+ int start_memcg_id;
struct radix_tree_iter tree_iter;
void **slot;
};
@@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
WARN_ON_ONCE(!rcu_read_lock_held());
- if (iter->start_blkcg_id >= 0) {
- iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
- iter->start_blkcg_id = -1;
+ if (iter->start_memcg_id >= 0) {
+ iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
+ iter->start_memcg_id = -1;
} else {
iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
}
@@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
struct backing_dev_info *bdi,
- int start_blkcg_id)
+ int start_memcg_id)
{
- iter->start_blkcg_id = start_blkcg_id;
+ iter->start_memcg_id = start_memcg_id;
- if (start_blkcg_id)
+ if (start_memcg_id)
return __wb_iter_next(iter, bdi);
else
return &bdi->wb;
}
/**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
* @wb_cur: cursor struct bdi_writeback pointer
* @bdi: bdi to walk wb's of
* @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_blkcg_id: blkcg ID to start iteration from
+ * @start_memcg_id: memcg ID to start iteration from
*
* Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter
+ * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter
* to be used as temp storage during iteration. rcu_read_lock() must be
* held throughout iteration.
*/
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \
- for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \
+ for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \
(wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
#else /* CONFIG_CGROUP_WRITEBACK */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index a4cd1641e9e2..0a5cc7a1109b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -14,12 +14,15 @@
*/
#include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
+#include <linux/percpu_counter.h>
#include <linux/seq_file.h>
#include <linux/radix-tree.h>
#include <linux/blkdev.h>
#include <linux/atomic.h>
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
+
/* Max limits for throttle policy */
#define THROTL_IOPS_MAX UINT_MAX
@@ -45,7 +48,7 @@ struct blkcg {
struct blkcg_gq *blkg_hint;
struct hlist_head blkg_list;
- struct blkcg_policy_data *pd[BLKCG_MAX_POLS];
+ struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
struct list_head all_blkcgs_node;
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -53,14 +56,19 @@ struct blkcg {
#endif
};
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive. Used to carry stats of dead children, and, for blkg_rwstat,
+ * to carry result values from read and sum operations.
+ */
struct blkg_stat {
- struct u64_stats_sync syncp;
- uint64_t cnt;
+ struct percpu_counter cpu_cnt;
+ atomic64_t aux_cnt;
};
struct blkg_rwstat {
- struct u64_stats_sync syncp;
- uint64_t cnt[BLKG_RWSTAT_NR];
+ struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR];
+ atomic64_t aux_cnt[BLKG_RWSTAT_NR];
};
/*
@@ -68,32 +76,28 @@ struct blkg_rwstat {
* request_queue (q). This is used by blkcg policies which need to track
* information per blkcg - q pair.
*
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size. blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods. A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
*/
struct blkg_policy_data {
/* the blkg and policy id this per-policy data belongs to */
struct blkcg_gq *blkg;
int plid;
-
- /* used during policy activation */
- struct list_head alloc_node;
};
/*
- * Policies that need to keep per-blkcg data which is independent
- * from any request_queue associated to it must specify its size
- * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it. cpd_init() is invoked to let
- * each policy handle per-blkcg data.
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods. A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
*/
struct blkcg_policy_data {
- /* the policy id this per-policy data belongs to */
+ /* the blkcg and policy id this per-policy data belongs to */
+ struct blkcg *blkcg;
int plid;
};
@@ -123,40 +127,50 @@ struct blkcg_gq {
/* is this blkg online? protected by both blkcg and q locks */
bool online;
+ struct blkg_rwstat stat_bytes;
+ struct blkg_rwstat stat_ios;
+
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
struct rcu_head rcu_head;
};
-typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
struct blkcg_policy {
int plid;
- /* policy specific private data size */
- size_t pd_size;
- /* policy specific per-blkcg data size */
- size_t cpd_size;
/* cgroup files for the policy */
- struct cftype *cftypes;
+ struct cftype *dfl_cftypes;
+ struct cftype *legacy_cftypes;
/* operations */
+ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn;
blkcg_pol_init_cpd_fn *cpd_init_fn;
+ blkcg_pol_free_cpd_fn *cpd_free_fn;
+ blkcg_pol_bind_cpd_fn *cpd_bind_fn;
+
+ blkcg_pol_alloc_pd_fn *pd_alloc_fn;
blkcg_pol_init_pd_fn *pd_init_fn;
blkcg_pol_online_pd_fn *pd_online_fn;
blkcg_pol_offline_pd_fn *pd_offline_fn;
- blkcg_pol_exit_pd_fn *pd_exit_fn;
+ blkcg_pol_free_pd_fn *pd_free_fn;
blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
};
extern struct blkcg blkcg_root;
extern struct cgroup_subsys_state * const blkcg_root_css;
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+ struct request_queue *q, bool update_hint);
struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
struct request_queue *q);
int blkcg_init_queue(struct request_queue *q);
@@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q,
void blkcg_deactivate_policy(struct request_queue *q,
const struct blkcg_policy *pol);
+const char *blkg_dev_name(struct blkcg_gq *blkg);
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
u64 (*prfill)(struct seq_file *,
struct blkg_policy_data *, int),
@@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
int off);
+int blkg_print_stat_bytes(struct seq_file *sf, void *v);
+int blkg_print_stat_ios(struct seq_file *sf, void *v);
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
- int off);
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol, int off);
struct blkg_conf_ctx {
struct gendisk *disk;
struct blkcg_gq *blkg;
- u64 v;
+ char *body;
};
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- const char *input, struct blkg_conf_ctx *ctx);
+ char *input, struct blkg_conf_ctx *ctx);
void blkg_conf_finish(struct blkg_conf_ctx *ctx);
@@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
static inline struct blkcg *task_blkcg(struct task_struct *tsk)
{
- return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+ return css_to_blkcg(task_css(tsk, io_cgrp_id));
}
static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
static inline struct cgroup_subsys_state *
task_get_blkcg_css(struct task_struct *task)
{
- return task_get_css(task, blkio_cgrp_id);
+ return task_get_css(task, io_cgrp_id);
}
/**
@@ -233,6 +253,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
}
/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state. If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q,
+ bool update_hint)
+{
+ struct blkcg_gq *blkg;
+
+ if (blkcg == &blkcg_root)
+ return q->root_blkg;
+
+ blkg = rcu_dereference(blkcg->blkg_hint);
+ if (blkg && blkg->q == q)
+ return blkg;
+
+ return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair. This function should be called
+ * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
+ * - see blk_queue_bypass_start() for details.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (unlikely(blk_queue_bypass(q)))
+ return NULL;
+ return __blkg_lookup(blkcg, q, false);
+}
+
+/**
* blkg_to_pdata - get policy private data
* @blkg: blkg of interest
* @pol: policy of interest
@@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
struct blkcg_policy *pol)
{
- return blkcg ? blkcg->pd[pol->plid] : NULL;
+ return blkcg ? blkcg->cpd[pol->plid] : NULL;
}
/**
@@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
return pd ? pd->blkg : NULL;
}
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+ return cpd ? cpd->blkcg : NULL;
+}
+
/**
* blkg_path - format cgroup path of blkg
* @blkg: blkg of interest
@@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
call_rcu(&blkg->rcu_head, __blkg_release_rcu);
}
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
- bool update_hint);
-
/**
* blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
@@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
* or if either the blkcg or queue is going away. Fall back to
* root_rl in such cases.
*/
- blkg = blkg_lookup_create(blkcg, q);
- if (IS_ERR(blkg))
+ blkg = blkg_lookup(blkcg, q);
+ if (unlikely(!blkg))
goto root_rl;
blkg_get(blkg);
@@ -394,8 +462,7 @@ root_rl:
*/
static inline void blk_put_rl(struct request_list *rl)
{
- /* root_rl may not have blkg set */
- if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+ if (rl->blkg->blkcg != &blkcg_root)
blkg_put(rl->blkg);
}
@@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
-static inline void blkg_stat_init(struct blkg_stat *stat)
+static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
{
- u64_stats_init(&stat->syncp);
+ int ret;
+
+ ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+ if (ret)
+ return ret;
+
+ atomic64_set(&stat->aux_cnt, 0);
+ return 0;
+}
+
+static inline void blkg_stat_exit(struct blkg_stat *stat)
+{
+ percpu_counter_destroy(&stat->cpu_cnt);
}
/**
@@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
* @stat: target blkg_stat
* @val: value to add
*
- * Add @val to @stat. The caller is responsible for synchronizing calls to
- * this function.
+ * Add @val to @stat. The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
*/
static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
{
- u64_stats_update_begin(&stat->syncp);
- stat->cnt += val;
- u64_stats_update_end(&stat->syncp);
+ __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
}
/**
* blkg_stat_read - read the current value of a blkg_stat
* @stat: blkg_stat to read
- *
- * Read the current value of @stat. This function can be called without
- * synchroniztion and takes care of u64 atomicity.
*/
static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
{
- unsigned int start;
- uint64_t v;
-
- do {
- start = u64_stats_fetch_begin_irq(&stat->syncp);
- v = stat->cnt;
- } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
- return v;
+ return percpu_counter_sum_positive(&stat->cpu_cnt);
}
/**
@@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
*/
static inline void blkg_stat_reset(struct blkg_stat *stat)
{
- stat->cnt = 0;
+ percpu_counter_set(&stat->cpu_cnt, 0);
+ atomic64_set(&stat->aux_cnt, 0);
}
/**
- * blkg_stat_merge - merge a blkg_stat into another
+ * blkg_stat_add_aux - add a blkg_stat into another's aux count
* @to: the destination blkg_stat
* @from: the source
*
- * Add @from's count to @to.
+ * Add @from's count including the aux one to @to's aux count.
*/
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+static inline void blkg_stat_add_aux(struct blkg_stat *to,
+ struct blkg_stat *from)
{
- blkg_stat_add(to, blkg_stat_read(from));
+ atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
+ &to->aux_cnt);
}
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
{
- u64_stats_init(&rwstat->syncp);
+ int i, ret;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+ ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+ if (ret) {
+ while (--i >= 0)
+ percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+ return ret;
+ }
+ atomic64_set(&rwstat->aux_cnt[i], 0);
+ }
+ return 0;
+}
+
+static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ percpu_counter_destroy(&rwstat->cpu_cnt[i]);
}
/**
@@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
int rw, uint64_t val)
{
- u64_stats_update_begin(&rwstat->syncp);
+ struct percpu_counter *cnt;
if (rw & REQ_WRITE)
- rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
else
- rwstat->cnt[BLKG_RWSTAT_READ] += val;
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+ __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+
if (rw & REQ_SYNC)
- rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
else
- rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
- u64_stats_update_end(&rwstat->syncp);
+ __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
}
/**
* blkg_rwstat_read - read the current values of a blkg_rwstat
* @rwstat: blkg_rwstat to read
*
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
+ * Read the current snapshot of @rwstat and return it in the aux counts.
*/
static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
{
- unsigned int start;
- struct blkg_rwstat tmp;
-
- do {
- start = u64_stats_fetch_begin_irq(&rwstat->syncp);
- tmp = *rwstat;
- } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+ struct blkg_rwstat result;
+ int i;
- return tmp;
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ atomic64_set(&result.aux_cnt[i],
+ percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
+ return result;
}
/**
@@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
{
struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
- return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+ return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
}
/**
@@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
*/
static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
{
- memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+ percpu_counter_set(&rwstat->cpu_cnt[i], 0);
+ atomic64_set(&rwstat->aux_cnt[i], 0);
+ }
}
/**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
* @to: the destination blkg_rwstat
* @from: the source
*
- * Add @from's counts to @to.
+ * Add @from's count including the aux one to @to's aux count.
*/
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
- struct blkg_rwstat *from)
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+ struct blkg_rwstat *from)
{
struct blkg_rwstat v = blkg_rwstat_read(from);
int i;
- u64_stats_update_begin(&to->syncp);
for (i = 0; i < BLKG_RWSTAT_NR; i++)
- to->cnt[i] += v.cnt[i];
- u64_stats_update_end(&to->syncp);
+ atomic64_add(atomic64_read(&v.aux_cnt[i]) +
+ atomic64_read(&from->aux_cnt[i]),
+ &to->aux_cnt[i]);
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+ struct bio *bio);
+#else
+static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+ struct bio *bio) { return false; }
+#endif
+
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+ struct bio *bio)
+{
+ struct blkcg *blkcg;
+ struct blkcg_gq *blkg;
+ bool throtl = false;
+
+ rcu_read_lock();
+ blkcg = bio_blkcg(bio);
+
+ blkg = blkg_lookup(blkcg, q);
+ if (unlikely(!blkg)) {
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_lookup_create(blkcg, q);
+ if (IS_ERR(blkg))
+ blkg = NULL;
+ spin_unlock_irq(q->queue_lock);
+ }
+
+ throtl = blk_throtl_bio(q, blkg, bio);
+
+ if (!throtl) {
+ blkg = blkg ?: q->root_blkg;
+ blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags,
+ bio->bi_iter.bi_size);
+ blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1);
+ }
+
+ rcu_read_unlock();
+ return !throtl;
}
#else /* CONFIG_BLK_CGROUP */
@@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { }
static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+ struct bio *bio) { return true; }
+
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 1f36945fd23d..1a96fdaa33d5 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -27,7 +27,7 @@ SUBSYS(cpuacct)
#endif
#if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
+SUBSYS(io)
#endif
#if IS_ENABLED(CONFIG_MEMCG)
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 123be25ea15a..5d4e9c4b821d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
}
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+size_t kernfs_path_len(struct kernfs_node *kn);
char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
@@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{ return -ENOSYS; }
+static inline size_t kernfs_path_len(struct kernfs_node *kn)
+{ return 0; }
+
static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
size_t buflen)
{ return NULL; }
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index a7aa607a4c55..fff846b512e6 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
TP_ARGS(inode, flags)
);
+#ifdef CREATE_TRACE_POINTS
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+ return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+ struct cgroup *cgrp = wb->memcg_css->cgroup;
+ char *path;
+
+ path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
+ WARN_ON_ONCE(path != buf);
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+ if (wbc->wb)
+ return __trace_wb_cgroup_size(wbc->wb);
+ else
+ return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+ struct writeback_control *wbc)
+{
+ if (wbc->wb)
+ __trace_wb_assign_cgroup(buf, wbc->wb);
+ else
+ strcpy(buf, "/");
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+ return 2;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+ strcpy(buf, "/");
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+ return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+ struct writeback_control *wbc)
+{
+ strcpy(buf, "/");
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+#endif /* CREATE_TRACE_POINTS */
+
DECLARE_EVENT_CLASS(writeback_write_inode_template,
TP_PROTO(struct inode *inode, struct writeback_control *wbc),
@@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
__array(char, name, 32)
__field(unsigned long, ino)
__field(int, sync_mode)
+ __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
),
TP_fast_assign(
@@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
dev_name(inode_to_bdi(inode)->dev), 32);
__entry->ino = inode->i_ino;
__entry->sync_mode = wbc->sync_mode;
+ __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
),
- TP_printk("bdi %s: ino=%lu sync_mode=%d",
+ TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
__entry->name,
__entry->ino,
- __entry->sync_mode
+ __entry->sync_mode,
+ __get_str(cgroup)
)
);
@@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,
);
DECLARE_EVENT_CLASS(writeback_work_class,
- TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
- TP_ARGS(bdi, work),
+ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
+ TP_ARGS(wb, work),
TP_STRUCT__entry(
__array(char, name, 32)
__field(long, nr_pages)
@@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__field(int, range_cyclic)
__field(int, for_background)
__field(int, reason)
+ __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
),
TP_fast_assign(
strncpy(__entry->name,
- bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
+ wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
__entry->nr_pages = work->nr_pages;
__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
__entry->sync_mode = work->sync_mode;
@@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__entry->range_cyclic = work->range_cyclic;
__entry->for_background = work->for_background;
__entry->reason = work->reason;
+ __trace_wb_assign_cgroup(__get_str(cgroup), wb);
),
TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
- "kupdate=%d range_cyclic=%d background=%d reason=%s",
+ "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
__entry->name,
MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
__entry->nr_pages,
@@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__entry->for_kupdate,
__entry->range_cyclic,
__entry->for_background,
- __print_symbolic(__entry->reason, WB_WORK_REASON)
+ __print_symbolic(__entry->reason, WB_WORK_REASON),
+ __get_str(cgroup)
)
);
#define DEFINE_WRITEBACK_WORK_EVENT(name) \
DEFINE_EVENT(writeback_work_class, name, \
- TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
- TP_ARGS(bdi, work))
+ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
+ TP_ARGS(wb, work))
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written,
);
DECLARE_EVENT_CLASS(writeback_class,
- TP_PROTO(struct backing_dev_info *bdi),
- TP_ARGS(bdi),
+ TP_PROTO(struct bdi_writeback *wb),
+ TP_ARGS(wb),
TP_STRUCT__entry(
__array(char, name, 32)
+ __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
),
TP_fast_assign(
- strncpy(__entry->name, dev_name(bdi->dev), 32);
+ strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+ __trace_wb_assign_cgroup(__get_str(cgroup), wb);
),
- TP_printk("bdi %s",
- __entry->name
+ TP_printk("bdi %s: cgroup=%s",
+ __entry->name,
+ __get_str(cgroup)
)
);
#define DEFINE_WRITEBACK_EVENT(name) \
DEFINE_EVENT(writeback_class, name, \
- TP_PROTO(struct backing_dev_info *bdi), \
- TP_ARGS(bdi))
+ TP_PROTO(struct bdi_writeback *wb), \
+ TP_ARGS(wb))
DEFINE_WRITEBACK_EVENT(writeback_nowork);
DEFINE_WRITEBACK_EVENT(writeback_wake_background);
-DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
+
+TRACE_EVENT(writeback_bdi_register,
+ TP_PROTO(struct backing_dev_info *bdi),
+ TP_ARGS(bdi),
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ ),
+ TP_fast_assign(
+ strncpy(__entry->name, dev_name(bdi->dev), 32);
+ ),
+ TP_printk("bdi %s",
+ __entry->name
+ )
+);
DECLARE_EVENT_CLASS(wbc_class,
TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
__field(int, range_cyclic)
__field(long, range_start)
__field(long, range_end)
+ __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
),
TP_fast_assign(
@@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class,
__entry->range_cyclic = wbc->range_cyclic;
__entry->range_start = (long)wbc->range_start;
__entry->range_end = (long)wbc->range_end;
+ __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
),
TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
"bgrd=%d reclm=%d cyclic=%d "
- "start=0x%lx end=0x%lx",
+ "start=0x%lx end=0x%lx cgroup=%s",
__entry->name,
__entry->nr_to_write,
__entry->pages_skipped,
@@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class,
__entry->for_reclaim,
__entry->range_cyclic,
__entry->range_start,
- __entry->range_end)
+ __entry->range_end,
+ __get_str(cgroup)
+ )
)
#define DEFINE_WBC_EVENT(name) \
@@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io,
__field(long, age)
__field(int, moved)
__field(int, reason)
+ __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
),
TP_fast_assign(
unsigned long *older_than_this = work->older_than_this;
@@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io,
(jiffies - *older_than_this) * 1000 / HZ : -1;
__entry->moved = moved;
__entry->reason = work->reason;
+ __trace_wb_assign_cgroup(__get_str(cgroup), wb);
),
- TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s",
+ TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
__entry->name,
__entry->older, /* older_than_this in jiffies */
__entry->age, /* older_than_this in relative milliseconds */
__entry->moved,
- __print_symbolic(__entry->reason, WB_WORK_REASON)
+ __print_symbolic(__entry->reason, WB_WORK_REASON),
+ __get_str(cgroup)
)
);
@@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state,
TRACE_EVENT(bdi_dirty_ratelimit,
- TP_PROTO(struct backing_dev_info *bdi,
+ TP_PROTO(struct bdi_writeback *wb,
unsigned long dirty_rate,
unsigned long task_ratelimit),
- TP_ARGS(bdi, dirty_rate, task_ratelimit),
+ TP_ARGS(wb, dirty_rate, task_ratelimit),
TP_STRUCT__entry(
__array(char, bdi, 32)
@@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit,
__field(unsigned long, dirty_ratelimit)
__field(unsigned long, task_ratelimit)
__field(unsigned long, balanced_dirty_ratelimit)
+ __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
),
TP_fast_assign(
- strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
- __entry->write_bw = KBps(bdi->wb.write_bandwidth);
- __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth);
+ strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
+ __entry->write_bw = KBps(wb->write_bandwidth);
+ __entry->avg_write_bw = KBps(wb->avg_write_bandwidth);
__entry->dirty_rate = KBps(dirty_rate);
- __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
+ __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
__entry->task_ratelimit = KBps(task_ratelimit);
__entry->balanced_dirty_ratelimit =
- KBps(bdi->wb.balanced_dirty_ratelimit);
+ KBps(wb->balanced_dirty_ratelimit);
+ __trace_wb_assign_cgroup(__get_str(cgroup), wb);
),
TP_printk("bdi %s: "
"write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
"dirty_ratelimit=%lu task_ratelimit=%lu "
- "balanced_dirty_ratelimit=%lu",
+ "balanced_dirty_ratelimit=%lu cgroup=%s",
__entry->bdi,
__entry->write_bw, /* write bandwidth */
__entry->avg_write_bw, /* avg write bandwidth */
__entry->dirty_rate, /* bdi dirty rate */
__entry->dirty_ratelimit, /* base ratelimit */
__entry->task_ratelimit, /* ratelimit with position control */
- __entry->balanced_dirty_ratelimit /* the balanced ratelimit */
+ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
+ __get_str(cgroup)
)
);
TRACE_EVENT(balance_dirty_pages,
- TP_PROTO(struct backing_dev_info *bdi,
+ TP_PROTO(struct bdi_writeback *wb,
unsigned long thresh,
unsigned long bg_thresh,
unsigned long dirty,
@@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages,
long pause,
unsigned long start_time),
- TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
+ TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
dirty_ratelimit, task_ratelimit,
dirtied, period, pause, start_time),
@@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages,
__field( long, pause)
__field(unsigned long, period)
__field( long, think)
+ __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
),
TP_fast_assign(
unsigned long freerun = (thresh + bg_thresh) / 2;
- strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
+ strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
__entry->limit = global_wb_domain.dirty_limit;
__entry->setpoint = (global_wb_domain.dirty_limit +
@@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages,
__entry->period = period * 1000 / HZ;
__entry->pause = pause * 1000 / HZ;
__entry->paused = (jiffies - start_time) * 1000 / HZ;
+ __trace_wb_assign_cgroup(__get_str(cgroup), wb);
),
@@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages,
"bdi_setpoint=%lu bdi_dirty=%lu "
"dirty_ratelimit=%lu task_ratelimit=%lu "
"dirtied=%u dirtied_pause=%u "
- "paused=%lu pause=%ld period=%lu think=%ld",
+ "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
__entry->bdi,
__entry->limit,
__entry->setpoint,
@@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages,
__entry->paused, /* ms */
__entry->pause, /* ms */
__entry->period, /* ms */
- __entry->think /* ms */
+ __entry->think, /* ms */
+ __get_str(cgroup)
)
);
@@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
__field(unsigned long, ino)
__field(unsigned long, state)
__field(unsigned long, dirtied_when)
+ __dynamic_array(char, cgroup,
+ __trace_wb_cgroup_size(inode_to_wb(inode)))
),
TP_fast_assign(
@@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
__entry->ino = inode->i_ino;
__entry->state = inode->i_state;
__entry->dirtied_when = inode->dirtied_when;
+ __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
),
- TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu",
+ TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
__entry->name,
__entry->ino,
show_inode_state(__entry->state),
__entry->dirtied_when,
- (jiffies - __entry->dirtied_when) / HZ
+ (jiffies - __entry->dirtied_when) / HZ,
+ __get_str(cgroup)
)
);
@@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
__field(unsigned long, writeback_index)
__field(long, nr_to_write)
__field(unsigned long, wrote)
+ __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
),
TP_fast_assign(
@@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
__entry->writeback_index = inode->i_mapping->writeback_index;
__entry->nr_to_write = nr_to_write;
__entry->wrote = nr_to_write - wbc->nr_to_write;
+ __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
),
TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
- "index=%lu to_write=%ld wrote=%lu",
+ "index=%lu to_write=%ld wrote=%lu cgroup=%s",
__entry->name,
__entry->ino,
show_inode_state(__entry->state),
@@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
(jiffies - __entry->dirtied_when) / HZ,
__entry->writeback_index,
__entry->nr_to_write,
- __entry->wrote
+ __entry->wrote,
+ __get_str(cgroup)
)
);