diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/backing-dev.h | 26 | ||||
-rw-r--r-- | include/linux/blk-cgroup.h | 340 | ||||
-rw-r--r-- | include/linux/cgroup_subsys.h | 2 | ||||
-rw-r--r-- | include/linux/kernfs.h | 4 | ||||
-rw-r--r-- | include/trace/events/writeback.h | 180 |
5 files changed, 397 insertions, 155 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0fe9df983ab7..5a5d79ee256f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ - if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } @@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) } struct wb_iter { - int start_blkcg_id; + int start_memcg_id; struct radix_tree_iter tree_iter; void **slot; }; @@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, WARN_ON_ONCE(!rcu_read_lock_held()); - if (iter->start_blkcg_id >= 0) { - iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); - iter->start_blkcg_id = -1; + if (iter->start_memcg_id >= 0) { + iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id); + iter->start_memcg_id = -1; } else { iter->slot = radix_tree_next_slot(iter->slot, titer, 0); } @@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, struct backing_dev_info *bdi, - int start_blkcg_id) + int start_memcg_id) { - iter->start_blkcg_id = start_blkcg_id; + iter->start_memcg_id = start_memcg_id; - if (start_blkcg_id) + if (start_memcg_id) return __wb_iter_next(iter, bdi); else return &bdi->wb; } /** - * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order + * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order * @wb_cur: cursor struct bdi_writeback pointer * @bdi: bdi to walk wb's of * @iter: pointer to struct wb_iter to be used as iteration buffer - * @start_blkcg_id: blkcg ID to start iteration from + * @start_memcg_id: memcg ID to start iteration from * * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending - * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter + * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter * to be used as temp storage during iteration. rcu_read_lock() must be * held throughout iteration. */ -#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ - for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ +#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \ + for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \ (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) #else /* CONFIG_CGROUP_WRITEBACK */ diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index a4cd1641e9e2..0a5cc7a1109b 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -14,12 +14,15 @@ */ #include <linux/cgroup.h> -#include <linux/u64_stats_sync.h> +#include <linux/percpu_counter.h> #include <linux/seq_file.h> #include <linux/radix-tree.h> #include <linux/blkdev.h> #include <linux/atomic.h> +/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ +#define BLKG_STAT_CPU_BATCH (INT_MAX / 2) + /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX @@ -45,7 +48,7 @@ struct blkcg { struct blkcg_gq *blkg_hint; struct hlist_head blkg_list; - struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; + struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; #ifdef CONFIG_CGROUP_WRITEBACK @@ -53,14 +56,19 @@ struct blkcg { #endif }; +/* + * blkg_[rw]stat->aux_cnt is excluded for local stats but included for + * recursive. Used to carry stats of dead children, and, for blkg_rwstat, + * to carry result values from read and sum operations. + */ struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; + struct percpu_counter cpu_cnt; + atomic64_t aux_cnt; }; struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; + struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; + atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; /* @@ -68,32 +76,28 @@ struct blkg_rwstat { * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. + * There can be multiple active blkcg policies and each blkg:policy pair is + * represented by a blkg_policy_data which is allocated and freed by each + * policy's pd_alloc/free_fn() methods. A policy can allocate private data + * area by allocating larger data structure which embeds blkg_policy_data + * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; - - /* used during policy activation */ - struct list_head alloc_node; }; /* - * Policies that need to keep per-blkcg data which is independent - * from any request_queue associated to it must specify its size - * with the cpd_size field of the blkcg_policy structure and - * embed a blkcg_policy_data in it. cpd_init() is invoked to let - * each policy handle per-blkcg data. + * Policies that need to keep per-blkcg data which is independent from any + * request_queue associated to it should implement cpd_alloc/free_fn() + * methods. A policy can allocate private data area by allocating larger + * data structure which embeds blkcg_policy_data at the beginning. + * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { - /* the policy id this per-policy data belongs to */ + /* the blkcg and policy id this per-policy data belongs to */ + struct blkcg *blkcg; int plid; }; @@ -123,40 +127,50 @@ struct blkcg_gq { /* is this blkg online? protected by both blkcg and q locks */ bool online; + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; }; -typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); +typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); +typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); +typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); +typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; - /* policy specific private data size */ - size_t pd_size; - /* policy specific per-blkcg data size */ - size_t cpd_size; /* cgroup files for the policy */ - struct cftype *cftypes; + struct cftype *dfl_cftypes; + struct cftype *legacy_cftypes; /* operations */ + blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_free_cpd_fn *cpd_free_fn; + blkcg_pol_bind_cpd_fn *cpd_bind_fn; + + blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; extern struct blkcg blkcg_root; extern struct cgroup_subsys_state * const blkcg_root_css; -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q, void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol); +const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), @@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +int blkg_print_stat_bytes(struct seq_file *sf, void *v); +int blkg_print_stat_ios(struct seq_file *sf, void *v); +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off); +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off); struct blkg_conf_ctx { struct gendisk *disk; struct blkcg_gq *blkg; - u64 v; + char *body; }; int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); + char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); @@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); + return css_to_blkcg(task_css(tsk, io_cgrp_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) @@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) static inline struct cgroup_subsys_state * task_get_blkcg_css(struct task_struct *task) { - return task_get_css(task, blkio_cgrp_id); + return task_get_css(task, io_cgrp_id); } /** @@ -233,6 +253,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) } /** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ +static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q, + bool update_hint) +{ + struct blkcg_gq *blkg; + + if (blkcg == &blkcg_root) + return q->root_blkg; + + blkg = rcu_dereference(blkcg->blkg_hint); + if (blkg && blkg->q == q) + return blkg; + + return blkg_lookup_slowpath(blkcg, q, update_hint); +} + +/** + * blkg_lookup - lookup blkg for the specified blkcg - q pair + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. This function should be called + * under RCU read lock and is guaranteed to return %NULL if @q is bypassing + * - see blk_queue_bypass_start() for details. + */ +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(blk_queue_bypass(q))) + return NULL; + return __blkg_lookup(blkcg, q, false); +} + +/** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest @@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { - return blkcg ? blkcg->pd[pol->plid] : NULL; + return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** @@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) return pd ? pd->blkg : NULL; } +static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) +{ + return cpd ? cpd->blkcg : NULL; +} + /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest @@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) call_rcu(&blkg->rcu_head, __blkg_release_rcu); } -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint); - /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant @@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, * or if either the blkcg or queue is going away. Fall back to * root_rl in such cases. */ - blkg = blkg_lookup_create(blkcg, q); - if (IS_ERR(blkg)) + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) goto root_rl; blkg_get(blkg); @@ -394,8 +462,7 @@ root_rl: */ static inline void blk_put_rl(struct request_list *rl) { - /* root_rl may not have blkg set */ - if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + if (rl->blkg->blkcg != &blkcg_root) blkg_put(rl->blkg); } @@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) -static inline void blkg_stat_init(struct blkg_stat *stat) +static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp) { - u64_stats_init(&stat->syncp); + int ret; + + ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); + if (ret) + return ret; + + atomic64_set(&stat->aux_cnt, 0); + return 0; +} + +static inline void blkg_stat_exit(struct blkg_stat *stat) +{ + percpu_counter_destroy(&stat->cpu_cnt); } /** @@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat) * @stat: target blkg_stat * @val: value to add * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. + * Add @val to @stat. The caller must ensure that IRQ on the same CPU + * don't re-enter this function for the same counter. */ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) { - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); + __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_stat_read - read the current value of a blkg_stat * @stat: blkg_stat to read - * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. */ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) { - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin_irq(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); - - return v; + return percpu_counter_sum_positive(&stat->cpu_cnt); } /** @@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) */ static inline void blkg_stat_reset(struct blkg_stat *stat) { - stat->cnt = 0; + percpu_counter_set(&stat->cpu_cnt, 0); + atomic64_set(&stat->aux_cnt, 0); } /** - * blkg_stat_merge - merge a blkg_stat into another + * blkg_stat_add_aux - add a blkg_stat into another's aux count * @to: the destination blkg_stat * @from: the source * - * Add @from's count to @to. + * Add @from's count including the aux one to @to's aux count. */ -static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +static inline void blkg_stat_add_aux(struct blkg_stat *to, + struct blkg_stat *from) { - blkg_stat_add(to, blkg_stat_read(from)); + atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt), + &to->aux_cnt); } -static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) +static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) { - u64_stats_init(&rwstat->syncp); + int i, ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); + if (ret) { + while (--i >= 0) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); + return ret; + } + atomic64_set(&rwstat->aux_cnt[i], 0); + } + return 0; +} + +static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); } /** @@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, int rw, uint64_t val) { - u64_stats_update_begin(&rwstat->syncp); + struct percpu_counter *cnt; if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else - rwstat->cnt[BLKG_RWSTAT_READ] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; + + __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); + if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; - u64_stats_update_end(&rwstat->syncp); + __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. + * Read the current snapshot of @rwstat and return it in the aux counts. */ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) { - unsigned int start; - struct blkg_rwstat tmp; - - do { - start = u64_stats_fetch_begin_irq(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); + struct blkg_rwstat result; + int i; - return tmp; + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&result.aux_cnt[i], + percpu_counter_sum_positive(&rwstat->cpu_cnt[i])); + return result; } /** @@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; + return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); } /** @@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) */ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + percpu_counter_set(&rwstat->cpu_cnt[i], 0); + atomic64_set(&rwstat->aux_cnt[i], 0); + } } /** - * blkg_rwstat_merge - merge a blkg_rwstat into another + * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count * @to: the destination blkg_rwstat * @from: the source * - * Add @from's counts to @to. + * Add @from's count including the aux one to @to's aux count. */ -static inline void blkg_rwstat_merge(struct blkg_rwstat *to, - struct blkg_rwstat *from) +static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, + struct blkg_rwstat *from) { struct blkg_rwstat v = blkg_rwstat_read(from); int i; - u64_stats_update_begin(&to->syncp); for (i = 0; i < BLKG_RWSTAT_NR; i++) - to->cnt[i] += v.cnt[i]; - u64_stats_update_end(&to->syncp); + atomic64_add(atomic64_read(&v.aux_cnt[i]) + + atomic64_read(&from->aux_cnt[i]), + &to->aux_cnt[i]); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING +extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio); +#else +static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { return false; } +#endif + +static inline bool blkcg_bio_issue_check(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + bool throtl = false; + + rcu_read_lock(); + blkcg = bio_blkcg(bio); + + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + spin_unlock_irq(q->queue_lock); + } + + throtl = blk_throtl_bio(q, blkg, bio); + + if (!throtl) { + blkg = blkg ?: q->root_blkg; + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags, + bio->bi_iter.bi_size); + blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1); + } + + rcu_read_unlock(); + return !throtl; } #else /* CONFIG_BLK_CGROUP */ @@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { } static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } +static inline bool blkcg_bio_issue_check(struct request_queue *q, + struct bio *bio) { return true; } + #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 1f36945fd23d..1a96fdaa33d5 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -27,7 +27,7 @@ SUBSYS(cpuacct) #endif #if IS_ENABLED(CONFIG_BLK_CGROUP) -SUBSYS(blkio) +SUBSYS(io) #endif #if IS_ENABLED(CONFIG_MEMCG) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 123be25ea15a..5d4e9c4b821d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); +size_t kernfs_path_len(struct kernfs_node *kn); char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); @@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } +static inline size_t kernfs_path_len(struct kernfs_node *kn) +{ return 0; } + static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return NULL; } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index a7aa607a4c55..fff846b512e6 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_ARGS(inode, flags) ); +#ifdef CREATE_TRACE_POINTS +#ifdef CONFIG_CGROUP_WRITEBACK + +static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb) +{ + return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1; +} + +static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb) +{ + struct cgroup *cgrp = wb->memcg_css->cgroup; + char *path; + + path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1); + WARN_ON_ONCE(path != buf); +} + +static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc) +{ + if (wbc->wb) + return __trace_wb_cgroup_size(wbc->wb); + else + return 2; +} + +static inline void __trace_wbc_assign_cgroup(char *buf, + struct writeback_control *wbc) +{ + if (wbc->wb) + __trace_wb_assign_cgroup(buf, wbc->wb); + else + strcpy(buf, "/"); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb) +{ + return 2; +} + +static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb) +{ + strcpy(buf, "/"); +} + +static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc) +{ + return 2; +} + +static inline void __trace_wbc_assign_cgroup(char *buf, + struct writeback_control *wbc) +{ + strcpy(buf, "/"); +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ +#endif /* CREATE_TRACE_POINTS */ + DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), @@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, __array(char, name, 32) __field(unsigned long, ino) __field(int, sync_mode) + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) ), TP_fast_assign( @@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); ), - TP_printk("bdi %s: ino=%lu sync_mode=%d", + TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s", __entry->name, __entry->ino, - __entry->sync_mode + __entry->sync_mode, + __get_str(cgroup) ) ); @@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, ); DECLARE_EVENT_CLASS(writeback_work_class, - TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), - TP_ARGS(bdi, work), + TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), + TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) @@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, range_cyclic) __field(int, for_background) __field(int, reason) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( strncpy(__entry->name, - bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); + wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; @@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " - "kupdate=%d range_cyclic=%d background=%d reason=%s", + "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, @@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, - __print_symbolic(__entry->reason, WB_WORK_REASON) + __print_symbolic(__entry->reason, WB_WORK_REASON), + __get_str(cgroup) ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ - TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ - TP_ARGS(bdi, work)) + TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ + TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); @@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written, ); DECLARE_EVENT_CLASS(writeback_class, - TP_PROTO(struct backing_dev_info *bdi), - TP_ARGS(bdi), + TP_PROTO(struct bdi_writeback *wb), + TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), - TP_printk("bdi %s", - __entry->name + TP_printk("bdi %s: cgroup=%s", + __entry->name, + __get_str(cgroup) ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ - TP_PROTO(struct backing_dev_info *bdi), \ - TP_ARGS(bdi)) + TP_PROTO(struct bdi_writeback *wb), \ + TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_nowork); DEFINE_WRITEBACK_EVENT(writeback_wake_background); -DEFINE_WRITEBACK_EVENT(writeback_bdi_register); + +TRACE_EVENT(writeback_bdi_register, + TP_PROTO(struct backing_dev_info *bdi), + TP_ARGS(bdi), + TP_STRUCT__entry( + __array(char, name, 32) + ), + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + ), + TP_printk("bdi %s", + __entry->name + ) +); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), @@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) ), TP_fast_assign( @@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " - "start=0x%lx end=0x%lx", + "start=0x%lx end=0x%lx cgroup=%s", __entry->name, __entry->nr_to_write, __entry->pages_skipped, @@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_reclaim, __entry->range_cyclic, __entry->range_start, - __entry->range_end) + __entry->range_end, + __get_str(cgroup) + ) ) #define DEFINE_WBC_EVENT(name) \ @@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io, __field(long, age) __field(int, moved) __field(int, reason) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; @@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io, (jiffies - *older_than_this) * 1000 / HZ : -1; __entry->moved = moved; __entry->reason = work->reason; + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s", + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s", __entry->name, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, - __print_symbolic(__entry->reason, WB_WORK_REASON) + __print_symbolic(__entry->reason, WB_WORK_REASON), + __get_str(cgroup) ) ); @@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state, TRACE_EVENT(bdi_dirty_ratelimit, - TP_PROTO(struct backing_dev_info *bdi, + TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), - TP_ARGS(bdi, dirty_rate, task_ratelimit), + TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) @@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit, __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( - strlcpy(__entry->bdi, dev_name(bdi->dev), 32); - __entry->write_bw = KBps(bdi->wb.write_bandwidth); - __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth); + strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + __entry->write_bw = KBps(wb->write_bandwidth); + __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); - __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit); + __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = - KBps(bdi->wb.balanced_dirty_ratelimit); + KBps(wb->balanced_dirty_ratelimit); + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " - "balanced_dirty_ratelimit=%lu", + "balanced_dirty_ratelimit=%lu cgroup=%s", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ - __entry->balanced_dirty_ratelimit /* the balanced ratelimit */ + __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ + __get_str(cgroup) ) ); TRACE_EVENT(balance_dirty_pages, - TP_PROTO(struct backing_dev_info *bdi, + TP_PROTO(struct bdi_writeback *wb, unsigned long thresh, unsigned long bg_thresh, unsigned long dirty, @@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages, long pause, unsigned long start_time), - TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, + TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), @@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages, __field( long, pause) __field(unsigned long, period) __field( long, think) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; - strlcpy(__entry->bdi, dev_name(bdi->dev), 32); + strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + @@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages, __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), @@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages, "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld period=%lu think=%ld", + "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s", __entry->bdi, __entry->limit, __entry->setpoint, @@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ - __entry->think /* ms */ + __entry->think, /* ms */ + __get_str(cgroup) ) ); @@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue, __field(unsigned long, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) + __dynamic_array(char, cgroup, + __trace_wb_cgroup_size(inode_to_wb(inode))) ), TP_fast_assign( @@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue, __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; + __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode)); ), - TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu", + TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s", __entry->name, __entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, - (jiffies - __entry->dirtied_when) / HZ + (jiffies - __entry->dirtied_when) / HZ, + __get_str(cgroup) ) ); @@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) ), TP_fast_assign( @@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " - "index=%lu to_write=%ld wrote=%lu", + "index=%lu to_write=%ld wrote=%lu cgroup=%s", __entry->name, __entry->ino, show_inode_state(__entry->state), @@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, - __entry->wrote + __entry->wrote, + __get_str(cgroup) ) ); |