summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2019-11-07 11:18:03 -0800
committerJens Axboe <axboe@kernel.dk>2019-11-07 12:28:13 -0700
commitf73316482977ac401ac37245c9df48079d4e11f3 (patch)
tree1177e1189b87661f46900c6404047da82364dd93 /include
parent8a80d5d6638b7d58480a83aef49d587de63d4cbb (diff)
downloadlinux-f73316482977ac401ac37245c9df48079d4e11f3.tar.bz2
blk-cgroup: reimplement basic IO stats using cgroup rstat
blk-cgroup has been using blkg_rwstat to track basic IO stats. Unfortunately, reading recursive stats scales badly as itinvolves walking all descendants. On systems with a huge number of cgroups (dead or alive), this can lead to substantial CPU cost when reading IO stats. This patch reimplements basic IO stats using cgroup rstat which uses more memory but makes recursive stat reading O(# descendants which have been active since last reading) instead of O(# descendants). * blk-cgroup core no longer uses sync/async stats. Introduce new stat enums - BLKG_IOSTAT_{READ|WRITE|DISCARD}. * Add blkg_iostat[_set] which encapsulates byte and io stats, last values for propagation delta calculation and u64_stats_sync for correctness on 32bit archs. * Update the new percpu stat counters directly and implement blkcg_rstat_flush() to implement propagation. * blkg_print_stat() can now bring the stats up to date by calling cgroup_rstat_flush() and print them instead of directly summing up all descendants. * It now allocates 96 bytes per cpu. It used to be 40 bytes. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Dan Schatzberg <dschatzberg@fb.com> Cc: Daniel Xu <dlxu@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'include')
-rw-r--r--include/linux/blk-cgroup.h48
1 files changed, 43 insertions, 5 deletions
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 914ce55fa8c2..867ab391e409 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -15,7 +15,9 @@
*/
#include <linux/cgroup.h>
+#include <linux/percpu.h>
#include <linux/percpu_counter.h>
+#include <linux/u64_stats_sync.h>
#include <linux/seq_file.h>
#include <linux/radix-tree.h>
#include <linux/blkdev.h>
@@ -31,6 +33,14 @@
#ifdef CONFIG_BLK_CGROUP
+enum blkg_iostat_type {
+ BLKG_IOSTAT_READ,
+ BLKG_IOSTAT_WRITE,
+ BLKG_IOSTAT_DISCARD,
+
+ BLKG_IOSTAT_NR,
+};
+
enum blkg_rwstat_type {
BLKG_RWSTAT_READ,
BLKG_RWSTAT_WRITE,
@@ -61,6 +71,17 @@ struct blkcg {
#endif
};
+struct blkg_iostat {
+ u64 bytes[BLKG_IOSTAT_NR];
+ u64 ios[BLKG_IOSTAT_NR];
+};
+
+struct blkg_iostat_set {
+ struct u64_stats_sync sync;
+ struct blkg_iostat cur;
+ struct blkg_iostat last;
+};
+
/*
* blkg_[rw]stat->aux_cnt is excluded for local stats but included for
* recursive. Used to carry stats of dead children.
@@ -127,8 +148,8 @@ struct blkcg_gq {
/* is this blkg online? protected by both blkcg and q locks */
bool online;
- struct blkg_rwstat stat_bytes;
- struct blkg_rwstat stat_ios;
+ struct blkg_iostat_set __percpu *iostat_cpu;
+ struct blkg_iostat_set iostat;
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
@@ -740,15 +761,32 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
throtl = blk_throtl_bio(q, blkg, bio);
if (!throtl) {
+ struct blkg_iostat_set *bis;
+ int rwd, cpu;
+
+ if (op_is_discard(bio->bi_opf))
+ rwd = BLKG_IOSTAT_DISCARD;
+ else if (op_is_write(bio->bi_opf))
+ rwd = BLKG_IOSTAT_WRITE;
+ else
+ rwd = BLKG_IOSTAT_READ;
+
+ cpu = get_cpu();
+ bis = per_cpu_ptr(blkg->iostat_cpu, cpu);
+ u64_stats_update_begin(&bis->sync);
+
/*
* If the bio is flagged with BIO_QUEUE_ENTERED it means this
* is a split bio and we would have already accounted for the
* size of the bio.
*/
if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
- blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
- bio->bi_iter.bi_size);
- blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
+ bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+ bis->cur.ios[rwd]++;
+
+ u64_stats_update_end(&bis->sync);
+ cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu);
+ put_cpu();
}
blkcg_bio_issue_init(bio);