From 1311326cf4755c7ffefd20f576144ecf46d9906b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:49 +0800 Subject: blk-mq: avoid to synchronize rcu inside blk_cleanup_queue() SCSI probing may synchronously create and destroy a lot of request_queues for non-existent devices. Any synchronize_rcu() in queue creation or destroy path may introduce long latency during booting, see detailed description in comment of blk_register_queue(). This patch removes one synchronize_rcu() inside blk_cleanup_queue() for this case, commit c2856ae2f315d75(blk-mq: quiesce queue before freeing queue) needs synchronize_rcu() for implementing blk_mq_quiesce_queue(), but when queue isn't initialized, it isn't necessary to do that since only pass-through requests are involved, no original issue in scsi_execute() at all. Without this patch and previous one, it may take more 20+ seconds for virtio-scsi to complete disk probe. With the two patches, the time becomes less than 100ms. Fixes: c2856ae2f315d75 ("blk-mq: quiesce queue before freeing queue") Reported-by: Andrew Jones Cc: Omar Sandoval Cc: Bart Van Assche Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Christoph Hellwig Tested-by: Andrew Jones Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index f84a9b7b6f5a..947e7a4abd8c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -762,9 +762,13 @@ void blk_cleanup_queue(struct request_queue *q) * make sure all in-progress dispatch are completed because * blk_freeze_queue() can only complete all requests, and * dispatch may still be in-progress since we dispatch requests - * from more than one contexts + * from more than one contexts. + * + * No need to quiesce queue if it isn't initialized yet since + * blk_freeze_queue() should be enough for cases of passthrough + * request. */ - if (q->mq_ops) + if (q->mq_ops && blk_queue_init_done(q)) blk_mq_quiesce_queue(q); /* for synchronous bio-based driver finish in-flight integrity i/o */ -- cgit v1.2.3 From 1954e9a998d59d08520d7d4bebeafb8f66ba0d0f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 27 Jun 2018 13:09:05 -0700 Subject: block: Document how blk_update_request() handles RQF_SPECIAL_PAYLOAD requests The payload of struct request is stored in the request.bio chain if the RQF_SPECIAL_PAYLOAD flag is not set and in request.special_vec if RQF_SPECIAL_PAYLOAD has been set. However, blk_update_request() iterates over req->bio whether or not RQF_SPECIAL_PAYLOAD has been set. Additionally, the RQF_SPECIAL_PAYLOAD flag is ignored by blk_rq_bytes() which means that the value returned by that function is incorrect if the RQF_SPECIAL_PAYLOAD flag has been set. It is not clear to me whether this is an oversight or whether this happened on purpose. Anyway, document that it is known that both functions ignore RQF_SPECIAL_PAYLOAD. See also commit f9d03f96b988 ("block: improve handling of the magic discard payload"). Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 947e7a4abd8c..2ff8e131a892 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3056,6 +3056,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios); * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * + * Note: + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both + * blk_rq_bytes() and in blk_update_request(). + * * Return: * %false - this request doesn't have any more data * %true - this request has more data -- cgit v1.2.3 From a79050434b45959f397042080fd1d70ffa9bd9df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 09:32:35 -0600 Subject: blk-rq-qos: refactor out common elements of blk-wbt blkcg-qos is going to do essentially what wbt does, only on a cgroup basis. Break out the common code that will be shared between blkcg-qos and wbt into blk-rq-qos.* so they can both utilize the same infrastructure. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk-core.c | 12 +- block/blk-mq.c | 12 +- block/blk-rq-qos.c | 178 +++++++++++++++++++++++++++ block/blk-rq-qos.h | 106 ++++++++++++++++ block/blk-settings.c | 4 +- block/blk-sysfs.c | 22 ++-- block/blk-wbt.c | 326 ++++++++++++++++++++++--------------------------- block/blk-wbt.h | 63 ++++------ include/linux/blkdev.h | 4 +- 10 files changed, 478 insertions(+), 251 deletions(-) create mode 100644 block/blk-rq-qos.c create mode 100644 block/blk-rq-qos.h (limited to 'block/blk-core.c') diff --git a/block/Makefile b/block/Makefile index a8f94cdb75c3..57d0f47ab05f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o partition-generic.o ioprio.o \ - badblocks.o partitions/ + badblocks.o partitions/ blk-rq-qos.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o diff --git a/block/blk-core.c b/block/blk-core.c index 2ff8e131a892..b33a73bcf2d0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1645,7 +1645,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - wbt_requeue(q->rq_wb, rq); + rq_qos_requeue(q, rq); if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); @@ -1752,7 +1752,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); - wbt_done(q->rq_wb, req); + rq_qos_done(q, req); /* * Request may not have originated from ll_rw_blk. if not, @@ -2044,7 +2044,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: - wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock); + wb_acct = rq_qos_throttle(q, bio, q->queue_lock); /* * Grab a free request. This is might sleep but can not fail. @@ -2054,7 +2054,7 @@ get_rq: req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); if (IS_ERR(req)) { blk_queue_exit(q); - __wbt_done(q->rq_wb, wb_acct); + rq_qos_cleanup(q, wb_acct); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; else @@ -2983,7 +2983,7 @@ void blk_start_request(struct request *req) req->throtl_size = blk_rq_sectors(req); #endif req->rq_flags |= RQF_STATS; - wbt_issue(req->q->rq_wb, req); + rq_qos_issue(req->q, req); } BUG_ON(blk_rq_is_complete(req)); @@ -3207,7 +3207,7 @@ void blk_finish_request(struct request *req, blk_status_t error) blk_account_io_done(req, now); if (req->end_io) { - wbt_done(req->q->rq_wb, req); + rq_qos_done(q, req); req->end_io(req, error); } else { if (blk_bidi_rq(req)) diff --git a/block/blk-mq.c b/block/blk-mq.c index 850fdd02c385..ea2a226457fa 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -504,7 +504,7 @@ void blk_mq_free_request(struct request *rq) if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->backing_dev_info); - wbt_done(q->rq_wb, rq); + rq_qos_done(q, rq); if (blk_rq_rl(rq)) blk_put_rl(blk_rq_rl(rq)); @@ -527,7 +527,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_account_io_done(rq, now); if (rq->end_io) { - wbt_done(rq->q->rq_wb, rq); + rq_qos_done(rq->q, rq); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) @@ -641,7 +641,7 @@ void blk_mq_start_request(struct request *rq) rq->throtl_size = blk_rq_sectors(rq); #endif rq->rq_flags |= RQF_STATS; - wbt_issue(q->rq_wb, rq); + rq_qos_issue(q, rq); } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); @@ -667,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_put_driver_tag(rq); trace_block_rq_requeue(q, rq); - wbt_requeue(q->rq_wb, rq); + rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); @@ -1806,13 +1806,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (blk_mq_sched_bio_merge(q, bio)) return BLK_QC_T_NONE; - wb_acct = wbt_wait(q->rq_wb, bio, NULL); + wb_acct = rq_qos_throttle(q, bio, NULL); trace_block_getrq(q, bio, bio->bi_opf); rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { - __wbt_done(q->rq_wb, wb_acct); + rq_qos_cleanup(q, wb_acct); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); return BLK_QC_T_NONE; diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c new file mode 100644 index 000000000000..d2f2af8aa10c --- /dev/null +++ b/block/blk-rq-qos.c @@ -0,0 +1,178 @@ +#include "blk-rq-qos.h" + +#include "blk-wbt.h" + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +bool rq_wait_inc_below(struct rq_wait *rq_wait, int limit) +{ + return atomic_inc_below(&rq_wait->inflight, limit); +} + +void rq_qos_cleanup(struct request_queue *q, enum wbt_flags wb_acct) +{ + struct rq_qos *rqos; + + for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->cleanup) + rqos->ops->cleanup(rqos, wb_acct); + } +} + +void rq_qos_done(struct request_queue *q, struct request *rq) +{ + struct rq_qos *rqos; + + for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->done) + rqos->ops->done(rqos, rq); + } +} + +void rq_qos_issue(struct request_queue *q, struct request *rq) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->issue) + rqos->ops->issue(rqos, rq); + } +} + +void rq_qos_requeue(struct request_queue *q, struct request *rq) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->requeue) + rqos->ops->requeue(rqos, rq); + } +} + +enum wbt_flags rq_qos_throttle(struct request_queue *q, struct bio *bio, + spinlock_t *lock) +{ + struct rq_qos *rqos; + enum wbt_flags flags = 0; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->throttle) + flags |= rqos->ops->throttle(rqos, bio, lock); + } + return flags; +} + +/* + * Return true, if we can't increase the depth further by scaling + */ +bool rq_depth_calc_max_depth(struct rq_depth *rqd) +{ + unsigned int depth; + bool ret = false; + + /* + * For QD=1 devices, this is a special case. It's important for those + * to have one request ready when one completes, so force a depth of + * 2 for those devices. On the backend, it'll be a depth of 1 anyway, + * since the device can't have more than that in flight. If we're + * scaling down, then keep a setting of 1/1/1. + */ + if (rqd->queue_depth == 1) { + if (rqd->scale_step > 0) + rqd->max_depth = 1; + else { + rqd->max_depth = 2; + ret = true; + } + } else { + /* + * scale_step == 0 is our default state. If we have suffered + * latency spikes, step will be > 0, and we shrink the + * allowed write depths. If step is < 0, we're only doing + * writes, and we allow a temporarily higher depth to + * increase performance. + */ + depth = min_t(unsigned int, rqd->default_depth, + rqd->queue_depth); + if (rqd->scale_step > 0) + depth = 1 + ((depth - 1) >> min(31, rqd->scale_step)); + else if (rqd->scale_step < 0) { + unsigned int maxd = 3 * rqd->queue_depth / 4; + + depth = 1 + ((depth - 1) << -rqd->scale_step); + if (depth > maxd) { + depth = maxd; + ret = true; + } + } + + rqd->max_depth = depth; + } + + return ret; +} + +void rq_depth_scale_up(struct rq_depth *rqd) +{ + /* + * Hit max in previous round, stop here + */ + if (rqd->scaled_max) + return; + + rqd->scale_step--; + + rqd->scaled_max = rq_depth_calc_max_depth(rqd); +} + +/* + * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we + * had a latency violation. + */ +void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rqd->max_depth == 1) + return; + + if (rqd->scale_step < 0 && hard_throttle) + rqd->scale_step = 0; + else + rqd->scale_step++; + + rqd->scaled_max = false; + rq_depth_calc_max_depth(rqd); +} + +void rq_qos_exit(struct request_queue *q) +{ + while (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + q->rq_qos = rqos->next; + rqos->ops->exit(rqos); + } +} diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h new file mode 100644 index 000000000000..f9a39bd6ece3 --- /dev/null +++ b/block/blk-rq-qos.h @@ -0,0 +1,106 @@ +#ifndef RQ_QOS_H +#define RQ_QOS_H + +#include +#include +#include +#include +#include + +enum rq_qos_id { + RQ_QOS_WBT, + RQ_QOS_CGROUP, +}; + +struct rq_wait { + wait_queue_head_t wait; + atomic_t inflight; +}; + +struct rq_qos { + struct rq_qos_ops *ops; + struct request_queue *q; + enum rq_qos_id id; + struct rq_qos *next; +}; + +struct rq_qos_ops { + enum wbt_flags (*throttle)(struct rq_qos *, struct bio *, + spinlock_t *); + void (*issue)(struct rq_qos *, struct request *); + void (*requeue)(struct rq_qos *, struct request *); + void (*done)(struct rq_qos *, struct request *); + void (*cleanup)(struct rq_qos *, enum wbt_flags); + void (*exit)(struct rq_qos *); +}; + +struct rq_depth { + unsigned int max_depth; + + int scale_step; + bool scaled_max; + + unsigned int queue_depth; + unsigned int default_depth; +}; + +static inline struct rq_qos *rq_qos_id(struct request_queue *q, + enum rq_qos_id id) +{ + struct rq_qos *rqos; + for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->id == id) + break; + } + return rqos; +} + +static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) +{ + return rq_qos_id(q, RQ_QOS_WBT); +} + +static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) +{ + return rq_qos_id(q, RQ_QOS_CGROUP); +} + +static inline void rq_wait_init(struct rq_wait *rq_wait) +{ + atomic_set(&rq_wait->inflight, 0); + init_waitqueue_head(&rq_wait->wait); +} + +static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) +{ + rqos->next = q->rq_qos; + q->rq_qos = rqos; +} + +static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) +{ + struct rq_qos *cur, *prev = NULL; + for (cur = q->rq_qos; cur; cur = cur->next) { + if (cur == rqos) { + if (prev) + prev->next = rqos->next; + else + q->rq_qos = cur; + break; + } + prev = cur; + } +} + +bool rq_wait_inc_below(struct rq_wait *rq_wait, int limit); +void rq_depth_scale_up(struct rq_depth *rqd); +void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); +bool rq_depth_calc_max_depth(struct rq_depth *rqd); + +void rq_qos_cleanup(struct request_queue *, enum wbt_flags); +void rq_qos_done(struct request_queue *, struct request *); +void rq_qos_issue(struct request_queue *, struct request *); +void rq_qos_requeue(struct request_queue *, struct request *); +enum wbt_flags rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); +void rq_qos_exit(struct request_queue *); +#endif diff --git a/block/blk-settings.c b/block/blk-settings.c index d1de71124656..053de87d1fda 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -875,7 +875,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; - wbt_set_queue_depth(q->rq_wb, depth); + wbt_set_queue_depth(q, depth); } EXPORT_SYMBOL(blk_set_queue_depth); @@ -900,7 +900,7 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) queue_flag_clear(QUEUE_FLAG_FUA, q); spin_unlock_irq(q->queue_lock); - wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 94987b1f69e1..49c29a5d06bb 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -422,16 +422,16 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) { - if (!q->rq_wb) + if (!wbt_rq_qos(q)) return -EINVAL; - return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); } static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, size_t count) { - struct rq_wb *rwb; + struct rq_qos *rqos; ssize_t ret; s64 val; @@ -441,23 +441,21 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, if (val < -1) return -EINVAL; - rwb = q->rq_wb; - if (!rwb) { + rqos = wbt_rq_qos(q); + if (!rqos) { ret = wbt_init(q); if (ret) return ret; } - rwb = q->rq_wb; if (val == -1) - rwb->min_lat_nsec = wbt_default_latency_nsec(q); + val = wbt_default_latency_nsec(q); else if (val >= 0) - rwb->min_lat_nsec = val * 1000ULL; + val *= 1000ULL; - if (rwb->enable_state == WBT_STATE_ON_DEFAULT) - rwb->enable_state = WBT_STATE_ON_MANUAL; + wbt_set_min_lat(q, val); - wbt_update_limits(rwb); + wbt_update_limits(q); return count; } @@ -964,7 +962,7 @@ void blk_unregister_queue(struct gendisk *disk) kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); - wbt_exit(q); + rq_qos_exit(q); mutex_lock(&q->sysfs_lock); if (q->request_fn || (q->mq_ops && q->elevator)) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 4f89b28fa652..6fe20fb823e4 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -25,6 +25,7 @@ #include #include "blk-wbt.h" +#include "blk-rq-qos.h" #define CREATE_TRACE_POINTS #include @@ -78,28 +79,6 @@ static inline bool rwb_enabled(struct rq_wb *rwb) return rwb && rwb->wb_normal != 0; } -/* - * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, - * false if 'v' + 1 would be bigger than 'below'. - */ -static bool atomic_inc_below(atomic_t *v, int below) -{ - int cur = atomic_read(v); - - for (;;) { - int old; - - if (cur >= below) - return false; - old = atomic_cmpxchg(v, cur, cur + 1); - if (old == cur) - break; - cur = old; - } - - return true; -} - static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) { if (rwb_enabled(rwb)) { @@ -116,7 +95,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; + struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -144,8 +123,9 @@ static void rwb_wake_all(struct rq_wb *rwb) } } -void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) +static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) { + struct rq_wb *rwb = RQWB(rqos); struct rq_wait *rqw; int inflight, limit; @@ -194,10 +174,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) * Called on completion of a request. Note that it's also called when * a request is merged, when the request gets freed. */ -void wbt_done(struct rq_wb *rwb, struct request *rq) +static void wbt_done(struct rq_qos *rqos, struct request *rq) { - if (!rwb) - return; + struct rq_wb *rwb = RQWB(rqos); if (!wbt_is_tracked(rq)) { if (rwb->sync_cookie == rq) { @@ -209,72 +188,11 @@ void wbt_done(struct rq_wb *rwb, struct request *rq) wb_timestamp(rwb, &rwb->last_comp); } else { WARN_ON_ONCE(rq == rwb->sync_cookie); - __wbt_done(rwb, wbt_flags(rq)); + __wbt_done(rqos, wbt_flags(rq)); } wbt_clear_state(rq); } -/* - * Return true, if we can't increase the depth further by scaling - */ -static bool calc_wb_limits(struct rq_wb *rwb) -{ - unsigned int depth; - bool ret = false; - - if (!rwb->min_lat_nsec) { - rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; - return false; - } - - /* - * For QD=1 devices, this is a special case. It's important for those - * to have one request ready when one completes, so force a depth of - * 2 for those devices. On the backend, it'll be a depth of 1 anyway, - * since the device can't have more than that in flight. If we're - * scaling down, then keep a setting of 1/1/1. - */ - if (rwb->queue_depth == 1) { - if (rwb->scale_step > 0) - rwb->wb_max = rwb->wb_normal = 1; - else { - rwb->wb_max = rwb->wb_normal = 2; - ret = true; - } - rwb->wb_background = 1; - } else { - /* - * scale_step == 0 is our default state. If we have suffered - * latency spikes, step will be > 0, and we shrink the - * allowed write depths. If step is < 0, we're only doing - * writes, and we allow a temporarily higher depth to - * increase performance. - */ - depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); - if (rwb->scale_step > 0) - depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); - else if (rwb->scale_step < 0) { - unsigned int maxd = 3 * rwb->queue_depth / 4; - - depth = 1 + ((depth - 1) << -rwb->scale_step); - if (depth > maxd) { - depth = maxd; - ret = true; - } - } - - /* - * Set our max/normal/bg queue depths based on how far - * we have scaled down (->scale_step). - */ - rwb->wb_max = depth; - rwb->wb_normal = (rwb->wb_max + 1) / 2; - rwb->wb_background = (rwb->wb_max + 3) / 4; - } - - return ret; -} - static inline bool stat_sample_valid(struct blk_rq_stat *stat) { /* @@ -307,7 +225,8 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; /* @@ -351,7 +270,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) return LAT_EXCEEDED; } - if (rwb->scale_step) + if (rqd->scale_step) trace_wbt_stat(bdi, stat); return LAT_OK; @@ -359,58 +278,48 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct rq_depth *rqd = &rwb->rq_depth; - trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, - rwb->wb_background, rwb->wb_normal, rwb->wb_max); + trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rqd->max_depth); } -static void scale_up(struct rq_wb *rwb) +static void calc_wb_limits(struct rq_wb *rwb) { - /* - * Hit max in previous round, stop here - */ - if (rwb->scaled_max) - return; + if (rwb->min_lat_nsec == 0) { + rwb->wb_normal = rwb->wb_background = 0; + } else if (rwb->rq_depth.max_depth <= 2) { + rwb->wb_normal = rwb->rq_depth.max_depth; + rwb->wb_background = 1; + } else { + rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2; + rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4; + } +} - rwb->scale_step--; +static void scale_up(struct rq_wb *rwb) +{ + rq_depth_scale_up(&rwb->rq_depth); + calc_wb_limits(rwb); rwb->unknown_cnt = 0; - - rwb->scaled_max = calc_wb_limits(rwb); - - rwb_wake_all(rwb); - - rwb_trace_step(rwb, "step up"); + rwb_trace_step(rwb, "scale up"); } -/* - * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we - * had a latency violation. - */ static void scale_down(struct rq_wb *rwb, bool hard_throttle) { - /* - * Stop scaling down when we've hit the limit. This also prevents - * ->scale_step from going to crazy values, if the device can't - * keep up. - */ - if (rwb->wb_max == 1) - return; - - if (rwb->scale_step < 0 && hard_throttle) - rwb->scale_step = 0; - else - rwb->scale_step++; - - rwb->scaled_max = false; - rwb->unknown_cnt = 0; + rq_depth_scale_down(&rwb->rq_depth, hard_throttle); calc_wb_limits(rwb); - rwb_trace_step(rwb, "step down"); + rwb->unknown_cnt = 0; + rwb_wake_all(rwb); + rwb_trace_step(rwb, "scale down"); } static void rwb_arm_timer(struct rq_wb *rwb) { - if (rwb->scale_step > 0) { + struct rq_depth *rqd = &rwb->rq_depth; + + if (rqd->scale_step > 0) { /* * We should speed this up, using some variant of a fast * integer inverse square root calculation. Since we only do @@ -418,7 +327,7 @@ static void rwb_arm_timer(struct rq_wb *rwb) * though. */ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, - int_sqrt((rwb->scale_step + 1) << 8)); + int_sqrt((rqd->scale_step + 1) << 8)); } else { /* * For step < 0, we don't want to increase/decrease the @@ -433,12 +342,13 @@ static void rwb_arm_timer(struct rq_wb *rwb) static void wb_timer_fn(struct blk_stat_callback *cb) { struct rq_wb *rwb = cb->data; + struct rq_depth *rqd = &rwb->rq_depth; unsigned int inflight = wbt_inflight(rwb); int status; status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, + trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, inflight); /* @@ -469,9 +379,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) * currently don't have a valid read/write sample. For that * case, slowly return to center state (step == 0). */ - if (rwb->scale_step > 0) + if (rqd->scale_step > 0) scale_up(rwb); - else if (rwb->scale_step < 0) + else if (rqd->scale_step < 0) scale_down(rwb, false); break; default: @@ -481,19 +391,50 @@ static void wb_timer_fn(struct blk_stat_callback *cb) /* * Re-arm timer, if we have IO in flight */ - if (rwb->scale_step || inflight) + if (rqd->scale_step || inflight) rwb_arm_timer(rwb); } -void wbt_update_limits(struct rq_wb *rwb) +static void __wbt_update_limits(struct rq_wb *rwb) { - rwb->scale_step = 0; - rwb->scaled_max = false; + struct rq_depth *rqd = &rwb->rq_depth; + + rqd->scale_step = 0; + rqd->scaled_max = false; + + rq_depth_calc_max_depth(rqd); calc_wb_limits(rwb); rwb_wake_all(rwb); } +void wbt_update_limits(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return; + __wbt_update_limits(RQWB(rqos)); +} + +u64 wbt_get_min_lat(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return 0; + return RQWB(rqos)->min_lat_nsec; +} + +void wbt_set_min_lat(struct request_queue *q, u64 val) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return; + RQWB(rqos)->min_lat_nsec = val; + RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + __wbt_update_limits(RQWB(rqos)); +} + + static bool close_io(struct rq_wb *rwb) { const unsigned long now = jiffies; @@ -520,7 +461,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) * IO for a bit. */ if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) - limit = rwb->wb_max; + limit = rwb->rq_depth.max_depth; else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { /* * If less than 100ms since we completed unrelated IO, @@ -554,7 +495,7 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, rqw->wait.head.next != &wait->entry) return false; - return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); + return rq_wait_inc_below(rqw, get_limit(rwb, rw)); } /* @@ -614,8 +555,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ -enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) +static enum wbt_flags wbt_wait(struct rq_qos *rqos, struct bio *bio, + spinlock_t *lock) { + struct rq_wb *rwb = RQWB(rqos); enum wbt_flags ret = 0; if (!rwb_enabled(rwb)) @@ -643,8 +586,10 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) return ret | WBT_TRACKED; } -void wbt_issue(struct rq_wb *rwb, struct request *rq) +void wbt_issue(struct rq_qos *rqos, struct request *rq) { + struct rq_wb *rwb = RQWB(rqos); + if (!rwb_enabled(rwb)) return; @@ -661,8 +606,9 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq) } } -void wbt_requeue(struct rq_wb *rwb, struct request *rq) +void wbt_requeue(struct rq_qos *rqos, struct request *rq) { + struct rq_wb *rwb = RQWB(rqos); if (!rwb_enabled(rwb)) return; if (rq == rwb->sync_cookie) { @@ -671,39 +617,30 @@ void wbt_requeue(struct rq_wb *rwb, struct request *rq) } } -void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) { - if (rwb) { - rwb->queue_depth = depth; - wbt_update_limits(rwb); + struct rq_qos *rqos = wbt_rq_qos(q); + if (rqos) { + RQWB(rqos)->rq_depth.queue_depth = depth; + __wbt_update_limits(RQWB(rqos)); } } -void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) -{ - if (rwb) - rwb->wc = write_cache_on; -} - -/* - * Disable wbt, if enabled by default. - */ -void wbt_disable_default(struct request_queue *q) +void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) { - struct rq_wb *rwb = q->rq_wb; - - if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) - wbt_exit(q); + struct rq_qos *rqos = wbt_rq_qos(q); + if (rqos) + RQWB(rqos)->wc = write_cache_on; } -EXPORT_SYMBOL_GPL(wbt_disable_default); /* * Enable wbt if defaults are configured that way */ void wbt_enable_default(struct request_queue *q) { + struct rq_qos *rqos = wbt_rq_qos(q); /* Throttling already enabled? */ - if (q->rq_wb) + if (rqos) return; /* Queue not registered? Maybe shutting down... */ @@ -741,6 +678,41 @@ static int wbt_data_dir(const struct request *rq) return -1; } +static void wbt_exit(struct rq_qos *rqos) +{ + struct rq_wb *rwb = RQWB(rqos); + struct request_queue *q = rqos->q; + + blk_stat_remove_callback(q, rwb->cb); + blk_stat_free_callback(rwb->cb); + kfree(rwb); +} + +/* + * Disable wbt, if enabled by default. + */ +void wbt_disable_default(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_wb *rwb; + if (!rqos) + return; + rwb = RQWB(rqos); + if (rwb->enable_state == WBT_STATE_ON_DEFAULT) + rwb->wb_normal = 0; +} +EXPORT_SYMBOL_GPL(wbt_disable_default); + + +static struct rq_qos_ops wbt_rqos_ops = { + .throttle = wbt_wait, + .issue = wbt_issue, + .requeue = wbt_requeue, + .done = wbt_done, + .cleanup = __wbt_done, + .exit = wbt_exit, +}; + int wbt_init(struct request_queue *q) { struct rq_wb *rwb; @@ -756,39 +728,29 @@ int wbt_init(struct request_queue *q) return -ENOMEM; } - for (i = 0; i < WBT_NUM_RWQ; i++) { - atomic_set(&rwb->rq_wait[i].inflight, 0); - init_waitqueue_head(&rwb->rq_wait[i].wait); - } + for (i = 0; i < WBT_NUM_RWQ; i++) + rq_wait_init(&rwb->rq_wait[i]); + rwb->rqos.id = RQ_QOS_WBT; + rwb->rqos.ops = &wbt_rqos_ops; + rwb->rqos.q = q; rwb->last_comp = rwb->last_issue = jiffies; - rwb->queue = q; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; - wbt_update_limits(rwb); + rwb->wc = 1; + rwb->rq_depth.default_depth = RWB_DEF_DEPTH; + __wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ - q->rq_wb = rwb; + rq_qos_add(q, &rwb->rqos); blk_stat_add_callback(q, rwb->cb); rwb->min_lat_nsec = wbt_default_latency_nsec(q); - wbt_set_queue_depth(rwb, blk_queue_depth(q)); - wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + wbt_set_queue_depth(q, blk_queue_depth(q)); + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); return 0; } - -void wbt_exit(struct request_queue *q) -{ - struct rq_wb *rwb = q->rq_wb; - - if (rwb) { - blk_stat_remove_callback(q, rwb->cb); - blk_stat_free_callback(rwb->cb); - q->rq_wb = NULL; - kfree(rwb); - } -} diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 300df531d0a6..53b20a58c0a2 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -9,6 +9,7 @@ #include #include "blk-stat.h" +#include "blk-rq-qos.h" enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ @@ -35,20 +36,12 @@ enum { WBT_STATE_ON_MANUAL = 2, }; -struct rq_wait { - wait_queue_head_t wait; - atomic_t inflight; -}; - struct rq_wb { /* * Settings that govern how we throttle */ unsigned int wb_background; /* background writeback */ unsigned int wb_normal; /* normal writeback */ - unsigned int wb_max; /* max throughput writeback */ - int scale_step; - bool scaled_max; short enable_state; /* WBT_STATE_* */ @@ -67,15 +60,20 @@ struct rq_wb { void *sync_cookie; unsigned int wc; - unsigned int queue_depth; unsigned long last_issue; /* last non-throttled issue */ unsigned long last_comp; /* last non-throttled comp */ unsigned long min_lat_nsec; - struct request_queue *queue; + struct rq_qos rqos; struct rq_wait rq_wait[WBT_NUM_RWQ]; + struct rq_depth rq_depth; }; +static inline struct rq_wb *RQWB(struct rq_qos *rqos) +{ + return container_of(rqos, struct rq_wb, rqos); +} + static inline unsigned int wbt_inflight(struct rq_wb *rwb) { unsigned int i, ret = 0; @@ -86,6 +84,7 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb) return ret; } + #ifdef CONFIG_BLK_WBT static inline void wbt_track(struct request *rq, enum wbt_flags flags) @@ -93,19 +92,16 @@ static inline void wbt_track(struct request *rq, enum wbt_flags flags) rq->wbt_flags |= flags; } -void __wbt_done(struct rq_wb *, enum wbt_flags); -void wbt_done(struct rq_wb *, struct request *); -enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *); int wbt_init(struct request_queue *); -void wbt_exit(struct request_queue *); -void wbt_update_limits(struct rq_wb *); -void wbt_requeue(struct rq_wb *, struct request *); -void wbt_issue(struct rq_wb *, struct request *); +void wbt_update_limits(struct request_queue *); void wbt_disable_default(struct request_queue *); void wbt_enable_default(struct request_queue *); -void wbt_set_queue_depth(struct rq_wb *, unsigned int); -void wbt_set_write_cache(struct rq_wb *, bool); +u64 wbt_get_min_lat(struct request_queue *q); +void wbt_set_min_lat(struct request_queue *q, u64 val); + +void wbt_set_queue_depth(struct request_queue *, unsigned int); +void wbt_set_write_cache(struct request_queue *, bool); u64 wbt_default_latency_nsec(struct request_queue *); @@ -114,43 +110,30 @@ u64 wbt_default_latency_nsec(struct request_queue *); static inline void wbt_track(struct request *rq, enum wbt_flags flags) { } -static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags) -{ -} -static inline void wbt_done(struct rq_wb *rwb, struct request *rq) -{ -} -static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, - spinlock_t *lock) -{ - return 0; -} static inline int wbt_init(struct request_queue *q) { return -EINVAL; } -static inline void wbt_exit(struct request_queue *q) -{ -} -static inline void wbt_update_limits(struct rq_wb *rwb) +static inline void wbt_update_limits(struct request_queue *q) { } -static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq) +static inline void wbt_disable_default(struct request_queue *q) { } -static inline void wbt_issue(struct rq_wb *rwb, struct request *rq) +static inline void wbt_enable_default(struct request_queue *q) { } -static inline void wbt_disable_default(struct request_queue *q) +static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) { } -static inline void wbt_enable_default(struct request_queue *q) +static inline void wbt_set_write_cache(struct request_queue *q, bool wc) { } -static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +static inline u64 wbt_get_min_lat(struct request_queue *q) { + return 0; } -static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc) +static inline void wbt_set_min_lat(struct request_queue *q, u64 val) { } static inline u64 wbt_default_latency_nsec(struct request_queue *q) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9d05646d5059..137759862f07 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -42,7 +42,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; -struct rq_wb; +struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; @@ -443,7 +443,7 @@ struct request_queue { int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ struct blk_queue_stats *stats; - struct rq_wb *rq_wb; + struct rq_qos *rq_qos; /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg -- cgit v1.2.3 From c1c80384c8f47021a01a0cc42894a06bed2b801b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:59 -0400 Subject: block: remove external dependency on wbt_flags We don't really need to save this stuff in the core block code, we can just pass the bio back into the helpers later on to derive the same flags and update the rq->wbt_flags appropriately. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-core.c | 9 ++++----- block/blk-mq.c | 9 ++++----- block/blk-rq-qos.c | 24 +++++++++++++++--------- block/blk-rq-qos.h | 11 ++++++----- block/blk-wbt.c | 52 +++++++++++++++++++++++++++++++++++++++------------- block/blk-wbt.h | 5 ----- 6 files changed, 68 insertions(+), 42 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index b33a73bcf2d0..687d7732f23a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -42,7 +42,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" -#include "blk-wbt.h" +#include "blk-rq-qos.h" #ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; @@ -1986,7 +1986,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) int where = ELEVATOR_INSERT_SORT; struct request *req, *free; unsigned int request_count = 0; - unsigned int wb_acct; /* * low level driver can indicate that it wants pages above a @@ -2044,7 +2043,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: - wb_acct = rq_qos_throttle(q, bio, q->queue_lock); + rq_qos_throttle(q, bio, q->queue_lock); /* * Grab a free request. This is might sleep but can not fail. @@ -2054,7 +2053,7 @@ get_rq: req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); if (IS_ERR(req)) { blk_queue_exit(q); - rq_qos_cleanup(q, wb_acct); + rq_qos_cleanup(q, bio); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; else @@ -2063,7 +2062,7 @@ get_rq: goto out_unlock; } - wbt_track(req, wb_acct); + rq_qos_track(q, req, bio); /* * After dropping the lock and possibly sleeping here, our request diff --git a/block/blk-mq.c b/block/blk-mq.c index ea2a226457fa..73a43b81b17d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -34,8 +34,8 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-stat.h" -#include "blk-wbt.h" #include "blk-mq-sched.h" +#include "blk-rq-qos.h" static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); @@ -1790,7 +1790,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; - unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1806,19 +1805,19 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (blk_mq_sched_bio_merge(q, bio)) return BLK_QC_T_NONE; - wb_acct = rq_qos_throttle(q, bio, NULL); + rq_qos_throttle(q, bio, NULL); trace_block_getrq(q, bio, bio->bi_opf); rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { - rq_qos_cleanup(q, wb_acct); + rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); return BLK_QC_T_NONE; } - wbt_track(rq, wb_acct); + rq_qos_track(q, rq, bio); cookie = request_to_qc_t(data.hctx, rq); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d2f2af8aa10c..b7b02e04f64f 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -1,7 +1,5 @@ #include "blk-rq-qos.h" -#include "blk-wbt.h" - /* * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, * false if 'v' + 1 would be bigger than 'below'. @@ -29,13 +27,13 @@ bool rq_wait_inc_below(struct rq_wait *rq_wait, int limit) return atomic_inc_below(&rq_wait->inflight, limit); } -void rq_qos_cleanup(struct request_queue *q, enum wbt_flags wb_acct) +void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { struct rq_qos *rqos; for (rqos = q->rq_qos; rqos; rqos = rqos->next) { if (rqos->ops->cleanup) - rqos->ops->cleanup(rqos, wb_acct); + rqos->ops->cleanup(rqos, bio); } } @@ -69,17 +67,25 @@ void rq_qos_requeue(struct request_queue *q, struct request *rq) } } -enum wbt_flags rq_qos_throttle(struct request_queue *q, struct bio *bio, - spinlock_t *lock) +void rq_qos_throttle(struct request_queue *q, struct bio *bio, + spinlock_t *lock) { struct rq_qos *rqos; - enum wbt_flags flags = 0; for(rqos = q->rq_qos; rqos; rqos = rqos->next) { if (rqos->ops->throttle) - flags |= rqos->ops->throttle(rqos, bio, lock); + rqos->ops->throttle(rqos, bio, lock); + } +} + +void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->track) + rqos->ops->track(rqos, rq, bio); } - return flags; } /* diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index f9a39bd6ece3..a6d13b8ce0dc 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -25,12 +25,12 @@ struct rq_qos { }; struct rq_qos_ops { - enum wbt_flags (*throttle)(struct rq_qos *, struct bio *, - spinlock_t *); + void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *); + void (*track)(struct rq_qos *, struct request *, struct bio *); void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); void (*done)(struct rq_qos *, struct request *); - void (*cleanup)(struct rq_qos *, enum wbt_flags); + void (*cleanup)(struct rq_qos *, struct bio *); void (*exit)(struct rq_qos *); }; @@ -97,10 +97,11 @@ void rq_depth_scale_up(struct rq_depth *rqd); void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); -void rq_qos_cleanup(struct request_queue *, enum wbt_flags); +void rq_qos_cleanup(struct request_queue *, struct bio *); void rq_qos_done(struct request_queue *, struct request *); void rq_qos_issue(struct request_queue *, struct request *); void rq_qos_requeue(struct request_queue *, struct request *); -enum wbt_flags rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); +void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); +void rq_qos_track(struct request_queue *q, struct request *, struct bio *); void rq_qos_exit(struct request_queue *); #endif diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6fe20fb823e4..461a9af11efe 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -549,41 +549,66 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) } } +static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) +{ + enum wbt_flags flags = 0; + + if (bio_op(bio) == REQ_OP_READ) { + flags = WBT_READ; + } else if (wbt_should_throttle(rwb, bio)) { + if (current_is_kswapd()) + flags |= WBT_KSWAPD; + if (bio_op(bio) == REQ_OP_DISCARD) + flags |= WBT_DISCARD; + flags |= WBT_TRACKED; + } + return flags; +} + +static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + enum wbt_flags flags = bio_to_wbt_flags(rwb, bio); + __wbt_done(rqos, flags); +} + /* * Returns true if the IO request should be accounted, false if not. * May sleep, if we have exceeded the writeback limits. Caller can pass * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ -static enum wbt_flags wbt_wait(struct rq_qos *rqos, struct bio *bio, - spinlock_t *lock) +static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) { struct rq_wb *rwb = RQWB(rqos); - enum wbt_flags ret = 0; + enum wbt_flags flags; if (!rwb_enabled(rwb)) - return 0; + return; - if (bio_op(bio) == REQ_OP_READ) - ret = WBT_READ; + flags = bio_to_wbt_flags(rwb, bio); if (!wbt_should_throttle(rwb, bio)) { - if (ret & WBT_READ) + if (flags & WBT_READ) wb_timestamp(rwb, &rwb->last_issue); - return ret; + return; } if (current_is_kswapd()) - ret |= WBT_KSWAPD; + flags |= WBT_KSWAPD; if (bio_op(bio) == REQ_OP_DISCARD) - ret |= WBT_DISCARD; + flags |= WBT_DISCARD; - __wbt_wait(rwb, ret, bio->bi_opf, lock); + __wbt_wait(rwb, flags, bio->bi_opf, lock); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); +} - return ret | WBT_TRACKED; +static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); } void wbt_issue(struct rq_qos *rqos, struct request *rq) @@ -707,9 +732,10 @@ EXPORT_SYMBOL_GPL(wbt_disable_default); static struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, .issue = wbt_issue, + .track = wbt_track, .requeue = wbt_requeue, .done = wbt_done, - .cleanup = __wbt_done, + .cleanup = wbt_cleanup, .exit = wbt_exit, }; diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 53b20a58c0a2..f47218d5b3b2 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -87,11 +87,6 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb) #ifdef CONFIG_BLK_WBT -static inline void wbt_track(struct request *rq, enum wbt_flags flags) -{ - rq->wbt_flags |= flags; -} - int wbt_init(struct request_queue *); void wbt_update_limits(struct request_queue *); void wbt_disable_default(struct request_queue *); -- cgit v1.2.3 From e9a83853302b339e63dea4072f6210e5a88ab4bb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 6 Jul 2018 10:49:35 +0200 Subject: block: Add default switch case to blk_pm_allow_request() to kill warning With gcc 4.9.0 and 7.3.0: block/blk-core.c: In function 'blk_pm_allow_request': block/blk-core.c:2747:2: warning: enumeration value 'RPM_ACTIVE' not handled in switch [-Wswitch] switch (rq->q->rpm_status) { ^ Convert the return statement below the switch() block into a default case to fix this. Fixes: e4f36b249b4d4e75 ("block: fix peeking requests during PM") Signed-off-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 687d7732f23a..c4b57d8806fe 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2753,9 +2753,9 @@ static bool blk_pm_allow_request(struct request *rq) return rq->rq_flags & RQF_PM; case RPM_SUSPENDED: return false; + default: + return true; } - - return true; } #else static bool blk_pm_allow_request(struct request *rq) -- cgit v1.2.3 From ddcf35d397976421a4ec1d0d00fbcc027a8cb034 Mon Sep 17 00:00:00 2001 From: Michael Callahan Date: Wed, 18 Jul 2018 04:47:39 -0700 Subject: block: Add and use op_stat_group() for indexing disk_stat fields. Add and use a new op_stat_group() function for indexing partition stat fields rather than indexing them by rq_data_dir() or bio_data_dir(). This function works similarly to op_is_sync() in that it takes the request::cmd_flags or bio::bi_opf flags and determines which stats should et updated. In addition, the second parameter to generic_start_io_acct() and generic_end_io_acct() is now a REQ_OP rather than simply a read or write bit and it uses op_stat_group() on the parameter to determine the stat group. Note that the partition in_flight counts are not part of the per-cpu statistics and as such are not indexed via this function. It's now indexed by op_is_write(). tj: Refreshed on top of v4.17. Updated to pass around REQ_OP. Signed-off-by: Michael Callahan Signed-off-by: Tejun Heo Cc: Minchan Kim Cc: Dan Williams Cc: Joshua Morris Cc: Philipp Reisner Cc: Matias Bjorling Cc: Kent Overstreet Cc: Alasdair Kergon Signed-off-by: Jens Axboe --- block/bio.c | 16 +++++++++------- block/blk-core.c | 12 ++++++------ drivers/block/drbd/drbd_req.c | 4 ++-- drivers/block/rsxx/dev.c | 6 +++--- drivers/block/zram/zram_drv.c | 5 ++--- drivers/lightnvm/pblk-cache.c | 5 +++-- drivers/lightnvm/pblk-read.c | 5 +++-- drivers/md/bcache/request.c | 13 +++++-------- drivers/md/dm.c | 6 ++++-- drivers/md/md.c | 5 +++-- drivers/nvdimm/nd.h | 7 +++---- include/linux/bio.h | 4 ++-- include/linux/blk_types.h | 5 +++++ 13 files changed, 50 insertions(+), 43 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/bio.c b/block/bio.c index f3536bfc8298..8ecc95615941 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1728,29 +1728,31 @@ void bio_check_pages_dirty(struct bio *bio) } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); -void generic_start_io_acct(struct request_queue *q, int rw, +void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part) { + const int sgrp = op_stat_group(op); int cpu = part_stat_lock(); part_round_stats(q, cpu, part); - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, sectors[rw], sectors); - part_inc_in_flight(q, part, rw); + part_stat_inc(cpu, part, ios[sgrp]); + part_stat_add(cpu, part, sectors[sgrp], sectors); + part_inc_in_flight(q, part, op_is_write(op)); part_stat_unlock(); } EXPORT_SYMBOL(generic_start_io_acct); -void generic_end_io_acct(struct request_queue *q, int rw, +void generic_end_io_acct(struct request_queue *q, int req_op, struct hd_struct *part, unsigned long start_time) { unsigned long duration = jiffies - start_time; + const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock(); - part_stat_add(cpu, part, ticks[rw], duration); + part_stat_add(cpu, part, ticks[sgrp], duration); part_round_stats(q, cpu, part); - part_dec_in_flight(q, part, rw); + part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); } diff --git a/block/blk-core.c b/block/blk-core.c index c4b57d8806fe..03a4ea93a5f3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2702,13 +2702,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes); void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { - const int rw = rq_data_dir(req); + const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; cpu = part_stat_lock(); part = req->part; - part_stat_add(cpu, part, sectors[rw], bytes >> 9); + part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -2722,7 +2722,7 @@ void blk_account_io_done(struct request *req, u64 now) */ if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { unsigned long duration; - const int rw = rq_data_dir(req); + const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; @@ -2730,10 +2730,10 @@ void blk_account_io_done(struct request *req, u64 now) cpu = part_stat_lock(); part = req->part; - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, ticks[rw], duration); + part_stat_inc(cpu, part, ios[sgrp]); + part_stat_add(cpu, part, ticks[sgrp], duration); part_round_stats(req->q, cpu, part); - part_dec_in_flight(req->q, part, rw); + part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); part_stat_unlock(); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index d146fedc38bb..19cac36e9737 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -38,7 +38,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request { struct request_queue *q = device->rq_queue; - generic_start_io_acct(q, bio_data_dir(req->master_bio), + generic_start_io_acct(q, bio_op(req->master_bio), req->i.size >> 9, &device->vdisk->part0); } @@ -47,7 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r { struct request_queue *q = device->rq_queue; - generic_end_io_acct(q, bio_data_dir(req->master_bio), + generic_end_io_acct(q, bio_op(req->master_bio), &device->vdisk->part0, req->start_jif); } diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index dddb3f2490b6..1a92f9e65937 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -112,7 +112,7 @@ static const struct block_device_operations rsxx_fops = { static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) { - generic_start_io_acct(card->queue, bio_data_dir(bio), bio_sectors(bio), + generic_start_io_acct(card->queue, bio_op(bio), bio_sectors(bio), &card->gendisk->part0); } @@ -120,8 +120,8 @@ static void disk_stats_complete(struct rsxx_cardinfo *card, struct bio *bio, unsigned long start_time) { - generic_end_io_acct(card->queue, bio_data_dir(bio), - &card->gendisk->part0, start_time); + generic_end_io_acct(card->queue, bio_op(bio), + &card->gendisk->part0, start_time); } static void bio_dma_done_cb(struct rsxx_cardinfo *card, diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 78c29044684a..2907a8156aaf 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1277,11 +1277,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, unsigned int op, struct bio *bio) { unsigned long start_time = jiffies; - int rw_acct = op_is_write(op) ? REQ_OP_WRITE : REQ_OP_READ; struct request_queue *q = zram->disk->queue; int ret; - generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT, + generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT, &zram->disk->part0); if (!op_is_write(op)) { @@ -1293,7 +1292,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, ret = zram_bvec_write(zram, bvec, index, offset, bio); } - generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time); + generic_end_io_acct(q, op, &zram->disk->part0, start_time); zram_slot_lock(zram, index); zram_accessed(zram, index); diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 77d811962818..f565a56b898a 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -27,7 +27,8 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) int nr_entries = pblk_get_secs(bio); int i, ret; - generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0); + generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio), + &pblk->disk->part0); /* Update the write buffer head (mem) with the entries that we can * write. The write in itself cannot fail, so there is no need to @@ -75,7 +76,7 @@ retry: pblk_rl_inserted(&pblk->rl, nr_entries); out: - generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time); + generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time); pblk_write_should_kick(pblk); return ret; } diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 26d414ae25b6..5a46d7f9302f 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -199,7 +199,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, struct bio *int_bio = rqd->bio; unsigned long start_time = r_ctx->start_time; - generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time); + generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time); if (rqd->error) pblk_log_read_err(pblk, rqd); @@ -461,7 +461,8 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) return NVM_IO_ERR; } - generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0); + generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio), + &pblk->disk->part0); bitmap_zero(read_bitmap, nr_secs); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index ae67f5fa8047..97707b0c54ce 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - generic_end_io_acct(s->d->disk->queue, - bio_data_dir(s->orig_bio), + generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio), &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); @@ -1062,8 +1061,7 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io = ddip->bi_end_io; bio->bi_private = ddip->bi_private; - generic_end_io_acct(ddip->d->disk->queue, - bio_data_dir(bio), + generic_end_io_acct(ddip->d->disk->queue, bio_op(bio), &ddip->d->disk->part0, ddip->start_time); if (bio->bi_status) { @@ -1120,7 +1118,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, } atomic_set(&dc->backing_idle, 0); - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); + generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; @@ -1229,7 +1227,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, struct search *s; struct closure *cl; struct bcache_device *d = bio->bi_disk->private_data; - int rw = bio_data_dir(bio); if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { bio->bi_status = BLK_STS_IOERR; @@ -1237,7 +1234,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, return BLK_QC_T_NONE; } - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); + generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); s = search_alloc(bio, d); cl = &s->cl; @@ -1254,7 +1251,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, flash_dev_nodata, bcache_wq); return BLK_QC_T_NONE; - } else if (rw) { + } else if (bio_data_dir(bio)) { bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_iter.bi_sector, 0), &KEY(d->id, bio_end_sector(bio), 0)); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b0dd7027848b..20f7e4ef5342 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io) io->start_time = jiffies; - generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0); + generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), + &dm_disk(md)->part0); atomic_set(&dm_disk(md)->part0.in_flight[rw], atomic_inc_return(&md->pending[rw])); @@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io) int pending; int rw = bio_data_dir(bio); - generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time); + generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, + io->start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), diff --git a/drivers/md/md.c b/drivers/md/md.c index dabe36723d60..f6e58dbca0d4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -335,6 +335,7 @@ EXPORT_SYMBOL(md_handle_request); static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); + const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = q->queuedata; unsigned int sectors; int cpu; @@ -363,8 +364,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) md_handle_request(mddev, bio); cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors); part_stat_unlock(); return BLK_QC_T_NONE; diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 32e0364b48b9..6ee7fd7e4bbd 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -396,16 +396,15 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) return false; *start = jiffies; - generic_start_io_acct(disk->queue, bio_data_dir(bio), - bio_sectors(bio), &disk->part0); + generic_start_io_acct(disk->queue, bio_op(bio), bio_sectors(bio), + &disk->part0); return true; } static inline void nd_iostat_end(struct bio *bio, unsigned long start) { struct gendisk *disk = bio->bi_disk; - generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0, - start); + generic_end_io_acct(disk->queue, bio_op(bio), &disk->part0, start); } static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len) diff --git a/include/linux/bio.h b/include/linux/bio.h index a00dfff51aa5..ab221c517f4e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -496,9 +496,9 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -void generic_start_io_acct(struct request_queue *q, int rw, +void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part); -void generic_end_io_acct(struct request_queue *q, int rw, +void generic_end_io_acct(struct request_queue *q, int op, struct hd_struct *part, unsigned long start_time); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d2b44de56bc1..2960a96d833c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -401,6 +401,11 @@ static inline bool op_is_sync(unsigned int op) (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); } +static inline int op_stat_group(unsigned int op) +{ + return op_is_write(op); +} + typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U #define BLK_QC_T_SHIFT 16 -- cgit v1.2.3 From 54648cf1ec2d7f4b6a71767799c45676a138ca24 Mon Sep 17 00:00:00 2001 From: xiao jin Date: Mon, 30 Jul 2018 14:11:12 +0800 Subject: block: blk_init_allocated_queue() set q->fq as NULL in the fail case We find the memory use-after-free issue in __blk_drain_queue() on the kernel 4.14. After read the latest kernel 4.18-rc6 we think it has the same problem. Memory is allocated for q->fq in the blk_init_allocated_queue(). If the elevator init function called with error return, it will run into the fail case to free the q->fq. Then the __blk_drain_queue() uses the same memory after the free of the q->fq, it will lead to the unpredictable event. The patch is to set q->fq as NULL in the fail case of blk_init_allocated_queue(). Fixes: commit 7c94e1c157a2 ("block: introduce blk_flush_queue to drive flush machinery") Cc: Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Signed-off-by: xiao jin Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 03a4ea93a5f3..23cd1b7770e7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1184,6 +1184,7 @@ out_exit_flush_rq: q->exit_rq_fn(q, q->fq->flush_rq); out_free_flush_queue: blk_free_flush_queue(q->fq); + q->fq = NULL; return -ENOMEM; } EXPORT_SYMBOL(blk_init_allocated_queue); -- cgit v1.2.3 From b233f127042dba991229e3882c6217c80492f6ef Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 30 Jul 2018 20:02:19 +0800 Subject: block: really disable runtime-pm for blk-mq Runtime PM isn't ready for blk-mq yet, and commit 765e40b675a9 ("block: disable runtime-pm for blk-mq") tried to disable it. Unfortunately, it can't take effect in that way since user space still can switch it on via 'echo auto > /sys/block/sdN/device/power/control'. This patch disables runtime-pm for blk-mq really by pm_runtime_disable() and fixes all kinds of PM related kernel crash. Cc: Tomas Janousek Cc: Przemek Socha Cc: Alan Stern Cc: Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Tested-by: Patrick Steinhardt Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 23cd1b7770e7..f9ad73d8573c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3770,9 +3770,11 @@ EXPORT_SYMBOL(blk_finish_plug); */ void blk_pm_runtime_init(struct request_queue *q, struct device *dev) { - /* not support for RQF_PM and ->rpm_status in blk-mq yet */ - if (q->mq_ops) + /* Don't enable runtime PM for blk-mq until it is ready */ + if (q->mq_ops) { + pm_runtime_disable(dev); return; + } q->dev = dev; q->rpm_status = RPM_ACTIVE; -- cgit v1.2.3 From 4cf6324b17e96b7b7ab4021c6929500934d46750 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Aug 2018 07:53:37 -0700 Subject: block: Introduce blk_exit_queue() This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Cc: Christoph Hellwig Cc: Ming Lei Cc: Omar Sandoval Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Joseph Qi Cc: Signed-off-by: Jens Axboe --- block/blk-core.c | 54 ++++++++++++++++++++++++++++++------------------------ block/blk.h | 1 + 2 files changed, 31 insertions(+), 24 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index f9ad73d8573c..49af34bf2119 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -715,6 +715,35 @@ void blk_set_queue_dying(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_set_queue_dying); +/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ +void blk_exit_queue(struct request_queue *q) +{ + /* + * Since the I/O scheduler exit code may access cgroup information, + * perform I/O scheduler exit before disassociating from the block + * cgroup controller. + */ + if (q->elevator) { + ioc_clear_queue(q); + elevator_exit(q, q->elevator); + q->elevator = NULL; + } + + /* + * Remove all references to @q from the block cgroup controller before + * restoring @q->queue_lock to avoid that restoring this pointer causes + * e.g. blkcg_print_blkgs() to crash. + */ + blkcg_exit_queue(q); + + /* + * Since the cgroup code may dereference the @q->backing_dev_info + * pointer, only decrease its reference count after having removed the + * association with the block cgroup controller. + */ + bdi_put(q->backing_dev_info); +} + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -784,30 +813,7 @@ void blk_cleanup_queue(struct request_queue *q) */ WARN_ON_ONCE(q->kobj.state_in_sysfs); - /* - * Since the I/O scheduler exit code may access cgroup information, - * perform I/O scheduler exit before disassociating from the block - * cgroup controller. - */ - if (q->elevator) { - ioc_clear_queue(q); - elevator_exit(q, q->elevator); - q->elevator = NULL; - } - - /* - * Remove all references to @q from the block cgroup controller before - * restoring @q->queue_lock to avoid that restoring this pointer causes - * e.g. blkcg_print_blkgs() to crash. - */ - blkcg_exit_queue(q); - - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); + blk_exit_queue(q); if (q->mq_ops) blk_mq_free_queue(q); diff --git a/block/blk.h b/block/blk.h index 69b14cd2bb22..d4d67e948920 100644 --- a/block/blk.h +++ b/block/blk.h @@ -130,6 +130,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q); int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); void blk_exit_rl(struct request_queue *q, struct request_list *rl); +void blk_exit_queue(struct request_queue *q); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); void blk_queue_bypass_start(struct request_queue *q); -- cgit v1.2.3