summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig26
-rw-r--r--block/Makefile3
-rw-r--r--block/badblocks.c29
-rw-r--r--block/bio.c19
-rw-r--r--block/blk-cgroup.c9
-rw-r--r--block/blk-core.c132
-rw-r--r--block/blk-flush.c37
-rw-r--r--block/blk-lib.c175
-rw-r--r--block/blk-map.c4
-rw-r--r--block/blk-merge.c81
-rw-r--r--block/blk-mq-cpumap.c1
-rw-r--r--block/blk-mq-sysfs.c47
-rw-r--r--block/blk-mq.c350
-rw-r--r--block/blk-mq.h5
-rw-r--r--block/blk-settings.c34
-rw-r--r--block/blk-stat.c256
-rw-r--r--block/blk-stat.h42
-rw-r--r--block/blk-sysfs.c162
-rw-r--r--block/blk-wbt.c750
-rw-r--r--block/blk-wbt.h171
-rw-r--r--block/blk.h1
-rw-r--r--block/bsg-lib.c25
-rw-r--r--block/cfq-iosched.c16
-rw-r--r--block/elevator.c8
-rw-r--r--block/partition-generic.c65
25 files changed, 2189 insertions, 259 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 3a024440a669..8bf114a3858a 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -121,6 +121,32 @@ config BLK_CMDLINE_PARSER
See Documentation/block/cmdline-partition.txt for more information.
+config BLK_WBT
+ bool "Enable support for block device writeback throttling"
+ default n
+ ---help---
+ Enabling this option enables the block layer to throttle buffered
+ background writeback from the VM, making it more smooth and having
+ less impact on foreground operations. The throttling is done
+ dynamically on an algorithm loosely based on CoDel, factoring in
+ the realtime performance of the disk.
+
+config BLK_WBT_SQ
+ bool "Single queue writeback throttling"
+ default n
+ depends on BLK_WBT
+ ---help---
+ Enable writeback throttling by default on legacy single queue devices
+
+config BLK_WBT_MQ
+ bool "Multiqueue writeback throttling"
+ default y
+ depends on BLK_WBT
+ ---help---
+ Enable writeback throttling by default on multiqueue devices.
+ Multiqueue currently doesn't have support for IO scheduling,
+ enabling this option is recommended.
+
menu "Partition Types"
source "block/partitions/Kconfig"
diff --git a/block/Makefile b/block/Makefile
index 934dac73fb37..a827f988c4e6 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-lib.o blk-mq.o blk-mq-tag.o \
+ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
@@ -24,3 +24,4 @@ obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
+obj-$(CONFIG_BLK_WBT) += blk-wbt.o
diff --git a/block/badblocks.c b/block/badblocks.c
index 7be53cb1cc3c..6ebcef282314 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -133,6 +133,26 @@ retry:
}
EXPORT_SYMBOL_GPL(badblocks_check);
+static void badblocks_update_acked(struct badblocks *bb)
+{
+ u64 *p = bb->page;
+ int i;
+ bool unacked = false;
+
+ if (!bb->unacked_exist)
+ return;
+
+ for (i = 0; i < bb->count ; i++) {
+ if (!BB_ACK(p[i])) {
+ unacked = true;
+ break;
+ }
+ }
+
+ if (!unacked)
+ bb->unacked_exist = 0;
+}
+
/**
* badblocks_set() - Add a range of bad blocks to the table.
* @bb: the badblocks structure that holds all badblock information
@@ -294,6 +314,8 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
bb->changed = 1;
if (!acknowledged)
bb->unacked_exist = 1;
+ else
+ badblocks_update_acked(bb);
write_sequnlock_irqrestore(&bb->lock, flags);
return rv;
@@ -354,7 +376,8 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
* current range. Earlier ranges could also overlap,
* but only this one can overlap the end of the range.
*/
- if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
+ if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
+ (BB_OFFSET(p[lo]) < target)) {
/* Partial overlap, leave the tail of this range */
int ack = BB_ACK(p[lo]);
sector_t a = BB_OFFSET(p[lo]);
@@ -377,7 +400,8 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
lo--;
}
while (lo >= 0 &&
- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
+ (BB_OFFSET(p[lo]) < target)) {
/* This range does overlap */
if (BB_OFFSET(p[lo]) < s) {
/* Keep the early parts of this range. */
@@ -399,6 +423,7 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
}
}
+ badblocks_update_acked(bb);
bb->changed = 1;
out:
write_sequnlock_irq(&bb->lock);
diff --git a/block/bio.c b/block/bio.c
index 2cf6ebabc68c..2b375020fc49 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,11 +270,15 @@ static void bio_free(struct bio *bio)
}
}
-void bio_init(struct bio *bio)
+void bio_init(struct bio *bio, struct bio_vec *table,
+ unsigned short max_vecs)
{
memset(bio, 0, sizeof(*bio));
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
+
+ bio->bi_io_vec = table;
+ bio->bi_max_vecs = max_vecs;
}
EXPORT_SYMBOL(bio_init);
@@ -480,7 +484,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
return NULL;
bio = p + front_pad;
- bio_init(bio);
+ bio_init(bio, NULL, 0);
if (nr_iovecs > inline_vecs) {
unsigned long idx = 0;
@@ -670,6 +674,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
break;
case REQ_OP_WRITE_SAME:
bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
@@ -1835,15 +1840,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
BUG_ON(sectors <= 0);
BUG_ON(sectors >= bio_sectors(bio));
- /*
- * Discards need a mutable bio_vec to accommodate the payload
- * required by the DSM TRIM and UNMAP commands.
- */
- if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
- split = bio_clone_bioset(bio, gfp, bs);
- else
- split = bio_clone_fast(bio, gfp, bs);
-
+ split = bio_clone_fast(bio, gfp, bs);
if (!split)
return NULL;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b08ccbb9393a..8ba0af780e88 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -185,7 +185,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
}
wb_congested = wb_congested_get_create(&q->backing_dev_info,
- blkcg->css.id, GFP_NOWAIT);
+ blkcg->css.id,
+ GFP_NOWAIT | __GFP_NOWARN);
if (!wb_congested) {
ret = -ENOMEM;
goto err_put_css;
@@ -193,7 +194,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* allocate */
if (!new_blkg) {
- new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
+ new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
goto err_put_congested;
@@ -1022,7 +1023,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
}
spin_lock_init(&blkcg->lock);
- INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
+ INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -1240,7 +1241,7 @@ pd_prealloc:
if (blkg->pd[pol->plid])
continue;
- pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+ pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
if (!pd)
swap(pd, pd_prealloc);
if (!pd) {
diff --git a/block/blk-core.c b/block/blk-core.c
index 2deca48a4a05..61ba08c58b64 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
#include "blk.h"
#include "blk-mq.h"
+#include "blk-wbt.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -882,6 +883,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
fail:
blk_free_flush_queue(q->fq);
+ wbt_exit(q);
return NULL;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1152,6 +1154,7 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
blk_rq_init(q, rq);
blk_rq_set_rl(rq, rl);
+ blk_rq_set_prio(rq, ioc);
rq->cmd_flags = op;
rq->rq_flags = rq_flags;
@@ -1344,6 +1347,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+ wbt_requeue(q->rq_wb, &rq->issue_stat);
if (rq->rq_flags & RQF_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
+ wbt_done(q->rq_wb, &req->issue_stat);
+
/*
* Request may not have originated from ll_rw_blk. if not,
* it didn't come out of our reserved rq pools
@@ -1470,38 +1476,6 @@ void blk_put_request(struct request *req)
}
EXPORT_SYMBOL(blk_put_request);
-/**
- * blk_add_request_payload - add a payload to a request
- * @rq: request to update
- * @page: page backing the payload
- * @offset: offset in page
- * @len: length of the payload.
- *
- * This allows to later add a payload to an already submitted request by
- * a block driver. The driver needs to take care of freeing the payload
- * itself.
- *
- * Note that this is a quite horrible hack and nothing but handling of
- * discard requests should ever use it.
- */
-void blk_add_request_payload(struct request *rq, struct page *page,
- int offset, unsigned int len)
-{
- struct bio *bio = rq->bio;
-
- bio->bi_io_vec->bv_page = page;
- bio->bi_io_vec->bv_offset = offset;
- bio->bi_io_vec->bv_len = len;
-
- bio->bi_iter.bi_size = len;
- bio->bi_vcnt = 1;
- bio->bi_phys_segments = 1;
-
- rq->__data_len = rq->resid_len = len;
- rq->nr_phys_segments = 1;
-}
-EXPORT_SYMBOL_GPL(blk_add_request_payload);
-
bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
struct bio *bio)
{
@@ -1653,7 +1627,8 @@ void init_request_from_bio(struct request *req, struct bio *bio)
req->errors = 0;
req->__sector = bio->bi_iter.bi_sector;
- req->ioprio = bio_prio(bio);
+ if (ioprio_valid(bio_prio(bio)))
+ req->ioprio = bio_prio(bio);
blk_rq_bio_prep(req->q, req, bio);
}
@@ -1663,6 +1638,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
int el_ret, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+ unsigned int wb_acct;
/*
* low level driver can indicate that it wants pages above a
@@ -1715,17 +1691,22 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
}
get_rq:
+ wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+
/*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
if (IS_ERR(req)) {
+ __wbt_done(q->rq_wb, wb_acct);
bio->bi_error = PTR_ERR(req);
bio_endio(bio);
goto out_unlock;
}
+ wbt_track(&req->issue_stat, wb_acct);
+
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
@@ -1742,8 +1723,11 @@ get_rq:
/*
* If this is the first request added after a plug, fire
* of a plug trace.
+ *
+ * @request_count may become stale because of schedule
+ * out, so check plug list again.
*/
- if (!request_count)
+ if (!request_count || list_empty(&plug->list))
trace_block_plug(q);
else {
struct request *last = list_entry_rq(plug->list.prev);
@@ -1773,7 +1757,12 @@ static inline void blk_partition_remap(struct bio *bio)
{
struct block_device *bdev = bio->bi_bdev;
- if (bio_sectors(bio) && bdev != bdev->bd_contains) {
+ /*
+ * Zone reset does not include bi_size so bio_sectors() is always 0.
+ * Include a test for the reset op code and perform the remap if needed.
+ */
+ if (bdev != bdev->bd_contains &&
+ (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)) {
struct hd_struct *p = bdev->bd_part;
bio->bi_iter.bi_sector += p->start_sect;
@@ -1926,11 +1915,16 @@ generic_make_request_checks(struct bio *bio)
case REQ_OP_WRITE_SAME:
if (!bdev_write_same(bio->bi_bdev))
goto not_supported;
+ break;
case REQ_OP_ZONE_REPORT:
case REQ_OP_ZONE_RESET:
if (!bdev_is_zoned(bio->bi_bdev))
goto not_supported;
break;
+ case REQ_OP_WRITE_ZEROES:
+ if (!bdev_write_zeroes_sectors(bio->bi_bdev))
+ goto not_supported;
+ break;
default:
break;
}
@@ -2464,6 +2458,12 @@ void blk_start_request(struct request *req)
{
blk_dequeue_request(req);
+ if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
+ blk_stat_set_issue_time(&req->issue_stat);
+ req->rq_flags |= RQF_STATS;
+ wbt_issue(req->q->rq_wb, &req->issue_stat);
+ }
+
/*
* We are now handing the request to the hardware, initialize
* resid_len to full count and add the timeout handler.
@@ -2612,6 +2612,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
return false;
}
+ WARN_ON_ONCE(req->rq_flags & RQF_SPECIAL_PAYLOAD);
+
req->__data_len -= total_bytes;
/* update sector only for requests with clear definition of sector */
@@ -2683,8 +2685,13 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
*/
void blk_finish_request(struct request *req, int error)
{
+ struct request_queue *q = req->q;
+
+ if (req->rq_flags & RQF_STATS)
+ blk_stat_add(&q->rq_stats[rq_data_dir(req)], req);
+
if (req->rq_flags & RQF_QUEUED)
- blk_queue_end_tag(req->q, req);
+ blk_queue_end_tag(q, req);
BUG_ON(blk_queued_rq(req));
@@ -2698,13 +2705,14 @@ void blk_finish_request(struct request *req, int error)
blk_account_io_done(req);
- if (req->end_io)
+ if (req->end_io) {
+ wbt_done(req->q->rq_wb, &req->issue_stat);
req->end_io(req, error);
- else {
+ } else {
if (blk_bidi_rq(req))
__blk_put_request(req->next_rq->q, req->next_rq);
- __blk_put_request(req->q, req);
+ __blk_put_request(q, req);
}
}
EXPORT_SYMBOL(blk_finish_request);
@@ -3289,52 +3297,6 @@ void blk_finish_plug(struct blk_plug *plug)
}
EXPORT_SYMBOL(blk_finish_plug);
-bool blk_poll(struct request_queue *q, blk_qc_t cookie)
-{
- struct blk_plug *plug;
- long state;
- unsigned int queue_num;
- struct blk_mq_hw_ctx *hctx;
-
- if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
- !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
- return false;
-
- queue_num = blk_qc_t_to_queue_num(cookie);
- hctx = q->queue_hw_ctx[queue_num];
- hctx->poll_considered++;
-
- plug = current->plug;
- if (plug)
- blk_flush_plug_list(plug, false);
-
- state = current->state;
- while (!need_resched()) {
- int ret;
-
- hctx->poll_invoked++;
-
- ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
- if (ret > 0) {
- hctx->poll_success++;
- set_current_state(TASK_RUNNING);
- return true;
- }
-
- if (signal_pending_state(state, current))
- set_current_state(TASK_RUNNING);
-
- if (current->state == TASK_RUNNING)
- return true;
- if (ret < 0)
- break;
- cpu_relax();
- }
-
- return false;
-}
-EXPORT_SYMBOL_GPL(blk_poll);
-
#ifdef CONFIG_PM
/**
* blk_pm_runtime_init - Block layer runtime PM initialization routine
diff --git a/block/blk-flush.c b/block/blk-flush.c
index c486b7aa62ee..20b7c7a02f1c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -341,6 +341,34 @@ static void flush_data_end_io(struct request *rq, int error)
struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
/*
+ * Updating q->in_flight[] here for making this tag usable
+ * early. Because in blk_queue_start_tag(),
+ * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and
+ * reserve tags for sync I/O.
+ *
+ * More importantly this way can avoid the following I/O
+ * deadlock:
+ *
+ * - suppose there are 40 fua requests comming to flush queue
+ * and queue depth is 31
+ * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc
+ * tag for async I/O any more
+ * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT
+ * and flush_data_end_io() is called
+ * - the other rqs still can't go ahead if not updating
+ * q->in_flight[BLK_RW_ASYNC] here, meantime these rqs
+ * are held in flush data queue and make no progress of
+ * handling post flush rq
+ * - only after the post flush rq is handled, all these rqs
+ * can be completed
+ */
+
+ elv_completed_request(q, rq);
+
+ /* for avoiding double accounting */
+ rq->rq_flags &= ~RQF_STARTED;
+
+ /*
* After populating an empty queue, kick it to avoid stall. Read
* the comment in flush_end_io().
*/
@@ -396,6 +424,13 @@ void blk_insert_flush(struct request *rq)
rq->cmd_flags &= ~REQ_FUA;
/*
+ * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any
+ * of those flags, we have to set REQ_SYNC to avoid skewing
+ * the request accounting.
+ */
+ rq->cmd_flags |= REQ_SYNC;
+
+ /*
* An empty flush handed down from a stacking driver may
* translate into nothing if the underlying device does not
* advertise a write-back cache. In this case, simply
@@ -419,7 +454,7 @@ void blk_insert_flush(struct request *rq)
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
if (q->mq_ops) {
- blk_mq_insert_request(rq, false, false, true);
+ blk_mq_insert_request(rq, false, true, false);
} else
list_add_tail(&rq->queuelist, &q->queue_head);
return;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 18abda862915..ed89c8f4b2a0 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -80,7 +80,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
req_sects = end_sect - sector;
}
- bio = next_bio(bio, 1, gfp_mask);
+ bio = next_bio(bio, 0, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio->bi_bdev = bdev;
bio_set_op_attrs(bio, op, 0);
@@ -137,24 +137,24 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
EXPORT_SYMBOL(blkdev_issue_discard);
/**
- * blkdev_issue_write_same - queue a write same operation
+ * __blkdev_issue_write_same - generate number of bios with same page
* @bdev: target blockdev
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @page: page containing data to write
+ * @biop: pointer to anchor bio
*
* Description:
- * Issue a write same request for the sectors in question.
+ * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page.
*/
-int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask,
- struct page *page)
+static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct page *page,
+ struct bio **biop)
{
struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_write_same_sectors;
- struct bio *bio = NULL;
- int ret = 0;
+ struct bio *bio = *biop;
sector_t bs_mask;
if (!q)
@@ -164,6 +164,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
+ if (!bdev_write_same(bdev))
+ return -EOPNOTSUPP;
+
/* Ensure that max_write_same_sectors doesn't overflow bi_size */
max_write_same_sectors = UINT_MAX >> 9;
@@ -185,32 +188,112 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
bio->bi_iter.bi_size = nr_sects << 9;
nr_sects = 0;
}
+ cond_resched();
}
- if (bio) {
+ *biop = bio;
+ return 0;
+}
+
+/**
+ * blkdev_issue_write_same - queue a write same operation
+ * @bdev: target blockdev
+ * @sector: start sector
+ * @nr_sects: number of sectors to write
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @page: page containing data
+ *
+ * Description:
+ * Issue a write same request for the sectors in question.
+ */
+int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask,
+ struct page *page)
+{
+ struct bio *bio = NULL;
+ struct blk_plug plug;
+ int ret;
+
+ blk_start_plug(&plug);
+ ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page,
+ &bio);
+ if (ret == 0 && bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
}
+ blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(blkdev_issue_write_same);
/**
- * blkdev_issue_zeroout - generate number of zero filed write bios
+ * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
* @bdev: blockdev to issue
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
+ * @biop: pointer to anchor bio
*
* Description:
- * Generate and issue number of bios with zerofiled pages.
+ * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
*/
+static int __blkdev_issue_write_zeroes(struct block_device *bdev,
+ sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+ struct bio **biop)
+{
+ struct bio *bio = *biop;
+ unsigned int max_write_zeroes_sectors;
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (!q)
+ return -ENXIO;
+
+ /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
+ max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
+
+ if (max_write_zeroes_sectors == 0)
+ return -EOPNOTSUPP;
+
+ while (nr_sects) {
+ bio = next_bio(bio, 0, gfp_mask);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = bdev;
+ bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
-static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask)
+ if (nr_sects > max_write_zeroes_sectors) {
+ bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
+ nr_sects -= max_write_zeroes_sectors;
+ sector += max_write_zeroes_sectors;
+ } else {
+ bio->bi_iter.bi_size = nr_sects << 9;
+ nr_sects = 0;
+ }
+ cond_resched();
+ }
+
+ *biop = bio;
+ return 0;
+}
+
+/**
+ * __blkdev_issue_zeroout - generate number of zero filed write bios
+ * @bdev: blockdev to issue
+ * @sector: start sector
+ * @nr_sects: number of sectors to write
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @biop: pointer to anchor bio
+ * @discard: discard flag
+ *
+ * Description:
+ * Generate and issue number of bios with zerofiled pages.
+ */
+int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
+ bool discard)
{
int ret;
- struct bio *bio = NULL;
+ int bi_size = 0;
+ struct bio *bio = *biop;
unsigned int sz;
sector_t bs_mask;
@@ -218,6 +301,24 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
+ if (discard) {
+ ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
+ BLKDEV_DISCARD_ZERO, biop);
+ if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+ goto out;
+ }
+
+ ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
+ biop);
+ if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+ goto out;
+
+ ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
+ ZERO_PAGE(0), biop);
+ if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+ goto out;
+
+ ret = 0;
while (nr_sects != 0) {
bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES),
gfp_mask);
@@ -227,21 +328,20 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
while (nr_sects != 0) {
sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
- ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
- nr_sects -= ret >> 9;
- sector += ret >> 9;
- if (ret < (sz << 9))
+ bi_size = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+ nr_sects -= bi_size >> 9;
+ sector += bi_size >> 9;
+ if (bi_size < (sz << 9))
break;
}
+ cond_resched();
}
- if (bio) {
- ret = submit_bio_wait(bio);
- bio_put(bio);
- return ret;
- }
- return 0;
+ *biop = bio;
+out:
+ return ret;
}
+EXPORT_SYMBOL(__blkdev_issue_zeroout);
/**
* blkdev_issue_zeroout - zero-fill a block range
@@ -258,26 +358,27 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
* the discard request fail, if the discard flag is not set, or if
* discard_zeroes_data is not supported, this function will resort to
* zeroing the blocks manually, thus provisioning (allocating,
- * anchoring) them. If the block device supports the WRITE SAME command
- * blkdev_issue_zeroout() will use it to optimize the process of
+ * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
+ * command(s), blkdev_issue_zeroout() will use it to optimize the process of
* clearing the block range. Otherwise the zeroing will be performed
* using regular WRITE calls.
*/
-
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, bool discard)
{
- if (discard) {
- if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
- BLKDEV_DISCARD_ZERO))
- return 0;
- }
+ int ret;
+ struct bio *bio = NULL;
+ struct blk_plug plug;
- if (bdev_write_same(bdev) &&
- blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
- ZERO_PAGE(0)) == 0)
- return 0;
+ blk_start_plug(&plug);
+ ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
+ &bio, discard);
+ if (ret == 0 && bio) {
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+ }
+ blk_finish_plug(&plug);
- return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
+ return ret;
}
EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-map.c b/block/blk-map.c
index 0173a72a8aa9..0acb6640ead7 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -120,6 +120,9 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
struct iov_iter i;
int ret;
+ if (!iter_is_iovec(iter))
+ goto fail;
+
if (map_data)
copy = true;
else if (iov_iter_alignment(iter) & align)
@@ -142,6 +145,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
unmap_rq:
__blk_rq_unmap_user(bio);
+fail:
rq->bio = NULL;
return -EINVAL;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fda6a12fc776..182398cb1524 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
case REQ_OP_SECURE_ERASE:
split = blk_bio_discard_split(q, *bio, bs, &nsegs);
break;
+ case REQ_OP_WRITE_ZEROES:
+ split = NULL;
+ nsegs = (*bio)->bi_phys_segments;
+ break;
case REQ_OP_WRITE_SAME:
split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
break;
@@ -237,15 +241,14 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
if (!bio)
return 0;
- /*
- * This should probably be returning 0, but blk_add_request_payload()
- * (Christoph!!!!)
- */
- if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
- return 1;
-
- if (bio_op(bio) == REQ_OP_WRITE_SAME)
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
+ return 0;
+ case REQ_OP_WRITE_SAME:
return 1;
+ }
fbio = bio;
cluster = blk_queue_cluster(q);
@@ -402,38 +405,21 @@ new_segment:
*bvprv = *bvec;
}
+static inline int __blk_bvec_map_sg(struct request_queue *q, struct bio_vec bv,
+ struct scatterlist *sglist, struct scatterlist **sg)
+{
+ *sg = sglist;
+ sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+ return 1;
+}
+
static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist,
struct scatterlist **sg)
{
struct bio_vec bvec, bvprv = { NULL };
struct bvec_iter iter;
- int nsegs, cluster;
-
- nsegs = 0;
- cluster = blk_queue_cluster(q);
-
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- /*
- * This is a hack - drivers should be neither modifying the
- * biovec, nor relying on bi_vcnt - but because of
- * blk_add_request_payload(), a discard bio may or may not have
- * a payload we need to set up here (thank you Christoph) and
- * bi_vcnt is really the only way of telling if we need to.
- */
- if (!bio->bi_vcnt)
- return 0;
- /* Fall through */
- case REQ_OP_WRITE_SAME:
- *sg = sglist;
- bvec = bio_iovec(bio);
- sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
- return 1;
- default:
- break;
- }
+ int cluster = blk_queue_cluster(q), nsegs = 0;
for_each_bio(bio)
bio_for_each_segment(bvec, bio, iter)
@@ -453,7 +439,11 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
struct scatterlist *sg = NULL;
int nsegs = 0;
- if (rq->bio)
+ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+ nsegs = __blk_bvec_map_sg(q, rq->special_vec, sglist, &sg);
+ else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
+ nsegs = __blk_bvec_map_sg(q, bio_iovec(rq->bio), sglist, &sg);
+ else if (rq->bio)
nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
if (unlikely(rq->rq_flags & RQF_COPY_USER) &&
@@ -486,12 +476,19 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
* Something must have been wrong if the figured number of
* segment is bigger than number of req's physical segments
*/
- WARN_ON(nsegs > rq->nr_phys_segments);
+ WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
return nsegs;
}
EXPORT_SYMBOL(blk_rq_map_sg);
+static void req_set_nomerge(struct request_queue *q, struct request *req)
+{
+ req->cmd_flags |= REQ_NOMERGE;
+ if (req == q->last_merge)
+ q->last_merge = NULL;
+}
+
static inline int ll_new_hw_segment(struct request_queue *q,
struct request *req,
struct bio *bio)
@@ -512,9 +509,7 @@ static inline int ll_new_hw_segment(struct request_queue *q,
return 1;
no_merge:
- req->cmd_flags |= REQ_NOMERGE;
- if (req == q->last_merge)
- q->last_merge = NULL;
+ req_set_nomerge(q, req);
return 0;
}
@@ -528,9 +523,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
- req->cmd_flags |= REQ_NOMERGE;
- if (req == q->last_merge)
- q->last_merge = NULL;
+ req_set_nomerge(q, req);
return 0;
}
if (!bio_flagged(req->biotail, BIO_SEG_VALID))
@@ -552,9 +545,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
- req->cmd_flags |= REQ_NOMERGE;
- if (req == q->last_merge)
- q->last_merge = NULL;
+ req_set_nomerge(q, req);
return 0;
}
if (!bio_flagged(bio, BIO_SEG_VALID))
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 19b1d9c5f07e..8e61e8640e17 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -87,6 +87,7 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set)
free_cpumask_var(cpus);
return 0;
}
+EXPORT_SYMBOL_GPL(blk_mq_map_queues);
/*
* We have no quick way of doing reverse lookups. This is only used at
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 01fb455d3377..eacd3af72099 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -259,6 +259,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
return ret;
}
+static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
+{
+ struct blk_mq_ctx *ctx;
+ unsigned int i;
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ blk_stat_init(&ctx->stat[BLK_STAT_READ]);
+ blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
+ }
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
+ const char *page, size_t count)
+{
+ blk_mq_stat_clear(hctx);
+ return count;
+}
+
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+ pre, (long long) stat->nr_samples,
+ (long long) stat->mean, (long long) stat->min,
+ (long long) stat->max);
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+ struct blk_rq_stat stat[2];
+ ssize_t ret;
+
+ blk_stat_init(&stat[BLK_STAT_READ]);
+ blk_stat_init(&stat[BLK_STAT_WRITE]);
+
+ blk_hctx_stat_get(hctx, stat);
+
+ ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
+ ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
+ return ret;
+}
+
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
.attr = {.name = "dispatched", .mode = S_IRUGO },
.show = blk_mq_sysfs_dispatched_show,
@@ -317,6 +358,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
.show = blk_mq_hw_sysfs_poll_show,
.store = blk_mq_hw_sysfs_poll_store,
};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
+ .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
+ .show = blk_mq_hw_sysfs_stat_show,
+ .store = blk_mq_hw_sysfs_stat_store,
+};
static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_queued.attr,
@@ -327,6 +373,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_cpus.attr,
&blk_mq_hw_sysfs_active.attr,
&blk_mq_hw_sysfs_poll.attr,
+ &blk_mq_hw_sysfs_stat.attr,
NULL,
};
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 077c2416a955..d79fdc11b1ee 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -30,6 +30,8 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-stat.h"
+#include "blk-wbt.h"
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
@@ -325,9 +327,12 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
if (rq->rq_flags & RQF_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
+
+ wbt_done(q->rq_wb, &rq->issue_stat);
rq->rq_flags = 0;
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+ clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
blk_mq_put_tag(hctx, ctx, tag);
blk_queue_exit(q);
}
@@ -353,6 +358,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
blk_account_io_done(rq);
if (rq->end_io) {
+ wbt_done(rq->q->rq_wb, &rq->issue_stat);
rq->end_io(rq, error);
} else {
if (unlikely(blk_bidi_rq(rq)))
@@ -403,10 +409,27 @@ static void blk_mq_ipi_complete_request(struct request *rq)
put_cpu();
}
+static void blk_mq_stat_add(struct request *rq)
+{
+ if (rq->rq_flags & RQF_STATS) {
+ /*
+ * We could rq->mq_ctx here, but there's less of a risk
+ * of races if we have the completion event add the stats
+ * to the local software queue.
+ */
+ struct blk_mq_ctx *ctx;
+
+ ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
+ blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
+ }
+}
+
static void __blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_mq_stat_add(rq);
+
if (!q->softirq_done_fn)
blk_mq_end_request(rq, rq->errors);
else
@@ -450,6 +473,12 @@ void blk_mq_start_request(struct request *rq)
if (unlikely(blk_bidi_rq(rq)))
rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
+ if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+ blk_stat_set_issue_time(&rq->issue_stat);
+ rq->rq_flags |= RQF_STATS;
+ wbt_issue(q->rq_wb, &rq->issue_stat);
+ }
+
blk_add_timer(rq);
/*
@@ -485,6 +514,7 @@ static void __blk_mq_requeue_request(struct request *rq)
struct request_queue *q = rq->q;
trace_block_rq_requeue(q, rq);
+ wbt_requeue(q->rq_wb, &rq->issue_stat);
if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -791,41 +821,13 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
struct request_queue *q = hctx->queue;
struct request *rq;
- LIST_HEAD(rq_list);
LIST_HEAD(driver_list);
struct list_head *dptr;
- int queued;
-
- if (unlikely(blk_mq_hctx_stopped(hctx)))
- return;
-
- hctx->run++;
-
- /*
- * Touch any software queue that has pending entries.
- */
- flush_busy_ctxs(hctx, &rq_list);
-
- /*
- * If we have previous entries on our dispatch list, grab them
- * and stuff them at the front for more fair dispatch.
- */
- if (!list_empty_careful(&hctx->dispatch)) {
- spin_lock(&hctx->lock);
- if (!list_empty(&hctx->dispatch))
- list_splice_init(&hctx->dispatch, &rq_list);
- spin_unlock(&hctx->lock);
- }
+ int queued, ret = BLK_MQ_RQ_QUEUE_OK;
/*
* Start off with dptr being NULL, so we start the first request
@@ -837,16 +839,15 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
* Now process all the entries, sending them to the driver.
*/
queued = 0;
- while (!list_empty(&rq_list)) {
+ while (!list_empty(list)) {
struct blk_mq_queue_data bd;
- int ret;
- rq = list_first_entry(&rq_list, struct request, queuelist);
+ rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
bd.rq = rq;
bd.list = dptr;
- bd.last = list_empty(&rq_list);
+ bd.last = list_empty(list);
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
@@ -854,7 +855,7 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
queued++;
break;
case BLK_MQ_RQ_QUEUE_BUSY:
- list_add(&rq->queuelist, &rq_list);
+ list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
break;
default:
@@ -872,7 +873,7 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
* We've done the first request. If we have more than 1
* left in the list, set dptr to defer issue.
*/
- if (!dptr && rq_list.next != rq_list.prev)
+ if (!dptr && list->next != list->prev)
dptr = &driver_list;
}
@@ -882,10 +883,11 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
* Any items that need requeuing? Stuff them into hctx->dispatch,
* that is where we will continue on next queue run.
*/
- if (!list_empty(&rq_list)) {
+ if (!list_empty(list)) {
spin_lock(&hctx->lock);
- list_splice(&rq_list, &hctx->dispatch);
+ list_splice(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
+
/*
* the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
* it's possible the queue is stopped and restarted again
@@ -897,6 +899,43 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
**/
blk_mq_run_hw_queue(hctx, true);
}
+
+ return ret != BLK_MQ_RQ_QUEUE_BUSY;
+}
+
+/*
+ * Run this hardware queue, pulling any software queues mapped to it in.
+ * Note that this function currently has various problems around ordering
+ * of IO. In particular, we'd like FIFO behaviour on handling existing
+ * items on the hctx->dispatch list. Ignore that for now.
+ */
+static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
+{
+ LIST_HEAD(rq_list);
+ LIST_HEAD(driver_list);
+
+ if (unlikely(blk_mq_hctx_stopped(hctx)))
+ return;
+
+ hctx->run++;
+
+ /*
+ * Touch any software queue that has pending entries.
+ */
+ flush_busy_ctxs(hctx, &rq_list);
+
+ /*
+ * If we have previous entries on our dispatch list, grab them
+ * and stuff them at the front for more fair dispatch.
+ */
+ if (!list_empty_careful(&hctx->dispatch)) {
+ spin_lock(&hctx->lock);
+ if (!list_empty(&hctx->dispatch))
+ list_splice_init(&hctx->dispatch, &rq_list);
+ spin_unlock(&hctx->lock);
+ }
+
+ blk_mq_dispatch_rq_list(hctx, &rq_list);
}
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
@@ -929,7 +968,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
- int cpu = hctx->next_cpu, next_cpu;
+ int next_cpu;
next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
if (next_cpu >= nr_cpu_ids)
@@ -937,8 +976,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
-
- return cpu;
}
return hctx->next_cpu;
@@ -1036,18 +1073,23 @@ void blk_mq_start_hw_queues(struct request_queue *q)
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);
+void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+ if (!blk_mq_hctx_stopped(hctx))
+ return;
+
+ clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ blk_mq_run_hw_queue(hctx, async);
+}
+EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
+
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
int i;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (!blk_mq_hctx_stopped(hctx))
- continue;
-
- clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
- blk_mq_run_hw_queue(hctx, async);
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_start_stopped_hw_queue(hctx, async);
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
@@ -1209,7 +1251,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
{
init_request_from_bio(rq, bio);
- blk_account_io_start(rq, 1);
+ blk_account_io_start(rq, true);
}
static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
@@ -1264,11 +1306,11 @@ static struct request *blk_mq_map_request(struct request_queue *q,
return rq;
}
-static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq, blk_qc_t *cookie)
+static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
{
int ret;
struct request_queue *q = rq->q;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
struct blk_mq_queue_data bd = {
.rq = rq,
.list = NULL,
@@ -1318,6 +1360,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
+ unsigned int wb_acct;
blk_queue_bounce(q, &bio);
@@ -1332,9 +1375,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
return BLK_QC_T_NONE;
+ wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ __wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
+ }
+
+ wbt_track(&rq->issue_stat, wb_acct);
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
@@ -1380,11 +1429,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
rcu_read_lock();
- blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+ blk_mq_try_issue_directly(old_rq, &cookie);
rcu_read_unlock();
} else {
srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
- blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+ blk_mq_try_issue_directly(old_rq, &cookie);
srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
}
goto done;
@@ -1418,6 +1467,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
struct blk_mq_alloc_data data;
struct request *rq;
blk_qc_t cookie;
+ unsigned int wb_acct;
blk_queue_bounce(q, &bio);
@@ -1434,9 +1484,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
} else
request_count = blk_plug_queued_count(q);
+ wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ __wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
+ }
+
+ wbt_track(&rq->issue_stat, wb_acct);
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
@@ -1456,6 +1512,13 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
struct request *last = NULL;
blk_mq_bio_to_request(rq, bio);
+
+ /*
+ * @request_count may become stale because of schedule
+ * out, so check the list again.
+ */
+ if (list_empty(&plug->mq_list))
+ request_count = 0;
if (!request_count)
trace_block_plug(q);
else
@@ -1786,6 +1849,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
spin_lock_init(&__ctx->lock);
INIT_LIST_HEAD(&__ctx->rq_list);
__ctx->queue = q;
+ blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
+ blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpu_online(i))
@@ -2081,6 +2146,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
*/
q->nr_requests = set->queue_depth;
+ /*
+ * Default to classic polling
+ */
+ q->poll_nsec = -1;
+
if (set->ops->complete)
blk_queue_softirq_done(q, set->ops->complete);
@@ -2116,6 +2186,8 @@ void blk_mq_free_queue(struct request_queue *q)
list_del_init(&q->all_q_node);
mutex_unlock(&all_q_mutex);
+ wbt_exit(q);
+
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
@@ -2162,16 +2234,9 @@ static void blk_mq_queue_reinit_work(void)
*/
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_start(q);
- list_for_each_entry(q, &all_q_list, all_q_node) {
+ list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_wait(q);
- /*
- * timeout handler can't touch hw queue during the
- * reinitialization
- */
- del_timer_sync(&q->timeout);
- }
-
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_queue_reinit(q, &cpuhp_online_new);
@@ -2416,6 +2481,165 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
+static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ struct blk_rq_stat stat[2];
+ unsigned long ret = 0;
+
+ /*
+ * If stats collection isn't on, don't sleep but turn it on for
+ * future users
+ */
+ if (!blk_stat_enable(q))
+ return 0;
+
+ /*
+ * We don't have to do this once per IO, should optimize this
+ * to just use the current window of stats until it changes
+ */
+ memset(&stat, 0, sizeof(stat));
+ blk_hctx_stat_get(hctx, stat);
+
+ /*
+ * As an optimistic guess, use half of the mean service time
+ * for this type of request. We can (and should) make this smarter.
+ * For instance, if the completion latencies are tight, we can
+ * get closer than just half the mean. This is especially
+ * important on devices where the completion latencies are longer
+ * than ~10 usec.
+ */
+ if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
+ ret = (stat[BLK_STAT_READ].mean + 1) / 2;
+ else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
+ ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
+
+ return ret;
+}
+
+static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ struct hrtimer_sleeper hs;
+ enum hrtimer_mode mode;
+ unsigned int nsecs;
+ ktime_t kt;
+
+ if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+ return false;
+
+ /*
+ * poll_nsec can be:
+ *
+ * -1: don't ever hybrid sleep
+ * 0: use half of prev avg
+ * >0: use this specific value
+ */
+ if (q->poll_nsec == -1)
+ return false;
+ else if (q->poll_nsec > 0)
+ nsecs = q->poll_nsec;
+ else
+ nsecs = blk_mq_poll_nsecs(q, hctx, rq);
+
+ if (!nsecs)
+ return false;
+
+ set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+
+ /*
+ * This will be replaced with the stats tracking code, using
+ * 'avg_completion_time / 2' as the pre-sleep target.
+ */
+ kt = ktime_set(0, nsecs);
+
+ mode = HRTIMER_MODE_REL;
+ hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires(&hs.timer, kt);
+
+ hrtimer_init_sleeper(&hs, current);
+ do {
+ if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+ break;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ hrtimer_start_expires(&hs.timer, mode);
+ if (hs.task)
+ io_schedule();
+ hrtimer_cancel(&hs.timer);
+ mode = HRTIMER_MODE_ABS;
+ } while (hs.task && !signal_pending(current));
+
+ __set_current_state(TASK_RUNNING);
+ destroy_hrtimer_on_stack(&hs.timer);
+ return true;
+}
+
+static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+ struct request_queue *q = hctx->queue;
+ long state;
+
+ /*
+ * If we sleep, have the caller restart the poll loop to reset
+ * the state. Like for the other success return cases, the
+ * caller is responsible for checking if the IO completed. If
+ * the IO isn't complete, we'll get called again and will go
+ * straight to the busy poll loop.
+ */
+ if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
+ return true;
+
+ hctx->poll_considered++;
+
+ state = current->state;
+ while (!need_resched()) {
+ int ret;
+
+ hctx->poll_invoked++;
+
+ ret = q->mq_ops->poll(hctx, rq->tag);
+ if (ret > 0) {
+ hctx->poll_success++;
+ set_current_state(TASK_RUNNING);
+ return true;
+ }
+
+ if (signal_pending_state(state, current))
+ set_current_state(TASK_RUNNING);
+
+ if (current->state == TASK_RUNNING)
+ return true;
+ if (ret < 0)
+ break;
+ cpu_relax();
+ }
+
+ return false;
+}
+
+bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+{
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_plug *plug;
+ struct request *rq;
+
+ if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ return false;
+
+ plug = current->plug;
+ if (plug)
+ blk_flush_plug_list(plug, false);
+
+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+ rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+
+ return __blk_mq_poll(hctx, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_poll);
+
void blk_mq_disable_hotplug(void)
{
mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ac772dac7ce8..63e9116cddbd 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
#ifndef INT_BLK_MQ_H
#define INT_BLK_MQ_H
+#include "blk-stat.h"
+
struct blk_mq_tag_set;
struct blk_mq_ctx {
@@ -18,6 +20,7 @@ struct blk_mq_ctx {
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
+ struct blk_rq_stat stat[2];
struct request_queue *queue;
struct kobject kobj;
@@ -28,6 +31,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
/*
* CPU hotplug helpers
@@ -38,7 +42,6 @@ void blk_mq_disable_hotplug(void);
/*
* CPU -> queue mappings
*/
-int blk_mq_map_queues(struct blk_mq_tag_set *set);
extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 55369a65dea2..529e55f52a03 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -13,6 +13,7 @@
#include <linux/gfp.h>
#include "blk.h"
+#include "blk-wbt.h"
unsigned long blk_max_low_pfn;
EXPORT_SYMBOL(blk_max_low_pfn);
@@ -95,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
+ lim->max_write_zeroes_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->discard_granularity = 0;
@@ -131,6 +133,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
+ lim->max_write_zeroes_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -250,6 +253,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
limits->max_sectors = max_sectors;
+ q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9);
}
EXPORT_SYMBOL(blk_queue_max_hw_sectors);
@@ -299,6 +303,19 @@ void blk_queue_max_write_same_sectors(struct request_queue *q,
EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
/**
+ * blk_queue_max_write_zeroes_sectors - set max sectors for a single
+ * write zeroes
+ * @q: the request queue for the device
+ * @max_write_zeroes_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+ unsigned int max_write_zeroes_sectors)
+{
+ q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
+
+/**
* blk_queue_max_segments - set max hw segments for a request for this queue
* @q: the request queue for the device
* @max_segments: max number of segments
@@ -526,6 +543,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_same_sectors = min(t->max_write_same_sectors,
b->max_write_same_sectors);
+ t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
+ b->max_write_zeroes_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -837,6 +856,19 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
/**
+ * blk_set_queue_depth - tell the block layer about the device queue depth
+ * @q: the request queue for the device
+ * @depth: queue depth
+ *
+ */
+void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
+{
+ q->queue_depth = depth;
+ wbt_set_queue_depth(q->rq_wb, depth);
+}
+EXPORT_SYMBOL(blk_set_queue_depth);
+
+/**
* blk_queue_write_cache - configure queue's write cache
* @q: the request queue for the device
* @wc: write back cache on or off
@@ -856,6 +888,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
else
queue_flag_clear(QUEUE_FLAG_FUA, q);
spin_unlock_irq(q->queue_lock);
+
+ wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
diff --git a/block/blk-stat.c b/block/blk-stat.c
new file mode 100644
index 000000000000..9b43efb8933f
--- /dev/null
+++ b/block/blk-stat.c
@@ -0,0 +1,256 @@
+/*
+ * Block stat tracking code
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/blk-mq.h>
+
+#include "blk-stat.h"
+#include "blk-mq.h"
+
+static void blk_stat_flush_batch(struct blk_rq_stat *stat)
+{
+ const s32 nr_batch = READ_ONCE(stat->nr_batch);
+ const s32 nr_samples = READ_ONCE(stat->nr_samples);
+
+ if (!nr_batch)
+ return;
+ if (!nr_samples)
+ stat->mean = div64_s64(stat->batch, nr_batch);
+ else {
+ stat->mean = div64_s64((stat->mean * nr_samples) +
+ stat->batch,
+ nr_batch + nr_samples);
+ }
+
+ stat->nr_samples += nr_batch;
+ stat->nr_batch = stat->batch = 0;
+}
+
+static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+{
+ if (!src->nr_samples)
+ return;
+
+ blk_stat_flush_batch(src);
+
+ dst->min = min(dst->min, src->min);
+ dst->max = max(dst->max, src->max);
+
+ if (!dst->nr_samples)
+ dst->mean = src->mean;
+ else {
+ dst->mean = div64_s64((src->mean * src->nr_samples) +
+ (dst->mean * dst->nr_samples),
+ dst->nr_samples + src->nr_samples);
+ }
+ dst->nr_samples += src->nr_samples;
+}
+
+static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ uint64_t latest = 0;
+ int i, j, nr;
+
+ blk_stat_init(&dst[BLK_STAT_READ]);
+ blk_stat_init(&dst[BLK_STAT_WRITE]);
+
+ nr = 0;
+ do {
+ uint64_t newest = 0;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx_for_each_ctx(hctx, ctx, j) {
+ blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
+ blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+
+ if (!ctx->stat[BLK_STAT_READ].nr_samples &&
+ !ctx->stat[BLK_STAT_WRITE].nr_samples)
+ continue;
+ if (ctx->stat[BLK_STAT_READ].time > newest)
+ newest = ctx->stat[BLK_STAT_READ].time;
+ if (ctx->stat[BLK_STAT_WRITE].time > newest)
+ newest = ctx->stat[BLK_STAT_WRITE].time;
+ }
+ }
+
+ /*
+ * No samples
+ */
+ if (!newest)
+ break;
+
+ if (newest > latest)
+ latest = newest;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx_for_each_ctx(hctx, ctx, j) {
+ if (ctx->stat[BLK_STAT_READ].time == newest) {
+ blk_stat_sum(&dst[BLK_STAT_READ],
+ &ctx->stat[BLK_STAT_READ]);
+ nr++;
+ }
+ if (ctx->stat[BLK_STAT_WRITE].time == newest) {
+ blk_stat_sum(&dst[BLK_STAT_WRITE],
+ &ctx->stat[BLK_STAT_WRITE]);
+ nr++;
+ }
+ }
+ }
+ /*
+ * If we race on finding an entry, just loop back again.
+ * Should be very rare.
+ */
+ } while (!nr);
+
+ dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest;
+}
+
+void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+ if (q->mq_ops)
+ blk_mq_stat_get(q, dst);
+ else {
+ blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]);
+ blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]);
+ memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ],
+ sizeof(struct blk_rq_stat));
+ memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE],
+ sizeof(struct blk_rq_stat));
+ }
+}
+
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+{
+ struct blk_mq_ctx *ctx;
+ unsigned int i, nr;
+
+ nr = 0;
+ do {
+ uint64_t newest = 0;
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
+ blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+
+ if (!ctx->stat[BLK_STAT_READ].nr_samples &&
+ !ctx->stat[BLK_STAT_WRITE].nr_samples)
+ continue;
+
+ if (ctx->stat[BLK_STAT_READ].time > newest)
+ newest = ctx->stat[BLK_STAT_READ].time;
+ if (ctx->stat[BLK_STAT_WRITE].time > newest)
+ newest = ctx->stat[BLK_STAT_WRITE].time;
+ }
+
+ if (!newest)
+ break;
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ if (ctx->stat[BLK_STAT_READ].time == newest) {
+ blk_stat_sum(&dst[BLK_STAT_READ],
+ &ctx->stat[BLK_STAT_READ]);
+ nr++;
+ }
+ if (ctx->stat[BLK_STAT_WRITE].time == newest) {
+ blk_stat_sum(&dst[BLK_STAT_WRITE],
+ &ctx->stat[BLK_STAT_WRITE]);
+ nr++;
+ }
+ }
+ /*
+ * If we race on finding an entry, just loop back again.
+ * Should be very rare, as the window is only updated
+ * occasionally
+ */
+ } while (!nr);
+}
+
+static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+{
+ stat->min = -1ULL;
+ stat->max = stat->nr_samples = stat->mean = 0;
+ stat->batch = stat->nr_batch = 0;
+ stat->time = time_now & BLK_STAT_NSEC_MASK;
+}
+
+void blk_stat_init(struct blk_rq_stat *stat)
+{
+ __blk_stat_init(stat, ktime_to_ns(ktime_get()));
+}
+
+static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
+{
+ return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
+}
+
+bool blk_stat_is_current(struct blk_rq_stat *stat)
+{
+ return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+}
+
+void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+{
+ s64 now, value;
+
+ now = __blk_stat_time(ktime_to_ns(ktime_get()));
+ if (now < blk_stat_time(&rq->issue_stat))
+ return;
+
+ if (!__blk_stat_is_current(stat, now))
+ __blk_stat_init(stat, now);
+
+ value = now - blk_stat_time(&rq->issue_stat);
+ if (value > stat->max)
+ stat->max = value;
+ if (value < stat->min)
+ stat->min = value;
+
+ if (stat->batch + value < stat->batch ||
+ stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+ blk_stat_flush_batch(stat);
+
+ stat->batch += value;
+ stat->nr_batch++;
+}
+
+void blk_stat_clear(struct request_queue *q)
+{
+ if (q->mq_ops) {
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ int i, j;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx_for_each_ctx(hctx, ctx, j) {
+ blk_stat_init(&ctx->stat[BLK_STAT_READ]);
+ blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
+ }
+ }
+ } else {
+ blk_stat_init(&q->rq_stats[BLK_STAT_READ]);
+ blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]);
+ }
+}
+
+void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+{
+ stat->time = (stat->time & BLK_STAT_MASK) |
+ (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+}
+
+/*
+ * Enable stat tracking, return whether it was enabled
+ */
+bool blk_stat_enable(struct request_queue *q)
+{
+ if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+ set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+ return false;
+ }
+
+ return true;
+}
diff --git a/block/blk-stat.h b/block/blk-stat.h
new file mode 100644
index 000000000000..a2050a0a5314
--- /dev/null
+++ b/block/blk-stat.h
@@ -0,0 +1,42 @@
+#ifndef BLK_STAT_H
+#define BLK_STAT_H
+
+/*
+ * ~0.13s window as a power-of-2 (2^27 nsecs)
+ */
+#define BLK_STAT_NSEC 134217728ULL
+#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1)
+
+/*
+ * Upper 3 bits can be used elsewhere
+ */
+#define BLK_STAT_RES_BITS 3
+#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1)
+#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK
+
+enum {
+ BLK_STAT_READ = 0,
+ BLK_STAT_WRITE,
+};
+
+void blk_stat_add(struct blk_rq_stat *, struct request *);
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
+void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
+void blk_stat_clear(struct request_queue *);
+void blk_stat_init(struct blk_rq_stat *);
+bool blk_stat_is_current(struct blk_rq_stat *);
+void blk_stat_set_issue_time(struct blk_issue_stat *);
+bool blk_stat_enable(struct request_queue *);
+
+static inline u64 __blk_stat_time(u64 time)
+{
+ return time & BLK_STAT_TIME_MASK;
+}
+
+static inline u64 blk_stat_time(struct blk_issue_stat *stat)
+{
+ return __blk_stat_time(stat->time);
+}
+
+#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 488c2e28feb8..1dbce057592d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -13,6 +13,7 @@
#include "blk.h"
#include "blk-mq.h"
+#include "blk-wbt.h"
struct queue_sysfs_entry {
struct attribute attr;
@@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
return count;
}
+static ssize_t queue_var_store64(s64 *var, const char *page)
+{
+ int err;
+ s64 v;
+
+ err = kstrtos64(page, 10, &v);
+ if (err < 0)
+ return err;
+
+ *var = v;
+ return 0;
+}
+
static ssize_t queue_requests_show(struct request_queue *q, char *page)
{
return queue_var_show(q->nr_requests, (page));
@@ -197,6 +211,11 @@ static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
(unsigned long long)q->limits.max_write_same_sectors << 9);
}
+static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long)q->limits.max_write_zeroes_sectors << 9);
+}
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
@@ -217,6 +236,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
spin_lock_irq(q->queue_lock);
q->limits.max_sectors = max_sectors_kb << 1;
+ q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
spin_unlock_irq(q->queue_lock);
return ret;
@@ -336,6 +356,38 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
return ret;
}
+static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
+{
+ int val;
+
+ if (q->poll_nsec == -1)
+ val = -1;
+ else
+ val = q->poll_nsec / 1000;
+
+ return sprintf(page, "%d\n", val);
+}
+
+static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ int err, val;
+
+ if (!q->mq_ops || !q->mq_ops->poll)
+ return -EINVAL;
+
+ err = kstrtoint(page, 10, &val);
+ if (err < 0)
+ return err;
+
+ if (val == -1)
+ q->poll_nsec = -1;
+ else
+ q->poll_nsec = val * 1000;
+
+ return count;
+}
+
static ssize_t queue_poll_show(struct request_queue *q, char *page)
{
return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
@@ -364,6 +416,50 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
return ret;
}
+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
+{
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+}
+
+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ struct rq_wb *rwb;
+ ssize_t ret;
+ s64 val;
+
+ ret = queue_var_store64(&val, page);
+ if (ret < 0)
+ return ret;
+ if (val < -1)
+ return -EINVAL;
+
+ rwb = q->rq_wb;
+ if (!rwb) {
+ ret = wbt_init(q);
+ if (ret)
+ return ret;
+
+ rwb = q->rq_wb;
+ if (!rwb)
+ return -EINVAL;
+ }
+
+ if (val == -1)
+ rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+ else if (val >= 0)
+ rwb->min_lat_nsec = val * 1000ULL;
+
+ if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
+ rwb->enable_state = WBT_STATE_ON_MANUAL;
+
+ wbt_update_limits(rwb);
+ return count;
+}
+
static ssize_t queue_wc_show(struct request_queue *q, char *page)
{
if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
@@ -401,6 +497,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
return queue_var_show(blk_queue_dax(q), page);
}
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+ pre, (long long) stat->nr_samples,
+ (long long) stat->mean, (long long) stat->min,
+ (long long) stat->max);
+}
+
+static ssize_t queue_stats_show(struct request_queue *q, char *page)
+{
+ struct blk_rq_stat stat[2];
+ ssize_t ret;
+
+ blk_queue_stat_get(q, stat);
+
+ ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
+ ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
+ return ret;
+}
+
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
@@ -501,6 +617,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = {
.show = queue_write_same_max_show,
};
+static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
+ .attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO },
+ .show = queue_write_zeroes_max_show,
+};
+
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
.show = queue_show_nonrot,
@@ -542,6 +663,12 @@ static struct queue_sysfs_entry queue_poll_entry = {
.store = queue_poll_store,
};
+static struct queue_sysfs_entry queue_poll_delay_entry = {
+ .attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_poll_delay_show,
+ .store = queue_poll_delay_store,
+};
+
static struct queue_sysfs_entry queue_wc_entry = {
.attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
.show = queue_wc_show,
@@ -553,6 +680,17 @@ static struct queue_sysfs_entry queue_dax_entry = {
.show = queue_dax_show,
};
+static struct queue_sysfs_entry queue_stats_entry = {
+ .attr = {.name = "stats", .mode = S_IRUGO },
+ .show = queue_stats_show,
+};
+
+static struct queue_sysfs_entry queue_wb_lat_entry = {
+ .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_wb_lat_show,
+ .store = queue_wb_lat_store,
+};
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -573,6 +711,7 @@ static struct attribute *default_attrs[] = {
&queue_discard_max_hw_entry.attr,
&queue_discard_zeroes_data_entry.attr,
&queue_write_same_max_entry.attr,
+ &queue_write_zeroes_max_entry.attr,
&queue_nonrot_entry.attr,
&queue_zoned_entry.attr,
&queue_nomerges_entry.attr,
@@ -582,6 +721,9 @@ static struct attribute *default_attrs[] = {
&queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_dax_entry.attr,
+ &queue_stats_entry.attr,
+ &queue_wb_lat_entry.attr,
+ &queue_poll_delay_entry.attr,
NULL,
};
@@ -656,6 +798,7 @@ static void blk_release_queue(struct kobject *kobj)
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
+ wbt_exit(q);
bdi_exit(&q->backing_dev_info);
blkcg_exit_queue(q);
@@ -696,6 +839,23 @@ struct kobj_type blk_queue_ktype = {
.release = blk_release_queue,
};
+static void blk_wb_init(struct request_queue *q)
+{
+#ifndef CONFIG_BLK_WBT_MQ
+ if (q->mq_ops)
+ return;
+#endif
+#ifndef CONFIG_BLK_WBT_SQ
+ if (q->request_fn)
+ return;
+#endif
+
+ /*
+ * If this fails, we don't get throttling
+ */
+ wbt_init(q);
+}
+
int blk_register_queue(struct gendisk *disk)
{
int ret;
@@ -735,6 +895,8 @@ int blk_register_queue(struct gendisk *disk)
if (q->mq_ops)
blk_mq_register_dev(dev, q);
+ blk_wb_init(q);
+
if (!q->request_fn)
return 0;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
new file mode 100644
index 000000000000..6e82769f4042
--- /dev/null
+++ b/block/blk-wbt.c
@@ -0,0 +1,750 @@
+/*
+ * buffered writeback throttling. loosely based on CoDel. We can't drop
+ * packets for IO scheduling, so the logic is something like this:
+ *
+ * - Monitor latencies in a defined window of time.
+ * - If the minimum latency in the above window exceeds some target, increment
+ * scaling step and scale down queue depth by a factor of 2x. The monitoring
+ * window is then shrunk to 100 / sqrt(scaling step + 1).
+ * - For any window where we don't have solid data on what the latencies
+ * look like, retain status quo.
+ * - If latencies look good, decrement scaling step.
+ * - If we're only doing writes, allow the scaling step to go negative. This
+ * will temporarily boost write performance, snapping back to a stable
+ * scaling step of 0 if reads show up or the heavy writers finish. Unlike
+ * positive scaling steps where we shrink the monitoring window, a negative
+ * scaling step retains the default step==0 window size.
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/swap.h>
+
+#include "blk-wbt.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/wbt.h>
+
+enum {
+ /*
+ * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
+ * from here depending on device stats
+ */
+ RWB_DEF_DEPTH = 16,
+
+ /*
+ * 100msec window
+ */
+ RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
+
+ /*
+ * Disregard stats, if we don't meet this minimum
+ */
+ RWB_MIN_WRITE_SAMPLES = 3,
+
+ /*
+ * If we have this number of consecutive windows with not enough
+ * information to scale up or down, scale up.
+ */
+ RWB_UNKNOWN_BUMP = 5,
+};
+
+static inline bool rwb_enabled(struct rq_wb *rwb)
+{
+ return rwb && rwb->wb_normal != 0;
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+ int cur = atomic_read(v);
+
+ for (;;) {
+ int old;
+
+ if (cur >= below)
+ return false;
+ old = atomic_cmpxchg(v, cur, cur + 1);
+ if (old == cur)
+ break;
+ cur = old;
+ }
+
+ return true;
+}
+
+static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
+{
+ if (rwb_enabled(rwb)) {
+ const unsigned long cur = jiffies;
+
+ if (cur != *var)
+ *var = cur;
+ }
+}
+
+/*
+ * If a task was rate throttled in balance_dirty_pages() within the last
+ * second or so, use that to indicate a higher cleaning rate.
+ */
+static bool wb_recent_wait(struct rq_wb *rwb)
+{
+ struct bdi_writeback *wb = &rwb->queue->backing_dev_info.wb;
+
+ return time_before(jiffies, wb->dirty_sleep + HZ);
+}
+
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+{
+ return &rwb->rq_wait[is_kswapd];
+}
+
+static void rwb_wake_all(struct rq_wb *rwb)
+{
+ int i;
+
+ for (i = 0; i < WBT_NUM_RWQ; i++) {
+ struct rq_wait *rqw = &rwb->rq_wait[i];
+
+ if (waitqueue_active(&rqw->wait))
+ wake_up_all(&rqw->wait);
+ }
+}
+
+void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
+{
+ struct rq_wait *rqw;
+ int inflight, limit;
+
+ if (!(wb_acct & WBT_TRACKED))
+ return;
+
+ rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+ inflight = atomic_dec_return(&rqw->inflight);
+
+ /*
+ * wbt got disabled with IO in flight. Wake up any potential
+ * waiters, we don't have to do more than that.
+ */
+ if (unlikely(!rwb_enabled(rwb))) {
+ rwb_wake_all(rwb);
+ return;
+ }
+
+ /*
+ * If the device does write back caching, drop further down
+ * before we wake people up.
+ */
+ if (rwb->wc && !wb_recent_wait(rwb))
+ limit = 0;
+ else
+ limit = rwb->wb_normal;
+
+ /*
+ * Don't wake anyone up if we are above the normal limit.
+ */
+ if (inflight && inflight >= limit)
+ return;
+
+ if (waitqueue_active(&rqw->wait)) {
+ int diff = limit - inflight;
+
+ if (!inflight || diff >= rwb->wb_background / 2)
+ wake_up_all(&rqw->wait);
+ }
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+ if (!rwb)
+ return;
+
+ if (!wbt_is_tracked(stat)) {
+ if (rwb->sync_cookie == stat) {
+ rwb->sync_issue = 0;
+ rwb->sync_cookie = NULL;
+ }
+
+ if (wbt_is_read(stat))
+ wb_timestamp(rwb, &rwb->last_comp);
+ wbt_clear_state(stat);
+ } else {
+ WARN_ON_ONCE(stat == rwb->sync_cookie);
+ __wbt_done(rwb, wbt_stat_to_mask(stat));
+ wbt_clear_state(stat);
+ }
+}
+
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+static bool calc_wb_limits(struct rq_wb *rwb)
+{
+ unsigned int depth;
+ bool ret = false;
+
+ if (!rwb->min_lat_nsec) {
+ rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
+ return false;
+ }
+
+ /*
+ * For QD=1 devices, this is a special case. It's important for those
+ * to have one request ready when one completes, so force a depth of
+ * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+ * since the device can't have more than that in flight. If we're
+ * scaling down, then keep a setting of 1/1/1.
+ */
+ if (rwb->queue_depth == 1) {
+ if (rwb->scale_step > 0)
+ rwb->wb_max = rwb->wb_normal = 1;
+ else {
+ rwb->wb_max = rwb->wb_normal = 2;
+ ret = true;
+ }
+ rwb->wb_background = 1;
+ } else {
+ /*
+ * scale_step == 0 is our default state. If we have suffered
+ * latency spikes, step will be > 0, and we shrink the
+ * allowed write depths. If step is < 0, we're only doing
+ * writes, and we allow a temporarily higher depth to
+ * increase performance.
+ */
+ depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
+ if (rwb->scale_step > 0)
+ depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
+ else if (rwb->scale_step < 0) {
+ unsigned int maxd = 3 * rwb->queue_depth / 4;
+
+ depth = 1 + ((depth - 1) << -rwb->scale_step);
+ if (depth > maxd) {
+ depth = maxd;
+ ret = true;
+ }
+ }
+
+ /*
+ * Set our max/normal/bg queue depths based on how far
+ * we have scaled down (->scale_step).
+ */
+ rwb->wb_max = depth;
+ rwb->wb_normal = (rwb->wb_max + 1) / 2;
+ rwb->wb_background = (rwb->wb_max + 3) / 4;
+ }
+
+ return ret;
+}
+
+static inline bool stat_sample_valid(struct blk_rq_stat *stat)
+{
+ /*
+ * We need at least one read sample, and a minimum of
+ * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
+ * that it's writes impacting us, and not just some sole read on
+ * a device that is in a lower power state.
+ */
+ return stat[BLK_STAT_READ].nr_samples >= 1 &&
+ stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+}
+
+static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
+{
+ u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+
+ if (!issue || !rwb->sync_cookie)
+ return 0;
+
+ now = ktime_to_ns(ktime_get());
+ return now - issue;
+}
+
+enum {
+ LAT_OK = 1,
+ LAT_UNKNOWN,
+ LAT_UNKNOWN_WRITES,
+ LAT_EXCEEDED,
+};
+
+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+{
+ struct backing_dev_info *bdi = &rwb->queue->backing_dev_info;
+ u64 thislat;
+
+ /*
+ * If our stored sync issue exceeds the window size, or it
+ * exceeds our min target AND we haven't logged any entries,
+ * flag the latency as exceeded. wbt works off completion latencies,
+ * but for a flooded device, a single sync IO can take a long time
+ * to complete after being issued. If this time exceeds our
+ * monitoring window AND we didn't see any other completions in that
+ * window, then count that sync IO as a violation of the latency.
+ */
+ thislat = rwb_sync_issue_lat(rwb);
+ if (thislat > rwb->cur_win_nsec ||
+ (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) {
+ trace_wbt_lat(bdi, thislat);
+ return LAT_EXCEEDED;
+ }
+
+ /*
+ * No read/write mix, if stat isn't valid
+ */
+ if (!stat_sample_valid(stat)) {
+ /*
+ * If we had writes in this stat window and the window is
+ * current, we're only doing writes. If a task recently
+ * waited or still has writes in flights, consider us doing
+ * just writes as well.
+ */
+ if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) ||
+ wb_recent_wait(rwb) || wbt_inflight(rwb))
+ return LAT_UNKNOWN_WRITES;
+ return LAT_UNKNOWN;
+ }
+
+ /*
+ * If the 'min' latency exceeds our target, step down.
+ */
+ if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) {
+ trace_wbt_lat(bdi, stat[BLK_STAT_READ].min);
+ trace_wbt_stat(bdi, stat);
+ return LAT_EXCEEDED;
+ }
+
+ if (rwb->scale_step)
+ trace_wbt_stat(bdi, stat);
+
+ return LAT_OK;
+}
+
+static int latency_exceeded(struct rq_wb *rwb)
+{
+ struct blk_rq_stat stat[2];
+
+ blk_queue_stat_get(rwb->queue, stat);
+ return __latency_exceeded(rwb, stat);
+}
+
+static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
+{
+ struct backing_dev_info *bdi = &rwb->queue->backing_dev_info;
+
+ trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
+ rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+}
+
+static void scale_up(struct rq_wb *rwb)
+{
+ /*
+ * Hit max in previous round, stop here
+ */
+ if (rwb->scaled_max)
+ return;
+
+ rwb->scale_step--;
+ rwb->unknown_cnt = 0;
+ blk_stat_clear(rwb->queue);
+
+ rwb->scaled_max = calc_wb_limits(rwb);
+
+ rwb_wake_all(rwb);
+
+ rwb_trace_step(rwb, "step up");
+}
+
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+static void scale_down(struct rq_wb *rwb, bool hard_throttle)
+{
+ /*
+ * Stop scaling down when we've hit the limit. This also prevents
+ * ->scale_step from going to crazy values, if the device can't
+ * keep up.
+ */
+ if (rwb->wb_max == 1)
+ return;
+
+ if (rwb->scale_step < 0 && hard_throttle)
+ rwb->scale_step = 0;
+ else
+ rwb->scale_step++;
+
+ rwb->scaled_max = false;
+ rwb->unknown_cnt = 0;
+ blk_stat_clear(rwb->queue);
+ calc_wb_limits(rwb);
+ rwb_trace_step(rwb, "step down");
+}
+
+static void rwb_arm_timer(struct rq_wb *rwb)
+{
+ unsigned long expires;
+
+ if (rwb->scale_step > 0) {
+ /*
+ * We should speed this up, using some variant of a fast
+ * integer inverse square root calculation. Since we only do
+ * this for every window expiration, it's not a huge deal,
+ * though.
+ */
+ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
+ int_sqrt((rwb->scale_step + 1) << 8));
+ } else {
+ /*
+ * For step < 0, we don't want to increase/decrease the
+ * window size.
+ */
+ rwb->cur_win_nsec = rwb->win_nsec;
+ }
+
+ expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
+ mod_timer(&rwb->window_timer, expires);
+}
+
+static void wb_timer_fn(unsigned long data)
+{
+ struct rq_wb *rwb = (struct rq_wb *) data;
+ unsigned int inflight = wbt_inflight(rwb);
+ int status;
+
+ status = latency_exceeded(rwb);
+
+ trace_wbt_timer(&rwb->queue->backing_dev_info, status, rwb->scale_step,
+ inflight);
+
+ /*
+ * If we exceeded the latency target, step down. If we did not,
+ * step one level up. If we don't know enough to say either exceeded
+ * or ok, then don't do anything.
+ */
+ switch (status) {
+ case LAT_EXCEEDED:
+ scale_down(rwb, true);
+ break;
+ case LAT_OK:
+ scale_up(rwb);
+ break;
+ case LAT_UNKNOWN_WRITES:
+ /*
+ * We started a the center step, but don't have a valid
+ * read/write sample, but we do have writes going on.
+ * Allow step to go negative, to increase write perf.
+ */
+ scale_up(rwb);
+ break;
+ case LAT_UNKNOWN:
+ if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
+ break;
+ /*
+ * We get here when previously scaled reduced depth, and we
+ * currently don't have a valid read/write sample. For that
+ * case, slowly return to center state (step == 0).
+ */
+ if (rwb->scale_step > 0)
+ scale_up(rwb);
+ else if (rwb->scale_step < 0)
+ scale_down(rwb, false);
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * Re-arm timer, if we have IO in flight
+ */
+ if (rwb->scale_step || inflight)
+ rwb_arm_timer(rwb);
+}
+
+void wbt_update_limits(struct rq_wb *rwb)
+{
+ rwb->scale_step = 0;
+ rwb->scaled_max = false;
+ calc_wb_limits(rwb);
+
+ rwb_wake_all(rwb);
+}
+
+static bool close_io(struct rq_wb *rwb)
+{
+ const unsigned long now = jiffies;
+
+ return time_before(now, rwb->last_issue + HZ / 10) ||
+ time_before(now, rwb->last_comp + HZ / 10);
+}
+
+#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
+
+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+{
+ unsigned int limit;
+
+ /*
+ * At this point we know it's a buffered write. If this is
+ * kswapd trying to free memory, or REQ_SYNC is set, set, then
+ * it's WB_SYNC_ALL writeback, and we'll use the max limit for
+ * that. If the write is marked as a background write, then use
+ * the idle limit, or go to normal if we haven't had competing
+ * IO for a bit.
+ */
+ if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+ limit = rwb->wb_max;
+ else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
+ /*
+ * If less than 100ms since we completed unrelated IO,
+ * limit us to half the depth for background writeback.
+ */
+ limit = rwb->wb_background;
+ } else
+ limit = rwb->wb_normal;
+
+ return limit;
+}
+
+static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
+ wait_queue_t *wait, unsigned long rw)
+{
+ /*
+ * inc it here even if disabled, since we'll dec it at completion.
+ * this only happens if the task was sleeping in __wbt_wait(),
+ * and someone turned it off at the same time.
+ */
+ if (!rwb_enabled(rwb)) {
+ atomic_inc(&rqw->inflight);
+ return true;
+ }
+
+ /*
+ * If the waitqueue is already active and we are not the next
+ * in line to be woken up, wait for our turn.
+ */
+ if (waitqueue_active(&rqw->wait) &&
+ rqw->wait.task_list.next != &wait->task_list)
+ return false;
+
+ return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+{
+ struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
+ DEFINE_WAIT(wait);
+
+ if (may_queue(rwb, rqw, &wait, rw))
+ return;
+
+ do {
+ prepare_to_wait_exclusive(&rqw->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (may_queue(rwb, rqw, &wait, rw))
+ break;
+
+ if (lock)
+ spin_unlock_irq(lock);
+
+ io_schedule();
+
+ if (lock)
+ spin_lock_irq(lock);
+ } while (1);
+
+ finish_wait(&rqw->wait, &wait);
+}
+
+static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
+{
+ const int op = bio_op(bio);
+
+ /*
+ * If not a WRITE, do nothing
+ */
+ if (op != REQ_OP_WRITE)
+ return false;
+
+ /*
+ * Don't throttle WRITE_ODIRECT
+ */
+ if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE))
+ return false;
+
+ return true;
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+unsigned int wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
+{
+ unsigned int ret = 0;
+
+ if (!rwb_enabled(rwb))
+ return 0;
+
+ if (bio_op(bio) == REQ_OP_READ)
+ ret = WBT_READ;
+
+ if (!wbt_should_throttle(rwb, bio)) {
+ if (ret & WBT_READ)
+ wb_timestamp(rwb, &rwb->last_issue);
+ return ret;
+ }
+
+ __wbt_wait(rwb, bio->bi_opf, lock);
+
+ if (!timer_pending(&rwb->window_timer))
+ rwb_arm_timer(rwb);
+
+ if (current_is_kswapd())
+ ret |= WBT_KSWAPD;
+
+ return ret | WBT_TRACKED;
+}
+
+void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+ if (!rwb_enabled(rwb))
+ return;
+
+ /*
+ * Track sync issue, in case it takes a long time to complete. Allows
+ * us to react quicker, if a sync IO takes a long time to complete.
+ * Note that this is just a hint. 'stat' can go away when the
+ * request completes, so it's important we never dereference it. We
+ * only use the address to compare with, which is why we store the
+ * sync_issue time locally.
+ */
+ if (wbt_is_read(stat) && !rwb->sync_issue) {
+ rwb->sync_cookie = stat;
+ rwb->sync_issue = blk_stat_time(stat);
+ }
+}
+
+void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+ if (!rwb_enabled(rwb))
+ return;
+ if (stat == rwb->sync_cookie) {
+ rwb->sync_issue = 0;
+ rwb->sync_cookie = NULL;
+ }
+}
+
+void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+ if (rwb) {
+ rwb->queue_depth = depth;
+ wbt_update_limits(rwb);
+ }
+}
+
+void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
+{
+ if (rwb)
+ rwb->wc = write_cache_on;
+}
+
+ /*
+ * Disable wbt, if enabled by default. Only called from CFQ, if we have
+ * cgroups enabled
+ */
+void wbt_disable_default(struct request_queue *q)
+{
+ struct rq_wb *rwb = q->rq_wb;
+
+ if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
+ del_timer_sync(&rwb->window_timer);
+ rwb->win_nsec = rwb->min_lat_nsec = 0;
+ wbt_update_limits(rwb);
+ }
+}
+EXPORT_SYMBOL_GPL(wbt_disable_default);
+
+u64 wbt_default_latency_nsec(struct request_queue *q)
+{
+ /*
+ * We default to 2msec for non-rotational storage, and 75msec
+ * for rotational storage.
+ */
+ if (blk_queue_nonrot(q))
+ return 2000000ULL;
+ else
+ return 75000000ULL;
+}
+
+int wbt_init(struct request_queue *q)
+{
+ struct rq_wb *rwb;
+ int i;
+
+ /*
+ * For now, we depend on the stats window being larger than
+ * our monitoring window. Ensure that this isn't inadvertently
+ * violated.
+ */
+ BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
+ BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
+
+ rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+ if (!rwb)
+ return -ENOMEM;
+
+ for (i = 0; i < WBT_NUM_RWQ; i++) {
+ atomic_set(&rwb->rq_wait[i].inflight, 0);
+ init_waitqueue_head(&rwb->rq_wait[i].wait);
+ }
+
+ setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
+ rwb->wc = 1;
+ rwb->queue_depth = RWB_DEF_DEPTH;
+ rwb->last_comp = rwb->last_issue = jiffies;
+ rwb->queue = q;
+ rwb->win_nsec = RWB_WINDOW_NSEC;
+ rwb->enable_state = WBT_STATE_ON_DEFAULT;
+ wbt_update_limits(rwb);
+
+ /*
+ * Assign rwb, and turn on stats tracking for this queue
+ */
+ q->rq_wb = rwb;
+ blk_stat_enable(q);
+
+ rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+
+ wbt_set_queue_depth(rwb, blk_queue_depth(q));
+ wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+
+ return 0;
+}
+
+void wbt_exit(struct request_queue *q)
+{
+ struct rq_wb *rwb = q->rq_wb;
+
+ if (rwb) {
+ del_timer_sync(&rwb->window_timer);
+ q->rq_wb = NULL;
+ kfree(rwb);
+ }
+}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
new file mode 100644
index 000000000000..65f1de519f67
--- /dev/null
+++ b/block/blk-wbt.h
@@ -0,0 +1,171 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/timer.h>
+#include <linux/ktime.h>
+
+#include "blk-stat.h"
+
+enum wbt_flags {
+ WBT_TRACKED = 1, /* write, tracked for throttling */
+ WBT_READ = 2, /* read */
+ WBT_KSWAPD = 4, /* write, from kswapd */
+
+ WBT_NR_BITS = 3, /* number of bits */
+};
+
+enum {
+ WBT_NUM_RWQ = 2,
+};
+
+/*
+ * Enable states. Either off, or on by default (done at init time),
+ * or on through manual setup in sysfs.
+ */
+enum {
+ WBT_STATE_ON_DEFAULT = 1,
+ WBT_STATE_ON_MANUAL = 2,
+};
+
+static inline void wbt_clear_state(struct blk_issue_stat *stat)
+{
+ stat->time &= BLK_STAT_TIME_MASK;
+}
+
+static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
+{
+ return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
+}
+
+static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
+{
+ stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
+}
+
+static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
+{
+ return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
+}
+
+static inline bool wbt_is_read(struct blk_issue_stat *stat)
+{
+ return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
+}
+
+struct rq_wait {
+ wait_queue_head_t wait;
+ atomic_t inflight;
+};
+
+struct rq_wb {
+ /*
+ * Settings that govern how we throttle
+ */
+ unsigned int wb_background; /* background writeback */
+ unsigned int wb_normal; /* normal writeback */
+ unsigned int wb_max; /* max throughput writeback */
+ int scale_step;
+ bool scaled_max;
+
+ short enable_state; /* WBT_STATE_* */
+
+ /*
+ * Number of consecutive periods where we don't have enough
+ * information to make a firm scale up/down decision.
+ */
+ unsigned int unknown_cnt;
+
+ u64 win_nsec; /* default window size */
+ u64 cur_win_nsec; /* current window size */
+
+ struct timer_list window_timer;
+
+ s64 sync_issue;
+ void *sync_cookie;
+
+ unsigned int wc;
+ unsigned int queue_depth;
+
+ unsigned long last_issue; /* last non-throttled issue */
+ unsigned long last_comp; /* last non-throttled comp */
+ unsigned long min_lat_nsec;
+ struct request_queue *queue;
+ struct rq_wait rq_wait[WBT_NUM_RWQ];
+};
+
+static inline unsigned int wbt_inflight(struct rq_wb *rwb)
+{
+ unsigned int i, ret = 0;
+
+ for (i = 0; i < WBT_NUM_RWQ; i++)
+ ret += atomic_read(&rwb->rq_wait[i].inflight);
+
+ return ret;
+}
+
+#ifdef CONFIG_BLK_WBT
+
+void __wbt_done(struct rq_wb *, enum wbt_flags);
+void wbt_done(struct rq_wb *, struct blk_issue_stat *);
+enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
+int wbt_init(struct request_queue *);
+void wbt_exit(struct request_queue *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_disable_default(struct request_queue *);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+u64 wbt_default_latency_nsec(struct request_queue *);
+
+#else
+
+static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
+{
+}
+static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
+ spinlock_t *lock)
+{
+ return 0;
+}
+static inline int wbt_init(struct request_queue *q)
+{
+ return -EINVAL;
+}
+static inline void wbt_exit(struct request_queue *q)
+{
+}
+static inline void wbt_update_limits(struct rq_wb *rwb)
+{
+}
+static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline void wbt_disable_default(struct request_queue *q)
+{
+}
+static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+}
+static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
+{
+}
+static inline u64 wbt_default_latency_nsec(struct request_queue *q)
+{
+ return 0;
+}
+
+#endif /* CONFIG_BLK_WBT */
+
+#endif
diff --git a/block/blk.h b/block/blk.h
index aa132dea598c..041185e5f129 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -111,6 +111,7 @@ void blk_account_io_done(struct request *req);
enum rq_atomic_flags {
REQ_ATOM_COMPLETE = 0,
REQ_ATOM_STARTED,
+ REQ_ATOM_POLL_SLEPT,
};
/*
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 650f427d915b..9d652a992316 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -32,8 +32,13 @@
* bsg_destroy_job - routine to teardown/delete a bsg job
* @job: bsg_job that is to be torn down
*/
-static void bsg_destroy_job(struct bsg_job *job)
+static void bsg_destroy_job(struct kref *kref)
{
+ struct bsg_job *job = container_of(kref, struct bsg_job, kref);
+ struct request *rq = job->req;
+
+ blk_end_request_all(rq, rq->errors);
+
put_device(job->dev); /* release reference for the request */
kfree(job->request_payload.sg_list);
@@ -41,6 +46,18 @@ static void bsg_destroy_job(struct bsg_job *job)
kfree(job);
}
+void bsg_job_put(struct bsg_job *job)
+{
+ kref_put(&job->kref, bsg_destroy_job);
+}
+EXPORT_SYMBOL_GPL(bsg_job_put);
+
+int bsg_job_get(struct bsg_job *job)
+{
+ return kref_get_unless_zero(&job->kref);
+}
+EXPORT_SYMBOL_GPL(bsg_job_get);
+
/**
* bsg_job_done - completion routine for bsg requests
* @job: bsg_job that is complete
@@ -83,8 +100,7 @@ static void bsg_softirq_done(struct request *rq)
{
struct bsg_job *job = rq->special;
- blk_end_request_all(rq, rq->errors);
- bsg_destroy_job(job);
+ bsg_job_put(job);
}
static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
@@ -142,6 +158,7 @@ static int bsg_create_job(struct device *dev, struct request *req)
job->dev = dev;
/* take a reference for the request */
get_device(job->dev);
+ kref_init(&job->kref);
return 0;
failjob_rls_rqst_payload:
@@ -161,6 +178,8 @@ failjob_rls_job:
* Drivers/subsys should pass this to the queue init function.
*/
void bsg_request_fn(struct request_queue *q)
+ __releases(q->queue_lock)
+ __acquires(q->queue_lock)
{
struct device *dev = q->queuedata;
struct request *req;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dcbed8c9c82c..c73a6fcaeb9d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -16,6 +16,7 @@
#include <linux/blktrace_api.h>
#include <linux/blk-cgroup.h>
#include "blk.h"
+#include "blk-wbt.h"
/*
* tunables
@@ -1586,7 +1587,7 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
{
struct cfq_group_data *cgd;
- cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+ cgd = kzalloc(sizeof(*cgd), gfp);
if (!cgd)
return NULL;
return &cgd->cpd;
@@ -3762,9 +3763,11 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
struct cfq_data *cfqd = cic_to_cfqd(cic);
struct cfq_queue *cfqq;
uint64_t serial_nr;
+ bool nonroot_cg;
rcu_read_lock();
serial_nr = bio_blkcg(bio)->css.serial_nr;
+ nonroot_cg = bio_blkcg(bio) != &blkcg_root;
rcu_read_unlock();
/*
@@ -3775,6 +3778,14 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
return;
/*
+ * If we have a non-root cgroup, we can depend on that to
+ * do proper throttling of writes. Turn off wbt for that
+ * case, if it was enabled by default.
+ */
+ if (nonroot_cg)
+ wbt_disable_default(cfqd->queue);
+
+ /*
* Drop reference to queues. New queues will be assigned in new
* group upon arrival of fresh requests.
*/
@@ -3845,7 +3856,8 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
goto out;
}
- cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
+ cfqq = kmem_cache_alloc_node(cfq_pool,
+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
cfqd->queue->node);
if (!cfqq) {
cfqq = &cfqd->oom_cfqq;
diff --git a/block/elevator.c b/block/elevator.c
index a18a5db274e4..40f0c04e5ad3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -248,13 +248,13 @@ static inline void __elv_rqhash_del(struct request *rq)
rq->rq_flags &= ~RQF_HASHED;
}
-static void elv_rqhash_del(struct request_queue *q, struct request *rq)
+void elv_rqhash_del(struct request_queue *q, struct request *rq)
{
if (ELV_ON_HASH(rq))
__elv_rqhash_del(rq);
}
-static void elv_rqhash_add(struct request_queue *q, struct request *rq)
+void elv_rqhash_add(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
@@ -263,13 +263,13 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq)
rq->rq_flags |= RQF_HASHED;
}
-static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
+void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
{
__elv_rqhash_del(rq);
elv_rqhash_add(q, rq);
}
-static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
+struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
{
struct elevator_queue *e = q->elevator;
struct hlist_node *next;
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 71d9ed9df8da..d7beb6bbbf66 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -430,6 +430,56 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
return 0;
}
+static bool part_zone_aligned(struct gendisk *disk,
+ struct block_device *bdev,
+ sector_t from, sector_t size)
+{
+ unsigned int zone_size = bdev_zone_size(bdev);
+
+ /*
+ * If this function is called, then the disk is a zoned block device
+ * (host-aware or host-managed). This can be detected even if the
+ * zoned block device support is disabled (CONFIG_BLK_DEV_ZONED not
+ * set). In this case, however, only host-aware devices will be seen
+ * as a block device is not created for host-managed devices. Without
+ * zoned block device support, host-aware drives can still be used as
+ * regular block devices (no zone operation) and their zone size will
+ * be reported as 0. Allow this case.
+ */
+ if (!zone_size)
+ return true;
+
+ /*
+ * Check partition start and size alignement. If the drive has a
+ * smaller last runt zone, ignore it and allow the partition to
+ * use it. Check the zone size too: it should be a power of 2 number
+ * of sectors.
+ */
+ if (WARN_ON_ONCE(!is_power_of_2(zone_size))) {
+ u32 rem;
+
+ div_u64_rem(from, zone_size, &rem);
+ if (rem)
+ return false;
+ if ((from + size) < get_capacity(disk)) {
+ div_u64_rem(size, zone_size, &rem);
+ if (rem)
+ return false;
+ }
+
+ } else {
+
+ if (from & (zone_size - 1))
+ return false;
+ if ((from + size) < get_capacity(disk) &&
+ (size & (zone_size - 1)))
+ return false;
+
+ }
+
+ return true;
+}
+
int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
{
struct parsed_partitions *state = NULL;
@@ -529,6 +579,21 @@ rescan:
}
}
+ /*
+ * On a zoned block device, partitions should be aligned on the
+ * device zone size (i.e. zone boundary crossing not allowed).
+ * Otherwise, resetting the write pointer of the last zone of
+ * one partition may impact the following partition.
+ */
+ if (bdev_is_zoned(bdev) &&
+ !part_zone_aligned(disk, bdev, from, size)) {
+ printk(KERN_WARNING
+ "%s: p%d start %llu+%llu is not zone aligned\n",
+ disk->disk_name, p, (unsigned long long) from,
+ (unsigned long long) size);
+ continue;
+ }
+
part = add_partition(disk, p, from, size,
state->parts[p].flags,
&state->parts[p].info);