From 34b48db66e08ca1c1bc07cf305d672ac940268dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 6 Sep 2014 16:08:05 -0700 Subject: block: remove artifical max_hw_sectors cap Set max_sectors to the value the drivers provides as hardware limit by default. Linux had proper I/O throttling for a long time and doesn't rely on a artifically small maximum I/O size anymore. By not limiting the I/O size by default we remove an annoying tuning step required for most Linux installation. Note that both the user, and if absolutely required the driver can still impose a limit for FS requests below max_hw_sectors_kb. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-settings.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index aa02247d227e..6ed2cbe5e8c9 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -257,9 +257,7 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_ __func__, max_hw_sectors); } - limits->max_hw_sectors = max_hw_sectors; - limits->max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); + limits->max_sectors = limits->max_hw_sectors = max_hw_sectors; } EXPORT_SYMBOL(blk_limits_max_hw_sectors); -- cgit v1.2.3 From 74c450521dd8d245b982da62592a18aa6f88b045 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Oct 2014 11:14:52 -0600 Subject: blk-mq: add a 'list' parameter to ->queue_rq() Since we have the notion of a 'last' request in a chain, we can use this to have the hardware optimize the issuing of requests. Add a list_head parameter to queue_rq that the driver can use to temporarily store hw commands for issue when 'last' is true. If we are doing a chain of requests, pass in a NULL list for the first request to force issue of that immediately, then batch the remainder for deferred issue until the last request has been sent. Instead of adding yet another argument to the hot ->queue_rq path, encapsulate the passed arguments in a blk_mq_queue_data structure. This is passed as a constant, and has been tested as faster than passing 4 (or even 3) args through ->queue_rq. Update drivers for the new ->queue_rq() prototype. There are no functional changes in this patch for drivers - if they don't use the passed in list, then they will just queue requests individually like before. Signed-off-by: Jens Axboe --- block/blk-mq.c | 29 +++++++++++++++++++++++++++-- drivers/block/mtip32xx/mtip32xx.c | 5 +++-- drivers/block/null_blk.c | 10 +++++----- drivers/block/virtio_blk.c | 7 ++++--- drivers/scsi/scsi_lib.c | 5 +++-- include/linux/blk-mq.h | 8 +++++++- 6 files changed, 49 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 68929bad9a6a..7e5303820452 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -680,6 +680,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct request *rq; LIST_HEAD(rq_list); + LIST_HEAD(driver_list); + struct list_head *dptr; int queued; WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); @@ -705,17 +707,28 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) spin_unlock(&hctx->lock); } + /* + * Start off with dptr being NULL, so we start the first request + * immediately, even if we have more pending. + */ + dptr = NULL; + /* * Now process all the entries, sending them to the driver. */ queued = 0; while (!list_empty(&rq_list)) { + struct blk_mq_queue_data bd; int ret; rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); + bd.rq = rq; + bd.list = dptr; + bd.last = list_empty(&rq_list); + + ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; @@ -734,6 +747,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (ret == BLK_MQ_RQ_QUEUE_BUSY) break; + + /* + * We've done the first request. If we have more than 1 + * left in the list, set dptr to defer issue. + */ + if (!dptr && rq_list.next != rq_list.prev) + dptr = &driver_list; } if (!queued) @@ -1153,6 +1173,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) } if (is_sync) { + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; int ret; blk_mq_bio_to_request(rq, bio); @@ -1162,7 +1187,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) * error (busy), just add it to our list as we previously * would have done */ - ret = q->mq_ops->queue_rq(data.hctx, rq, true); + ret = q->mq_ops->queue_rq(data.hctx, &bd); if (ret == BLK_MQ_RQ_QUEUE_OK) goto done; else { diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 1bd5f523f8fd..3bd7ca9853a8 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3775,9 +3775,10 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, return false; } -static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool last) +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct request *rq = bd->rq; int ret; if (unlikely(mtip_check_unal_depth(hctx, rq))) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 2671a3f02f0c..8433bc8ead3d 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -313,15 +313,15 @@ static void null_request_fn(struct request_queue *q) } } -static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool last) +static int null_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - cmd->rq = rq; + cmd->rq = bd->rq; cmd->nq = hctx->driver_data; - blk_mq_start_request(rq); + blk_mq_start_request(bd->rq); null_handle_cmd(cmd); return BLK_MQ_RQ_QUEUE_OK; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c6a27d54ad62..cecd3f983e49 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -158,10 +158,11 @@ static void virtblk_done(struct virtqueue *vq) spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } -static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, - bool last) +static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { struct virtio_blk *vblk = hctx->queue->queuedata; + struct request *req = bd->rq; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; @@ -222,7 +223,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, return BLK_MQ_RQ_QUEUE_ERROR; } - if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) + if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) notify = true; spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9eff8a375132..161dcc93ac75 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1858,9 +1858,10 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) blk_mq_complete_request(cmd->request); } -static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, - bool last) +static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct request *req = bd->rq; struct request_queue *q = req->q; struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c9be1589415a..be01d7a687d4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -79,7 +79,13 @@ struct blk_mq_tag_set { struct list_head tag_list; }; -typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool); +struct blk_mq_queue_data { + struct request *rq; + struct list_head *list; + bool last; +}; + +typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); -- cgit v1.2.3 From e167dfb53cb85fde7b15f644e9dbef7ba31896b6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Oct 2014 11:18:26 -0600 Subject: blk-mq: add BLK_MQ_F_DEFER_ISSUE support flag Drivers can now tell blk-mq if they take advantage of the deferred issue through 'last' or not. If they do, don't do queue-direct for sync IO. This is a preparation patch for the nvme conversion. Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 ++++++- include/linux/blk-mq.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 7e5303820452..b355b5957cd7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1172,7 +1172,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } - if (is_sync) { + /* + * If the driver supports defer issued based on 'last', then + * queue it up like normal since we can potentially save some + * CPU this way. + */ + if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index be01d7a687d4..c3b64ec5321e 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -146,6 +146,7 @@ enum { BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_SYSFS_UP = 1 << 3, + BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, -- cgit v1.2.3 From 398205b8391b208f0034a392242867b28ad8af3d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 7 Nov 2014 23:03:59 +0100 Subject: blk_mq: call preempt_disable/enable in blk_mq_run_hw_queue, and only if needed preempt_disable/enable surrounds every call to blk_mq_run_hw_queue, except the one in blk-flush.c. In fact that one is always asynchronous, and it does not need smp_processor_id(). We can do the same for all other calls, avoiding preempt_disable when async is true. This avoids peppering blk-mq.c with preemption-disabled regions. Cc: Jens Axboe Cc: Thomas Gleixner Reported-by: Clark Williams Tested-by: Clark Williams Signed-off-by: Paolo Bonzini Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index b355b5957cd7..8b309e81ed0f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -801,9 +801,18 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return; - if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) - __blk_mq_run_hw_queue(hctx); - else if (hctx->queue->nr_hw_queues == 1) + if (!async) { + preempt_disable(); + if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); + preempt_enable(); + return; + } + + preempt_enable(); + } + + if (hctx->queue->nr_hw_queues == 1) kblockd_schedule_delayed_work(&hctx->run_work, 0); else { unsigned int cpu; @@ -824,9 +833,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_run_queues); @@ -853,9 +860,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, false); - preempt_enable(); } EXPORT_SYMBOL(blk_mq_start_hw_queue); @@ -880,9 +885,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) continue; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); -- cgit v1.2.3 From 2a90d4aae5509e9cf1ba848c5d0b3458201160a0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 7 Nov 2014 23:04:00 +0100 Subject: blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable blk-mq is using preempt_disable/enable in order to ensure that the queue runners are placed on the right CPU. This does not work with the RT patches, because __blk_mq_run_hw_queue takes a non-raw spinlock with the preemption-disabled region. If there is contention on the lock, this violates the rules for preemption-disabled regions. While this should be easily fixable within the RT patches just by doing migrate_disable/enable, we can do better and document _why_ this particular region runs with disabled preemption. After the previous patch, it is trivial to switch it to get/put_cpu; the RT patches then can change it to get_cpu_light, which lets virtio-blk run under RT kernels. Cc: Jens Axboe Cc: Thomas Gleixner Reported-by: Clark Williams Tested-by: Clark Williams Signed-off-by: Paolo Bonzini Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8b309e81ed0f..06ab0683a1f1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -802,14 +802,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) return; if (!async) { - preempt_disable(); - if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) { + int cpu = get_cpu(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); - preempt_enable(); + put_cpu(); return; } - preempt_enable(); + put_cpu(); } if (hctx->queue->nr_hw_queues == 1) -- cgit v1.2.3 From 1a3b595a281a44be4074fe33b317a0a4854b4197 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Nov 2014 10:40:48 -0700 Subject: blk-mq: export blk_mq_free_request() Drivers that know they are blk-mq should just use this function instead of calling through blk_put_request(). Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 06ab0683a1f1..fdf12152946e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -280,6 +280,7 @@ void blk_mq_free_request(struct request *rq) hctx = q->mq_ops->map_queue(q, ctx->cpu); __blk_mq_free_request(hctx, ctx, rq); } +EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, int error) { -- cgit v1.2.3 From 7c7f2f2bc9a63f9605a16eabac59fc655dfe7c9a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Nov 2014 10:41:57 -0700 Subject: blk-mq: add blk_mq_free_hctx_request() It's silly to use blk_mq_free_request() which in turn maps the request to the hardware queue, for places where we already know what the hardware queue is. This saves us an extra mapping of a hardware queue on request completion, if the caller knows this information already. Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 ++++++++++++----- include/linux/blk-mq.h | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index fdf12152946e..4347aa2be6ae 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -269,16 +269,23 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, blk_mq_queue_exit(q); } -void blk_mq_free_request(struct request *rq) +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; ctx->rq_completed[rq_is_sync(rq)]++; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); __blk_mq_free_request(hctx, ctx, rq); + +} +EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); + +void blk_mq_free_request(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq->q; + + hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + blk_mq_free_hctx_request(hctx, rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c3b64ec5321e..fb0a4fb3dc2b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -169,6 +169,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); -- cgit v1.2.3 From 5fabcb4c33fe11c7e3afdf805fde26c1a54d0953 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Nov 2014 13:06:22 -0700 Subject: genhd: check for int overflow in disk_expand_part_tbl() We can get here from blkdev_ioctl() -> blkpg_ioctl() -> add_partition() with a user passed in partno value. If we pass in 0x7fffffff, the new target in disk_expand_part_tbl() overflows the 'int' and we access beyond the end of ptbl->part[] and even write to it when we do the rcu_assign_pointer() to assign the new partition. Reported-by: David Ramos Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/genhd.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index bd3060684ab2..0a536dc05f3b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) struct disk_part_tbl *old_ptbl = disk->part_tbl; struct disk_part_tbl *new_ptbl; int len = old_ptbl ? old_ptbl->len : 0; - int target = partno + 1; + int i, target; size_t size; - int i; + + /* + * check for int overflow, since we can get here from blkpg_ioctl() + * with a user passed 'partno'. + */ + target = partno + 1; + if (target < 0) + return -EINVAL; /* disk_max_parts() is zero during initialization, ignore if so */ if (disk_max_parts(disk) && target > disk_max_parts(disk)) -- cgit v1.2.3 From b657d7e632e0bc40e5e231332be39d69b2f1a0bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Nov 2014 09:27:23 +0100 Subject: blk-mq: handle the single queue case in blk_mq_hctx_next_cpu Don't duplicate the code to handle the not cpu bounce case in the caller, do it inside blk_mq_hctx_next_cpu instead. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4347aa2be6ae..27a347fe8f5b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -788,10 +788,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { - int cpu = hctx->next_cpu; + if (hctx->queue->nr_hw_queues == 1) + return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { - int next_cpu; + int cpu = hctx->next_cpu, next_cpu; next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); if (next_cpu >= nr_cpu_ids) @@ -799,9 +800,11 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + + return cpu; } - return cpu; + return hctx->next_cpu; } void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) @@ -820,14 +823,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) put_cpu(); } - if (hctx->queue->nr_hw_queues == 1) - kblockd_schedule_delayed_work(&hctx->run_work, 0); - else { - unsigned int cpu; - - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); - } + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->run_work, 0); } void blk_mq_run_queues(struct request_queue *q, bool async) @@ -919,16 +916,8 @@ static void blk_mq_delay_work_fn(struct work_struct *work) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { - unsigned long tmo = msecs_to_jiffies(msecs); - - if (hctx->queue->nr_hw_queues == 1) - kblockd_schedule_delayed_work(&hctx->delay_work, tmo); - else { - unsigned int cpu; - - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); - } + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->delay_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_queue); -- cgit v1.2.3 From 394ffa503bc40e32d7f54a9b817264e81ce131b4 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Mon, 24 Nov 2014 11:05:22 +0800 Subject: blk: introduce generic io stat accounting help function Many block drivers accounting io stat based on bio (e.g. NVMe...), the blk_account_io_start/end() which is based on request does not make sense to them, so here we introduce the similar help function named generic_start/end_io_acct base on raw sectors, and it can simplify some driver's open io accounting code. Signed-off-by: Gu Zheng Signed-off-by: Jens Axboe --- block/bio.c | 28 ++++++++++++++++++++++++++++ include/linux/bio.h | 5 +++++ 2 files changed, 33 insertions(+) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 3e6e1986a5b2..3d4a072375ef 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1739,6 +1739,34 @@ void bio_check_pages_dirty(struct bio *bio) } } +void generic_start_io_acct(int rw, unsigned long sectors, + struct hd_struct *part) +{ + int cpu = part_stat_lock(); + + part_round_stats(cpu, part); + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, sectors[rw], sectors); + part_inc_in_flight(part, rw); + + part_stat_unlock(); +} +EXPORT_SYMBOL(generic_start_io_acct); + +void generic_end_io_acct(int rw, struct hd_struct *part, + unsigned long start_time) +{ + unsigned long duration = jiffies - start_time; + int cpu = part_stat_lock(); + + part_stat_add(cpu, part, ticks[rw], duration); + part_round_stats(cpu, part); + part_dec_in_flight(part, rw); + + part_stat_unlock(); +} +EXPORT_SYMBOL(generic_end_io_acct); + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE void bio_flush_dcache_pages(struct bio *bi) { diff --git a/include/linux/bio.h b/include/linux/bio.h index 7347f486ceca..efead0b532c4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -443,6 +443,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); +void generic_start_io_acct(int rw, unsigned long sectors, + struct hd_struct *part); +void generic_end_io_acct(int rw, struct hd_struct *part, + unsigned long start_time); + #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" #endif -- cgit v1.2.3 From a33c1ba2913802b6fb23e974bb2f6a4e73c8b7ce Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 Nov 2014 15:02:42 -0700 Subject: blk-mq: use 'nr_cpu_ids' as highest CPU ID count for hwq <-> cpu map We currently use num_possible_cpus(), but that breaks on sparc64 where the CPU ID space is discontig. Use nr_cpu_ids as the highest CPU ID instead, so we don't end up reading from invalid memory. Cc: stable@kernel.org # 3.13+ Signed-off-by: Jens Axboe --- block/blk-mq-cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 1065d7c65fa1..72e5ed691e37 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -90,7 +90,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) unsigned int *map; /* If cpus are offline, map them to first hctx */ - map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, + map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL, set->numa_node); if (!map) return NULL; -- cgit v1.2.3 From 70114c393ccaa43ca38e6b36b9469ed2c35acc49 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 Nov 2014 15:52:30 -0700 Subject: blk-mq: cleanup tag free handling We only call __blk_mq_put_tag() and __blk_mq_put_reserved_tag() from blk_mq_put_tag(), so just inline the two calls instead of having them as separate functions. Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 8317175a3009..230ef3056b72 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -360,21 +360,6 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) } } -static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) -{ - BUG_ON(tag >= tags->nr_tags); - - bt_clear_tag(&tags->bitmap_tags, tag); -} - -static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, - unsigned int tag) -{ - BUG_ON(tag >= tags->nr_reserved_tags); - - bt_clear_tag(&tags->breserved_tags, tag); -} - void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag) { @@ -383,10 +368,13 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, if (tag >= tags->nr_reserved_tags) { const int real_tag = tag - tags->nr_reserved_tags; - __blk_mq_put_tag(tags, real_tag); + BUG_ON(real_tag >= tags->nr_tags); + bt_clear_tag(&tags->bitmap_tags, real_tag); *last_tag = real_tag; - } else - __blk_mq_put_reserved_tag(tags, tag); + } else { + BUG_ON(tag >= tags->nr_reserved_tags); + bt_clear_tag(&tags->breserved_tags, tag); + } } static void bt_for_each(struct blk_mq_hw_ctx *hctx, -- cgit v1.2.3 From 6637fadf25657e619a50fde5ff3ae09a98d20eb5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Sun, 30 Nov 2014 16:00:58 -0800 Subject: blk-mq: move the kdump check to blk_mq_alloc_tag_set We call blk_mq_alloc_tag_set() first then blk_mq_init_queue(). The requests are allocated in the former function. So the kdump check should be moved to there to really save memory. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 27a347fe8f5b..4854e709aa34 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1804,16 +1804,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!ctx) return ERR_PTR(-ENOMEM); - /* - * If a crashdump is active, then we are potentially in a very - * memory constrained environment. Limit us to 1 queue and - * 64 tags to prevent using too much memory. - */ - if (is_kdump_kernel()) { - set->nr_hw_queues = 1; - set->queue_depth = min(64U, set->queue_depth); - } - hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, set->numa_node); @@ -2070,6 +2060,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->queue_depth = BLK_MQ_MAX_DEPTH; } + /* + * If a crashdump is active, then we are potentially in a very + * memory constrained environment. Limit us to 1 queue and + * 64 tags to prevent using too much memory. + */ + if (is_kdump_kernel()) { + set->nr_hw_queues = 1; + set->queue_depth = min(64U, set->queue_depth); + } + set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); -- cgit v1.2.3 From b32232073e8061b41258bff2a10a06a91677480a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Dec 2014 08:46:34 -0700 Subject: blk-mq: fix hang in bt_get() Avoid that if there are fewer hardware queues than CPU threads that bt_get() can hang. The symptoms of the hang were as follows: * All tags allocated for a particular hardware queue. * (nr_tags) pending commands for that hardware queue. * No pending commands for the software queues associated with that hardware queue. Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 230ef3056b72..eb55492e6875 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -254,6 +254,13 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) break; + /* + * We're out of tags on this hardware queue, kick any + * pending IO submits before going to sleep waiting for + * some to complete. + */ + blk_mq_run_hw_queue(hctx, false); + blk_mq_put_ctx(data->ctx); io_schedule(); -- cgit v1.2.3 From 080ff3511450fd73948697fef34a3cc382675b59 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Dec 2014 08:49:06 -0700 Subject: blk-mq: re-check for available tags after running the hardware queue If we run out of tags and have to sleep, we run the hardware queue to kick pending IO into gear. During that run, we may have completed requests, so re-check if we have free tags before going to sleep. Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index eb55492e6875..bab4bff15f42 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -261,6 +261,14 @@ static int bt_get(struct blk_mq_alloc_data *data, */ blk_mq_run_hw_queue(hctx, false); + /* + * Retry tag allocation after running the hardware queue, + * as running the queue may also have found completions. + */ + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + break; + blk_mq_put_ctx(data->ctx); io_schedule(); -- cgit v1.2.3 From 19c66e59ce57e7b181625cbb408d48eb10837763 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 3 Dec 2014 19:38:04 +0800 Subject: blk-mq: prevent unmapped hw queue from being scheduled When one hardware queue has no mapped software queues, it shouldn't have been scheduled. Otherwise WARNING or OOPS can triggered. blk_mq_hw_queue_mapped() helper is introduce for fixing the problem. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++-- block/blk-mq.h | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4854e709aa34..b21a3b6f7b65 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -589,7 +589,7 @@ static void blk_mq_rq_timer(unsigned long priv) * If not software queues are currently mapped to this * hardware queue, there's nothing to check */ - if (!hctx->nr_ctx || !hctx->tags) + if (!blk_mq_hw_queue_mapped(hctx)) continue; blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); @@ -809,7 +809,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || + !blk_mq_hw_queue_mapped(hctx))) return; if (!async) { @@ -916,6 +917,9 @@ static void blk_mq_delay_work_fn(struct work_struct *work) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { + if (unlikely(!blk_mq_hw_queue_mapped(hctx))) + return; + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->delay_work, msecs_to_jiffies(msecs)); } diff --git a/block/blk-mq.h b/block/blk-mq.h index d567d5283ffa..206230e64f79 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -115,4 +115,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, data->hctx = hctx; } +static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) +{ + return hctx->nr_ctx && hctx->tags; +} + #endif -- cgit v1.2.3 From 45a9c9d909b24c6ad0e28a7946e7486e73010319 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Dec 2014 16:57:48 +0100 Subject: blk-mq: Fix a use-after-free blk-mq users are allowed to free the memory request_queue.tag_set points at after blk_cleanup_queue() has finished but before blk_release_queue() has started. This can happen e.g. in the SCSI core. The SCSI core namely embeds the tag_set structure in a SCSI host structure. The SCSI host structure is freed by scsi_host_dev_release(). This function is called after blk_cleanup_queue() finished but can be called before blk_release_queue(). This means that it is not safe to access request_queue.tag_set from inside blk_release_queue(). Hence remove the blk_sync_queue() call from blk_release_queue(). This call is not necessary - outstanding requests must have finished before blk_release_queue() is called. Additionally, move the blk_mq_free_queue() call from blk_release_queue() to blk_cleanup_queue() to avoid that struct request_queue.tag_set gets accessed after it has been freed. This patch avoids that the following kernel oops can be triggered when deleting a SCSI host for which scsi-mq was enabled: Call Trace: [] lock_acquire+0xc4/0x270 [] mutex_lock_nested+0x61/0x380 [] blk_mq_free_queue+0x30/0x180 [] blk_release_queue+0x84/0xd0 [] kobject_cleanup+0x7b/0x1a0 [] kobject_put+0x30/0x70 [] blk_put_queue+0x15/0x20 [] disk_release+0x99/0xd0 [] device_release+0x36/0xb0 [] kobject_cleanup+0x7b/0x1a0 [] kobject_put+0x30/0x70 [] put_disk+0x1a/0x20 [] __blkdev_put+0x135/0x1b0 [] blkdev_put+0x50/0x160 [] kill_block_super+0x44/0x70 [] deactivate_locked_super+0x44/0x60 [] deactivate_super+0x4e/0x70 [] cleanup_mnt+0x43/0x90 [] __cleanup_mnt+0x12/0x20 [] task_work_run+0xac/0xe0 [] do_notify_resume+0x61/0xa0 [] int_signal+0x12/0x17 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Robert Elliott Cc: Ming Lei Cc: Alexander Gordeev Cc: # v3.13+ Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +++ block/blk-sysfs.c | 12 ++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 0421b53e6431..93f9152fc271 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -525,6 +525,9 @@ void blk_cleanup_queue(struct request_queue *q) del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); blk_sync_queue(q); + if (q->mq_ops) + blk_mq_free_queue(q); + spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) q->queue_lock = &q->__queue_lock; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1fac43408911..935ea2aa0730 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -492,17 +492,15 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) * Currently, its primary task it to free all the &struct request * structures that were allocated to the queue and the queue itself. * - * Caveat: - * Hopefully the low level driver will have finished any - * outstanding requests first... + * Note: + * The low level driver must have finished any outstanding requests first + * via blk_cleanup_queue(). **/ static void blk_release_queue(struct kobject *kobj) { struct request_queue *q = container_of(kobj, struct request_queue, kobj); - blk_sync_queue(q); - blkcg_exit_queue(q); if (q->elevator) { @@ -517,9 +515,7 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - if (q->mq_ops) - blk_mq_free_queue(q); - else + if (!q->mq_ops) blk_free_flush_queue(q->fq); blk_trace_shutdown(q); -- cgit v1.2.3 From 9e98e9d7cf6e9d2ec1cce45e8d5ccaf3f9b386f3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Dec 2014 16:58:11 +0100 Subject: blk-mq: Avoid that __bt_get_word() wraps multiple times If __bt_get_word() is called with last_tag != 0, if the first find_next_zero_bit() fails, if after wrap-around the test_and_set_bit() call fails and find_next_zero_bit() succeeds, if the next test_and_set_bit() call fails and subsequently find_next_zero_bit() does not find a zero bit, then another wrap-around will occur. Avoid this by introducing an additional local variable. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Robert Elliott Cc: Ming Lei Cc: Alexander Gordeev Cc: # v3.13+ Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index bab4bff15f42..0f5e22a7971f 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -137,6 +137,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) { int tag, org_last_tag, end; + bool wrap = last_tag != 0; org_last_tag = last_tag; end = bm->depth; @@ -148,8 +149,9 @@ restart: * We started with an offset, start from 0 to * exhaust the map. */ - if (org_last_tag && last_tag) { - end = last_tag; + if (wrap) { + wrap = false; + end = org_last_tag; last_tag = 0; goto restart; } -- cgit v1.2.3 From c38d185d4af12e8be63ca4b6745d99449c450f12 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Dec 2014 16:58:35 +0100 Subject: blk-mq: Fix a race between bt_clear_tag() and bt_get() What we need is the following two guarantees: * Any thread that observes the effect of the test_and_set_bit() by __bt_get_word() also observes the preceding addition of 'current' to the appropriate wait list. This is guaranteed by the semantics of the spin_unlock() operation performed by prepare_and_wait(). Hence the conversion of test_and_set_bit_lock() into test_and_set_bit(). * The wait lists are examined by bt_clear() after the tag bit has been cleared. clear_bit_unlock() guarantees that any thread that observes that the bit has been cleared also observes the store operations preceding clear_bit_unlock(). However, clear_bit_unlock() does not prevent that the wait lists are examined before that the tag bit is cleared. Hence the addition of a memory barrier between clear_bit() and the wait list examination. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Robert Elliott Cc: Ming Lei Cc: Alexander Gordeev Cc: # v3.13+ Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 0f5e22a7971f..e47c4c75fd33 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -158,7 +158,7 @@ restart: return -1; } last_tag = tag + 1; - } while (test_and_set_bit_lock(tag, &bm->word)); + } while (test_and_set_bit(tag, &bm->word)); return tag; } @@ -357,11 +357,10 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) struct bt_wait_state *bs; int wait_cnt; - /* - * The unlock memory barrier need to order access to req in free - * path and clearing tag bit - */ - clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); + clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); + + /* Ensure that the wait list checks occur after clear_bit(). */ + smp_mb(); bs = bt_wake_ptr(bt); if (!bs) -- cgit v1.2.3 From 52f7eb945f2ba62b324bb9ae16d945326a961dcf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Dec 2014 16:59:48 +0100 Subject: blk-mq: Micro-optimize bt_get() Remove a superfluous finish_wait() call. Convert the two bt_wait_ptr() calls into a single call. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Robert Elliott Cc: Ming Lei Cc: Alexander Gordeev Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e47c4c75fd33..1b7229f9354a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -248,8 +248,8 @@ static int bt_get(struct blk_mq_alloc_data *data, if (!(data->gfp & __GFP_WAIT)) return -1; - bs = bt_wait_ptr(bt, hctx); do { + bs = bt_wait_ptr(bt, hctx); prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); tag = __bt_get(hctx, bt, last_tag); @@ -285,8 +285,6 @@ static int bt_get(struct blk_mq_alloc_data *data, hctx = data->hctx; bt = &hctx->tags->bitmap_tags; } - finish_wait(&bs->wait, &wait); - bs = bt_wait_ptr(bt, hctx); } while (1); finish_wait(&bs->wait, &wait); -- cgit v1.2.3 From 959f5f5b2fa7ac3bdd37c91076e560c06513f1e6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Dec 2014 16:59:21 +0100 Subject: blk-mq: Use all available hardware queues Suppose that a system has two CPU sockets, three cores per socket, that it does not support hyperthreading and that four hardware queues are provided by a block driver. With the current algorithm this will lead to the following assignment of CPU cores to hardware queues: HWQ 0: 0 1 HWQ 1: 2 3 HWQ 2: 4 5 HWQ 3: (none) This patch changes the queue assignment into: HWQ 0: 0 1 HWQ 1: 2 HWQ 2: 3 4 HWQ 3: 5 In other words, this patch has the following three effects: - All four hardware queues are used instead of only three. - CPU cores are spread more evenly over hardware queues. For the above example the range of the number of CPU cores associated with a single HWQ is reduced from [0..2] to [1..2]. - If the number of HWQ's is a multiple of the number of CPU sockets it is now guaranteed that all CPU cores associated with a single HWQ reside on the same CPU socket. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Cc: Jens Axboe Cc: Christoph Hellwig Cc: Ming Lei Cc: Alexander Gordeev Signed-off-by: Jens Axboe --- block/blk-mq-cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 72e5ed691e37..5f13f4d0bcce 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -17,7 +17,7 @@ static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, const int cpu) { - return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); + return cpu * nr_queues / nr_cpus; } static int get_first_sibling(unsigned int cpu) -- cgit v1.2.3 From 06a41a99d13d8e919e9a00a4849e6b85ae492592 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 10 Dec 2014 16:38:30 +0100 Subject: blk-mq: Fix uninitialized kobject at CPU hotplugging When a CPU is hotplugged, the current blk-mq spews a warning like: kobject '(null)' (ffffe8ffffc8b5d8): tried to add an uninitialized object, something is seriously wrong. CPU: 1 PID: 1386 Comm: systemd-udevd Not tainted 3.18.0-rc7-2.g088d59b-default #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_171129-lamiak 04/01/2014 0000000000000000 0000000000000002 ffffffff81605f07 ffffe8ffffc8b5d8 ffffffff8132c7a0 ffff88023341d370 0000000000000020 ffff8800bb05bd58 ffff8800bb05bd08 000000000000a0a0 000000003f441940 0000000000000007 Call Trace: [] dump_trace+0x86/0x330 [] show_stack_log_lvl+0x94/0x170 [] show_stack+0x21/0x50 [] dump_stack+0x41/0x51 [] kobject_add+0xa0/0xb0 [] blk_mq_register_hctx+0x91/0xb0 [] blk_mq_sysfs_register+0x3e/0x60 [] blk_mq_queue_reinit_notify+0xf8/0x190 [] notifier_call_chain+0x4c/0x70 [] cpu_notify+0x23/0x50 [] _cpu_up+0x157/0x170 [] cpu_up+0x89/0xb0 [] cpu_subsys_online+0x35/0x80 [] device_online+0x5d/0xa0 [] online_store+0x75/0x80 [] kernfs_fop_write+0xda/0x150 [] vfs_write+0xb2/0x1f0 [] SyS_write+0x42/0xb0 [] system_call_fastpath+0x16/0x1b [<00007f0132fb24e0>] 0x7f0132fb24e0 This is indeed because of an uninitialized kobject for blk_mq_ctx. The blk_mq_ctx kobjects are initialized in blk_mq_sysfs_init(), but it goes loop over hctx_for_each_ctx(), i.e. it initializes only for online CPUs. Thus, when a CPU is hotplugged, the ctx for the newly onlined CPU is registered without initialization. This patch fixes the issue by initializing the all ctx kobjects belonging to each queue. Bugzilla: https://bugzilla.novell.com/show_bug.cgi?id=908794 Cc: Signed-off-by: Takashi Iwai Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 371d8800b48a..1630a20d5dcf 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -390,16 +390,15 @@ static void blk_mq_sysfs_init(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; - int i, j; + int i; kobject_init(&q->mq_kobj, &blk_mq_ktype); - queue_for_each_hw_ctx(q, hctx, i) { + queue_for_each_hw_ctx(q, hctx, i) kobject_init(&hctx->kobj, &blk_mq_hw_ktype); - hctx_for_each_ctx(hctx, ctx, j) - kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); - } + queue_for_each_ctx(q, ctx, i) + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } /* see blk_register_queue() */ -- cgit v1.2.3 From fcbf6a087a7e4d3f03d28333678a1010810a53c3 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Wed, 10 Dec 2014 14:16:53 -0800 Subject: bio: modify __bio_add_page() to accept pages that don't start a new segment The original behaviour is to refuse to add a new page if the maximum number of segments has been reached, regardless of the fact the page we are going to add can be merged into the last segment or not. Unfortunately, when the system runs under heavy memory fragmentation conditions, a driver may try to add multiple pages to the last segment. The original code won't accept them and EBUSY will be reported to userspace. This patch modifies the function so it refuses to add a page only in case the latter starts a new segment and the maximum number of segments has already been reached. The bug can be easily reproduced with the st driver: 1) set CONFIG_SCSI_MPT2SAS_MAX_SGE or CONFIG_SCSI_MPT3SAS_MAX_SGE to 16 2) modprobe st buffer_kbs=1024 3) #dd if=/dev/zero of=/dev/st0 bs=1M count=10 dd: error writing `/dev/st0': Device or resource busy Signed-off-by: Maurizio Lombardi Signed-off-by: Ming Lei Cc: Jet Chen Cc: Tomas Henzl Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/bio.c | 54 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 3d4a072375ef..471d7382c7d1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -748,6 +748,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page } } + bio->bi_iter.bi_size += len; goto done; } @@ -764,28 +765,31 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page return 0; /* - * we might lose a segment or two here, but rather that than - * make this too complex. + * setup the new entry, we might clear it again later if we + * cannot add the page + */ + bvec = &bio->bi_io_vec[bio->bi_vcnt]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = offset; + bio->bi_vcnt++; + bio->bi_phys_segments++; + bio->bi_iter.bi_size += len; + + /* + * Perform a recount if the number of segments is greater + * than queue_max_segments(q). */ - while (bio->bi_phys_segments >= queue_max_segments(q)) { + while (bio->bi_phys_segments > queue_max_segments(q)) { if (retried_segments) - return 0; + goto failed; retried_segments = 1; blk_recount_segments(q, bio); } - /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; - /* * if queue has other restrictions (eg varying max sector size * depending on offset), it can specify a merge_bvec_fn in the @@ -795,7 +799,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page struct bvec_merge_data bvm = { .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size, + .bi_size = bio->bi_iter.bi_size - len, .bi_rw = bio->bi_rw, }; @@ -803,23 +807,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - return 0; - } + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) + goto failed; } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio->bi_flags &= ~(1 << BIO_SEG_VALID); - bio->bi_vcnt++; - bio->bi_phys_segments++; done: - bio->bi_iter.bi_size += len; return len; + + failed: + bvec->bv_page = NULL; + bvec->bv_len = 0; + bvec->bv_offset = 0; + bio->bi_vcnt--; + bio->bi_iter.bi_size -= len; + blk_recount_segments(q, bio); + return 0; } /** -- cgit v1.2.3