From 34b48db66e08ca1c1bc07cf305d672ac940268dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 6 Sep 2014 16:08:05 -0700
Subject: block: remove artifical max_hw_sectors cap

Set max_sectors to the value the drivers provides as hardware limit by
default.  Linux had proper I/O throttling for a long time and doesn't
rely on a artifically small maximum I/O size anymore.  By not limiting
the I/O size by default we remove an annoying tuning step required for
most Linux installation.

Note that both the user, and if absolutely required the driver can still
impose a limit for FS requests below max_hw_sectors_kb.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0207a78a8d82..74d14dba6fb7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1186,7 +1186,6 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 enum blk_default_limits {
 	BLK_MAX_SEGMENTS	= 128,
 	BLK_SAFE_MAX_SECTORS	= 255,
-	BLK_DEF_MAX_SECTORS	= 1024,
 	BLK_MAX_SEGMENT_SIZE	= 65536,
 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
 };
-- 
cgit v1.2.3


From 74c450521dd8d245b982da62592a18aa6f88b045 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 29 Oct 2014 11:14:52 -0600
Subject: blk-mq: add a 'list' parameter to ->queue_rq()

Since we have the notion of a 'last' request in a chain, we can use
this to have the hardware optimize the issuing of requests. Add
a list_head parameter to queue_rq that the driver can use to
temporarily store hw commands for issue when 'last' is true. If we
are doing a chain of requests, pass in a NULL list for the first
request to force issue of that immediately, then batch the remainder
for deferred issue until the last request has been sent.

Instead of adding yet another argument to the hot ->queue_rq path,
encapsulate the passed arguments in a blk_mq_queue_data structure.
This is passed as a constant, and has been tested as faster than
passing 4 (or even 3) args through ->queue_rq. Update drivers for
the new ->queue_rq() prototype. There are no functional changes
in this patch for drivers - if they don't use the passed in list,
then they will just queue requests individually like before.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c                    | 29 +++++++++++++++++++++++++++--
 drivers/block/mtip32xx/mtip32xx.c |  5 +++--
 drivers/block/null_blk.c          | 10 +++++-----
 drivers/block/virtio_blk.c        |  7 ++++---
 drivers/scsi/scsi_lib.c           |  5 +++--
 include/linux/blk-mq.h            |  8 +++++++-
 6 files changed, 49 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 68929bad9a6a..7e5303820452 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -680,6 +680,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct request *rq;
 	LIST_HEAD(rq_list);
+	LIST_HEAD(driver_list);
+	struct list_head *dptr;
 	int queued;
 
 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
@@ -705,17 +707,28 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		spin_unlock(&hctx->lock);
 	}
 
+	/*
+	 * Start off with dptr being NULL, so we start the first request
+	 * immediately, even if we have more pending.
+	 */
+	dptr = NULL;
+
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
 	queued = 0;
 	while (!list_empty(&rq_list)) {
+		struct blk_mq_queue_data bd;
 		int ret;
 
 		rq = list_first_entry(&rq_list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 
-		ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
+		bd.rq = rq;
+		bd.list = dptr;
+		bd.last = list_empty(&rq_list);
+
+		ret = q->mq_ops->queue_rq(hctx, &bd);
 		switch (ret) {
 		case BLK_MQ_RQ_QUEUE_OK:
 			queued++;
@@ -734,6 +747,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 		if (ret == BLK_MQ_RQ_QUEUE_BUSY)
 			break;
+
+		/*
+		 * We've done the first request. If we have more than 1
+		 * left in the list, set dptr to defer issue.
+		 */
+		if (!dptr && rq_list.next != rq_list.prev)
+			dptr = &driver_list;
 	}
 
 	if (!queued)
@@ -1153,6 +1173,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	if (is_sync) {
+		struct blk_mq_queue_data bd = {
+			.rq = rq,
+			.list = NULL,
+			.last = 1
+		};
 		int ret;
 
 		blk_mq_bio_to_request(rq, bio);
@@ -1162,7 +1187,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		 * error (busy), just add it to our list as we previously
 		 * would have done
 		 */
-		ret = q->mq_ops->queue_rq(data.hctx, rq, true);
+		ret = q->mq_ops->queue_rq(data.hctx, &bd);
 		if (ret == BLK_MQ_RQ_QUEUE_OK)
 			goto done;
 		else {
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 1bd5f523f8fd..3bd7ca9853a8 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3775,9 +3775,10 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
 	return false;
 }
 
-static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
-		bool last)
+static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
 {
+	struct request *rq = bd->rq;
 	int ret;
 
 	if (unlikely(mtip_check_unal_depth(hctx, rq)))
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 2671a3f02f0c..8433bc8ead3d 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -313,15 +313,15 @@ static void null_request_fn(struct request_queue *q)
 	}
 }
 
-static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
-		bool last)
+static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
 {
-	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 
-	cmd->rq = rq;
+	cmd->rq = bd->rq;
 	cmd->nq = hctx->driver_data;
 
-	blk_mq_start_request(rq);
+	blk_mq_start_request(bd->rq);
 
 	null_handle_cmd(cmd);
 	return BLK_MQ_RQ_QUEUE_OK;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c6a27d54ad62..cecd3f983e49 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -158,10 +158,11 @@ static void virtblk_done(struct virtqueue *vq)
 	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 }
 
-static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
-		bool last)
+static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
+			   const struct blk_mq_queue_data *bd)
 {
 	struct virtio_blk *vblk = hctx->queue->queuedata;
+	struct request *req = bd->rq;
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 	unsigned long flags;
 	unsigned int num;
@@ -222,7 +223,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
 		return BLK_MQ_RQ_QUEUE_ERROR;
 	}
 
-	if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
+	if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
 		notify = true;
 	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9eff8a375132..161dcc93ac75 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1858,9 +1858,10 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
 	blk_mq_complete_request(cmd->request);
 }
 
-static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
-		bool last)
+static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
 {
+	struct request *req = bd->rq;
 	struct request_queue *q = req->q;
 	struct scsi_device *sdev = q->queuedata;
 	struct Scsi_Host *shost = sdev->host;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c9be1589415a..be01d7a687d4 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -79,7 +79,13 @@ struct blk_mq_tag_set {
 	struct list_head	tag_list;
 };
 
-typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool);
+struct blk_mq_queue_data {
+	struct request *rq;
+	struct list_head *list;
+	bool last;
+};
+
+typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
-- 
cgit v1.2.3


From e167dfb53cb85fde7b15f644e9dbef7ba31896b6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 29 Oct 2014 11:18:26 -0600
Subject: blk-mq: add BLK_MQ_F_DEFER_ISSUE support flag

Drivers can now tell blk-mq if they take advantage of the deferred
issue through 'last' or not. If they do, don't do queue-direct
for sync IO. This is a preparation patch for the nvme conversion.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 7 ++++++-
 include/linux/blk-mq.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7e5303820452..b355b5957cd7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1172,7 +1172,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto run_queue;
 	}
 
-	if (is_sync) {
+	/*
+	 * If the driver supports defer issued based on 'last', then
+	 * queue it up like normal since we can potentially save some
+	 * CPU this way.
+	 */
+	if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
 		struct blk_mq_queue_data bd = {
 			.rq = rq,
 			.list = NULL,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index be01d7a687d4..c3b64ec5321e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -146,6 +146,7 @@ enum {
 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
 	BLK_MQ_F_SYSFS_UP	= 1 << 3,
+	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
 
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
-- 
cgit v1.2.3


From 7c7f2f2bc9a63f9605a16eabac59fc655dfe7c9a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 17 Nov 2014 10:41:57 -0700
Subject: blk-mq: add blk_mq_free_hctx_request()

It's silly to use blk_mq_free_request() which in turn maps the
request to the hardware queue, for places where we already know
what the hardware queue is. This saves us an extra mapping of a
hardware queue on request completion, if the caller knows this
information already.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 17 ++++++++++++-----
 include/linux/blk-mq.h |  1 +
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index fdf12152946e..4347aa2be6ae 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -269,16 +269,23 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	blk_mq_queue_exit(q);
 }
 
-void blk_mq_free_request(struct request *rq)
+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx;
-	struct request_queue *q = rq->q;
 
 	ctx->rq_completed[rq_is_sync(rq)]++;
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	__blk_mq_free_request(hctx, ctx, rq);
+
+}
+EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
+
+void blk_mq_free_request(struct request *rq)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q = rq->q;
+
+	hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
+	blk_mq_free_hctx_request(hctx, rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c3b64ec5321e..fb0a4fb3dc2b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -169,6 +169,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		gfp_t gfp, bool reserved);
-- 
cgit v1.2.3


From 394ffa503bc40e32d7f54a9b817264e81ce131b4 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Mon, 24 Nov 2014 11:05:22 +0800
Subject: blk: introduce generic io stat accounting help function

Many block drivers accounting io stat based on bio (e.g. NVMe...),
the blk_account_io_start/end() which is based on request
does not make sense to them, so here we introduce the similar help
function named generic_start/end_io_acct base on raw sectors, and it can
simplify some driver's open io accounting code.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         | 28 ++++++++++++++++++++++++++++
 include/linux/bio.h |  5 +++++
 2 files changed, 33 insertions(+)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 3e6e1986a5b2..3d4a072375ef 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1739,6 +1739,34 @@ void bio_check_pages_dirty(struct bio *bio)
 	}
 }
 
+void generic_start_io_acct(int rw, unsigned long sectors,
+			   struct hd_struct *part)
+{
+	int cpu = part_stat_lock();
+
+	part_round_stats(cpu, part);
+	part_stat_inc(cpu, part, ios[rw]);
+	part_stat_add(cpu, part, sectors[rw], sectors);
+	part_inc_in_flight(part, rw);
+
+	part_stat_unlock();
+}
+EXPORT_SYMBOL(generic_start_io_acct);
+
+void generic_end_io_acct(int rw, struct hd_struct *part,
+			 unsigned long start_time)
+{
+	unsigned long duration = jiffies - start_time;
+	int cpu = part_stat_lock();
+
+	part_stat_add(cpu, part, ticks[rw], duration);
+	part_round_stats(cpu, part);
+	part_dec_in_flight(part, rw);
+
+	part_stat_unlock();
+}
+EXPORT_SYMBOL(generic_end_io_acct);
+
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 void bio_flush_dcache_pages(struct bio *bi)
 {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7347f486ceca..efead0b532c4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -443,6 +443,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 
+void generic_start_io_acct(int rw, unsigned long sectors,
+			   struct hd_struct *part);
+void generic_end_io_acct(int rw, struct hd_struct *part,
+			 unsigned long start_time);
+
 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 # error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
 #endif
-- 
cgit v1.2.3