From f72b8792d180948b4b3898374998f5ac8c02e539 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 24 Aug 2016 15:51:50 -0600
Subject: workqueue: add cancel_work()

Like cancel_delayed_work(), but for regular work.

Signed-off-by: Jens Axboe <axboe@fb.com>
Mehed-by: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 26cc1df280d6..fc6e22186405 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -442,6 +442,7 @@ extern int schedule_on_each_cpu(work_func_t func);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 extern bool flush_work(struct work_struct *work);
+extern bool cancel_work(struct work_struct *work);
 extern bool cancel_work_sync(struct work_struct *work);
 
 extern bool flush_delayed_work(struct delayed_work *dwork);
-- 
cgit v1.2.3


From ee63cfa7fc197b63669623721b8009cce5b0659b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 24 Aug 2016 15:52:48 -0600
Subject: block: add kblockd_schedule_work_on()

Add a helper to schedule a regular struct work on a particular CPU.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 6 ++++++
 include/linux/blkdev.h | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 36c7ac328d8c..2d08597533a4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3097,6 +3097,12 @@ int kblockd_schedule_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
+int kblockd_schedule_work_on(int cpu, struct work_struct *work)
+{
+	return queue_work_on(cpu, kblockd_workqueue, work);
+}
+EXPORT_SYMBOL(kblockd_schedule_work_on);
+
 int kblockd_schedule_delayed_work(struct delayed_work *dwork,
 				  unsigned long delay)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e79055c8b577..69aae720f4ef 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1440,8 +1440,8 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 	return bio_will_gap(req->q, bio, req->bio);
 }
 
-struct work_struct;
 int kblockd_schedule_work(struct work_struct *work);
+int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
 int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
-- 
cgit v1.2.3


From 27489a3c827b7eebba26eda0320bb0f100bef167 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 24 Aug 2016 15:54:25 -0600
Subject: blk-mq: turn hctx->run_work into a regular work struct

We don't need the larger delayed work struct, since we always run it
immediately.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 2 +-
 block/blk-mq.c         | 9 ++++-----
 include/linux/blk-mq.h | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2d08597533a4..34ff8088eebe 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -288,7 +288,7 @@ void blk_sync_queue(struct request_queue *q)
 		int i;
 
 		queue_for_each_hw_ctx(q, hctx, i) {
-			cancel_delayed_work_sync(&hctx->run_work);
+			cancel_work_sync(&hctx->run_work);
 			cancel_delayed_work_sync(&hctx->delay_work);
 		}
 	} else {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 13f5a6c1de76..b68fdcbe58f6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -936,8 +936,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		put_cpu();
 	}
 
-	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-			&hctx->run_work, 0);
+	kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
 }
 
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
@@ -958,7 +957,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	cancel_delayed_work(&hctx->run_work);
+	cancel_work(&hctx->run_work);
 	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
@@ -1011,7 +1010,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
+	hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
 
 	__blk_mq_run_hw_queue(hctx);
 }
@@ -1722,7 +1721,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (node == NUMA_NO_NODE)
 		node = hctx->numa_node = set->numa_node;
 
-	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
+	INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
 	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e43bbffb5b7a..d579252e6463 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -25,7 +25,7 @@ struct blk_mq_hw_ctx {
 	} ____cacheline_aligned_in_smp;
 
 	unsigned long		state;		/* BLK_MQ_S_* flags */
-	struct delayed_work	run_work;
+	struct work_struct	run_work;
 	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
 	int			next_cpu;
-- 
cgit v1.2.3


From 8d354f133e86dd03ea7885a91df398c55ff699ff Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 25 Aug 2016 08:00:28 -0600
Subject: blk-mq: improve layout of blk_mq_hw_ctx

Various cache line optimizations:

- Move delay_work towards the end. It's huge, and we don't use it
  a lot (only SCSI).

- Move the atomic state into the same cacheline as the the dispatch
  list and lock.

- Rearrange a few members to pack it better.

- Shrink the max-order for dispatch accounting from 10 to 7. This
  means that ->dispatched[] and ->run now take up their own
  cacheline.

This shrinks struct blk_mq_hw_ctx down to 8 cachelines.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d579252e6463..e1544f0f8c21 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -22,11 +22,10 @@ struct blk_mq_hw_ctx {
 	struct {
 		spinlock_t		lock;
 		struct list_head	dispatch;
+		unsigned long		state;		/* BLK_MQ_S_* flags */
 	} ____cacheline_aligned_in_smp;
 
-	unsigned long		state;		/* BLK_MQ_S_* flags */
 	struct work_struct	run_work;
-	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
 	int			next_cpu;
 	int			next_cpu_batch;
@@ -40,8 +39,8 @@ struct blk_mq_hw_ctx {
 
 	struct blk_mq_ctxmap	ctx_map;
 
-	unsigned int		nr_ctx;
 	struct blk_mq_ctx	**ctxs;
+	unsigned int		nr_ctx;
 
 	atomic_t		wait_index;
 
@@ -49,7 +48,7 @@ struct blk_mq_hw_ctx {
 
 	unsigned long		queued;
 	unsigned long		run;
-#define BLK_MQ_MAX_DISPATCH_ORDER	10
+#define BLK_MQ_MAX_DISPATCH_ORDER	7
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
 	unsigned int		numa_node;
@@ -57,6 +56,8 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
+	struct delayed_work	delay_work;
+
 	struct blk_mq_cpu_notifier	cpu_notifier;
 	struct kobject		kobj;
 
-- 
cgit v1.2.3


From 6e219353afa1f67f453141f7462b01708ebf5574 Mon Sep 17 00:00:00 2001
From: Stephen Bates <sbates@raithlin.com>
Date: Tue, 13 Sep 2016 12:23:15 -0600
Subject: block: add poll_considered statistic

In order to help determine the effectiveness of polling in a running
system it is usful to determine the ratio of how often the poll
function is called vs how often the completion is checked. For this
reason we add a poll_considered variable and add it to the sysfs entry
for io_poll.

Signed-off-by: Stephen Bates <sbates@raithlin.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 8 ++++++--
 block/blk-mq-sysfs.c   | 4 +++-
 include/linux/blk-mq.h | 1 +
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 34ff8088eebe..14d7c0740dc0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3307,19 +3307,23 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie)
 {
 	struct blk_plug *plug;
 	long state;
+	unsigned int queue_num;
+	struct blk_mq_hw_ctx *hctx;
 
 	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
 	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		return false;
 
+	queue_num = blk_qc_t_to_queue_num(cookie);
+	hctx = q->queue_hw_ctx[queue_num];
+	hctx->poll_considered++;
+
 	plug = current->plug;
 	if (plug)
 		blk_flush_plug_list(plug, false);
 
 	state = current->state;
 	while (!need_resched()) {
-		unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
-		struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
 		int ret;
 
 		hctx->poll_invoked++;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index fe822aa5b8e4..ea8c3f58afbd 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -176,7 +176,9 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
 
 static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
-	return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success);
+	return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n",
+		       hctx->poll_considered, hctx->poll_invoked,
+		       hctx->poll_success);
 }
 
 static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e1544f0f8c21..7710f795d7c2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -61,6 +61,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_cpu_notifier	cpu_notifier;
 	struct kobject		kobj;
 
+	unsigned long		poll_considered;
 	unsigned long		poll_invoked;
 	unsigned long		poll_success;
 };
-- 
cgit v1.2.3


From abe47114b192a9e0167905a3418d815b4fcf87de Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 14 Sep 2016 14:33:15 +0200
Subject: block: remove blk_mq_alloc_single_hw_queue() prototype

The blk_mq_alloc_single_hw_queue() is a prototype artifact that
should have been removed with
commit cdef54dd85ad66e77262ea57796a3e81683dd5d6
"blk-mq: remove alloc_hctx and free_hctx methods" where the last
users of it were deleted.

Fixes: cdef54dd85ad ("blk-mq: remove alloc_hctx and free_hctx methods")
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 7710f795d7c2..ff14f68067aa 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -223,7 +223,6 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
 }
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
 
 int blk_mq_request_started(struct request *rq);
 void blk_mq_start_request(struct request *rq);
-- 
cgit v1.2.3


From 637ca77bd1f7950538956c61dcd0c2e559905dbf Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 14 Sep 2016 10:44:12 +0200
Subject: block: Document that bio_op() uses the data type of bio.bi_opf

Make it clear that the sizeof(unsigned int) expression in BIO_OP_SHIFT
refers to the bi_opf member of struct bio.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 436f43f87da9..1e1ef210ae91 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -89,7 +89,7 @@ struct bio {
 	struct bio_vec		bi_inline_vecs[0];
 };
 
-#define BIO_OP_SHIFT	(8 * sizeof(unsigned int) - REQ_OP_BITS)
+#define BIO_OP_SHIFT	(8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS)
 #define bio_op(bio)	((bio)->bi_opf >> BIO_OP_SHIFT)
 
 #define bio_set_op_attrs(bio, op, op_flags) do {		\
-- 
cgit v1.2.3


From 4382e33ad374862eacf62003bb02c750391ada05 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 14 Sep 2016 10:45:36 +0200
Subject: block, dm-crypt, btrfs: Introduce bio_flags()

Introduce the bio_flags() macro. Ensure that the second argument of
bio_set_op_attrs() only contains flags and no operation. This patch
does not change any functionality.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Chris Mason <clm@fb.com> (maintainer:BTRFS FILE SYSTEM)
Cc: Josef Bacik <jbacik@fb.com> (maintainer:BTRFS FILE SYSTEM)
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm-crypt.c     | 2 +-
 fs/btrfs/inode.c          | 5 +++--
 include/linux/blk_types.h | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index eedba67b0e3e..9ba0f0724d28 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1136,7 +1136,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_private = io;
 	clone->bi_end_io  = crypt_endio;
 	clone->bi_bdev    = cc->dev->bdev;
-	bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_opf);
+	bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio));
 }
 
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e6811c42e41e..ca01106795ea 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8412,7 +8412,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 	if (!bio)
 		return -ENOMEM;
 
-	bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
+	bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
 	bio->bi_private = dip;
 	bio->bi_end_io = btrfs_end_dio_bio;
 	btrfs_io_bio(bio)->logical = file_offset;
@@ -8450,7 +8450,8 @@ next_block:
 						  start_sector, GFP_NOFS);
 			if (!bio)
 				goto out_err;
-			bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
+			bio_set_op_attrs(bio, bio_op(orig_bio),
+					 bio_flags(orig_bio));
 			bio->bi_private = dip;
 			bio->bi_end_io = btrfs_end_dio_bio;
 			btrfs_io_bio(bio)->logical = file_offset;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1e1ef210ae91..311fa2f478b8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -90,11 +90,12 @@ struct bio {
 };
 
 #define BIO_OP_SHIFT	(8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS)
+#define bio_flags(bio)	((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1))
 #define bio_op(bio)	((bio)->bi_opf >> BIO_OP_SHIFT)
 
 #define bio_set_op_attrs(bio, op, op_flags) do {		\
 	WARN_ON(op >= (1 << REQ_OP_BITS));			\
-	(bio)->bi_opf &= ((1 << BIO_OP_SHIFT) - 1);		\
+	(bio)->bi_opf = bio_flags(bio);				\
 	(bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT);	\
 	(bio)->bi_opf |= op_flags;				\
 } while (0)
-- 
cgit v1.2.3


From 3e1de31b9bf608c5b35e2d0d134eb87f2a9ba4ae Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 14 Sep 2016 10:46:22 +0200
Subject: block: Improve bio_set_op_attrs() robustness

Since REQ_OP_BITS == 3 and __REQ_NR_BITS == 30 it is not that hard
to pass an op_flags argument to bio_set_op_attrs() that is larger
than the number of bits reserved for the op_flags argument. Complain
if this happens. Additionally, ensure that negative arguments trigger
a complaint (1 << ... is signed while 1U << ... is unsigned; adding
0U to an integer expression causes it to be promoted to an unsigned
type).

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 311fa2f478b8..53ee1a2acd4f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -93,11 +93,18 @@ struct bio {
 #define bio_flags(bio)	((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1))
 #define bio_op(bio)	((bio)->bi_opf >> BIO_OP_SHIFT)
 
-#define bio_set_op_attrs(bio, op, op_flags) do {		\
-	WARN_ON(op >= (1 << REQ_OP_BITS));			\
-	(bio)->bi_opf = bio_flags(bio);				\
-	(bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT);	\
-	(bio)->bi_opf |= op_flags;				\
+#define bio_set_op_attrs(bio, op, op_flags) do {			\
+	if (__builtin_constant_p(op))					\
+		BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS));		\
+	else								\
+		WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS));		\
+	if (__builtin_constant_p(op_flags))				\
+		BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT));	\
+	else								\
+		WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT));	\
+	(bio)->bi_opf = bio_flags(bio);					\
+	(bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT);			\
+	(bio)->bi_opf |= (op_flags);					\
 } while (0)
 
 #define BIO_RESET_BYTES		offsetof(struct bio, bi_max_vecs)
-- 
cgit v1.2.3


From 3f7c624aa58f769e0313ca3310704c5d88ac99ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 11 Sep 2016 16:03:02 +0200
Subject: block: remove bio_destructor_t

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 53ee1a2acd4f..cd395ecec99d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -16,7 +16,6 @@ struct block_device;
 struct io_context;
 struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *);
-typedef void (bio_destructor_t) (struct bio *);
 
 #ifdef CONFIG_BLOCK
 /*
-- 
cgit v1.2.3


From fc95db3edeaf924e9ad16592d9c1b06c730a49c9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 11 Sep 2016 16:03:03 +0200
Subject: bio.h: remove a very outdated comment

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 23ddf4b46a9b..e00721a2dce1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -1,6 +1,4 @@
 /*
- * 2.5 block I/O model
- *
  * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
  *
  * This program is free software; you can redistribute it and/or modify
-- 
cgit v1.2.3


From c5c5ca777469f0ff854f1da0aff9b3a9051b3ef7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 11 Sep 2016 16:03:04 +0200
Subject: block: remove IOPRIO_BITS

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/ioprio.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index beb9ce1c2c23..8c1239020d79 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -7,7 +7,6 @@
 /*
  * Gives us 8 prio classes with 13-bits of data for each class
  */
-#define IOPRIO_BITS		(16)
 #define IOPRIO_CLASS_SHIFT	(13)
 #define IOPRIO_PRIO_MASK	((1UL << IOPRIO_CLASS_SHIFT) - 1)
 
-- 
cgit v1.2.3


From 2849450ad39d2e699fda2d5c6f41e05d87fd7004 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 14 Sep 2016 13:28:30 -0400
Subject: blk-mq: introduce blk_mq_delay_kick_requeue_list()

blk_mq_delay_kick_requeue_list() provides the ability to kick the
q->requeue_list after a specified time.  To do this the request_queue's
'requeue_work' member was changed to a delayed_work.

blk_mq_delay_kick_requeue_list() allows DM to defer processing requeued
requests while it doesn't make sense to immediately requeue them
(e.g. when all paths in a DM multipath have failed).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 16 ++++++++++++----
 include/linux/blk-mq.h |  1 +
 include/linux/blkdev.h |  2 +-
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index eea0d230faa1..7ddc7969fba4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -502,7 +502,7 @@ EXPORT_SYMBOL(blk_mq_requeue_request);
 static void blk_mq_requeue_work(struct work_struct *work)
 {
 	struct request_queue *q =
-		container_of(work, struct request_queue, requeue_work);
+		container_of(work, struct request_queue, requeue_work.work);
 	LIST_HEAD(rq_list);
 	struct request *rq, *next;
 	unsigned long flags;
@@ -557,16 +557,24 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 
 void blk_mq_cancel_requeue_work(struct request_queue *q)
 {
-	cancel_work_sync(&q->requeue_work);
+	cancel_delayed_work_sync(&q->requeue_work);
 }
 EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
 
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
-	kblockd_schedule_work(&q->requeue_work);
+	kblockd_schedule_delayed_work(&q->requeue_work, 0);
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
+void blk_mq_delay_kick_requeue_list(struct request_queue *q,
+				    unsigned long msecs)
+{
+	kblockd_schedule_delayed_work(&q->requeue_work,
+				      msecs_to_jiffies(msecs));
+}
+EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
+
 void blk_mq_abort_requeue_list(struct request_queue *q)
 {
 	unsigned long flags;
@@ -2084,7 +2092,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	q->sg_reserved_size = INT_MAX;
 
-	INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
+	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
 	INIT_LIST_HEAD(&q->requeue_list);
 	spin_lock_init(&q->requeue_lock);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ff14f68067aa..60ef14cbcd2d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -233,6 +233,7 @@ void blk_mq_requeue_request(struct request *rq);
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
 void blk_mq_cancel_requeue_work(struct request_queue *q);
 void blk_mq_kick_requeue_list(struct request_queue *q);
+void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_abort_requeue_list(struct request_queue *q);
 void blk_mq_complete_request(struct request *rq, int error);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 69aae720f4ef..c47c358ba052 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -449,7 +449,7 @@ struct request_queue {
 
 	struct list_head	requeue_list;
 	spinlock_t		requeue_lock;
-	struct work_struct	requeue_work;
+	struct delayed_work	requeue_work;
 
 	struct mutex		sysfs_lock;
 
-- 
cgit v1.2.3


From 88459642cba452630326b9cab1c651e09577d4e4 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sat, 17 Sep 2016 08:38:44 -0600
Subject: blk-mq: abstract tag allocation out into sbitmap library

This is a generally useful data structure, so make it available to
anyone else who might want to use it. It's also a nice cleanup
separating the allocation logic from the rest of the tag handling logic.

The code is behind a new Kconfig option, CONFIG_SBITMAP, which is only
selected by CONFIG_BLOCK for now.

This should be a complete noop functionality-wise.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 MAINTAINERS             |   1 +
 block/Kconfig           |   1 +
 block/blk-mq-tag.c      | 463 ++++++++++++------------------------------------
 block/blk-mq-tag.h      |  37 ++--
 block/blk-mq.c          | 112 ++++--------
 block/blk-mq.h          |   9 -
 include/linux/blk-mq.h  |   9 +-
 include/linux/sbitmap.h | 327 ++++++++++++++++++++++++++++++++++
 lib/Kconfig             |   3 +
 lib/Makefile            |   2 +
 lib/sbitmap.c           | 301 +++++++++++++++++++++++++++++++
 11 files changed, 789 insertions(+), 476 deletions(-)
 create mode 100644 include/linux/sbitmap.h
 create mode 100644 lib/sbitmap.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 71aa5daeae8f..157b1ca3e19d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2451,6 +2451,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
 S:	Maintained
 F:	block/
 F:	kernel/trace/blktrace.c
+F:	lib/sbitmap.c
 
 BLOCK2MTD DRIVER
 M:	Joern Engel <joern@lazybastard.org>
diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879..5136ad4bb6d5 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
 menuconfig BLOCK
        bool "Enable the block layer" if EXPERT
        default y
+       select SBITMAP
        help
 	 Provide block layer support for the kernel.
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 729bac3a673b..2cbdecd594e9 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,12 +1,7 @@
 /*
- * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
- * over multiple cachelines to avoid ping-pong between multiple submitters
- * or submitter and completer. Uses rolling wakeups to avoid falling of
- * the scaling cliff when we run out of tags and have to start putting
- * submitters to sleep.
- *
- * Uses active queue tracking to support fairer distribution of tags
- * between multiple submitters when a shared tag map is used.
+ * Tag allocation using scalable bitmaps. Uses active queue tracking to support
+ * fairer distribution of tags between multiple submitters when a shared tag map
+ * is used.
  *
  * Copyright (C) 2013-2014 Jens Axboe
  */
@@ -19,40 +14,12 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
-{
-	int i;
-
-	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
-		int ret;
-
-		ret = find_first_zero_bit(&bm->word, bm->depth);
-		if (ret < bm->depth)
-			return true;
-	}
-
-	return false;
-}
-
 bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 {
 	if (!tags)
 		return true;
 
-	return bt_has_free_tags(&tags->bitmap_tags);
-}
-
-static inline int bt_index_inc(int index)
-{
-	return (index + 1) & (BT_WAIT_QUEUES - 1);
-}
-
-static inline void bt_index_atomic_inc(atomic_t *index)
-{
-	int old = atomic_read(index);
-	int new = bt_index_inc(old);
-	atomic_cmpxchg(index, old, new);
+	return sbitmap_any_bit_clear(&tags->bitmap_tags.sb);
 }
 
 /*
@@ -72,29 +39,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
  */
 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 {
-	struct blk_mq_bitmap_tags *bt;
-	int i, wake_index;
-
-	/*
-	 * Make sure all changes prior to this are visible from other CPUs.
-	 */
-	smp_mb();
-	bt = &tags->bitmap_tags;
-	wake_index = atomic_read(&bt->wake_index);
-	for (i = 0; i < BT_WAIT_QUEUES; i++) {
-		struct bt_wait_state *bs = &bt->bs[wake_index];
-
-		if (waitqueue_active(&bs->wait))
-			wake_up(&bs->wait);
-
-		wake_index = bt_index_inc(wake_index);
-	}
-
-	if (include_reserve) {
-		bt = &tags->breserved_tags;
-		if (waitqueue_active(&bt->bs[0].wait))
-			wake_up(&bt->bs[0].wait);
-	}
+	sbitmap_queue_wake_all(&tags->bitmap_tags);
+	if (include_reserve)
+		sbitmap_queue_wake_all(&tags->breserved_tags);
 }
 
 /*
@@ -118,7 +65,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
  * and attempt to provide a fair share of the tag depth for each of them.
  */
 static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
-				  struct blk_mq_bitmap_tags *bt)
+				  struct sbitmap_queue *bt)
 {
 	unsigned int depth, users;
 
@@ -130,7 +77,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	/*
 	 * Don't try dividing an ant
 	 */
-	if (bt->depth == 1)
+	if (bt->sb.depth == 1)
 		return true;
 
 	users = atomic_read(&hctx->tags->active_queues);
@@ -140,127 +87,42 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	/*
 	 * Allow at least some tags
 	 */
-	depth = max((bt->depth + users - 1) / users, 4U);
+	depth = max((bt->sb.depth + users - 1) / users, 4U);
 	return atomic_read(&hctx->nr_active) < depth;
 }
 
-static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag,
-			 bool nowrap)
-{
-	int tag, org_last_tag = last_tag;
-
-	while (1) {
-		tag = find_next_zero_bit(&bm->word, bm->depth, last_tag);
-		if (unlikely(tag >= bm->depth)) {
-			/*
-			 * We started with an offset, and we didn't reset the
-			 * offset to 0 in a failure case, so start from 0 to
-			 * exhaust the map.
-			 */
-			if (org_last_tag && last_tag && !nowrap) {
-				last_tag = org_last_tag = 0;
-				continue;
-			}
-			return -1;
-		}
-
-		if (!test_and_set_bit(tag, &bm->word))
-			break;
-
-		last_tag = tag + 1;
-		if (last_tag >= bm->depth - 1)
-			last_tag = 0;
-	}
-
-	return tag;
-}
-
 #define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
 
-/*
- * Straight forward bitmap tag implementation, where each bit is a tag
- * (cleared == free, and set == busy). The small twist is using per-cpu
- * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
- * contexts. This enables us to drastically limit the space searched,
- * without dirtying an extra shared cacheline like we would if we stored
- * the cache value inside the shared blk_mq_bitmap_tags structure. On top
- * of that, each word of tags is in a separate cacheline. This means that
- * multiple users will tend to stick to different cachelines, at least
- * until the map is exhausted.
- */
-static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
 		    unsigned int *tag_cache, struct blk_mq_tags *tags)
 {
-	unsigned int last_tag, org_last_tag;
-	int index, i, tag;
+	unsigned int last_tag;
+	int tag;
 
 	if (!hctx_may_queue(hctx, bt))
 		return -1;
 
-	last_tag = org_last_tag = *tag_cache;
-	index = TAG_TO_INDEX(bt, last_tag);
+	last_tag = *tag_cache;
+	tag = sbitmap_get(&bt->sb, last_tag, BT_ALLOC_RR(tags));
 
-	for (i = 0; i < bt->map_nr; i++) {
-		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag),
-				    BT_ALLOC_RR(tags));
-		if (tag != -1) {
-			tag += (index << bt->bits_per_word);
-			goto done;
-		}
-
-		/*
-		 * Jump to next index, and reset the last tag to be the
-		 * first tag of that index
-		 */
-		index++;
-		last_tag = (index << bt->bits_per_word);
-
-		if (index >= bt->map_nr) {
-			index = 0;
-			last_tag = 0;
-		}
-	}
-
-	*tag_cache = 0;
-	return -1;
-
-	/*
-	 * Only update the cache from the allocation path, if we ended
-	 * up using the specific cached tag.
-	 */
-done:
-	if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) {
+	if (tag == -1) {
+		*tag_cache = 0;
+	} else if (tag == last_tag || unlikely(BT_ALLOC_RR(tags))) {
 		last_tag = tag + 1;
-		if (last_tag >= bt->depth - 1)
+		if (last_tag >= bt->sb.depth - 1)
 			last_tag = 0;
-
 		*tag_cache = last_tag;
 	}
 
 	return tag;
 }
 
-static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
-					 struct blk_mq_hw_ctx *hctx)
-{
-	struct bt_wait_state *bs;
-	int wait_index;
-
-	if (!hctx)
-		return &bt->bs[0];
-
-	wait_index = atomic_read(&hctx->wait_index);
-	bs = &bt->bs[wait_index];
-	bt_index_atomic_inc(&hctx->wait_index);
-	return bs;
-}
-
 static int bt_get(struct blk_mq_alloc_data *data,
-		struct blk_mq_bitmap_tags *bt,
-		struct blk_mq_hw_ctx *hctx,
-		unsigned int *last_tag, struct blk_mq_tags *tags)
+		  struct sbitmap_queue *bt,
+		  struct blk_mq_hw_ctx *hctx,
+		  unsigned int *last_tag, struct blk_mq_tags *tags)
 {
-	struct bt_wait_state *bs;
+	struct sbq_wait_state *ws;
 	DEFINE_WAIT(wait);
 	int tag;
 
@@ -271,9 +133,9 @@ static int bt_get(struct blk_mq_alloc_data *data,
 	if (data->flags & BLK_MQ_REQ_NOWAIT)
 		return -1;
 
-	bs = bt_wait_ptr(bt, hctx);
+	ws = bt_wait_ptr(bt, hctx);
 	do {
-		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __bt_get(hctx, bt, last_tag, tags);
 		if (tag != -1)
@@ -310,11 +172,11 @@ static int bt_get(struct blk_mq_alloc_data *data,
 			hctx = data->hctx;
 			bt = &hctx->tags->bitmap_tags;
 		}
-		finish_wait(&bs->wait, &wait);
-		bs = bt_wait_ptr(bt, hctx);
+		finish_wait(&ws->wait, &wait);
+		ws = bt_wait_ptr(bt, hctx);
 	} while (1);
 
-	finish_wait(&bs->wait, &wait);
+	finish_wait(&ws->wait, &wait);
 	return tag;
 }
 
@@ -354,53 +216,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	return __blk_mq_get_tag(data);
 }
 
-static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
-{
-	int i, wake_index;
-
-	wake_index = atomic_read(&bt->wake_index);
-	for (i = 0; i < BT_WAIT_QUEUES; i++) {
-		struct bt_wait_state *bs = &bt->bs[wake_index];
-
-		if (waitqueue_active(&bs->wait)) {
-			int o = atomic_read(&bt->wake_index);
-			if (wake_index != o)
-				atomic_cmpxchg(&bt->wake_index, o, wake_index);
-
-			return bs;
-		}
-
-		wake_index = bt_index_inc(wake_index);
-	}
-
-	return NULL;
-}
-
-static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
-{
-	const int index = TAG_TO_INDEX(bt, tag);
-	struct bt_wait_state *bs;
-	int wait_cnt;
-
-	clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
-
-	/* Ensure that the wait list checks occur after clear_bit(). */
-	smp_mb();
-
-	bs = bt_wake_ptr(bt);
-	if (!bs)
-		return;
-
-	wait_cnt = atomic_dec_return(&bs->wait_cnt);
-	if (unlikely(wait_cnt < 0))
-		wait_cnt = atomic_inc_return(&bs->wait_cnt);
-	if (wait_cnt == 0) {
-		atomic_add(bt->wake_cnt, &bs->wait_cnt);
-		bt_index_atomic_inc(&bt->wake_index);
-		wake_up(&bs->wait);
-	}
-}
-
 void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 		    unsigned int *last_tag)
 {
@@ -410,67 +225,94 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 		const int real_tag = tag - tags->nr_reserved_tags;
 
 		BUG_ON(real_tag >= tags->nr_tags);
-		bt_clear_tag(&tags->bitmap_tags, real_tag);
+		sbitmap_queue_clear(&tags->bitmap_tags, real_tag);
 		if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
 			*last_tag = real_tag;
 	} else {
 		BUG_ON(tag >= tags->nr_reserved_tags);
-		bt_clear_tag(&tags->breserved_tags, tag);
+		sbitmap_queue_clear(&tags->breserved_tags, tag);
 	}
 }
 
-static void bt_for_each(struct blk_mq_hw_ctx *hctx,
-		struct blk_mq_bitmap_tags *bt, unsigned int off,
-		busy_iter_fn *fn, void *data, bool reserved)
+struct bt_iter_data {
+	struct blk_mq_hw_ctx *hctx;
+	busy_iter_fn *fn;
+	void *data;
+	bool reserved;
+};
+
+static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 {
+	struct bt_iter_data *iter_data = data;
+	struct blk_mq_hw_ctx *hctx = iter_data->hctx;
+	struct blk_mq_tags *tags = hctx->tags;
+	bool reserved = iter_data->reserved;
 	struct request *rq;
-	int bit, i;
 
-	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
+	if (!reserved)
+		bitnr += tags->nr_reserved_tags;
+	rq = tags->rqs[bitnr];
 
-		for (bit = find_first_bit(&bm->word, bm->depth);
-		     bit < bm->depth;
-		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
-			rq = hctx->tags->rqs[off + bit];
-			if (rq->q == hctx->queue)
-				fn(hctx, rq, data, reserved);
-		}
+	if (rq->q == hctx->queue)
+		iter_data->fn(hctx, rq, iter_data->data, reserved);
+	return true;
+}
 
-		off += (1 << bt->bits_per_word);
-	}
+static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
+			busy_iter_fn *fn, void *data, bool reserved)
+{
+	struct bt_iter_data iter_data = {
+		.hctx = hctx,
+		.fn = fn,
+		.data = data,
+		.reserved = reserved,
+	};
+
+	sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
 }
 
-static void bt_tags_for_each(struct blk_mq_tags *tags,
-		struct blk_mq_bitmap_tags *bt, unsigned int off,
-		busy_tag_iter_fn *fn, void *data, bool reserved)
+struct bt_tags_iter_data {
+	struct blk_mq_tags *tags;
+	busy_tag_iter_fn *fn;
+	void *data;
+	bool reserved;
+};
+
+static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 {
+	struct bt_tags_iter_data *iter_data = data;
+	struct blk_mq_tags *tags = iter_data->tags;
+	bool reserved = iter_data->reserved;
 	struct request *rq;
-	int bit, i;
 
-	if (!tags->rqs)
-		return;
-	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
-
-		for (bit = find_first_bit(&bm->word, bm->depth);
-		     bit < bm->depth;
-		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
-			rq = tags->rqs[off + bit];
-			fn(rq, data, reserved);
-		}
+	if (!reserved)
+		bitnr += tags->nr_reserved_tags;
+	rq = tags->rqs[bitnr];
 
-		off += (1 << bt->bits_per_word);
-	}
+	iter_data->fn(rq, iter_data->data, reserved);
+	return true;
+}
+
+static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
+			     busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+	struct bt_tags_iter_data iter_data = {
+		.tags = tags,
+		.fn = fn,
+		.data = data,
+		.reserved = reserved,
+	};
+
+	if (tags->rqs)
+		sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
 }
 
 static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
 		busy_tag_iter_fn *fn, void *priv)
 {
 	if (tags->nr_reserved_tags)
-		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
-	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
-			false);
+		bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
 }
 
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
@@ -529,107 +371,20 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 			continue;
 
 		if (tags->nr_reserved_tags)
-			bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
-		bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
-		      false);
-	}
-
-}
-
-static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
-{
-	unsigned int i, used;
-
-	for (i = 0, used = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
-
-		used += bitmap_weight(&bm->word, bm->depth);
+			bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
+		bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
 	}
 
-	return bt->depth - used;
 }
 
-static void bt_update_count(struct blk_mq_bitmap_tags *bt,
-			    unsigned int depth)
+static unsigned int bt_unused_tags(const struct sbitmap_queue *bt)
 {
-	unsigned int tags_per_word = 1U << bt->bits_per_word;
-	unsigned int map_depth = depth;
-
-	if (depth) {
-		int i;
-
-		for (i = 0; i < bt->map_nr; i++) {
-			bt->map[i].depth = min(map_depth, tags_per_word);
-			map_depth -= bt->map[i].depth;
-		}
-	}
-
-	bt->wake_cnt = BT_WAIT_BATCH;
-	if (bt->wake_cnt > depth / BT_WAIT_QUEUES)
-		bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES);
-
-	bt->depth = depth;
+	return bt->sb.depth - sbitmap_weight(&bt->sb);
 }
 
-static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
-			int node, bool reserved)
+static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, int node)
 {
-	int i;
-
-	bt->bits_per_word = ilog2(BITS_PER_LONG);
-
-	/*
-	 * Depth can be zero for reserved tags, that's not a failure
-	 * condition.
-	 */
-	if (depth) {
-		unsigned int nr, tags_per_word;
-
-		tags_per_word = (1 << bt->bits_per_word);
-
-		/*
-		 * If the tag space is small, shrink the number of tags
-		 * per word so we spread over a few cachelines, at least.
-		 * If less than 4 tags, just forget about it, it's not
-		 * going to work optimally anyway.
-		 */
-		if (depth >= 4) {
-			while (tags_per_word * 4 > depth) {
-				bt->bits_per_word--;
-				tags_per_word = (1 << bt->bits_per_word);
-			}
-		}
-
-		nr = ALIGN(depth, tags_per_word) / tags_per_word;
-		bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
-						GFP_KERNEL, node);
-		if (!bt->map)
-			return -ENOMEM;
-
-		bt->map_nr = nr;
-	}
-
-	bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
-	if (!bt->bs) {
-		kfree(bt->map);
-		bt->map = NULL;
-		return -ENOMEM;
-	}
-
-	bt_update_count(bt, depth);
-
-	for (i = 0; i < BT_WAIT_QUEUES; i++) {
-		init_waitqueue_head(&bt->bs[i].wait);
-		atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
-	}
-
-	return 0;
-}
-
-static void bt_free(struct blk_mq_bitmap_tags *bt)
-{
-	kfree(bt->map);
-	kfree(bt->bs);
+	return sbitmap_queue_init_node(bt, depth, -1, GFP_KERNEL, node);
 }
 
 static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
@@ -639,14 +394,15 @@ static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
 
 	tags->alloc_policy = alloc_policy;
 
-	if (bt_alloc(&tags->bitmap_tags, depth, node, false))
-		goto enomem;
-	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
-		goto enomem;
+	if (bt_alloc(&tags->bitmap_tags, depth, node))
+		goto free_tags;
+	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node))
+		goto free_bitmap_tags;
 
 	return tags;
-enomem:
-	bt_free(&tags->bitmap_tags);
+free_bitmap_tags:
+	sbitmap_queue_free(&tags->bitmap_tags);
+free_tags:
 	kfree(tags);
 	return NULL;
 }
@@ -679,8 +435,8 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 
 void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
-	bt_free(&tags->bitmap_tags);
-	bt_free(&tags->breserved_tags);
+	sbitmap_queue_free(&tags->bitmap_tags);
+	sbitmap_queue_free(&tags->breserved_tags);
 	free_cpumask_var(tags->cpumask);
 	kfree(tags);
 }
@@ -702,7 +458,8 @@ int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
 	 * Don't need (or can't) update reserved tags here, they remain
 	 * static and should never need resizing.
 	 */
-	bt_update_count(&tags->bitmap_tags, tdepth);
+	sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+
 	blk_mq_tag_wakeup_all(tags, false);
 	return 0;
 }
@@ -746,7 +503,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
 	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
 			"bits_per_word=%u\n",
 			tags->nr_tags, tags->nr_reserved_tags,
-			tags->bitmap_tags.bits_per_word);
+			1U << tags->bitmap_tags.sb.shift);
 
 	free = bt_unused_tags(&tags->bitmap_tags);
 	res = bt_unused_tags(&tags->breserved_tags);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d468a79f2c4a..3215c08c63cc 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -3,31 +3,6 @@
 
 #include "blk-mq.h"
 
-enum {
-	BT_WAIT_QUEUES	= 8,
-	BT_WAIT_BATCH	= 8,
-};
-
-struct bt_wait_state {
-	atomic_t wait_cnt;
-	wait_queue_head_t wait;
-} ____cacheline_aligned_in_smp;
-
-#define TAG_TO_INDEX(bt, tag)	((tag) >> (bt)->bits_per_word)
-#define TAG_TO_BIT(bt, tag)	((tag) & ((1 << (bt)->bits_per_word) - 1))
-
-struct blk_mq_bitmap_tags {
-	unsigned int depth;
-	unsigned int wake_cnt;
-	unsigned int bits_per_word;
-
-	unsigned int map_nr;
-	struct blk_align_bitmap *map;
-
-	atomic_t wake_index;
-	struct bt_wait_state *bs;
-};
-
 /*
  * Tag address space map.
  */
@@ -37,8 +12,8 @@ struct blk_mq_tags {
 
 	atomic_t active_queues;
 
-	struct blk_mq_bitmap_tags bitmap_tags;
-	struct blk_mq_bitmap_tags breserved_tags;
+	struct sbitmap_queue bitmap_tags;
+	struct sbitmap_queue breserved_tags;
 
 	struct request **rqs;
 	struct list_head page_list;
@@ -61,6 +36,14 @@ extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv);
 
+static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
+						 struct blk_mq_hw_ctx *hctx)
+{
+	if (!hctx)
+		return &bt->ws[0];
+	return sbq_wait_ptr(bt, &hctx->wait_index);
+}
+
 enum {
 	BLK_MQ_TAG_CACHE_MIN	= 1,
 	BLK_MQ_TAG_CACHE_MAX	= 64,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0cb93625d812..6603be18064e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -41,42 +41,23 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
  */
 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-	unsigned int i;
-
-	for (i = 0; i < hctx->ctx_map.size; i++)
-		if (hctx->ctx_map.map[i].word)
-			return true;
-
-	return false;
-}
-
-static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
-					      struct blk_mq_ctx *ctx)
-{
-	return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
+	return sbitmap_any_bit_set(&hctx->ctx_map);
 }
 
-#define CTX_TO_BIT(hctx, ctx)	\
-	((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
-
 /*
  * Mark this ctx as having pending work in this hardware queue
  */
 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
 				     struct blk_mq_ctx *ctx)
 {
-	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-
-	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
-		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+	if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
+		sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
 }
 
 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 				      struct blk_mq_ctx *ctx)
 {
-	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-
-	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
 }
 
 void blk_mq_freeze_queue_start(struct request_queue *q)
@@ -755,38 +736,36 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 	return false;
 }
 
+struct flush_busy_ctx_data {
+	struct blk_mq_hw_ctx *hctx;
+	struct list_head *list;
+};
+
+static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
+{
+	struct flush_busy_ctx_data *flush_data = data;
+	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
+	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+	sbitmap_clear_bit(sb, bitnr);
+	spin_lock(&ctx->lock);
+	list_splice_tail_init(&ctx->rq_list, flush_data->list);
+	spin_unlock(&ctx->lock);
+	return true;
+}
+
 /*
  * Process software queues that have been marked busy, splicing them
  * to the for-dispatch
  */
 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
-	struct blk_mq_ctx *ctx;
-	int i;
-
-	for (i = 0; i < hctx->ctx_map.size; i++) {
-		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
-		unsigned int off, bit;
-
-		if (!bm->word)
-			continue;
-
-		bit = 0;
-		off = i * hctx->ctx_map.bits_per_word;
-		do {
-			bit = find_next_bit(&bm->word, bm->depth, bit);
-			if (bit >= bm->depth)
-				break;
-
-			ctx = hctx->ctxs[bit + off];
-			clear_bit(bit, &bm->word);
-			spin_lock(&ctx->lock);
-			list_splice_tail_init(&ctx->rq_list, list);
-			spin_unlock(&ctx->lock);
+	struct flush_busy_ctx_data data = {
+		.hctx = hctx,
+		.list = list,
+	};
 
-			bit++;
-		} while (1);
-	}
+	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
 }
 
 static inline unsigned int queued_to_index(unsigned int queued)
@@ -1609,32 +1588,6 @@ fail:
 	return NULL;
 }
 
-static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
-{
-	kfree(bitmap->map);
-}
-
-static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
-{
-	unsigned int bpw = 8, total, num_maps, i;
-
-	bitmap->bits_per_word = bpw;
-
-	num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
-	bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
-					GFP_KERNEL, node);
-	if (!bitmap->map)
-		return -ENOMEM;
-
-	total = nr_cpu_ids;
-	for (i = 0; i < num_maps; i++) {
-		bitmap->map[i].depth = min(total, bitmap->bits_per_word);
-		total -= bitmap->map[i].depth;
-	}
-
-	return 0;
-}
-
 /*
  * 'cpu' is going away. splice any existing rq_list entries from this
  * software queue to the hw queue dispatch list, and ensure that it
@@ -1700,7 +1653,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 
 	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 	blk_free_flush_queue(hctx->fq);
-	blk_mq_free_bitmap(&hctx->ctx_map);
+	sbitmap_free(&hctx->ctx_map);
 }
 
 static void blk_mq_exit_hw_queues(struct request_queue *q,
@@ -1760,7 +1713,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (!hctx->ctxs)
 		goto unregister_cpu_notifier;
 
-	if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
+	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
+			      node))
 		goto free_ctxs;
 
 	hctx->nr_ctx = 0;
@@ -1787,7 +1741,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
  free_bitmap:
-	blk_mq_free_bitmap(&hctx->ctx_map);
+	sbitmap_free(&hctx->ctx_map);
  free_ctxs:
 	kfree(hctx->ctxs);
  unregister_cpu_notifier:
@@ -1863,8 +1817,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 	mutex_unlock(&q->sysfs_lock);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		struct blk_mq_ctxmap *map = &hctx->ctx_map;
-
 		/*
 		 * If no software queues are mapped to this hardware queue,
 		 * disable it and free the request entries.
@@ -1890,7 +1842,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 		 * This is more accurate and more efficient than looping
 		 * over all possibly mapped software queues.
 		 */
-		map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word);
+		sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
 
 		/*
 		 * Initialize batch roundrobin counts
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9087b11037b7..71831f970fd3 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -63,15 +63,6 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
 
 void blk_mq_release(struct request_queue *q);
 
-/*
- * Basic implementation of sparser bitmap, allowing the user to spread
- * the bits over more cachelines.
- */
-struct blk_align_bitmap {
-	unsigned long word;
-	unsigned long depth;
-} ____cacheline_aligned_in_smp;
-
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
 					   unsigned int cpu)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 60ef14cbcd2d..2575779cf13f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -2,6 +2,7 @@
 #define BLK_MQ_H
 
 #include <linux/blkdev.h>
+#include <linux/sbitmap.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
@@ -12,12 +13,6 @@ struct blk_mq_cpu_notifier {
 	int (*notify)(void *data, unsigned long action, unsigned int cpu);
 };
 
-struct blk_mq_ctxmap {
-	unsigned int size;
-	unsigned int bits_per_word;
-	struct blk_align_bitmap *map;
-};
-
 struct blk_mq_hw_ctx {
 	struct {
 		spinlock_t		lock;
@@ -37,7 +32,7 @@ struct blk_mq_hw_ctx {
 
 	void			*driver_data;
 
-	struct blk_mq_ctxmap	ctx_map;
+	struct sbitmap		ctx_map;
 
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
new file mode 100644
index 000000000000..1a3b836042e1
--- /dev/null
+++ b/include/linux/sbitmap.h
@@ -0,0 +1,327 @@
+/*
+ * Fast and scalable bitmaps.
+ *
+ * Copyright (C) 2016 Facebook
+ * Copyright (C) 2013-2014 Jens Axboe
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LINUX_SCALE_BITMAP_H
+#define __LINUX_SCALE_BITMAP_H
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+/**
+ * struct sbitmap_word - Word in a &struct sbitmap.
+ */
+struct sbitmap_word {
+	/**
+	 * @word: The bitmap word itself.
+	 */
+	unsigned long word;
+
+	/**
+	 * @depth: Number of bits being used in @word.
+	 */
+	unsigned long depth;
+} ____cacheline_aligned_in_smp;
+
+/**
+ * struct sbitmap - Scalable bitmap.
+ *
+ * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This
+ * trades off higher memory usage for better scalability.
+ */
+struct sbitmap {
+	/**
+	 * @depth: Number of bits used in the whole bitmap.
+	 */
+	unsigned int depth;
+
+	/**
+	 * @shift: log2(number of bits used per word)
+	 */
+	unsigned int shift;
+
+	/**
+	 * @map_nr: Number of words (cachelines) being used for the bitmap.
+	 */
+	unsigned int map_nr;
+
+	/**
+	 * @map: Allocated bitmap.
+	 */
+	struct sbitmap_word *map;
+};
+
+#define SBQ_WAIT_QUEUES 8
+#define SBQ_WAKE_BATCH 8
+
+/**
+ * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue.
+ */
+struct sbq_wait_state {
+	/**
+	 * @wait_cnt: Number of frees remaining before we wake up.
+	 */
+	atomic_t wait_cnt;
+
+	/**
+	 * @wait: Wait queue.
+	 */
+	wait_queue_head_t wait;
+} ____cacheline_aligned_in_smp;
+
+/**
+ * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free
+ * bits.
+ *
+ * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to
+ * avoid contention on the wait queue spinlock. This ensures that we don't hit a
+ * scalability wall when we run out of free bits and have to start putting tasks
+ * to sleep.
+ */
+struct sbitmap_queue {
+	/**
+	 * @sb: Scalable bitmap.
+	 */
+	struct sbitmap sb;
+
+	/**
+	 * @wake_batch: Number of bits which must be freed before we wake up any
+	 * waiters.
+	 */
+	unsigned int wake_batch;
+
+	/**
+	 * @wake_index: Next wait queue in @ws to wake up.
+	 */
+	atomic_t wake_index;
+
+	/**
+	 * @ws: Wait queues.
+	 */
+	struct sbq_wait_state *ws;
+};
+
+/**
+ * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node.
+ * @sb: Bitmap to initialize.
+ * @depth: Number of bits to allocate.
+ * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if
+ *         given, a good default is chosen.
+ * @flags: Allocation flags.
+ * @node: Memory node to allocate on.
+ *
+ * Return: Zero on success or negative errno on failure.
+ */
+int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
+		      gfp_t flags, int node);
+
+/**
+ * sbitmap_free() - Free memory used by a &struct sbitmap.
+ * @sb: Bitmap to free.
+ */
+static inline void sbitmap_free(struct sbitmap *sb)
+{
+	kfree(sb->map);
+	sb->map = NULL;
+}
+
+/**
+ * sbitmap_resize() - Resize a &struct sbitmap.
+ * @sb: Bitmap to resize.
+ * @depth: New number of bits to resize to.
+ *
+ * Doesn't reallocate anything. It's up to the caller to ensure that the new
+ * depth doesn't exceed the depth that the sb was initialized with.
+ */
+void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
+
+/**
+ * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap.
+ * @sb: Bitmap to allocate from.
+ * @alloc_hint: Hint for where to start searching for a free bit.
+ * @round_robin: If true, be stricter about allocation order; always allocate
+ *               starting from the last allocated bit. This is less efficient
+ *               than the default behavior (false).
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);
+
+/**
+ * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
+ * @sb: Bitmap to check.
+ *
+ * Return: true if any bit in the bitmap is set, false otherwise.
+ */
+bool sbitmap_any_bit_set(const struct sbitmap *sb);
+
+/**
+ * sbitmap_any_bit_clear() - Check for an unset bit in a &struct
+ * sbitmap.
+ * @sb: Bitmap to check.
+ *
+ * Return: true if any bit in the bitmap is clear, false otherwise.
+ */
+bool sbitmap_any_bit_clear(const struct sbitmap *sb);
+
+typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
+
+/**
+ * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @sb: Bitmap to iterate over.
+ * @fn: Callback. Should return true to continue or false to break early.
+ * @data: Pointer to pass to callback.
+ *
+ * This is inline even though it's non-trivial so that the function calls to the
+ * callback will hopefully get optimized away.
+ */
+static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
+					void *data)
+{
+	unsigned int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		struct sbitmap_word *word = &sb->map[i];
+		unsigned int off, nr;
+
+		if (!word->word)
+			continue;
+
+		nr = 0;
+		off = i << sb->shift;
+		while (1) {
+			nr = find_next_bit(&word->word, word->depth, nr);
+			if (nr >= word->depth)
+				break;
+
+			if (!fn(sb, off + nr, data))
+				return;
+
+			nr++;
+		}
+	}
+}
+
+#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
+#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+
+static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
+					    unsigned int bitnr)
+{
+	return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word;
+}
+
+/* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */
+
+static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
+}
+
+static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
+}
+
+static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
+}
+
+unsigned int sbitmap_weight(const struct sbitmap *sb);
+
+/**
+ * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific
+ * memory node.
+ * @sbq: Bitmap queue to initialize.
+ * @depth: See sbitmap_init_node().
+ * @shift: See sbitmap_init_node().
+ * @flags: Allocation flags.
+ * @node: Memory node to allocate on.
+ *
+ * Return: Zero on success or negative errno on failure.
+ */
+int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
+			    int shift, gfp_t flags, int node);
+
+/**
+ * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue.
+ *
+ * @sbq: Bitmap queue to free.
+ */
+static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
+{
+	kfree(sbq->ws);
+	sbitmap_free(&sbq->sb);
+}
+
+/**
+ * sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
+ * @sbq: Bitmap queue to resize.
+ * @depth: New number of bits to resize to.
+ *
+ * Like sbitmap_resize(), this doesn't reallocate anything. It has to do
+ * some extra work on the &struct sbitmap_queue, so it's not safe to just
+ * resize the underlying &struct sbitmap.
+ */
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
+
+/**
+ * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
+ * &struct sbitmap_queue.
+ * @sbq: Bitmap to free from.
+ * @nr: Bit number to free.
+ */
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr);
+
+static inline int sbq_index_inc(int index)
+{
+	return (index + 1) & (SBQ_WAIT_QUEUES - 1);
+}
+
+static inline void sbq_index_atomic_inc(atomic_t *index)
+{
+	int old = atomic_read(index);
+	int new = sbq_index_inc(old);
+	atomic_cmpxchg(index, old, new);
+}
+
+/**
+ * sbq_wait_ptr() - Get the next wait queue to use for a &struct
+ * sbitmap_queue.
+ * @sbq: Bitmap queue to wait on.
+ * @wait_index: A counter per "user" of @sbq.
+ */
+static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
+						  atomic_t *wait_index)
+{
+	struct sbq_wait_state *ws;
+
+	ws = &sbq->ws[atomic_read(wait_index)];
+	sbq_index_atomic_inc(wait_index);
+	return ws;
+}
+
+/**
+ * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct
+ * sbitmap_queue.
+ * @sbq: Bitmap queue to wake up.
+ */
+void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
+
+#endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index d79909dc01ec..942fb8091a86 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -550,4 +550,7 @@ config STACKDEPOT
 	bool
 	select STACKTRACE
 
+config SBITMAP
+	bool
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index cfa68eb269e4..2cbfd2904994 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -228,3 +228,5 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
 obj-$(CONFIG_UBSAN) += ubsan.o
 
 UBSAN_SANITIZE_ubsan.o := n
+
+obj-$(CONFIG_SBITMAP) += sbitmap.o
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
new file mode 100644
index 000000000000..dfc084ac6937
--- /dev/null
+++ b/lib/sbitmap.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2016 Facebook
+ * Copyright (C) 2013-2014 Jens Axboe
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/sbitmap.h>
+
+int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
+		      gfp_t flags, int node)
+{
+	unsigned int bits_per_word;
+	unsigned int i;
+
+	if (shift < 0) {
+		shift = ilog2(BITS_PER_LONG);
+		/*
+		 * If the bitmap is small, shrink the number of bits per word so
+		 * we spread over a few cachelines, at least. If less than 4
+		 * bits, just forget about it, it's not going to work optimally
+		 * anyway.
+		 */
+		if (depth >= 4) {
+			while ((4U << shift) > depth)
+				shift--;
+		}
+	}
+	bits_per_word = 1U << shift;
+	if (bits_per_word > BITS_PER_LONG)
+		return -EINVAL;
+
+	sb->shift = shift;
+	sb->depth = depth;
+	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+
+	if (depth == 0) {
+		sb->map = NULL;
+		return 0;
+	}
+
+	sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node);
+	if (!sb->map)
+		return -ENOMEM;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		sb->map[i].depth = min(depth, bits_per_word);
+		depth -= sb->map[i].depth;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sbitmap_init_node);
+
+void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
+{
+	unsigned int bits_per_word = 1U << sb->shift;
+	unsigned int i;
+
+	sb->depth = depth;
+	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+
+	for (i = 0; i < sb->map_nr; i++) {
+		sb->map[i].depth = min(depth, bits_per_word);
+		depth -= sb->map[i].depth;
+	}
+}
+EXPORT_SYMBOL_GPL(sbitmap_resize);
+
+static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
+			      bool wrap)
+{
+	unsigned int orig_hint = hint;
+	int nr;
+
+	while (1) {
+		nr = find_next_zero_bit(&word->word, word->depth, hint);
+		if (unlikely(nr >= word->depth)) {
+			/*
+			 * We started with an offset, and we didn't reset the
+			 * offset to 0 in a failure case, so start from 0 to
+			 * exhaust the map.
+			 */
+			if (orig_hint && hint && wrap) {
+				hint = orig_hint = 0;
+				continue;
+			}
+			return -1;
+		}
+
+		if (!test_and_set_bit(nr, &word->word))
+			break;
+
+		hint = nr + 1;
+		if (hint >= word->depth - 1)
+			hint = 0;
+	}
+
+	return nr;
+}
+
+int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
+{
+	unsigned int i, index;
+	int nr = -1;
+
+	index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+	for (i = 0; i < sb->map_nr; i++) {
+		nr = __sbitmap_get_word(&sb->map[index],
+					SB_NR_TO_BIT(sb, alloc_hint),
+					!round_robin);
+		if (nr != -1) {
+			nr += index << sb->shift;
+			break;
+		}
+
+		/* Jump to next index. */
+		index++;
+		alloc_hint = index << sb->shift;
+
+		if (index >= sb->map_nr) {
+			index = 0;
+			alloc_hint = 0;
+		}
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get);
+
+bool sbitmap_any_bit_set(const struct sbitmap *sb)
+{
+	unsigned int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		if (sb->map[i].word)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);
+
+bool sbitmap_any_bit_clear(const struct sbitmap *sb)
+{
+	unsigned int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		const struct sbitmap_word *word = &sb->map[i];
+		unsigned long ret;
+
+		ret = find_first_zero_bit(&word->word, word->depth);
+		if (ret < word->depth)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
+
+unsigned int sbitmap_weight(const struct sbitmap *sb)
+{
+	unsigned int i, weight;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		const struct sbitmap_word *word = &sb->map[i];
+
+		weight += bitmap_weight(&word->word, word->depth);
+	}
+	return weight;
+}
+EXPORT_SYMBOL_GPL(sbitmap_weight);
+
+static unsigned int sbq_calc_wake_batch(unsigned int depth)
+{
+	unsigned int wake_batch;
+
+	/*
+	 * For each batch, we wake up one queue. We need to make sure that our
+	 * batch size is small enough that the full depth of the bitmap is
+	 * enough to wake up all of the queues.
+	 */
+	wake_batch = SBQ_WAKE_BATCH;
+	if (wake_batch > depth / SBQ_WAIT_QUEUES)
+		wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
+
+	return wake_batch;
+}
+
+int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
+			    int shift, gfp_t flags, int node)
+{
+	int ret;
+	int i;
+
+	ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node);
+	if (ret)
+		return ret;
+
+	sbq->wake_batch = sbq_calc_wake_batch(depth);
+	atomic_set(&sbq->wake_index, 0);
+
+	sbq->ws = kzalloc(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags);
+	if (!sbq->ws) {
+		sbitmap_free(&sbq->sb);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		init_waitqueue_head(&sbq->ws[i].wait);
+		atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
+
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+{
+	sbq->wake_batch = sbq_calc_wake_batch(depth);
+	sbitmap_resize(&sbq->sb, depth);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
+
+static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
+{
+	int i, wake_index;
+
+	wake_index = atomic_read(&sbq->wake_index);
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		struct sbq_wait_state *ws = &sbq->ws[wake_index];
+
+		if (waitqueue_active(&ws->wait)) {
+			int o = atomic_read(&sbq->wake_index);
+
+			if (wake_index != o)
+				atomic_cmpxchg(&sbq->wake_index, o, wake_index);
+			return ws;
+		}
+
+		wake_index = sbq_index_inc(wake_index);
+	}
+
+	return NULL;
+}
+
+static void sbq_wake_up(struct sbitmap_queue *sbq)
+{
+	struct sbq_wait_state *ws;
+	int wait_cnt;
+
+	/* Ensure that the wait list checks occur after clear_bit(). */
+	smp_mb();
+
+	ws = sbq_wake_ptr(sbq);
+	if (!ws)
+		return;
+
+	wait_cnt = atomic_dec_return(&ws->wait_cnt);
+	if (unlikely(wait_cnt < 0))
+		wait_cnt = atomic_inc_return(&ws->wait_cnt);
+	if (wait_cnt == 0) {
+		atomic_add(sbq->wake_batch, &ws->wait_cnt);
+		sbq_index_atomic_inc(&sbq->wake_index);
+		wake_up(&ws->wait);
+	}
+}
+
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr)
+{
+	sbitmap_clear_bit(&sbq->sb, nr);
+	sbq_wake_up(sbq);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
+
+void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
+{
+	int i, wake_index;
+
+	/*
+	 * Make sure all changes prior to this are visible from other CPUs.
+	 */
+	smp_mb();
+	wake_index = atomic_read(&sbq->wake_index);
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		struct sbq_wait_state *ws = &sbq->ws[wake_index];
+
+		if (waitqueue_active(&ws->wait))
+			wake_up(&ws->wait);
+
+		wake_index = sbq_index_inc(wake_index);
+	}
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);
-- 
cgit v1.2.3


From 40aabb67464d5aad9ca3d2a5fedee56e2ff45aa0 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sat, 17 Sep 2016 01:28:23 -0700
Subject: sbitmap: push per-cpu last_tag into sbitmap_queue

Allocating your own per-cpu allocation hint separately makes for an
awkward API. Instead, allocate the per-cpu hint as part of the struct
sbitmap_queue. There's no point for a struct sbitmap_queue without the
cache, but you can still use a bare struct sbitmap.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c      | 53 ++++++++++++++++---------------------------------
 block/blk-mq-tag.h      |  3 ++-
 block/blk-mq.c          |  2 +-
 block/blk-mq.h          |  2 --
 include/linux/sbitmap.h | 45 ++++++++++++++++++++++++++++++++++++++++-
 lib/sbitmap.c           | 35 +++++++++++++++++++++++++++++++-
 6 files changed, 98 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2cbdecd594e9..c9a22dbbbda1 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -94,39 +94,21 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 #define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
 
 static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
-		    unsigned int *tag_cache, struct blk_mq_tags *tags)
+		    struct blk_mq_tags *tags)
 {
-	unsigned int last_tag;
-	int tag;
-
 	if (!hctx_may_queue(hctx, bt))
 		return -1;
-
-	last_tag = *tag_cache;
-	tag = sbitmap_get(&bt->sb, last_tag, BT_ALLOC_RR(tags));
-
-	if (tag == -1) {
-		*tag_cache = 0;
-	} else if (tag == last_tag || unlikely(BT_ALLOC_RR(tags))) {
-		last_tag = tag + 1;
-		if (last_tag >= bt->sb.depth - 1)
-			last_tag = 0;
-		*tag_cache = last_tag;
-	}
-
-	return tag;
+	return __sbitmap_queue_get(bt, BT_ALLOC_RR(tags));
 }
 
-static int bt_get(struct blk_mq_alloc_data *data,
-		  struct sbitmap_queue *bt,
-		  struct blk_mq_hw_ctx *hctx,
-		  unsigned int *last_tag, struct blk_mq_tags *tags)
+static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
+		  struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags)
 {
 	struct sbq_wait_state *ws;
 	DEFINE_WAIT(wait);
 	int tag;
 
-	tag = __bt_get(hctx, bt, last_tag, tags);
+	tag = __bt_get(hctx, bt, tags);
 	if (tag != -1)
 		return tag;
 
@@ -137,7 +119,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 	do {
 		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __bt_get(hctx, bt, last_tag, tags);
+		tag = __bt_get(hctx, bt, tags);
 		if (tag != -1)
 			break;
 
@@ -154,7 +136,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __bt_get(hctx, bt, last_tag, tags);
+		tag = __bt_get(hctx, bt, tags);
 		if (tag != -1)
 			break;
 
@@ -168,7 +150,6 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		if (data->flags & BLK_MQ_REQ_RESERVED) {
 			bt = &data->hctx->tags->breserved_tags;
 		} else {
-			last_tag = &data->ctx->last_tag;
 			hctx = data->hctx;
 			bt = &hctx->tags->bitmap_tags;
 		}
@@ -185,7 +166,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	int tag;
 
 	tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
-			&data->ctx->last_tag, data->hctx->tags);
+		     data->hctx->tags);
 	if (tag >= 0)
 		return tag + data->hctx->tags->nr_reserved_tags;
 
@@ -194,15 +175,15 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
 
 static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
 {
-	int tag, zero = 0;
+	int tag;
 
 	if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
 		WARN_ON_ONCE(1);
 		return BLK_MQ_TAG_FAIL;
 	}
 
-	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero,
-		data->hctx->tags);
+	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL,
+		     data->hctx->tags);
 	if (tag < 0)
 		return BLK_MQ_TAG_FAIL;
 
@@ -216,8 +197,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	return __blk_mq_get_tag(data);
 }
 
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
-		    unsigned int *last_tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+		    unsigned int tag)
 {
 	struct blk_mq_tags *tags = hctx->tags;
 
@@ -225,12 +206,12 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 		const int real_tag = tag - tags->nr_reserved_tags;
 
 		BUG_ON(real_tag >= tags->nr_tags);
-		sbitmap_queue_clear(&tags->bitmap_tags, real_tag);
-		if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
-			*last_tag = real_tag;
+		sbitmap_queue_clear(&tags->bitmap_tags, real_tag,
+				    BT_ALLOC_RR(tags), ctx->cpu);
 	} else {
 		BUG_ON(tag >= tags->nr_reserved_tags);
-		sbitmap_queue_clear(&tags->breserved_tags, tag);
+		sbitmap_queue_clear(&tags->breserved_tags, tag,
+				    BT_ALLOC_RR(tags), ctx->cpu);
 	}
 }
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 3215c08c63cc..2b1d52ed82e0 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -27,7 +27,8 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			   unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
 extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6603be18064e..e0a69daddbd8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -303,7 +303,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	rq->cmd_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	blk_mq_put_tag(hctx, tag, &ctx->last_tag);
+	blk_mq_put_tag(hctx, ctx, tag);
 	blk_queue_exit(q);
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 71831f970fd3..9b15d2ef7f7b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -12,8 +12,6 @@ struct blk_mq_ctx {
 	unsigned int		cpu;
 	unsigned int		index_hw;
 
-	unsigned int		last_tag ____cacheline_aligned_in_smp;
-
 	/* incremented at dispatch time */
 	unsigned long		rq_dispatched[2];
 	unsigned long		rq_merged;
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 1a3b836042e1..6745545e0b22 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -99,6 +99,14 @@ struct sbitmap_queue {
 	 */
 	struct sbitmap sb;
 
+	/*
+	 * @alloc_hint: Cache of last successfully allocated or freed bit.
+	 *
+	 * This is per-cpu, which allows multiple users to stick to different
+	 * cachelines until the map is exhausted.
+	 */
+	unsigned int __percpu *alloc_hint;
+
 	/**
 	 * @wake_batch: Number of bits which must be freed before we wake up any
 	 * waiters.
@@ -267,6 +275,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
 {
 	kfree(sbq->ws);
+	free_percpu(sbq->alloc_hint);
 	sbitmap_free(&sbq->sb);
 }
 
@@ -281,13 +290,47 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
  */
 void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
 
+/**
+ * __sbitmap_queue_get() - Try to allocate a free bit from a &struct
+ * sbitmap_queue with preemption already disabled.
+ * @sbq: Bitmap queue to allocate from.
+ * @round_robin: See sbitmap_get().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin);
+
+/**
+ * sbitmap_queue_get() - Try to allocate a free bit from a &struct
+ * sbitmap_queue.
+ * @sbq: Bitmap queue to allocate from.
+ * @round_robin: See sbitmap_get().
+ * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
+ *       sbitmap_queue_clear()).
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin,
+				    unsigned int *cpu)
+{
+	int nr;
+
+	*cpu = get_cpu();
+	nr = __sbitmap_queue_get(sbq, round_robin);
+	put_cpu();
+	return nr;
+}
+
 /**
  * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
  * &struct sbitmap_queue.
  * @sbq: Bitmap to free from.
  * @nr: Bit number to free.
+ * @round_robin: See sbitmap_get().
+ * @cpu: CPU the bit was allocated on.
  */
-void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr);
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
+			 bool round_robin, unsigned int cpu);
 
 static inline int sbq_index_inc(int index)
 {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 4d8e97e470ee..1651ad9d5530 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -205,11 +205,18 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 	if (ret)
 		return ret;
 
+	sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
+	if (!sbq->alloc_hint) {
+		sbitmap_free(&sbq->sb);
+		return -ENOMEM;
+	}
+
 	sbq->wake_batch = sbq_calc_wake_batch(depth);
 	atomic_set(&sbq->wake_index, 0);
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
 	if (!sbq->ws) {
+		free_percpu(sbq->alloc_hint);
 		sbitmap_free(&sbq->sb);
 		return -ENOMEM;
 	}
@@ -229,6 +236,29 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
 
+int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin)
+{
+	unsigned int hint;
+	int nr;
+
+	hint = this_cpu_read(*sbq->alloc_hint);
+	nr = sbitmap_get(&sbq->sb, hint, round_robin);
+
+	if (nr == -1) {
+		/* If the map is full, a hint won't do us much good. */
+		this_cpu_write(*sbq->alloc_hint, 0);
+	} else if (nr == hint || unlikely(round_robin)) {
+		/* Only update the hint if we used it. */
+		hint = nr + 1;
+		if (hint >= sbq->sb.depth - 1)
+			hint = 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
+
 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
@@ -273,10 +303,13 @@ static void sbq_wake_up(struct sbitmap_queue *sbq)
 	}
 }
 
-void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr)
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
+			 bool round_robin, unsigned int cpu)
 {
 	sbitmap_clear_bit(&sbq->sb, nr);
 	sbq_wake_up(sbq);
+	if (likely(!round_robin))
+		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
 
-- 
cgit v1.2.3


From f4a644db86669d938c71f19560aebf69d4720d63 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sat, 17 Sep 2016 01:28:24 -0700
Subject: sbitmap: push alloc policy into sbitmap_queue

Again, there's no point in passing this in every time. Make it part of
struct sbitmap_queue and clean up the API.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c      | 33 +++++++++++++++------------------
 block/blk-mq-tag.h      |  1 -
 include/linux/sbitmap.h | 19 +++++++++++--------
 lib/sbitmap.c           | 14 ++++++++------
 4 files changed, 34 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c9a22dbbbda1..e1c2bedb0bf9 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -91,14 +91,11 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	return atomic_read(&hctx->nr_active) < depth;
 }
 
-#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
-
-static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
-		    struct blk_mq_tags *tags)
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt)
 {
 	if (!hctx_may_queue(hctx, bt))
 		return -1;
-	return __sbitmap_queue_get(bt, BT_ALLOC_RR(tags));
+	return __sbitmap_queue_get(bt);
 }
 
 static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
@@ -108,7 +105,7 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
 	DEFINE_WAIT(wait);
 	int tag;
 
-	tag = __bt_get(hctx, bt, tags);
+	tag = __bt_get(hctx, bt);
 	if (tag != -1)
 		return tag;
 
@@ -119,7 +116,7 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
 	do {
 		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __bt_get(hctx, bt, tags);
+		tag = __bt_get(hctx, bt);
 		if (tag != -1)
 			break;
 
@@ -136,7 +133,7 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __bt_get(hctx, bt, tags);
+		tag = __bt_get(hctx, bt);
 		if (tag != -1)
 			break;
 
@@ -206,12 +203,10 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 		const int real_tag = tag - tags->nr_reserved_tags;
 
 		BUG_ON(real_tag >= tags->nr_tags);
-		sbitmap_queue_clear(&tags->bitmap_tags, real_tag,
-				    BT_ALLOC_RR(tags), ctx->cpu);
+		sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
 	} else {
 		BUG_ON(tag >= tags->nr_reserved_tags);
-		sbitmap_queue_clear(&tags->breserved_tags, tag,
-				    BT_ALLOC_RR(tags), ctx->cpu);
+		sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
 	}
 }
 
@@ -363,21 +358,23 @@ static unsigned int bt_unused_tags(const struct sbitmap_queue *bt)
 	return bt->sb.depth - sbitmap_weight(&bt->sb);
 }
 
-static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, int node)
+static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
+		    bool round_robin, int node)
 {
-	return sbitmap_queue_init_node(bt, depth, -1, GFP_KERNEL, node);
+	return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
+				       node);
 }
 
 static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
 						   int node, int alloc_policy)
 {
 	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+	bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
 
-	tags->alloc_policy = alloc_policy;
-
-	if (bt_alloc(&tags->bitmap_tags, depth, node))
+	if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
 		goto free_tags;
-	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node))
+	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
+		     node))
 		goto free_bitmap_tags;
 
 	return tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 2b1d52ed82e0..f90b850ce43d 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -18,7 +18,6 @@ struct blk_mq_tags {
 	struct request **rqs;
 	struct list_head page_list;
 
-	int alloc_policy;
 	cpumask_var_t cpumask;
 };
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 6745545e0b22..f017fd6e69c4 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -122,6 +122,11 @@ struct sbitmap_queue {
 	 * @ws: Wait queues.
 	 */
 	struct sbq_wait_state *ws;
+
+	/**
+	 * @round_robin: Allocate bits in strict round-robin order.
+	 */
+	bool round_robin;
 };
 
 /**
@@ -259,13 +264,14 @@ unsigned int sbitmap_weight(const struct sbitmap *sb);
  * @sbq: Bitmap queue to initialize.
  * @depth: See sbitmap_init_node().
  * @shift: See sbitmap_init_node().
+ * @round_robin: See sbitmap_get().
  * @flags: Allocation flags.
  * @node: Memory node to allocate on.
  *
  * Return: Zero on success or negative errno on failure.
  */
 int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
-			    int shift, gfp_t flags, int node);
+			    int shift, bool round_robin, gfp_t flags, int node);
 
 /**
  * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue.
@@ -294,29 +300,27 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
  * __sbitmap_queue_get() - Try to allocate a free bit from a &struct
  * sbitmap_queue with preemption already disabled.
  * @sbq: Bitmap queue to allocate from.
- * @round_robin: See sbitmap_get().
  *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
-int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin);
+int __sbitmap_queue_get(struct sbitmap_queue *sbq);
 
 /**
  * sbitmap_queue_get() - Try to allocate a free bit from a &struct
  * sbitmap_queue.
  * @sbq: Bitmap queue to allocate from.
- * @round_robin: See sbitmap_get().
  * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
  *       sbitmap_queue_clear()).
  *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
-static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin,
+static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
 				    unsigned int *cpu)
 {
 	int nr;
 
 	*cpu = get_cpu();
-	nr = __sbitmap_queue_get(sbq, round_robin);
+	nr = __sbitmap_queue_get(sbq);
 	put_cpu();
 	return nr;
 }
@@ -326,11 +330,10 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin,
  * &struct sbitmap_queue.
  * @sbq: Bitmap to free from.
  * @nr: Bit number to free.
- * @round_robin: See sbitmap_get().
  * @cpu: CPU the bit was allocated on.
  */
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
-			 bool round_robin, unsigned int cpu);
+			 unsigned int cpu);
 
 static inline int sbq_index_inc(int index)
 {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 1651ad9d5530..be55f744b771 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -196,7 +196,7 @@ static unsigned int sbq_calc_wake_batch(unsigned int depth)
 }
 
 int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
-			    int shift, gfp_t flags, int node)
+			    int shift, bool round_robin, gfp_t flags, int node)
 {
 	int ret;
 	int i;
@@ -225,6 +225,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 		init_waitqueue_head(&sbq->ws[i].wait);
 		atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
 	}
+
+	sbq->round_robin = round_robin;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
@@ -236,18 +238,18 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
 
-int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin)
+int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 {
 	unsigned int hint;
 	int nr;
 
 	hint = this_cpu_read(*sbq->alloc_hint);
-	nr = sbitmap_get(&sbq->sb, hint, round_robin);
+	nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin);
 
 	if (nr == -1) {
 		/* If the map is full, a hint won't do us much good. */
 		this_cpu_write(*sbq->alloc_hint, 0);
-	} else if (nr == hint || unlikely(round_robin)) {
+	} else if (nr == hint || unlikely(sbq->round_robin)) {
 		/* Only update the hint if we used it. */
 		hint = nr + 1;
 		if (hint >= sbq->sb.depth - 1)
@@ -304,11 +306,11 @@ static void sbq_wake_up(struct sbitmap_queue *sbq)
 }
 
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
-			 bool round_robin, unsigned int cpu)
+			 unsigned int cpu)
 {
 	sbitmap_clear_bit(&sbq->sb, nr);
 	sbq_wake_up(sbq);
-	if (likely(!round_robin))
+	if (likely(!sbq->round_robin))
 		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
-- 
cgit v1.2.3


From b21d5b301794ae332eaa6e177d71fe8b77d3664c Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 16 Sep 2016 14:25:06 +0200
Subject: blk-mq: register device instead of disk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable devices without a gendisk instance to register itself with blk-mq
and expose the associated multi-queue sysfs entries.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c   | 17 +++++++----------
 block/blk-sysfs.c      |  4 ++--
 drivers/md/dm-rq.c     |  2 +-
 include/linux/blk-mq.h |  4 ++--
 4 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 3c385b196bc7..01fb455d3377 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -393,9 +393,8 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	return ret;
 }
 
-static void __blk_mq_unregister_disk(struct gendisk *disk)
+static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
-	struct request_queue *q = disk->queue;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	int i, j;
@@ -413,15 +412,15 @@ static void __blk_mq_unregister_disk(struct gendisk *disk)
 	kobject_del(&q->mq_kobj);
 	kobject_put(&q->mq_kobj);
 
-	kobject_put(&disk_to_dev(disk)->kobj);
+	kobject_put(&dev->kobj);
 
 	q->mq_sysfs_init_done = false;
 }
 
-void blk_mq_unregister_disk(struct gendisk *disk)
+void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
 	blk_mq_disable_hotplug();
-	__blk_mq_unregister_disk(disk);
+	__blk_mq_unregister_dev(dev, q);
 	blk_mq_enable_hotplug();
 }
 
@@ -443,10 +442,8 @@ static void blk_mq_sysfs_init(struct request_queue *q)
 	}
 }
 
-int blk_mq_register_disk(struct gendisk *disk)
+int blk_mq_register_dev(struct device *dev, struct request_queue *q)
 {
-	struct device *dev = disk_to_dev(disk);
-	struct request_queue *q = disk->queue;
 	struct blk_mq_hw_ctx *hctx;
 	int ret, i;
 
@@ -467,7 +464,7 @@ int blk_mq_register_disk(struct gendisk *disk)
 	}
 
 	if (ret)
-		__blk_mq_unregister_disk(disk);
+		__blk_mq_unregister_dev(dev, q);
 	else
 		q->mq_sysfs_init_done = true;
 out:
@@ -475,7 +472,7 @@ out:
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blk_mq_register_disk);
+EXPORT_SYMBOL_GPL(blk_mq_register_dev);
 
 void blk_mq_sysfs_unregister(struct request_queue *q)
 {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f87a7e747d36..9cc8d7c5439a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -704,7 +704,7 @@ int blk_register_queue(struct gendisk *disk)
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 
 	if (q->mq_ops)
-		blk_mq_register_disk(disk);
+		blk_mq_register_dev(dev, q);
 
 	if (!q->request_fn)
 		return 0;
@@ -729,7 +729,7 @@ void blk_unregister_queue(struct gendisk *disk)
 		return;
 
 	if (q->mq_ops)
-		blk_mq_unregister_disk(disk);
+		blk_mq_unregister_dev(disk_to_dev(disk), q);
 
 	if (q->request_fn)
 		elv_unregister_queue(q);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 1ca7463e8bb2..ee48230a2952 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -955,7 +955,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 	dm_init_md_queue(md);
 
 	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
-	blk_mq_register_disk(md->disk);
+	blk_mq_register_dev(disk_to_dev(md->disk), q);
 
 	return 0;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2575779cf13f..fbcfdf323243 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -175,8 +175,8 @@ enum {
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 						  struct request_queue *q);
-int blk_mq_register_disk(struct gendisk *);
-void blk_mq_unregister_disk(struct gendisk *);
+int blk_mq_register_dev(struct device *, struct request_queue *);
+void blk_mq_unregister_dev(struct device *, struct request_queue *);
 
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
 void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
-- 
cgit v1.2.3


From b0b4e09c1ae71c4ec33df0616b830ae050006e9b Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 16 Sep 2016 14:25:07 +0200
Subject: lightnvm: control life of nvm_dev in driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LightNVM compatible device drivers does not have a method to expose
LightNVM specific sysfs entries.

To enable LightNVM sysfs entries to be exposed, lightnvm device
drivers require a struct device to attach it to. To allow both the
actual device driver and lightnvm sysfs entries to coexist, the device
driver tracks the lifetime of the nvm_dev structure.

This patch refactors NVMe and null_blk to handle the lifetime of struct
nvm_dev, which eliminates the need for struct gendisk when a lightnvm
compatible device is provided.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c     | 22 ++++++++++++++++++++--
 drivers/lightnvm/core.c      | 35 ++++++++---------------------------
 drivers/nvme/host/core.c     | 36 +++++++++++++++---------------------
 drivers/nvme/host/lightnvm.c | 31 ++++++++++++++++++++++++-------
 drivers/nvme/host/nvme.h     | 12 +++++++-----
 include/linux/lightnvm.h     | 15 +++++++++------
 6 files changed, 83 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 895867a8a783..91e1de898daf 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -34,6 +34,7 @@ struct nullb {
 	unsigned int index;
 	struct request_queue *q;
 	struct gendisk *disk;
+	struct nvm_dev *ndev;
 	struct blk_mq_tag_set tag_set;
 	struct hrtimer timer;
 	unsigned int queue_depth;
@@ -550,12 +551,29 @@ static struct nvm_dev_ops null_lnvm_dev_ops = {
 
 static int null_nvm_register(struct nullb *nullb)
 {
-	return nvm_register(nullb->q, nullb->disk_name, &null_lnvm_dev_ops);
+	struct nvm_dev *dev;
+	int rv;
+
+	dev = nvm_alloc_dev(0);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->q = nullb->q;
+	memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
+	dev->ops = &null_lnvm_dev_ops;
+
+	rv = nvm_register(dev);
+	if (rv) {
+		kfree(dev);
+		return rv;
+	}
+	nullb->ndev = dev;
+	return 0;
 }
 
 static void null_nvm_unregister(struct nullb *nullb)
 {
-	nvm_unregister(nullb->disk_name);
+	nvm_unregister(nullb->ndev);
 }
 #else
 static int null_nvm_register(struct nullb *nullb)
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 25c5df920326..a99b59d1eb36 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -660,22 +660,15 @@ static void nvm_exit(struct nvm_dev *dev)
 	pr_info("nvm: successfully unloaded\n");
 }
 
-int nvm_register(struct request_queue *q, char *disk_name,
-							struct nvm_dev_ops *ops)
+struct nvm_dev *nvm_alloc_dev(int node)
 {
-	struct nvm_dev *dev;
-	int ret;
-
-	if (!ops->identity)
-		return -EINVAL;
-
-	dev = kzalloc(sizeof(struct nvm_dev), GFP_KERNEL);
-	if (!dev)
-		return -ENOMEM;
+	return kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node);
+}
+EXPORT_SYMBOL(nvm_alloc_dev);
 
-	dev->q = q;
-	dev->ops = ops;
-	strncpy(dev->name, disk_name, DISK_NAME_LEN);
+int nvm_register(struct nvm_dev *dev)
+{
+	int ret;
 
 	ret = nvm_init(dev);
 	if (ret)
@@ -714,29 +707,17 @@ int nvm_register(struct request_queue *q, char *disk_name,
 	return 0;
 err_init:
 	kfree(dev->lun_map);
-	kfree(dev);
 	return ret;
 }
 EXPORT_SYMBOL(nvm_register);
 
-void nvm_unregister(char *disk_name)
+void nvm_unregister(struct nvm_dev *dev)
 {
-	struct nvm_dev *dev;
-
 	down_write(&nvm_lock);
-	dev = nvm_find_nvm_dev(disk_name);
-	if (!dev) {
-		pr_err("nvm: could not find device %s to unregister\n",
-								disk_name);
-		up_write(&nvm_lock);
-		return;
-	}
-
 	list_del(&dev->devices);
 	up_write(&nvm_lock);
 
 	nvm_exit(dev);
-	kfree(dev);
 }
 EXPORT_SYMBOL(nvm_unregister);
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2c3da3315a02..3c707d83b1da 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -156,12 +156,14 @@ static void nvme_free_ns(struct kref *kref)
 {
 	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 
-	if (ns->type == NVME_NS_LIGHTNVM)
-		nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
+	if (ns->ndev)
+		nvme_nvm_unregister(ns);
 
-	spin_lock(&dev_list_lock);
-	ns->disk->private_data = NULL;
-	spin_unlock(&dev_list_lock);
+	if (ns->disk) {
+		spin_lock(&dev_list_lock);
+		ns->disk->private_data = NULL;
+		spin_unlock(&dev_list_lock);
+	}
 
 	put_disk(ns->disk);
 	ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
@@ -891,8 +893,7 @@ static void nvme_config_discard(struct nvme_ns *ns)
 static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
 {
 	if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) {
-		dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n",
-				__func__);
+		dev_warn(ns->ctrl->dev, "%s: Identify failure\n", __func__);
 		return -ENODEV;
 	}
 
@@ -1683,18 +1684,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		goto out_free_queue;
 
 	if (nvme_nvm_ns_supported(ns, id)) {
-		if (nvme_nvm_register(ns->queue, disk_name)) {
-			dev_warn(ctrl->dev,
-				"%s: LightNVM init failure\n", __func__);
+		if (nvme_nvm_register(ns, disk_name, node)) {
+			dev_warn(ctrl->dev, "%s: LightNVM init failure\n",
+								__func__);
 			goto out_free_id;
 		}
-
-		disk = alloc_disk_node(0, node);
-		if (!disk)
-			goto out_free_id;
-		memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
-		ns->disk = disk;
-		ns->type = NVME_NS_LIGHTNVM;
 	} else {
 		disk = alloc_disk_node(0, node);
 		if (!disk)
@@ -1718,7 +1712,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	kfree(id);
 
-	if (ns->type == NVME_NS_LIGHTNVM)
+	if (ns->ndev)
 		return;
 
 	device_add_disk(ctrl->device, ns->disk);
@@ -1742,7 +1736,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
-	if (ns->disk->flags & GENHD_FL_UP) {
+	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
 		if (blk_get_integrity(ns->disk))
 			blk_integrity_unregister(ns->disk);
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
@@ -1765,7 +1759,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	ns = nvme_find_get_ns(ctrl, nsid);
 	if (ns) {
-		if (revalidate_disk(ns->disk))
+		if (ns->disk && revalidate_disk(ns->disk))
 			nvme_ns_remove(ns);
 		nvme_put_ns(ns);
 	} else
@@ -2070,7 +2064,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 		 * Revalidating a dead namespace sets capacity to 0. This will
 		 * end buffered writers dirtying pages that can't be synced.
 		 */
-		if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+		if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags))
 			revalidate_disk(ns->disk);
 
 		blk_set_queue_dying(ns->queue);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 7268a7a1a19a..798fcd9f5d1f 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -474,9 +474,8 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
 	c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
 
 	if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
-		/* momentarily hardcode the shift configuration. lba_shift from
-		 * nvm_dev will be available in a follow-up patch */
-		c->hb_rw.slba = cpu_to_le64(rqd->bio->bi_iter.bi_sector >> 3);
+		c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
+					rqd->bio->bi_iter.bi_sector));
 }
 
 static void nvme_nvm_end_io(struct request *rq, int error)
@@ -593,14 +592,32 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.max_phys_sect		= 64,
 };
 
-int nvme_nvm_register(struct request_queue *q, char *disk_name)
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 {
-	return nvm_register(q, disk_name, &nvme_nvm_dev_ops);
+	struct request_queue *q = ns->queue;
+	struct nvm_dev *dev;
+	int ret;
+
+	dev = nvm_alloc_dev(node);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->q = q;
+	memcpy(dev->name, disk_name, DISK_NAME_LEN);
+	dev->ops = &nvme_nvm_dev_ops;
+	ns->ndev = dev;
+
+	ret = nvm_register(dev);
+
+	ns->lba_shift = ilog2(dev->sec_size) - 9;
+
+	return ret;
 }
 
-void nvme_nvm_unregister(struct request_queue *q, char *disk_name)
+void nvme_nvm_unregister(struct nvme_ns *ns)
 {
-	nvm_unregister(disk_name);
+	nvm_unregister(ns->ndev);
+	kfree(ns->ndev);
 }
 
 /* move to shared place when used in multiple places. */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ab18b78102bf..e0535c14e538 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -18,6 +18,7 @@
 #include <linux/pci.h>
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
+#include <linux/lightnvm.h>
 
 enum {
 	/*
@@ -154,6 +155,7 @@ struct nvme_ns {
 	struct nvme_ctrl *ctrl;
 	struct request_queue *queue;
 	struct gendisk *disk;
+	struct nvm_dev *ndev;
 	struct kref kref;
 	int instance;
 
@@ -165,7 +167,6 @@ struct nvme_ns {
 	u16 ms;
 	bool ext;
 	u8 pi_type;
-	int type;
 	unsigned long flags;
 
 #define NVME_NS_REMOVING 0
@@ -307,15 +308,16 @@ int nvme_sg_get_version_num(int __user *ip);
 
 #ifdef CONFIG_NVM
 int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id);
-int nvme_nvm_register(struct request_queue *q, char *disk_name);
-void nvme_nvm_unregister(struct request_queue *q, char *disk_name);
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
+void nvme_nvm_unregister(struct nvme_ns *ns);
 #else
-static inline int nvme_nvm_register(struct request_queue *q, char *disk_name)
+static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
+								int node)
 {
 	return 0;
 }
 
-static inline void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {};
+static inline void nvme_nvm_unregister(struct nvme_ns *ns) {};
 
 static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
 {
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ba78b8306674..5afc2634f332 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -524,9 +524,9 @@ extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *,
 								unsigned long);
 extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *);
 
-extern int nvm_register(struct request_queue *, char *,
-						struct nvm_dev_ops *);
-extern void nvm_unregister(char *);
+extern struct nvm_dev *nvm_alloc_dev(int);
+extern int nvm_register(struct nvm_dev *);
+extern void nvm_unregister(struct nvm_dev *);
 
 void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type);
 
@@ -575,11 +575,14 @@ extern int nvm_dev_factory(struct nvm_dev *, int flags);
 #else /* CONFIG_NVM */
 struct nvm_dev_ops;
 
-static inline int nvm_register(struct request_queue *q, char *disk_name,
-							struct nvm_dev_ops *ops)
+static inline struct nvm_dev *nvm_alloc_dev(int node)
+{
+	return ERR_PTR(-EINVAL);
+}
+static inline int nvm_register(struct nvm_dev *dev)
 {
 	return -EINVAL;
 }
-static inline void nvm_unregister(char *disk_name) {}
+static inline void nvm_unregister(struct nvm_dev *dev) {}
 #endif /* CONFIG_NVM */
 #endif /* LIGHTNVM.H */
-- 
cgit v1.2.3


From 40267efddc296190d50c61d96daf277151447cf6 Mon Sep 17 00:00:00 2001
From: "Simon A. F. Lund" <slund@cnexlabs.com>
Date: Fri, 16 Sep 2016 14:25:08 +0200
Subject: lightnvm: expose device geometry through sysfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For a host to access an Open-Channel SSD, it has to know its geometry,
so that it writes and reads at the appropriate device bounds.

Currently, the geometry information is kept within the kernel, and not
exported to user-space for consumption. This patch exposes the
configuration through sysfs and enables user-space libraries, such as
liblightnvm, to use the sysfs implementation to get the geometry of an
Open-Channel SSD.

The sysfs entries are stored within the device hierarchy, and can be
found using the "lightnvm" device type.

An example configuration looks like this:

/sys/class/nvme/
└── nvme0n1
   ├── capabilities: 3
   ├── device_mode: 1
   ├── erase_max: 1000000
   ├── erase_typ: 1000000
   ├── flash_media_type: 0
   ├── media_capabilities: 0x00000001
   ├── media_type: 0
   ├── multiplane: 0x00010101
   ├── num_blocks: 1022
   ├── num_channels: 1
   ├── num_luns: 4
   ├── num_pages: 64
   ├── num_planes: 1
   ├── page_size: 4096
   ├── prog_max: 100000
   ├── prog_typ: 100000
   ├── read_max: 10000
   ├── read_typ: 10000
   ├── sector_oob_size: 0
   ├── sector_size: 4096
   ├── media_manager: gennvm
   ├── ppa_format: 0x380830082808001010102008
   ├── vendor_opcode: 0
   ├── max_phys_secs: 64
   └── version: 1

Signed-off-by: Simon A. F. Lund <slund@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/Makefile    |   2 +-
 drivers/lightnvm/core.c      |  20 +++--
 drivers/lightnvm/lightnvm.h  |  35 ++++++++
 drivers/lightnvm/sysfs.c     | 195 +++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c     |  13 +--
 drivers/nvme/host/lightnvm.c |   9 +-
 drivers/nvme/host/nvme.h     |  18 +++-
 include/linux/lightnvm.h     |   3 +
 8 files changed, 278 insertions(+), 17 deletions(-)
 create mode 100644 drivers/lightnvm/lightnvm.h
 create mode 100644 drivers/lightnvm/sysfs.c

(limited to 'include')

diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index a7a0a22cf1a5..1f6b6521016a 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -2,6 +2,6 @@
 # Makefile for Open-Channel SSDs.
 #
 
-obj-$(CONFIG_NVM)		:= core.o sysblk.o
+obj-$(CONFIG_NVM)		:= core.o sysblk.o sysfs.o
 obj-$(CONFIG_NVM_GENNVM) 	+= gennvm.o
 obj-$(CONFIG_NVM_RRPC)		+= rrpc.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index a99b59d1eb36..a2393e1ef82e 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -27,6 +27,8 @@
 #include <linux/lightnvm.h>
 #include <linux/sched/sysctl.h>
 
+#include "lightnvm.h"
+
 static LIST_HEAD(nvm_tgt_types);
 static DECLARE_RWSEM(nvm_tgtt_lock);
 static LIST_HEAD(nvm_mgrs);
@@ -598,15 +600,19 @@ static void nvm_free_mgr(struct nvm_dev *dev)
 	dev->mt = NULL;
 }
 
-static void nvm_free(struct nvm_dev *dev)
+void nvm_free(struct nvm_dev *dev)
 {
 	if (!dev)
 		return;
 
 	nvm_free_mgr(dev);
 
+	if (dev->dma_pool)
+		dev->ops->destroy_dma_pool(dev->dma_pool);
+
 	kfree(dev->lptbl);
 	kfree(dev->lun_map);
+	kfree(dev);
 }
 
 static int nvm_init(struct nvm_dev *dev)
@@ -653,11 +659,7 @@ err:
 
 static void nvm_exit(struct nvm_dev *dev)
 {
-	if (dev->dma_pool)
-		dev->ops->destroy_dma_pool(dev->dma_pool);
-	nvm_free(dev);
-
-	pr_info("nvm: successfully unloaded\n");
+	nvm_sysfs_unregister_dev(dev);
 }
 
 struct nvm_dev *nvm_alloc_dev(int node)
@@ -689,6 +691,10 @@ int nvm_register(struct nvm_dev *dev)
 		}
 	}
 
+	ret = nvm_sysfs_register_dev(dev);
+	if (ret)
+		goto err_ppalist;
+
 	if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) {
 		ret = nvm_get_sysblock(dev, &dev->sb);
 		if (!ret)
@@ -705,6 +711,8 @@ int nvm_register(struct nvm_dev *dev)
 	up_write(&nvm_lock);
 
 	return 0;
+err_ppalist:
+	dev->ops->destroy_dma_pool(dev->dma_pool);
 err_init:
 	kfree(dev->lun_map);
 	return ret;
diff --git a/drivers/lightnvm/lightnvm.h b/drivers/lightnvm/lightnvm.h
new file mode 100644
index 000000000000..93f1aacc9f02
--- /dev/null
+++ b/drivers/lightnvm/lightnvm.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016 CNEX Labs. All rights reserved.
+ * Initial release: Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#ifndef LIGHTNVM_H
+#define LIGHTNVM_H
+
+#include <linux/lightnvm.h>
+
+/* core -> sysfs.c */
+int nvm_sysfs_register_dev(struct nvm_dev *);
+void nvm_sysfs_unregister_dev(struct nvm_dev *);
+int nvm_sysfs_register(void);
+void nvm_sysfs_unregister(void);
+
+/* sysfs > core */
+void nvm_free(struct nvm_dev *);
+
+#endif
diff --git a/drivers/lightnvm/sysfs.c b/drivers/lightnvm/sysfs.c
new file mode 100644
index 000000000000..72ad089c0269
--- /dev/null
+++ b/drivers/lightnvm/sysfs.c
@@ -0,0 +1,195 @@
+#include <linux/kernel.h>
+#include <linux/lightnvm.h>
+#include <linux/miscdevice.h>
+#include <linux/kobject.h>
+#include <linux/blk-mq.h>
+
+#include "lightnvm.h"
+
+static ssize_t nvm_dev_attr_show(struct device *dev,
+				 struct device_attribute *dattr, char *page)
+{
+	struct nvm_dev *ndev = container_of(dev, struct nvm_dev, dev);
+	struct nvm_id *id = &ndev->identity;
+	struct nvm_id_group *grp = &id->groups[0];
+	struct attribute *attr = &dattr->attr;
+
+	if (strcmp(attr->name, "version") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->ver_id);
+	} else if (strcmp(attr->name, "vendor_opcode") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->vmnt);
+	} else if (strcmp(attr->name, "capabilities") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->cap);
+	} else if (strcmp(attr->name, "device_mode") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->dom);
+	} else if (strcmp(attr->name, "media_manager") == 0) {
+		if (!ndev->mt)
+			return scnprintf(page, PAGE_SIZE, "%s\n", "none");
+		return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name);
+	} else if (strcmp(attr->name, "ppa_format") == 0) {
+		return scnprintf(page, PAGE_SIZE,
+			"0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+			id->ppaf.ch_offset, id->ppaf.ch_len,
+			id->ppaf.lun_offset, id->ppaf.lun_len,
+			id->ppaf.pln_offset, id->ppaf.pln_len,
+			id->ppaf.blk_offset, id->ppaf.blk_len,
+			id->ppaf.pg_offset, id->ppaf.pg_len,
+			id->ppaf.sect_offset, id->ppaf.sect_len);
+	} else if (strcmp(attr->name, "media_type") == 0) {	/* u8 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->mtype);
+	} else if (strcmp(attr->name, "flash_media_type") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->fmtype);
+	} else if (strcmp(attr->name, "num_channels") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_ch);
+	} else if (strcmp(attr->name, "num_luns") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_lun);
+	} else if (strcmp(attr->name, "num_planes") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
+	} else if (strcmp(attr->name, "num_blocks") == 0) {	/* u16 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk);
+	} else if (strcmp(attr->name, "num_pages") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
+	} else if (strcmp(attr->name, "page_size") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->fpg_sz);
+	} else if (strcmp(attr->name, "hw_sector_size") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->csecs);
+	} else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->sos);
+	} else if (strcmp(attr->name, "read_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdt);
+	} else if (strcmp(attr->name, "read_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdm);
+	} else if (strcmp(attr->name, "prog_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprt);
+	} else if (strcmp(attr->name, "prog_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprm);
+	} else if (strcmp(attr->name, "erase_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbet);
+	} else if (strcmp(attr->name, "erase_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbem);
+	} else if (strcmp(attr->name, "multiplane_modes") == 0) {
+		return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mpos);
+	} else if (strcmp(attr->name, "media_capabilities") == 0) {
+		return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mccap);
+	} else if (strcmp(attr->name, "max_phys_secs") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n",
+				ndev->ops->max_phys_sect);
+	} else {
+		return scnprintf(page,
+				 PAGE_SIZE,
+				 "Unhandled attr(%s) in `nvm_dev_attr_show`\n",
+				 attr->name);
+	}
+}
+
+#define NVM_DEV_ATTR_RO(_name)						\
+	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL)
+
+static NVM_DEV_ATTR_RO(version);
+static NVM_DEV_ATTR_RO(vendor_opcode);
+static NVM_DEV_ATTR_RO(capabilities);
+static NVM_DEV_ATTR_RO(device_mode);
+static NVM_DEV_ATTR_RO(ppa_format);
+static NVM_DEV_ATTR_RO(media_manager);
+
+static NVM_DEV_ATTR_RO(media_type);
+static NVM_DEV_ATTR_RO(flash_media_type);
+static NVM_DEV_ATTR_RO(num_channels);
+static NVM_DEV_ATTR_RO(num_luns);
+static NVM_DEV_ATTR_RO(num_planes);
+static NVM_DEV_ATTR_RO(num_blocks);
+static NVM_DEV_ATTR_RO(num_pages);
+static NVM_DEV_ATTR_RO(page_size);
+static NVM_DEV_ATTR_RO(hw_sector_size);
+static NVM_DEV_ATTR_RO(oob_sector_size);
+static NVM_DEV_ATTR_RO(read_typ);
+static NVM_DEV_ATTR_RO(read_max);
+static NVM_DEV_ATTR_RO(prog_typ);
+static NVM_DEV_ATTR_RO(prog_max);
+static NVM_DEV_ATTR_RO(erase_typ);
+static NVM_DEV_ATTR_RO(erase_max);
+static NVM_DEV_ATTR_RO(multiplane_modes);
+static NVM_DEV_ATTR_RO(media_capabilities);
+static NVM_DEV_ATTR_RO(max_phys_secs);
+
+#define NVM_DEV_ATTR(_name) (dev_attr_##_name##)
+
+static struct attribute *nvm_dev_attrs[] = {
+	&dev_attr_version.attr,
+	&dev_attr_vendor_opcode.attr,
+	&dev_attr_capabilities.attr,
+	&dev_attr_device_mode.attr,
+	&dev_attr_media_manager.attr,
+
+	&dev_attr_ppa_format.attr,
+	&dev_attr_media_type.attr,
+	&dev_attr_flash_media_type.attr,
+	&dev_attr_num_channels.attr,
+	&dev_attr_num_luns.attr,
+	&dev_attr_num_planes.attr,
+	&dev_attr_num_blocks.attr,
+	&dev_attr_num_pages.attr,
+	&dev_attr_page_size.attr,
+	&dev_attr_hw_sector_size.attr,
+	&dev_attr_oob_sector_size.attr,
+	&dev_attr_read_typ.attr,
+	&dev_attr_read_max.attr,
+	&dev_attr_prog_typ.attr,
+	&dev_attr_prog_max.attr,
+	&dev_attr_erase_typ.attr,
+	&dev_attr_erase_max.attr,
+	&dev_attr_multiplane_modes.attr,
+	&dev_attr_media_capabilities.attr,
+	&dev_attr_max_phys_secs.attr,
+	NULL,
+};
+
+static struct attribute_group nvm_dev_attr_group = {
+	.name = "lightnvm",
+	.attrs = nvm_dev_attrs,
+};
+
+static const struct attribute_group *nvm_dev_attr_groups[] = {
+	&nvm_dev_attr_group,
+	NULL,
+};
+
+static void nvm_dev_release(struct device *device)
+{
+	struct nvm_dev *dev = container_of(device, struct nvm_dev, dev);
+	struct request_queue *q = dev->q;
+
+	pr_debug("nvm/sysfs: `nvm_dev_release`\n");
+
+	blk_mq_unregister_dev(device, q);
+
+	nvm_free(dev);
+}
+
+static struct device_type nvm_type = {
+	.name		= "lightnvm",
+	.groups		= nvm_dev_attr_groups,
+	.release	= nvm_dev_release,
+};
+
+int nvm_sysfs_register_dev(struct nvm_dev *dev)
+{
+	if (!dev->parent_dev)
+		return 0;
+
+	dev->dev.parent = dev->parent_dev;
+	dev_set_name(&dev->dev, "%s", dev->name);
+	dev->dev.type = &nvm_type;
+	device_initialize(&dev->dev);
+	device_add(&dev->dev);
+
+	blk_mq_register_dev(&dev->dev, dev->q);
+
+	return 0;
+}
+
+void nvm_sysfs_unregister_dev(struct nvm_dev *dev)
+{
+	if (dev && dev->parent_dev)
+		kobject_put(&dev->dev.kobj);
+}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3c707d83b1da..bd2156cbfc6c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1435,7 +1435,7 @@ static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
-	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	int serial_len = sizeof(ctrl->serial);
 	int model_len = sizeof(ctrl->model);
@@ -1459,7 +1459,7 @@ static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
-	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 	return sprintf(buf, "%pU\n", ns->uuid);
 }
 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
@@ -1467,7 +1467,7 @@ static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
-	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 	return sprintf(buf, "%8phd\n", ns->eui);
 }
 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
@@ -1475,7 +1475,7 @@ static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
-	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 	return sprintf(buf, "%d\n", ns->ns_id);
 }
 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
@@ -1492,7 +1492,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
 		struct attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 
 	if (a == &dev_attr_uuid.attr) {
 		if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
@@ -1684,7 +1684,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		goto out_free_queue;
 
 	if (nvme_nvm_ns_supported(ns, id)) {
-		if (nvme_nvm_register(ns, disk_name, node)) {
+		if (nvme_nvm_register(ns, disk_name, node,
+							&nvme_ns_attr_group)) {
 			dev_warn(ctrl->dev, "%s: LightNVM init failure\n",
 								__func__);
 			goto out_free_id;
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 798fcd9f5d1f..f5e3011e31fc 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -592,7 +592,8 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.max_phys_sect		= 64,
 };
 
-int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node,
+		      const struct attribute_group *attrs)
 {
 	struct request_queue *q = ns->queue;
 	struct nvm_dev *dev;
@@ -605,19 +606,23 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 	dev->q = q;
 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
 	dev->ops = &nvme_nvm_dev_ops;
+	dev->parent_dev = ns->ctrl->device;
+	dev->private_data = ns;
 	ns->ndev = dev;
 
 	ret = nvm_register(dev);
 
 	ns->lba_shift = ilog2(dev->sec_size) - 9;
 
+	if (sysfs_create_group(&dev->dev.kobj, attrs))
+		pr_warn("%s: failed to create sysfs group for identification\n",
+			disk_name);
 	return ret;
 }
 
 void nvme_nvm_unregister(struct nvme_ns *ns)
 {
 	nvm_unregister(ns->ndev);
-	kfree(ns->ndev);
 }
 
 /* move to shared place when used in multiple places. */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index e0535c14e538..bfd25dd73bca 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -308,11 +308,21 @@ int nvme_sg_get_version_num(int __user *ip);
 
 #ifdef CONFIG_NVM
 int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id);
-int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node,
+		      const struct attribute_group *attrs);
 void nvme_nvm_unregister(struct nvme_ns *ns);
+
+static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
+{
+	if (dev->type->devnode)
+		return dev_to_disk(dev)->private_data;
+
+	return (container_of(dev, struct nvm_dev, dev))->private_data;
+}
 #else
 static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
-								int node)
+				    int node,
+				    const struct attribute_group *attrs)
 {
 	return 0;
 }
@@ -323,6 +333,10 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i
 {
 	return 0;
 }
+static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
+{
+	return dev_to_disk(dev)->private_data;
+}
 #endif /* CONFIG_NVM */
 
 int __init nvme_core_init(void);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 5afc2634f332..d190786e4ad8 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -352,7 +352,10 @@ struct nvm_dev {
 
 	/* Backend device */
 	struct request_queue *q;
+	struct device dev;
+	struct device *parent_dev;
 	char name[DISK_NAME_LEN];
+	void *private_data;
 
 	struct mutex mlock;
 	spinlock_t lock;
-- 
cgit v1.2.3


From 491221f88d00651e449c9caf7415b6453c8a77b7 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 22 Sep 2016 03:10:01 -0400
Subject: block: export bio_free_pages to other modules

bio_free_pages is introduced in commit 1dfa0f68c040
("block: add a helper to free bio bounce buffer pages"),
we can reuse the func in other modules after it was
imported.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jens Axboe <axboe@fb.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Shaohua Li <shli@fb.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                   | 3 ++-
 drivers/md/bcache/btree.c     | 6 +-----
 drivers/md/bcache/debug.c     | 6 ++----
 drivers/md/bcache/movinggc.c  | 5 +----
 drivers/md/bcache/request.c   | 9 ++-------
 drivers/md/bcache/writeback.c | 5 +----
 drivers/md/dm-log-writes.c    | 6 +-----
 drivers/md/raid1.c            | 8 ++------
 include/linux/bio.h           | 1 +
 9 files changed, 13 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index a6d279e1ea9e..db85c5753a76 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1068,7 +1068,7 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
 	return 0;
 }
 
-static void bio_free_pages(struct bio *bio)
+void bio_free_pages(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
@@ -1076,6 +1076,7 @@ static void bio_free_pages(struct bio *bio)
 	bio_for_each_segment_all(bvec, bio, i)
 		__free_page(bvec->bv_page);
 }
+EXPORT_SYMBOL(bio_free_pages);
 
 /**
  *	bio_uncopy_user	-	finish previously mapped bio
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 76f7534d1dd1..81d3db40cd7b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -361,12 +361,8 @@ static void __btree_node_write_done(struct closure *cl)
 static void btree_node_write_done(struct closure *cl)
 {
 	struct btree *b = container_of(cl, struct btree, io);
-	struct bio_vec *bv;
-	int n;
-
-	bio_for_each_segment_all(bv, b->bio, n)
-		__free_page(bv->bv_page);
 
+	bio_free_pages(b->bio);
 	__btree_node_write_done(cl);
 }
 
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c28df164701e..333a1e5f6ae6 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -107,9 +107,8 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 {
 	char name[BDEVNAME_SIZE];
 	struct bio *check;
-	struct bio_vec bv, *bv2;
+	struct bio_vec bv;
 	struct bvec_iter iter;
-	int i;
 
 	check = bio_clone(bio, GFP_NOIO);
 	if (!check)
@@ -136,8 +135,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 		kunmap_atomic(p1);
 	}
 
-	bio_for_each_segment_all(bv2, check, i)
-		__free_page(bv2->bv_page);
+	bio_free_pages(check);
 out_put:
 	bio_put(check);
 }
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 1881319f2298..5c4bddecfaf0 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -44,11 +44,8 @@ static void write_moving_finish(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct bio *bio = &io->bio.bio;
-	struct bio_vec *bv;
-	int i;
 
-	bio_for_each_segment_all(bv, bio, i)
-		__free_page(bv->bv_page);
+	bio_free_pages(bio);
 
 	if (io->op.replace_collision)
 		trace_bcache_gc_copy_collision(&io->w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 4b177fe11ebb..40ffe5e424b3 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -694,13 +694,8 @@ static void cached_dev_cache_miss_done(struct closure *cl)
 	if (s->iop.replace_collision)
 		bch_mark_cache_miss_collision(s->iop.c, s->d);
 
-	if (s->iop.bio) {
-		int i;
-		struct bio_vec *bv;
-
-		bio_for_each_segment_all(bv, s->iop.bio, i)
-			__free_page(bv->bv_page);
-	}
+	if (s->iop.bio)
+		bio_free_pages(s->iop.bio);
 
 	cached_dev_bio_complete(cl);
 }
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d9fd2a62e5f6..e51644e503a5 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -128,11 +128,8 @@ static void write_dirty_finish(struct closure *cl)
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	struct cached_dev *dc = io->dc;
-	struct bio_vec *bv;
-	int i;
 
-	bio_for_each_segment_all(bv, &io->bio, i)
-		__free_page(bv->bv_page);
+	bio_free_pages(&io->bio);
 
 	/* This is kind of a dumb way of signalling errors. */
 	if (KEY_DIRTY(&w->key)) {
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 4ab68033f9d1..b52404159ccf 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -149,8 +149,6 @@ static void put_io_block(struct log_writes_c *lc)
 static void log_end_io(struct bio *bio)
 {
 	struct log_writes_c *lc = bio->bi_private;
-	struct bio_vec *bvec;
-	int i;
 
 	if (bio->bi_error) {
 		unsigned long flags;
@@ -161,9 +159,7 @@ static void log_end_io(struct bio *bio)
 		spin_unlock_irqrestore(&lc->blocks_lock, flags);
 	}
 
-	bio_for_each_segment_all(bvec, bio, i)
-		__free_page(bvec->bv_page);
-
+	bio_free_pages(bio);
 	put_io_block(lc);
 	bio_put(bio);
 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 21dc00eb1989..1961d827dbd1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -145,12 +145,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	return r1_bio;
 
 out_free_pages:
-	while (--j >= 0) {
-		struct bio_vec *bv;
-
-		bio_for_each_segment_all(bv, r1_bio->bios[j], i)
-			__free_page(bv->bv_page);
-	}
+	while (--j >= 0)
+		bio_free_pages(r1_bio->bios[j]);
 
 out_free_bio:
 	while (++j < pi->raid_disks)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e00721a2dce1..97cb48f03dc7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -459,6 +459,7 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
+extern void bio_free_pages(struct bio *bio);
 
 extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     struct rq_map_data *,
-- 
cgit v1.2.3


From 1b792f2f92784c00db2e6431496e437855d6f12a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 21 Sep 2016 10:12:13 -0600
Subject: blk-mq: add flag for drivers wanting blocking ->queue_rq()

If a driver sets BLK_MQ_F_BLOCKING, it is allowed to block in its
->queue_rq() handler. For that case, blk-mq ensures that we always
calls it from a safe context.

Signed-off-by: Jens Axboe <axboe@fb.com>
Tested-by: Josef Bacik <jbacik@fb.com>
---
 block/blk-mq.c         | 2 +-
 include/linux/blk-mq.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 80d483864247..e9ebe9864cc4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -908,7 +908,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	    !blk_mq_hw_queue_mapped(hctx)))
 		return;
 
-	if (!async) {
+	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		int cpu = get_cpu();
 		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
 			__blk_mq_run_hw_queue(hctx);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index fbcfdf323243..5daa0ef756dd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -155,6 +155,7 @@ enum {
 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
 	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
+	BLK_MQ_F_BLOCKING	= 1 << 5,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
 	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 
-- 
cgit v1.2.3


From 55679c8d23d191c24ad133abc5647e3054ca8de1 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 23 Sep 2016 09:07:56 -0700
Subject: blkcg: Annotate blkg_hint correctly

Avoid that sparse complains about blkg_hint manipulations.

Fixes: a637120e4902 ("blkcg: use radix tree to index blkgs from blkcg")
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 10648e300c93..cbdbf34de5b6 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -45,7 +45,7 @@ struct blkcg {
 	spinlock_t			lock;
 
 	struct radix_tree_root		blkg_tree;
-	struct blkcg_gq			*blkg_hint;
+	struct blkcg_gq	__rcu		*blkg_hint;
 	struct hlist_head		blkg_list;
 
 	struct blkcg_policy_data	*cpd[BLKCG_MAX_POLS];
-- 
cgit v1.2.3