From ee5242360424b9b967454e9183767323d10cf985 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 Feb 2017 10:04:55 +0100
Subject: scsi: zero per-cmd driver data before each I/O

Without this drivers that don't clear the state themselves can see off
effects.  For example Hyper-V VMs using the storvsc driver will often
hang during boot due to uncleared Test Unit Ready failures.

Fixes: e9c787e6 ("scsi: allocate scsi_cmnd structures as part of struct request")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Dexuan Cui <decui@microsoft.com>
Tested-by: Dexuan Cui <decui@microsoft.com>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/scsi/scsi_lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 912fbc3b4543..3e32dc954c3c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1167,7 +1167,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 
 	/* zero out the cmd, except for the embedded scsi_request */
 	memset((char *)cmd + sizeof(cmd->req), 0,
-		sizeof(*cmd) - sizeof(cmd->req));
+		sizeof(*cmd) - sizeof(cmd->req) + dev->host->hostt->cmd_size);
 
 	cmd->device = dev;
 	cmd->sense_buffer = buf;
-- 
cgit v1.2.3


From 9442b739207aab6b1053abf858a238e7642fbcd1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 7 Feb 2017 17:10:22 -0500
Subject: nbd: cleanup ioctl handling

Break the ioctl handling out into helper functions, some of these things
are getting pretty big and unwieldy.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 269 ++++++++++++++++++++++++++--------------------------
 1 file changed, 132 insertions(+), 137 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 0be84a3cb6d7..36b559605e18 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -96,6 +96,10 @@ static int max_part;
 static struct workqueue_struct *recv_workqueue;
 static int part_shift;
 
+static int nbd_dev_dbg_init(struct nbd_device *nbd);
+static void nbd_dev_dbg_close(struct nbd_device *nbd);
+
+
 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 {
 	return disk_to_dev(nbd->disk);
@@ -571,10 +575,17 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
-static int nbd_add_socket(struct nbd_device *nbd, struct socket *sock)
+static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
+			  unsigned long arg)
 {
+	struct socket *sock;
 	struct nbd_sock **socks;
 	struct nbd_sock *nsock;
+	int err;
+
+	sock = sockfd_lookup(arg, &err);
+	if (!sock)
+		return err;
 
 	if (!nbd->task_setup)
 		nbd->task_setup = current;
@@ -598,26 +609,20 @@ static int nbd_add_socket(struct nbd_device *nbd, struct socket *sock)
 	nsock->sock = sock;
 	socks[nbd->num_connections++] = nsock;
 
+	if (max_part)
+		bdev->bd_invalidated = 1;
 	return 0;
 }
 
 /* Reset all properties of an NBD device */
 static void nbd_reset(struct nbd_device *nbd)
 {
-	int i;
-
-	for (i = 0; i < nbd->num_connections; i++)
-		kfree(nbd->socks[i]);
-	kfree(nbd->socks);
-	nbd->socks = NULL;
 	nbd->runtime_flags = 0;
 	nbd->blksize = 1024;
 	nbd->bytesize = 0;
 	set_capacity(nbd->disk, 0);
 	nbd->flags = 0;
 	nbd->tag_set.timeout = 0;
-	nbd->num_connections = 0;
-	nbd->task_setup = NULL;
 	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 }
 
@@ -659,74 +664,135 @@ static void send_disconnects(struct nbd_device *nbd)
 	}
 }
 
-static int nbd_dev_dbg_init(struct nbd_device *nbd);
-static void nbd_dev_dbg_close(struct nbd_device *nbd);
+static int nbd_disconnect(struct nbd_device *nbd, struct block_device *bdev)
+{
+	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
+	if (!nbd->socks)
+		return -EINVAL;
 
-/* Must be called with config_lock held */
-static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
-		       unsigned int cmd, unsigned long arg)
+	mutex_unlock(&nbd->config_lock);
+	fsync_bdev(bdev);
+	mutex_lock(&nbd->config_lock);
+
+	/* Check again after getting mutex back.  */
+	if (!nbd->socks)
+		return -EINVAL;
+
+	if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
+			      &nbd->runtime_flags))
+		send_disconnects(nbd);
+	return 0;
+}
+
+static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev)
 {
-	switch (cmd) {
-	case NBD_DISCONNECT: {
-		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
-		if (!nbd->socks)
-			return -EINVAL;
-
-		mutex_unlock(&nbd->config_lock);
-		fsync_bdev(bdev);
-		mutex_lock(&nbd->config_lock);
-
-		/* Check again after getting mutex back.  */
-		if (!nbd->socks)
-			return -EINVAL;
-
-		if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
-				      &nbd->runtime_flags))
-			send_disconnects(nbd);
-		return 0;
+	sock_shutdown(nbd);
+	nbd_clear_que(nbd);
+	kill_bdev(bdev);
+	nbd_bdev_reset(bdev);
+	/*
+	 * We want to give the run thread a chance to wait for everybody
+	 * to clean up and then do it's own cleanup.
+	 */
+	if (!test_bit(NBD_RUNNING, &nbd->runtime_flags) &&
+	    nbd->num_connections) {
+		int i;
+
+		for (i = 0; i < nbd->num_connections; i++)
+			kfree(nbd->socks[i]);
+		kfree(nbd->socks);
+		nbd->socks = NULL;
+		nbd->num_connections = 0;
 	}
+	nbd->task_setup = NULL;
 
-	case NBD_CLEAR_SOCK:
-		sock_shutdown(nbd);
-		nbd_clear_que(nbd);
-		kill_bdev(bdev);
-		nbd_bdev_reset(bdev);
-		/*
-		 * We want to give the run thread a chance to wait for everybody
-		 * to clean up and then do it's own cleanup.
-		 */
-		if (!test_bit(NBD_RUNNING, &nbd->runtime_flags)) {
-			int i;
-
-			for (i = 0; i < nbd->num_connections; i++)
-				kfree(nbd->socks[i]);
-			kfree(nbd->socks);
-			nbd->socks = NULL;
-			nbd->num_connections = 0;
-			nbd->task_setup = NULL;
-		}
-		return 0;
+	return 0;
+}
 
-	case NBD_SET_SOCK: {
-		int err;
-		struct socket *sock = sockfd_lookup(arg, &err);
+static int nbd_start_device(struct nbd_device *nbd, struct block_device *bdev)
+{
+	struct recv_thread_args *args;
+	int num_connections = nbd->num_connections;
+	int error = 0, i;
+
+	if (nbd->task_recv)
+		return -EBUSY;
+	if (!nbd->socks)
+		return -EINVAL;
+	if (num_connections > 1 &&
+	    !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
+		dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
+		error = -EINVAL;
+		goto out_err;
+	}
 
-		if (!sock)
-			return err;
+	set_bit(NBD_RUNNING, &nbd->runtime_flags);
+	blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
+	args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
+	if (!args) {
+		error = -ENOMEM;
+		goto out_err;
+	}
+	nbd->task_recv = current;
+	mutex_unlock(&nbd->config_lock);
 
-		err = nbd_add_socket(nbd, sock);
-		if (!err && max_part)
-			bdev->bd_invalidated = 1;
+	nbd_parse_flags(nbd, bdev);
 
-		return err;
+	error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
+	if (error) {
+		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+		goto out_recv;
 	}
 
-	case NBD_SET_BLKSIZE: {
-		loff_t bsize = div_s64(nbd->bytesize, arg);
+	nbd_size_update(nbd, bdev);
 
-		return nbd_size_set(nbd, bdev, arg, bsize);
+	nbd_dev_dbg_init(nbd);
+	for (i = 0; i < num_connections; i++) {
+		sk_set_memalloc(nbd->socks[i]->sock->sk);
+		atomic_inc(&nbd->recv_threads);
+		INIT_WORK(&args[i].work, recv_work);
+		args[i].nbd = nbd;
+		args[i].index = i;
+		queue_work(recv_workqueue, &args[i].work);
 	}
+	wait_event_interruptible(nbd->recv_wq,
+				 atomic_read(&nbd->recv_threads) == 0);
+	for (i = 0; i < num_connections; i++)
+		flush_work(&args[i].work);
+	nbd_dev_dbg_close(nbd);
+	nbd_size_clear(nbd, bdev);
+	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+out_recv:
+	mutex_lock(&nbd->config_lock);
+	nbd->task_recv = NULL;
+out_err:
+	clear_bit(NBD_RUNNING, &nbd->runtime_flags);
+	nbd_clear_sock(nbd, bdev);
+
+	/* user requested, ignore socket errors */
+	if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
+		error = 0;
+	if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
+		error = -ETIMEDOUT;
+
+	nbd_reset(nbd);
+	return error;
+}
 
+/* Must be called with config_lock held */
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
+		       unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case NBD_DISCONNECT:
+		return nbd_disconnect(nbd, bdev);
+	case NBD_CLEAR_SOCK:
+		return nbd_clear_sock(nbd, bdev);
+	case NBD_SET_SOCK:
+		return nbd_add_socket(nbd, bdev, arg);
+	case NBD_SET_BLKSIZE:
+		return nbd_size_set(nbd, bdev, arg,
+				    div_s64(nbd->bytesize, arg));
 	case NBD_SET_SIZE:
 		return nbd_size_set(nbd, bdev, nbd->blksize,
 					div_s64(arg, nbd->blksize));
@@ -741,85 +807,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 	case NBD_SET_FLAGS:
 		nbd->flags = arg;
 		return 0;
-
-	case NBD_DO_IT: {
-		struct recv_thread_args *args;
-		int num_connections = nbd->num_connections;
-		int error = 0, i;
-
-		if (nbd->task_recv)
-			return -EBUSY;
-		if (!nbd->socks)
-			return -EINVAL;
-		if (num_connections > 1 &&
-		    !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
-			dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
-			error = -EINVAL;
-			goto out_err;
-		}
-
-		set_bit(NBD_RUNNING, &nbd->runtime_flags);
-		blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
-		args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
-		if (!args) {
-			error = -ENOMEM;
-			goto out_err;
-		}
-		nbd->task_recv = current;
-		mutex_unlock(&nbd->config_lock);
-
-		nbd_parse_flags(nbd, bdev);
-
-		error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
-		if (error) {
-			dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
-			goto out_recv;
-		}
-
-		nbd_size_update(nbd, bdev);
-
-		nbd_dev_dbg_init(nbd);
-		for (i = 0; i < num_connections; i++) {
-			sk_set_memalloc(nbd->socks[i]->sock->sk);
-			atomic_inc(&nbd->recv_threads);
-			INIT_WORK(&args[i].work, recv_work);
-			args[i].nbd = nbd;
-			args[i].index = i;
-			queue_work(recv_workqueue, &args[i].work);
-		}
-		wait_event_interruptible(nbd->recv_wq,
-					 atomic_read(&nbd->recv_threads) == 0);
-		for (i = 0; i < num_connections; i++)
-			flush_work(&args[i].work);
-		nbd_dev_dbg_close(nbd);
-		nbd_size_clear(nbd, bdev);
-		device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-out_recv:
-		mutex_lock(&nbd->config_lock);
-		nbd->task_recv = NULL;
-out_err:
-		sock_shutdown(nbd);
-		nbd_clear_que(nbd);
-		kill_bdev(bdev);
-		nbd_bdev_reset(bdev);
-
-		/* user requested, ignore socket errors */
-		if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
-			error = 0;
-		if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
-			error = -ETIMEDOUT;
-
-		nbd_reset(nbd);
-		return error;
-	}
-
+	case NBD_DO_IT:
+		return nbd_start_device(nbd, bdev);
 	case NBD_CLEAR_QUE:
 		/*
 		 * This is for compatibility only.  The queue is always cleared
 		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
 		 */
 		return 0;
-
 	case NBD_PRINT_DEBUG:
 		/*
 		 * For compatibility only, we no longer keep a list of
-- 
cgit v1.2.3


From e544541b0765c341174613b416d4b074fa7571c2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 13 Feb 2017 10:39:47 -0500
Subject: nbd: set the logical and physical blocksize properly

We noticed when trying to do O_DIRECT to an export on the server side
that we were getting requests smaller than the 4k sectorsize of the
device.  This is because the client isn't setting the logical and
physical blocksizes properly for the underlying device.  Fix this up by
setting the queue blocksizes and then calling bd_set_size.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 38 +++++++++++++++-----------------------
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 36b559605e18..cdbeb309a105 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -124,7 +124,7 @@ static const char *nbdcmd_to_ascii(int cmd)
 
 static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
 {
-	bdev->bd_inode->i_size = 0;
+	bd_set_size(bdev, 0);
 	set_capacity(nbd->disk, 0);
 	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 
@@ -133,29 +133,20 @@ static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
 
 static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
 {
-	if (!nbd_is_connected(nbd))
-		return;
-
-	bdev->bd_inode->i_size = nbd->bytesize;
+	blk_queue_logical_block_size(nbd->disk->queue, nbd->blksize);
+	blk_queue_physical_block_size(nbd->disk->queue, nbd->blksize);
+	bd_set_size(bdev, nbd->bytesize);
 	set_capacity(nbd->disk, nbd->bytesize >> 9);
 	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 }
 
-static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
+static void nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
 			loff_t blocksize, loff_t nr_blocks)
 {
-	int ret;
-
-	ret = set_blocksize(bdev, blocksize);
-	if (ret)
-		return ret;
-
 	nbd->blksize = blocksize;
 	nbd->bytesize = blocksize * nr_blocks;
-
-	nbd_size_update(nbd, bdev);
-
-	return 0;
+	if (nbd_is_connected(nbd))
+		nbd_size_update(nbd, bdev);
 }
 
 static void nbd_end_request(struct nbd_cmd *cmd)
@@ -791,15 +782,16 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 	case NBD_SET_SOCK:
 		return nbd_add_socket(nbd, bdev, arg);
 	case NBD_SET_BLKSIZE:
-		return nbd_size_set(nbd, bdev, arg,
-				    div_s64(nbd->bytesize, arg));
+		nbd_size_set(nbd, bdev, arg,
+			     div_s64(nbd->bytesize, arg));
+		return 0;
 	case NBD_SET_SIZE:
-		return nbd_size_set(nbd, bdev, nbd->blksize,
-					div_s64(arg, nbd->blksize));
-
+		nbd_size_set(nbd, bdev, nbd->blksize,
+			     div_s64(arg, nbd->blksize));
+		return 0;
 	case NBD_SET_SIZE_BLOCKS:
-		return nbd_size_set(nbd, bdev, nbd->blksize, arg);
-
+		nbd_size_set(nbd, bdev, nbd->blksize, arg);
+		return 0;
 	case NBD_SET_TIMEOUT:
 		nbd->tag_set.timeout = arg * HZ;
 		return 0;
-- 
cgit v1.2.3


From 6330a2d0b465527d621a9d95cad6b2fc0a959f13 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 15 Feb 2017 16:49:48 -0500
Subject: nbd: cleanup workqueue on error properly

If we fail to register the blockdev we need to make sure to destroy the
recv workqueue.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index cdbeb309a105..0bf2b21a62cb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1121,8 +1121,10 @@ static int __init nbd_init(void)
 	if (!recv_workqueue)
 		return -ENOMEM;
 
-	if (register_blkdev(NBD_MAJOR, "nbd"))
+	if (register_blkdev(NBD_MAJOR, "nbd")) {
+		destroy_workqueue(recv_workqueue);
 		return -EIO;
+	}
 
 	nbd_dbg_init();
 
-- 
cgit v1.2.3


From 4b8c861a7c79806fb9ee564c87f517dc26fc2d1e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 21 Feb 2017 18:09:46 +0100
Subject: block: Move bdev_unhash_inode() after invalidate_partition()

Move bdev_unhash_inode() after invalidate_partition() as
invalidate_partition() looks up bdev and it cannot find the right bdev
inode after bdev_unhash_inode() is called. Thus invalidate_partition()
would not invalidate page cache of the previously used bdev. Also use
part_devt() when calling bdev_unhash_inode() instead of manually
creating the device number.

Tested-by: Lekshmi Pillai <lekshmicpillai@in.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/genhd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 3631cd480295..15b3c18d8337 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -669,9 +669,8 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_init(&piter, disk,
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
 	while ((part = disk_part_iter_next(&piter))) {
-		bdev_unhash_inode(MKDEV(disk->major,
-					disk->first_minor + part->partno));
 		invalidate_partition(disk, part->partno);
+		bdev_unhash_inode(part_devt(part));
 		delete_partition(disk, part->partno);
 	}
 	disk_part_iter_exit(&piter);
-- 
cgit v1.2.3


From d06e05c026ab58121dc6ab37c11f855ff642531e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 21 Feb 2017 18:09:47 +0100
Subject: block: Unhash also block device inode for the whole device

Iteration over partitions in del_gendisk() omits part0. Add
bdev_unhash_inode() call for the whole device. Otherwise if the device
number gets reused, bdev inode will be still associated with the old
(stale) bdi.

Tested-by: Lekshmi Pillai <lekshmicpillai@in.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/genhd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/genhd.c b/block/genhd.c
index 15b3c18d8337..2f444b87a5f2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -676,6 +676,7 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_exit(&piter);
 
 	invalidate_partition(disk, 0);
+	bdev_unhash_inode(disk_devt(disk));
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 
-- 
cgit v1.2.3


From cccd9fb9ec960e343b2f354c4613e49f5a1d8371 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 21 Feb 2017 18:09:48 +0100
Subject: block: Revalidate i_bdev reference in bd_aquire()

When a device gets removed, block device inode unhashed so that it is not
used anymore (bdget() will not find it anymore). Later when a new device
gets created with the same device number, we create new block device
inode. However there may be file system device inodes whose i_bdev still
points to the original block device inode and thus we get two active
block device inodes for the same device. They will share the same
gendisk so the only visible differences will be that page caches will
not be coherent and BDIs will be different (the old block device inode
still points to unregistered BDI).

Fix the problem by checking in bd_acquire() whether i_bdev still points
to active block device inode and re-lookup the block device if not. That
way any open of a block device happening after the old device has been
removed will get correct block device inode.

Tested-by: Lekshmi Pillai <lekshmicpillai@in.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 73031ec54a7b..1c62845a72c7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1043,13 +1043,22 @@ static struct block_device *bd_acquire(struct inode *inode)
 
 	spin_lock(&bdev_lock);
 	bdev = inode->i_bdev;
-	if (bdev) {
+	if (bdev && !inode_unhashed(bdev->bd_inode)) {
 		bdgrab(bdev);
 		spin_unlock(&bdev_lock);
 		return bdev;
 	}
 	spin_unlock(&bdev_lock);
 
+	/*
+	 * i_bdev references block device inode that was already shut down
+	 * (corresponding device got removed).  Remove the reference and look
+	 * up block device inode again just in case new device got
+	 * reestablished under the same device number.
+	 */
+	if (bdev)
+		bd_forget(inode);
+
 	bdev = bdget(inode->i_rdev);
 	if (bdev) {
 		spin_lock(&bdev_lock);
-- 
cgit v1.2.3


From bd1599d931ca735c1081f11aa4d49006350709f1 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Feb 2017 10:03:50 -0800
Subject: scsi_transport_sas: fix BSG ioctl memory corruption

The end_device and sas_host devices support BSG ioctls, but the
request_queue allocated for them isn't set up to allocate the struct
scsi_request payload. This leads to memory corruption in the call to
scsi_req_init() in bsg_map_hdr(), since it will memset past the end of
the allocated request. Fix it by setting ->cmd_size on the allocated
request_queue.

Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/scsi/scsi_transport_sas.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 126a5ee00987..f94535130a34 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -227,27 +227,31 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 		return 0;
 	}
 
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		return -ENOMEM;
+	q->cmd_size = sizeof(struct scsi_request);
+
 	if (rphy) {
-		q = blk_init_queue(sas_non_host_smp_request, NULL);
+		q->request_fn = sas_non_host_smp_request;
 		dev = &rphy->dev;
 		name = dev_name(dev);
 		release = NULL;
 	} else {
-		q = blk_init_queue(sas_host_smp_request, NULL);
+		q->request_fn = sas_host_smp_request;
 		dev = &shost->shost_gendev;
 		snprintf(namebuf, sizeof(namebuf),
 			 "sas_host%d", shost->host_no);
 		name = namebuf;
 		release = sas_host_release;
 	}
-	if (!q)
-		return -ENOMEM;
+	error = blk_init_allocated_queue(q);
+	if (error)
+		goto out_cleanup_queue;
 
 	error = bsg_register_queue(q, dev, name, release);
-	if (error) {
-		blk_cleanup_queue(q);
-		return -ENOMEM;
-	}
+	if (error)
+		goto out_cleanup_queue;
 
 	if (rphy)
 		rphy->q = q;
@@ -261,6 +265,10 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 
 	queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
 	return 0;
+
+out_cleanup_queue:
+	blk_cleanup_queue(q);
+	return error;
 }
 
 static void sas_bsg_remove(struct Scsi_Host *shost, struct sas_rphy *rphy)
-- 
cgit v1.2.3


From aedb6e2411baf242fd0faf95bd3ff0dc62ee1fa5 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Tue, 21 Feb 2017 11:59:13 -0700
Subject: block/sed: Use ssize_t on atom parsers to return errors

The short atom parser can return an errno from decoding but does not
currently return the error as a signed value. Convert all of the parsers
to ssize_t.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Scott Bauer <scott.bauer@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/sed-opal.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index d1c52ba4d62d..4675fd8eed7b 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -706,8 +706,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp,
 	return tok->pos[0];
 }
 
-static size_t response_parse_tiny(struct opal_resp_tok *tok,
-				  const u8 *pos)
+static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
+				   const u8 *pos)
 {
 	tok->pos = pos;
 	tok->len = 1;
@@ -723,8 +723,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok,
 	return tok->len;
 }
 
-static size_t response_parse_short(struct opal_resp_tok *tok,
-				   const u8 *pos)
+static ssize_t response_parse_short(struct opal_resp_tok *tok,
+				    const u8 *pos)
 {
 	tok->pos = pos;
 	tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1;
@@ -736,7 +736,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok,
 		tok->type = OPAL_DTA_TOKENID_SINT;
 	} else {
 		u64 u_integer = 0;
-		int i, b = 0;
+		ssize_t i, b = 0;
 
 		tok->type = OPAL_DTA_TOKENID_UINT;
 		if (tok->len > 9) {
@@ -753,8 +753,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok,
 	return tok->len;
 }
 
-static size_t response_parse_medium(struct opal_resp_tok *tok,
-				    const u8 *pos)
+static ssize_t response_parse_medium(struct opal_resp_tok *tok,
+				     const u8 *pos)
 {
 	tok->pos = pos;
 	tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2;
@@ -770,8 +770,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok,
 	return tok->len;
 }
 
-static size_t response_parse_long(struct opal_resp_tok *tok,
-				  const u8 *pos)
+static ssize_t response_parse_long(struct opal_resp_tok *tok,
+				   const u8 *pos)
 {
 	tok->pos = pos;
 	tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4;
@@ -787,8 +787,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok,
 	return tok->len;
 }
 
-static size_t response_parse_token(struct opal_resp_tok *tok,
-				   const u8 *pos)
+static ssize_t response_parse_token(struct opal_resp_tok *tok,
+				    const u8 *pos)
 {
 	tok->pos = pos;
 	tok->len = 1;
@@ -805,7 +805,7 @@ static int response_parse(const u8 *buf, size_t length,
 	struct opal_resp_tok *iter;
 	int num_entries = 0;
 	int total;
-	size_t token_length;
+	ssize_t token_length;
 	const u8 *pos;
 
 	if (!buf)
@@ -851,8 +851,8 @@ static int response_parse(const u8 *buf, size_t length,
 		else /* TOKEN */
 			token_length = response_parse_token(iter, pos);
 
-		if (token_length == -EINVAL)
-			return -EINVAL;
+		if (token_length < 0)
+			return token_length;
 
 		pos += token_length;
 		total -= token_length;
-- 
cgit v1.2.3


From cccb92417d447172c4653876f6cd4b04c29d7905 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Tue, 21 Feb 2017 11:59:14 -0700
Subject: block/sed: Add helper to qualify response tokens

Add helper which verifies the response token is valid and matches the
expected value. Merges token_type and response_get_token.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Scott Bauer <scott.bauer@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/sed-opal.c | 61 +++++++++++++++++++++++---------------------------------
 1 file changed, 25 insertions(+), 36 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 4675fd8eed7b..d3d6db2877b9 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -662,48 +662,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
 	return 0;
 }
 
-static enum opal_response_token token_type(const struct parsed_resp *resp,
-					   int n)
+static const struct opal_resp_tok *response_get_token(
+				const struct parsed_resp *resp,
+				int n)
 {
 	const struct opal_resp_tok *tok;
 
 	if (n >= resp->num) {
 		pr_err("Token number doesn't exist: %d, resp: %d\n",
 		       n, resp->num);
-		return OPAL_DTA_TOKENID_INVALID;
+		return ERR_PTR(-EINVAL);
 	}
 
 	tok = &resp->toks[n];
 	if (tok->len == 0) {
 		pr_err("Token length must be non-zero\n");
-		return OPAL_DTA_TOKENID_INVALID;
+		return ERR_PTR(-EINVAL);
 	}
 
-	return tok->type;
-}
-
-/*
- * This function returns 0 in case of invalid token. One should call
- * token_type() first to find out if the token is valid or not.
- */
-static enum opal_token response_get_token(const struct parsed_resp *resp,
-					  int n)
-{
-	const struct opal_resp_tok *tok;
-
-	if (n >= resp->num) {
-		pr_err("Token number doesn't exist: %d, resp: %d\n",
-		       n, resp->num);
-		return 0;
-	}
-
-	tok = &resp->toks[n];
-	if (tok->len == 0) {
-		pr_err("Token length must be non-zero\n");
-		return 0;
-	}
-
-	return tok->pos[0];
+	return tok;
 }
 
 static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
@@ -922,20 +899,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n)
 	return resp->toks[n].stored.u;
 }
 
+static bool response_token_matches(const struct opal_resp_tok *token, u8 match)
+{
+	if (IS_ERR(token) ||
+	    token->type != OPAL_DTA_TOKENID_TOKEN ||
+	    token->pos[0] != match)
+		return false;
+	return true;
+}
+
 static u8 response_status(const struct parsed_resp *resp)
 {
-	if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN &&
-	    response_get_token(resp, 0) == OPAL_ENDOFSESSION) {
+	const struct opal_resp_tok *tok;
+
+	tok = response_get_token(resp, 0);
+	if (response_token_matches(tok, OPAL_ENDOFSESSION))
 		return 0;
-	}
 
 	if (resp->num < 5)
 		return DTAERROR_NO_METHOD_STATUS;
 
-	if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN ||
-	    token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN ||
-	    response_get_token(resp, resp->num - 1) != OPAL_ENDLIST ||
-	    response_get_token(resp, resp->num - 5) != OPAL_STARTLIST)
+	tok = response_get_token(resp, resp->num - 5);
+	if (!response_token_matches(tok, OPAL_STARTLIST))
+		return DTAERROR_NO_METHOD_STATUS;
+
+	tok = response_get_token(resp, resp->num - 1);
+	if (!response_token_matches(tok, OPAL_ENDLIST))
 		return DTAERROR_NO_METHOD_STATUS;
 
 	return response_get_u64(resp, resp->num - 4);
-- 
cgit v1.2.3


From 77039b96316d119cea23f48b617103d50e78b840 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Tue, 21 Feb 2017 11:59:15 -0700
Subject: block/sed: Check received header lengths

Add a buffer size check against discovery and response header lengths
before we loop over their buffers.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Scott Bauer <scott.bauer@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/sed-opal.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index d3d6db2877b9..4fc4d7b441d5 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -411,10 +411,17 @@ static int opal_discovery0_end(struct opal_dev *dev)
 	const struct d0_header *hdr = (struct d0_header *)dev->resp;
 	const u8 *epos = dev->resp, *cpos = dev->resp;
 	u16 comid = 0;
+	u32 hlen = be32_to_cpu(hdr->length);
 
-	print_buffer(dev->resp, be32_to_cpu(hdr->length));
+	print_buffer(dev->resp, hlen);
 
-	epos += be32_to_cpu(hdr->length); /* end of buffer */
+	if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+		pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n",
+			sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
+		return -EFAULT;
+	}
+
+	epos += hlen; /* end of buffer */
 	cpos += sizeof(*hdr); /* current position on buffer */
 
 	while (cpos < epos && supported) {
@@ -784,6 +791,7 @@ static int response_parse(const u8 *buf, size_t length,
 	int total;
 	ssize_t token_length;
 	const u8 *pos;
+	u32 clen, plen, slen;
 
 	if (!buf)
 		return -EFAULT;
@@ -795,17 +803,16 @@ static int response_parse(const u8 *buf, size_t length,
 	pos = buf;
 	pos += sizeof(*hdr);
 
-	pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
-		 be32_to_cpu(hdr->cp.length),
-		 be32_to_cpu(hdr->pkt.length),
-		 be32_to_cpu(hdr->subpkt.length));
-
-	if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
-	    hdr->subpkt.length == 0) {
-		pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
-		       be32_to_cpu(hdr->cp.length),
-		       be32_to_cpu(hdr->pkt.length),
-		       be32_to_cpu(hdr->subpkt.length));
+	clen = be32_to_cpu(hdr->cp.length);
+	plen = be32_to_cpu(hdr->pkt.length);
+	slen = be32_to_cpu(hdr->subpkt.length);
+	pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n",
+		 clen, plen, slen);
+
+	if (clen == 0 || plen == 0 || slen == 0 ||
+	    slen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+		pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+		       clen, plen, slen);
 		print_buffer(pos, sizeof(*hdr));
 		return -EINVAL;
 	}
@@ -814,7 +821,7 @@ static int response_parse(const u8 *buf, size_t length,
 		return -EFAULT;
 
 	iter = resp->toks;
-	total = be32_to_cpu(hdr->subpkt.length);
+	total = slen;
 	print_buffer(pos, total);
 	while (total > 0) {
 		if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */
-- 
cgit v1.2.3


From f0889638456b2da341f35f3c841adfe4bdb88e5c Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <me@tobin.cc>
Date: Wed, 22 Feb 2017 17:12:36 +1100
Subject: cciss: Fix checkpatch TRAILING_WHITESPACE

Checkpatch emits 85 trailing whitespace warnings.

Remove trailing whitespace.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
Acked-by: Don Brace <don.brace@microsemi.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/cciss_scsi.c | 170 ++++++++++++++++++++++-----------------------
 1 file changed, 85 insertions(+), 85 deletions(-)

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index a18de9d727b0..2b8cfd4c81ce 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -17,15 +17,15 @@
  *    02111-1307, USA.
  *
  *    Questions/Comments/Bugfixes to iss_storagedev@hp.com
- *    
+ *
  *    Author: Stephen M. Cameron
  */
 #ifdef CONFIG_CISS_SCSI_TAPE
 
-/* Here we have code to present the driver as a scsi driver 
-   as it is simultaneously presented as a block driver.  The 
+/* Here we have code to present the driver as a scsi driver
+   as it is simultaneously presented as a block driver.  The
    reason for doing this is to allow access to SCSI tape drives
-   through the array controller.  Note in particular, neither 
+   through the array controller.  Note in particular, neither
    physical nor logical disks are presented through the scsi layer. */
 
 #include <linux/timer.h>
@@ -37,7 +37,7 @@
 
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
-#include <scsi/scsi_host.h> 
+#include <scsi/scsi_host.h>
 
 #include "cciss_scsi.h"
 
@@ -120,7 +120,7 @@ struct cciss_scsi_adapter_data_t {
 	struct cciss_scsi_cmd_stack_t cmd_stack;
 	SGDescriptor_struct **cmd_sg_list;
 	int registered;
-	spinlock_t lock; // to protect ccissscsi[ctlr]; 
+	spinlock_t lock; // to protect ccissscsi[ctlr];
 };
 
 #define CPQ_TAPE_LOCK(h, flags) spin_lock_irqsave( \
@@ -143,36 +143,36 @@ scsi_cmd_alloc(ctlr_info_t *h)
 	u64bit temp64;
 
 	sa = h->scsi_ctlr;
-	stk = &sa->cmd_stack; 
+	stk = &sa->cmd_stack;
 
-	if (stk->top < 0) 
+	if (stk->top < 0)
 		return NULL;
-	c = stk->elem[stk->top]; 	
+	c = stk->elem[stk->top];
 	/* memset(c, 0, sizeof(*c)); */
 	memset(&c->cmd, 0, sizeof(c->cmd));
 	memset(&c->Err, 0, sizeof(c->Err));
 	/* set physical addr of cmd and addr of scsi parameters */
-	c->cmd.busaddr = c->busaddr; 
+	c->cmd.busaddr = c->busaddr;
 	c->cmd.cmdindex = c->cmdindex;
-	/* (__u32) (stk->cmd_pool_handle + 
+	/* (__u32) (stk->cmd_pool_handle +
 		(sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top)); */
 
 	temp64.val = (__u64) (c->busaddr + sizeof(CommandList_struct));
-	/* (__u64) (stk->cmd_pool_handle + 
+	/* (__u64) (stk->cmd_pool_handle +
 		(sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top) +
 		 sizeof(CommandList_struct)); */
 	stk->top--;
 	c->cmd.ErrDesc.Addr.lower = temp64.val32.lower;
 	c->cmd.ErrDesc.Addr.upper = temp64.val32.upper;
 	c->cmd.ErrDesc.Len = sizeof(ErrorInfo_struct);
-	
+
 	c->cmd.ctlr = h->ctlr;
 	c->cmd.err_info = &c->Err;
 
 	return (CommandList_struct *) c;
 }
 
-static void 
+static void
 scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c)
 {
 	/* assume only one process in here at a time, locking done by caller. */
@@ -183,7 +183,7 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c)
 	struct cciss_scsi_cmd_stack_t *stk;
 
 	sa = h->scsi_ctlr;
-	stk = &sa->cmd_stack; 
+	stk = &sa->cmd_stack;
 	stk->top++;
 	if (stk->top >= stk->nelems) {
 		dev_err(&h->pdev->dev,
@@ -228,7 +228,7 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
 	}
 	for (i = 0; i < stk->nelems; i++) {
 		stk->elem[i] = &stk->pool[i];
-		stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + 
+		stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle +
 			(sizeof(struct cciss_scsi_cmd_stack_elem_t) * i));
 		stk->elem[i]->cmdindex = i;
 	}
@@ -244,7 +244,7 @@ scsi_cmd_stack_free(ctlr_info_t *h)
 	size_t size;
 
 	sa = h->scsi_ctlr;
-	stk = &sa->cmd_stack; 
+	stk = &sa->cmd_stack;
 	if (stk->top != stk->nelems-1) {
 		dev_warn(&h->pdev->dev,
 			"bug: %d scsi commands are still outstanding.\n",
@@ -266,7 +266,7 @@ print_cmd(CommandList_struct *cp)
 	printk("queue:%d\n", cp->Header.ReplyQueue);
 	printk("sglist:%d\n", cp->Header.SGList);
 	printk("sgtot:%d\n", cp->Header.SGTotal);
-	printk("Tag:0x%08x/0x%08x\n", cp->Header.Tag.upper, 
+	printk("Tag:0x%08x/0x%08x\n", cp->Header.Tag.upper,
 			cp->Header.Tag.lower);
 	printk("LUN:0x%8phN\n", cp->Header.LUN.LunAddrBytes);
 	printk("CDBLen:%d\n", cp->Request.CDBLen);
@@ -275,8 +275,8 @@ print_cmd(CommandList_struct *cp)
 	printk(" Dir:%d\n",cp->Request.Type.Direction);
 	printk("Timeout:%d\n",cp->Request.Timeout);
 	printk("CDB: %16ph\n", cp->Request.CDB);
-	printk("edesc.Addr: 0x%08x/0%08x, Len  = %d\n", 
-		cp->ErrDesc.Addr.upper, cp->ErrDesc.Addr.lower, 
+	printk("edesc.Addr: 0x%08x/0%08x, Len  = %d\n",
+		cp->ErrDesc.Addr.upper, cp->ErrDesc.Addr.lower,
 			cp->ErrDesc.Len);
 	printk("sgs..........Errorinfo:\n");
 	printk("scsistatus:%d\n", cp->err_info->ScsiStatus);
@@ -289,7 +289,7 @@ print_cmd(CommandList_struct *cp)
 }
 #endif
 
-static int 
+static int
 find_bus_target_lun(ctlr_info_t *h, int *bus, int *target, int *lun)
 {
 	/* finds an unused bus, target, lun for a new device */
@@ -299,24 +299,24 @@ find_bus_target_lun(ctlr_info_t *h, int *bus, int *target, int *lun)
 
 	memset(&target_taken[0], 0, CCISS_MAX_SCSI_DEVS_PER_HBA);
 
-	target_taken[SELF_SCSI_ID] = 1;	
+	target_taken[SELF_SCSI_ID] = 1;
 	for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++)
 		target_taken[ccissscsi[h->ctlr].dev[i].target] = 1;
-	
+
 	for (i = 0; i < CCISS_MAX_SCSI_DEVS_PER_HBA; i++) {
 		if (!target_taken[i]) {
 			*bus = 0; *target=i; *lun = 0; found=1;
 			break;
 		}
 	}
-	return (!found);	
+	return (!found);
 }
 struct scsi2map {
 	char scsi3addr[8];
 	int bus, target, lun;
 };
 
-static int 
+static int
 cciss_scsi_add_entry(ctlr_info_t *h, int hostno,
 		struct cciss_scsi_dev_t *device,
 		struct scsi2map *added, int *nadded)
@@ -381,8 +381,8 @@ cciss_scsi_add_entry(ctlr_info_t *h, int hostno,
 
 	ccissscsi[h->ctlr].ndevices++;
 
-	/* initially, (before registering with scsi layer) we don't 
-	   know our hostno and we don't want to print anything first 
+	/* initially, (before registering with scsi layer) we don't
+	   know our hostno and we don't want to print anything first
 	   time anyway (the scsi layer's inquiries will show that info) */
 	if (hostno != -1)
 		dev_info(&h->pdev->dev, "%s device c%db%dt%dl%d added.\n",
@@ -467,7 +467,7 @@ adjust_cciss_scsi_table(ctlr_info_t *h, int hostno,
 	/* sd contains scsi3 addresses and devtypes, but
 	   bus target and lun are not filled in.  This funciton
 	   takes what's in sd to be the current and adjusts
-	   ccissscsi[] to be in line with what's in sd. */ 
+	   ccissscsi[] to be in line with what's in sd. */
 
 	int i,j, found, changes=0;
 	struct cciss_scsi_dev_t *csd;
@@ -492,7 +492,7 @@ adjust_cciss_scsi_table(ctlr_info_t *h, int hostno,
 	if (hostno != -1)  /* if it's not the first time... */
 		sh = h->scsi_ctlr->scsi_host;
 
-	/* find any devices in ccissscsi[] that are not in 
+	/* find any devices in ccissscsi[] that are not in
 	   sd[] and remove them from ccissscsi[] */
 
 	i = 0;
@@ -512,7 +512,7 @@ adjust_cciss_scsi_table(ctlr_info_t *h, int hostno,
 			}
 		}
 
-		if (found == 0) { /* device no longer present. */ 
+		if (found == 0) { /* device no longer present. */
 			changes++;
 			cciss_scsi_remove_entry(h, hostno, i,
 				removed, &nremoved);
@@ -641,14 +641,14 @@ lookup_scsi3addr(ctlr_info_t *h, int bus, int target, int lun, char *scsi3addr)
 	return -1;
 }
 
-static void 
+static void
 cciss_scsi_setup(ctlr_info_t *h)
 {
 	struct cciss_scsi_adapter_data_t * shba;
 
 	ccissscsi[h->ctlr].ndevices = 0;
 	shba = (struct cciss_scsi_adapter_data_t *)
-		kmalloc(sizeof(*shba), GFP_KERNEL);	
+		kmalloc(sizeof(*shba), GFP_KERNEL);
 	if (shba == NULL)
 		return;
 	shba->scsi_host = NULL;
@@ -693,20 +693,20 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
 
 	/* copy the sense data whether we need to or not. */
 
-	memcpy(cmd->sense_buffer, ei->SenseInfo, 
+	memcpy(cmd->sense_buffer, ei->SenseInfo,
 		ei->SenseLen > SCSI_SENSE_BUFFERSIZE ?
-			SCSI_SENSE_BUFFERSIZE : 
+			SCSI_SENSE_BUFFERSIZE :
 			ei->SenseLen);
 	scsi_set_resid(cmd, ei->ResidualCnt);
 
-	if(ei->CommandStatus != 0) 
-	{ /* an error has occurred */ 
+	if(ei->CommandStatus != 0)
+	{ /* an error has occurred */
 		switch(ei->CommandStatus)
 		{
 			case CMD_TARGET_STATUS:
 				/* Pass it up to the upper layers... */
 				if (!ei->ScsiStatus) {
-					
+
 	/* Ordinarily, this case should never happen, but there is a bug
 	   in some released firmware revisions that allows it to happen
 	   if, for example, a 4100 backplane loses power and the tape
@@ -731,7 +731,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
 				print_cmd(c);
 				 */
      /* We get CMD_INVALID if you address a non-existent tape drive instead
-	of a selection timeout (no response).  You will see this if you yank 
+	of a selection timeout (no response).  You will see this if you yank
 	out a tape drive, then try to access it. This is kind of a shame
 	because it means that any other CMD_INVALID (e.g. driver bug) will
 	get interpreted as a missing target. */
@@ -780,7 +780,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
 				cmd->result = DID_ERROR << 16;
 				dev_warn(&h->pdev->dev,
 					"%p returned unknown status %x\n", c,
-						ei->CommandStatus); 
+						ei->CommandStatus);
 		}
 	}
 	cmd->scsi_done(cmd);
@@ -796,15 +796,15 @@ cciss_scsi_detect(ctlr_info_t *h)
 	sh = scsi_host_alloc(&cciss_driver_template, sizeof(struct ctlr_info *));
 	if (sh == NULL)
 		goto fail;
-	sh->io_port = 0;	// good enough?  FIXME, 
+	sh->io_port = 0;	// good enough?  FIXME,
 	sh->n_io_port = 0;	// I don't think we use these two...
-	sh->this_id = SELF_SCSI_ID;  
+	sh->this_id = SELF_SCSI_ID;
 	sh->can_queue = cciss_tape_cmds;
 	sh->sg_tablesize = h->maxsgentries;
 	sh->max_cmd_len = MAX_COMMAND_SIZE;
 	sh->max_sectors = h->cciss_max_sectors;
 
-	((struct cciss_scsi_adapter_data_t *) 
+	((struct cciss_scsi_adapter_data_t *)
 		h->scsi_ctlr)->scsi_host = sh;
 	sh->hostdata[0] = (unsigned long) h;
 	sh->irq = h->intr[SIMPLE_MODE_INT];
@@ -856,7 +856,7 @@ cciss_map_one(struct pci_dev *pdev,
 static int
 cciss_scsi_do_simple_cmd(ctlr_info_t *h,
 			CommandList_struct *c,
-			unsigned char *scsi3addr, 
+			unsigned char *scsi3addr,
 			unsigned char *cdb,
 			unsigned char cdblen,
 			unsigned char *buf, int bufsize,
@@ -871,7 +871,7 @@ cciss_scsi_do_simple_cmd(ctlr_info_t *h,
 	c->Header.Tag.lower = c->busaddr;  /* Use k. address of cmd as tag */
 	// Fill in the request block...
 
-	/* printk("Using scsi3addr 0x%02x%0x2%0x2%0x2%0x2%0x2%0x2%0x2\n", 
+	/* printk("Using scsi3addr 0x%02x%0x2%0x2%0x2%0x2%0x2%0x2%0x2\n",
 		scsi3addr[0], scsi3addr[1], scsi3addr[2], scsi3addr[3],
 		scsi3addr[4], scsi3addr[5], scsi3addr[6], scsi3addr[7]); */
 
@@ -885,7 +885,7 @@ cciss_scsi_do_simple_cmd(ctlr_info_t *h,
 
 	/* Fill in the SG list and do dma mapping */
 	cciss_map_one(h->pdev, c, (unsigned char *) buf,
-			bufsize, DMA_FROM_DEVICE); 
+			bufsize, DMA_FROM_DEVICE);
 
 	c->waiting = &wait;
 	enqueue_cmd_and_start_io(h, c);
@@ -896,7 +896,7 @@ cciss_scsi_do_simple_cmd(ctlr_info_t *h,
 	return(0);
 }
 
-static void 
+static void
 cciss_scsi_interpret_error(ctlr_info_t *h, CommandList_struct *c)
 {
 	ErrorInfo_struct *ei;
@@ -1005,7 +1005,7 @@ cciss_scsi_do_inquiry(ctlr_info_t *h, unsigned char *scsi3addr,
 
 	if (rc != 0) return rc; /* something went wrong */
 
-	if (ei->CommandStatus != 0 && 
+	if (ei->CommandStatus != 0 &&
 	    ei->CommandStatus != CMD_DATA_UNDERRUN) {
 		cciss_scsi_interpret_error(h, c);
 		rc = -1;
@@ -1013,7 +1013,7 @@ cciss_scsi_do_inquiry(ctlr_info_t *h, unsigned char *scsi3addr,
 	spin_lock_irqsave(&h->lock, flags);
 	scsi_cmd_free(h, c);
 	spin_unlock_irqrestore(&h->lock, flags);
-	return rc;	
+	return rc;
 }
 
 /* Get the device id from inquiry page 0x83 */
@@ -1042,7 +1042,7 @@ cciss_scsi_do_report_phys_luns(ctlr_info_t *h,
 	int rc;
 	CommandList_struct *c;
 	unsigned char cdb[12];
-	unsigned char scsi3addr[8]; 
+	unsigned char scsi3addr[8];
 	ErrorInfo_struct *ei;
 	unsigned long flags;
 
@@ -1069,14 +1069,14 @@ cciss_scsi_do_report_phys_luns(ctlr_info_t *h,
 	cdb[11] = 0;
 
 	rc = cciss_scsi_do_simple_cmd(h, c, scsi3addr,
-				cdb, 12, 
-				(unsigned char *) buf, 
+				cdb, 12,
+				(unsigned char *) buf,
 				bufsize, XFER_READ);
 
 	if (rc != 0) return rc; /* something went wrong */
 
 	ei = c->err_info;
-	if (ei->CommandStatus != 0 && 
+	if (ei->CommandStatus != 0 &&
 	    ei->CommandStatus != CMD_DATA_UNDERRUN) {
 		cciss_scsi_interpret_error(h, c);
 		rc = -1;
@@ -1084,36 +1084,36 @@ cciss_scsi_do_report_phys_luns(ctlr_info_t *h,
 	spin_lock_irqsave(&h->lock, flags);
 	scsi_cmd_free(h, c);
 	spin_unlock_irqrestore(&h->lock, flags);
-	return rc;	
+	return rc;
 }
 
 static void
 cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
 {
 	/* the idea here is we could get notified from /proc
-	   that some devices have changed, so we do a report 
-	   physical luns cmd, and adjust our list of devices 
+	   that some devices have changed, so we do a report
+	   physical luns cmd, and adjust our list of devices
 	   accordingly.  (We can't rely on the scsi-mid layer just
-	   doing inquiries, because the "busses" that the scsi 
+	   doing inquiries, because the "busses" that the scsi
 	   mid-layer probes are totally fabricated by this driver,
 	   so new devices wouldn't show up.
 
-	   the scsi3addr's of devices won't change so long as the 
-	   adapter is not reset.  That means we can rescan and 
-	   tell which devices we already know about, vs. new 
+	   the scsi3addr's of devices won't change so long as the
+	   adapter is not reset.  That means we can rescan and
+	   tell which devices we already know about, vs. new
 	   devices, vs.  disappearing devices.
 
 	   Also, if you yank out a tape drive, then put in a disk
-	   in it's place, (say, a configured volume from another 
-	   array controller for instance)  _don't_ poke this driver 
-           (so it thinks it's still a tape, but _do_ poke the scsi 
-           mid layer, so it does an inquiry... the scsi mid layer 
+	   in it's place, (say, a configured volume from another
+	   array controller for instance)  _don't_ poke this driver
+           (so it thinks it's still a tape, but _do_ poke the scsi
+           mid layer, so it does an inquiry... the scsi mid layer
            will see the physical disk.  This would be bad.  Need to
-	   think about how to prevent that.  One idea would be to 
+	   think about how to prevent that.  One idea would be to
 	   snoop all scsi responses and if an inquiry repsonse comes
 	   back that reports a disk, chuck it an return selection
 	   timeout instead and adjust our table...  Not sure i like
-	   that though.  
+	   that though.
 
 	 */
 #define OBDR_TAPE_INQ_SIZE 49
@@ -1141,9 +1141,9 @@ cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
 		ch = &ld_buff->LUNListLength[0];
 		num_luns = ((ch[0]<<24) | (ch[1]<<16) | (ch[2]<<8) | ch[3]) / 8;
 		if (num_luns > CISS_MAX_PHYS_LUN) {
-			printk(KERN_WARNING 
+			printk(KERN_WARNING
 				"cciss: Maximum physical LUNs (%d) exceeded.  "
-				"%d LUNs ignored.\n", CISS_MAX_PHYS_LUN, 
+				"%d LUNs ignored.\n", CISS_MAX_PHYS_LUN,
 				num_luns - CISS_MAX_PHYS_LUN);
 			num_luns = CISS_MAX_PHYS_LUN;
 		}
@@ -1154,7 +1154,7 @@ cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
 	}
 
 
-	/* adjust our table of devices */	
+	/* adjust our table of devices */
 	for (i = 0; i < num_luns; i++) {
 		/* for each physical lun, do an inquiry */
 		if (ld_buff->LUN[i][3] & 0xC0) continue;
@@ -1213,7 +1213,7 @@ cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
 			currentsd[ncurrent] = *this_device;
 			ncurrent++;
 			break;
-		  default: 
+		  default:
 			break;
 		}
 	}
@@ -1258,8 +1258,8 @@ cciss_scsi_write_info(struct Scsi_Host *sh,
 		return -EINVAL;
 
 	return cciss_scsi_user_command(h, sh->host_no,
-			buffer, length);	
-} 
+			buffer, length);
+}
 
 static int
 cciss_scsi_show_info(struct seq_file *m, struct Scsi_Host *sh)
@@ -1297,8 +1297,8 @@ cciss_scsi_show_info(struct seq_file *m, struct Scsi_Host *sh)
 	return 0;
 }
 
-/* cciss_scatter_gather takes a struct scsi_cmnd, (cmd), and does the pci 
-   dma mapping  and fills in the scatter gather entries of the 
+/* cciss_scatter_gather takes a struct scsi_cmnd, (cmd), and does the pci
+   dma mapping  and fills in the scatter gather entries of the
    cciss command, c. */
 
 static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
@@ -1394,7 +1394,7 @@ cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
 
 	// Fill in the command list header
 
-	cmd->scsi_done = done;    // save this for use by completion code 
+	cmd->scsi_done = done;    // save this for use by completion code
 
 	/* save c in case we have to abort it */
 	cmd->host_scribble = (unsigned char *) c;
@@ -1404,7 +1404,7 @@ cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
 	c->Header.ReplyQueue = 0;  /* unused in simple mode */
 	memcpy(&c->Header.LUN.LunAddrBytes[0], &scsi3addr[0], 8);
 	c->Header.Tag.lower = c->busaddr;  /* Use k. address of cmd as tag */
-	
+
 	// Fill in the request block...
 
 	c->Request.Timeout = 0;
@@ -1432,15 +1432,15 @@ cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
 
 		c->Request.Type.Direction = XFER_RSVD;
 		// This is technically wrong, and cciss controllers should
-		// reject it with CMD_INVALID, which is the most correct 
-		// response, but non-fibre backends appear to let it 
+		// reject it with CMD_INVALID, which is the most correct
+		// response, but non-fibre backends appear to let it
 		// slide by, and give the same results as if this field
 		// were set correctly.  Either way is acceptable for
 		// our purposes here.
 
 		break;
 
-	  default: 
+	  default:
 		dev_warn(&h->pdev->dev, "unknown data direction: %d\n",
 			cmd->sc_data_direction);
 		BUG();
@@ -1464,9 +1464,9 @@ static void cciss_unregister_scsi(ctlr_info_t *h)
 
 	spin_lock_irqsave(&h->lock, flags);
 	sa = h->scsi_ctlr;
-	stk = &sa->cmd_stack; 
+	stk = &sa->cmd_stack;
 
-	/* if we weren't ever actually registered, don't unregister */ 
+	/* if we weren't ever actually registered, don't unregister */
 	if (sa->registered) {
 		spin_unlock_irqrestore(&h->lock, flags);
 		scsi_remove_host(sa->scsi_host);
@@ -1474,7 +1474,7 @@ static void cciss_unregister_scsi(ctlr_info_t *h)
 		spin_lock_irqsave(&h->lock, flags);
 	}
 
-	/* set scsi_host to NULL so our detect routine will 
+	/* set scsi_host to NULL so our detect routine will
 	   find us on register */
 	sa->scsi_host = NULL;
 	spin_unlock_irqrestore(&h->lock, flags);
@@ -1490,7 +1490,7 @@ static int cciss_engage_scsi(ctlr_info_t *h)
 
 	spin_lock_irqsave(&h->lock, flags);
 	sa = h->scsi_ctlr;
-	stk = &sa->cmd_stack; 
+	stk = &sa->cmd_stack;
 
 	if (sa->registered) {
 		dev_info(&h->pdev->dev, "SCSI subsystem already engaged.\n");
@@ -1586,13 +1586,13 @@ retry_tur:
 	return rc;
 }
 
-/* Need at least one of these error handlers to keep ../scsi/hosts.c from 
- * complaining.  Doing a host- or bus-reset can't do anything good here. 
+/* Need at least one of these error handlers to keep ../scsi/hosts.c from
+ * complaining.  Doing a host- or bus-reset can't do anything good here.
  * Despite what it might say in scsi_error.c, there may well be commands
  * on the controller, as the cciss driver registers twice, once as a block
  * device for the logical drives, and once as a scsi device, for any tape
  * drives.  So we know there are no commands out on the tape drives, but we
- * don't know there are no commands on the controller, and it is likely 
+ * don't know there are no commands on the controller, and it is likely
  * that there probably are, as the cciss block device is most commonly used
  * as a boot device (embedded controller on HP/Compaq systems.)
 */
-- 
cgit v1.2.3


From 19a5e10c3b53772cfe47a23963ebcb941d81e695 Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <me@tobin.cc>
Date: Wed, 22 Feb 2017 17:12:37 +1100
Subject: cciss: Fix checkpatch OPEN_BRACE

Checkpatch emits ERROR:OPEN_BRACE: that open brace { should be on the
previous line.

Move open brace to new line. Also add space after if/switch statement
since we introduce more checkpatch errors if not fixed at the same
time.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
Acked-by: Don Brace <don.brace@microsemi.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/cciss_scsi.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 2b8cfd4c81ce..f5c21f3edca4 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -699,10 +699,8 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
 			ei->SenseLen);
 	scsi_set_resid(cmd, ei->ResidualCnt);
 
-	if(ei->CommandStatus != 0)
-	{ /* an error has occurred */
-		switch(ei->CommandStatus)
-		{
+	if (ei->CommandStatus != 0) { /* an error has occurred */
+		switch (ei->CommandStatus) {
 			case CMD_TARGET_STATUS:
 				/* Pass it up to the upper layers... */
 				if (!ei->ScsiStatus) {
@@ -902,8 +900,7 @@ cciss_scsi_interpret_error(ctlr_info_t *h, CommandList_struct *c)
 	ErrorInfo_struct *ei;
 
 	ei = c->err_info;
-	switch(ei->CommandStatus)
-	{
+	switch (ei->CommandStatus) {
 		case CMD_TARGET_STATUS:
 			dev_warn(&h->pdev->dev,
 				"cmd %p has completed with errors\n", c);
@@ -1182,8 +1179,7 @@ cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
 		cciss_scsi_get_device_id(h, scsi3addr,
 			this_device->device_id, sizeof(this_device->device_id));
 
-		switch (this_device->devtype)
-		{
+		switch (this_device->devtype) {
 		  case 0x05: /* CD-ROM */ {
 
 			/* We don't *really* support actual CD-ROM devices,
@@ -1414,8 +1410,7 @@ cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
 	memcpy(c->Request.CDB, cmd->cmnd, cmd->cmd_len);
 	c->Request.Type.Type = TYPE_CMD;
 	c->Request.Type.Attribute = ATTR_SIMPLE;
-	switch(cmd->sc_data_direction)
-	{
+	switch (cmd->sc_data_direction) {
 	  case DMA_TO_DEVICE:
 		c->Request.Type.Direction = XFER_WRITE;
 		break;
-- 
cgit v1.2.3


From 48efbfbf6353af15d97ef37216146e550403c773 Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <me@tobin.cc>
Date: Wed, 22 Feb 2017 17:12:38 +1100
Subject: cciss: Remove kmalloc cast

Coccinelle emits a warning about casting the return value of
kmalloc(). Coccinelle suggests removing the cast as do
kerneljanitors.

Remove cast from kmalloc() call.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
Acked-by: Don Brace <don.brace@microsemi.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/cciss_scsi.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index f5c21f3edca4..01a1f7e24978 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -647,8 +647,7 @@ cciss_scsi_setup(ctlr_info_t *h)
 	struct cciss_scsi_adapter_data_t * shba;
 
 	ccissscsi[h->ctlr].ndevices = 0;
-	shba = (struct cciss_scsi_adapter_data_t *)
-		kmalloc(sizeof(*shba), GFP_KERNEL);
+	shba = kmalloc(sizeof(*shba), GFP_KERNEL);
 	if (shba == NULL)
 		return;
 	shba->scsi_host = NULL;
-- 
cgit v1.2.3


From eed64951f15d63beac75e0f848ac03c14f8a4a6c Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Wed, 22 Feb 2017 07:55:13 -0700
Subject: block/sed: Embed function data into the function sequence

By embedding the function data with the function sequence, we can
eliminate the external function data and state variable code. It also
made obvious some other small cleanups.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/sed-opal.c | 418 ++++++++++++++++++++++---------------------------------
 1 file changed, 163 insertions(+), 255 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 4fc4d7b441d5..893557554cc5 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -34,7 +34,11 @@
 #define IO_BUFFER_LENGTH 2048
 #define MAX_TOKS 64
 
-typedef int (*opal_step)(struct opal_dev *dev);
+struct opal_step {
+	int (*fn)(struct opal_dev *dev, void *data);
+	void *data;
+};
+typedef int (cont_fn)(struct opal_dev *dev);
 
 enum opal_atom_width {
 	OPAL_WIDTH_TINY,
@@ -80,9 +84,7 @@ struct opal_dev {
 	void *data;
 	sec_send_recv *send_recv;
 
-	const opal_step *funcs;
-	void **func_data;
-	int state;
+	const struct opal_step *steps;
 	struct mutex dev_lock;
 	u16 comid;
 	u32 hsn;
@@ -213,8 +215,6 @@ static const u8 opalmethod[][OPAL_UID_LENGTH] = {
 		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 },
 };
 
-typedef int (cont_fn)(struct opal_dev *dev);
-
 static int end_opal_session_error(struct opal_dev *dev);
 
 struct opal_suspend_data {
@@ -375,18 +375,18 @@ static void check_geometry(struct opal_dev *dev, const void *data)
 
 static int next(struct opal_dev *dev)
 {
-	opal_step func;
-	int error = 0;
+	const struct opal_step *step;
+	int state = 0, error = 0;
 
 	do {
-		func = dev->funcs[dev->state];
-		if (!func)
+		step = &dev->steps[state];
+		if (!step->fn)
 			break;
 
-		error = func(dev);
+		error = step->fn(dev, step->data);
 		if (error) {
 			pr_err("Error on step function: %d with error %d: %s\n",
-			       dev->state, error,
+			       state, error,
 			       opal_error_to_human(error));
 
 			/* For each OPAL command we do a discovery0 then we
@@ -396,10 +396,10 @@ static int next(struct opal_dev *dev)
 			 * session. Therefore we shouldn't attempt to terminate
 			 * a session, as one has not yet been created.
 			 */
-			if (dev->state > 1)
+			if (state > 1)
 				return end_opal_session_error(dev);
 		}
-		dev->state++;
+		state++;
 	} while (!error);
 
 	return error;
@@ -483,7 +483,7 @@ static int opal_discovery0_end(struct opal_dev *dev)
 	return 0;
 }
 
-static int opal_discovery0(struct opal_dev *dev)
+static int opal_discovery0(struct opal_dev *dev, void *data)
 {
 	int ret;
 
@@ -1018,7 +1018,7 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
 	return opal_send_recv(dev, cont);
 }
 
-static int gen_key(struct opal_dev *dev)
+static int gen_key(struct opal_dev *dev, void *data)
 {
 	const u8 *method;
 	u8 uid[OPAL_UID_LENGTH];
@@ -1072,15 +1072,14 @@ static int get_active_key_cont(struct opal_dev *dev)
 	return 0;
 }
 
-static int get_active_key(struct opal_dev *dev)
+static int get_active_key(struct opal_dev *dev, void *data)
 {
 	u8 uid[OPAL_UID_LENGTH];
 	int err = 0;
-	u8 *lr;
+	u8 *lr = data;
 
 	clear_opal_cmd(dev);
 	set_comid(dev, dev->comid);
-	lr = dev->func_data[dev->state];
 
 	err = build_locking_range(uid, sizeof(uid), *lr);
 	if (err)
@@ -1163,17 +1162,16 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
 	return err;
 }
 
-static int setup_locking_range(struct opal_dev *dev)
+static int setup_locking_range(struct opal_dev *dev, void *data)
 {
 	u8 uid[OPAL_UID_LENGTH];
-	struct opal_user_lr_setup *setup;
+	struct opal_user_lr_setup *setup = data;
 	u8 lr;
 	int err = 0;
 
 	clear_opal_cmd(dev);
 	set_comid(dev, dev->comid);
 
-	setup = dev->func_data[dev->state];
 	lr = setup->session.opal_key.lr;
 	err = build_locking_range(uid, sizeof(uid), lr);
 	if (err)
@@ -1286,20 +1284,19 @@ static int start_generic_opal_session(struct opal_dev *dev,
 	return finalize_and_send(dev, start_opal_session_cont);
 }
 
-static int start_anybodyASP_opal_session(struct opal_dev *dev)
+static int start_anybodyASP_opal_session(struct opal_dev *dev, void *data)
 {
 	return start_generic_opal_session(dev, OPAL_ANYBODY_UID,
 					  OPAL_ADMINSP_UID, NULL, 0);
 }
 
-static int start_SIDASP_opal_session(struct opal_dev *dev)
+static int start_SIDASP_opal_session(struct opal_dev *dev, void *data)
 {
 	int ret;
 	const u8 *key = dev->prev_data;
-	struct opal_key *okey;
 
 	if (!key) {
-		okey = dev->func_data[dev->state];
+		const struct opal_key *okey = data;
 		ret = start_generic_opal_session(dev, OPAL_SID_UID,
 						 OPAL_ADMINSP_UID,
 						 okey->key,
@@ -1314,22 +1311,21 @@ static int start_SIDASP_opal_session(struct opal_dev *dev)
 	return ret;
 }
 
-static inline int start_admin1LSP_opal_session(struct opal_dev *dev)
+static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data)
 {
-	struct opal_key *key = dev->func_data[dev->state];
-
+	struct opal_key *key = data;
 	return start_generic_opal_session(dev, OPAL_ADMIN1_UID,
 					  OPAL_LOCKINGSP_UID,
 					  key->key, key->key_len);
 }
 
-static int start_auth_opal_session(struct opal_dev *dev)
+static int start_auth_opal_session(struct opal_dev *dev, void *data)
 {
+	struct opal_session_info *session = data;
 	u8 lk_ul_user[OPAL_UID_LENGTH];
+	size_t keylen = session->opal_key.key_len;
 	int err = 0;
 
-	struct opal_session_info *session = dev->func_data[dev->state];
-	size_t keylen = session->opal_key.key_len;
 	u8 *key = session->opal_key.key;
 	u32 hsn = GENERIC_HOST_SESSION_NUM;
 
@@ -1379,7 +1375,7 @@ static int start_auth_opal_session(struct opal_dev *dev)
 	return finalize_and_send(dev, start_opal_session_cont);
 }
 
-static int revert_tper(struct opal_dev *dev)
+static int revert_tper(struct opal_dev *dev, void *data)
 {
 	int err = 0;
 
@@ -1401,9 +1397,9 @@ static int revert_tper(struct opal_dev *dev)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int internal_activate_user(struct opal_dev *dev)
+static int internal_activate_user(struct opal_dev *dev, void *data)
 {
-	struct opal_session_info *session = dev->func_data[dev->state];
+	struct opal_session_info *session = data;
 	u8 uid[OPAL_UID_LENGTH];
 	int err = 0;
 
@@ -1436,15 +1432,14 @@ static int internal_activate_user(struct opal_dev *dev)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int erase_locking_range(struct opal_dev *dev)
+static int erase_locking_range(struct opal_dev *dev, void *data)
 {
-	struct opal_session_info *session;
+	struct opal_session_info *session = data;
 	u8 uid[OPAL_UID_LENGTH];
 	int err = 0;
 
 	clear_opal_cmd(dev);
 	set_comid(dev, dev->comid);
-	session = dev->func_data[dev->state];
 
 	if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0)
 		return -ERANGE;
@@ -1463,9 +1458,9 @@ static int erase_locking_range(struct opal_dev *dev)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int set_mbr_done(struct opal_dev *dev)
+static int set_mbr_done(struct opal_dev *dev, void *data)
 {
-	u8 mbr_done_tf = *(u8 *)dev->func_data[dev->state];
+	u8 *mbr_done_tf = data;
 	int err = 0;
 
 	clear_opal_cmd(dev);
@@ -1481,7 +1476,7 @@ static int set_mbr_done(struct opal_dev *dev)
 	add_token_u8(&err, dev, OPAL_STARTLIST);
 	add_token_u8(&err, dev, OPAL_STARTNAME);
 	add_token_u8(&err, dev, 2); /* Done */
-	add_token_u8(&err, dev, mbr_done_tf); /* Done T or F */
+	add_token_u8(&err, dev, *mbr_done_tf); /* Done T or F */
 	add_token_u8(&err, dev, OPAL_ENDNAME);
 	add_token_u8(&err, dev, OPAL_ENDLIST);
 	add_token_u8(&err, dev, OPAL_ENDNAME);
@@ -1495,9 +1490,9 @@ static int set_mbr_done(struct opal_dev *dev)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int set_mbr_enable_disable(struct opal_dev *dev)
+static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
 {
-	u8 mbr_en_dis = *(u8 *)dev->func_data[dev->state];
+	u8 *mbr_en_dis = data;
 	int err = 0;
 
 	clear_opal_cmd(dev);
@@ -1513,7 +1508,7 @@ static int set_mbr_enable_disable(struct opal_dev *dev)
 	add_token_u8(&err, dev, OPAL_STARTLIST);
 	add_token_u8(&err, dev, OPAL_STARTNAME);
 	add_token_u8(&err, dev, 1);
-	add_token_u8(&err, dev, mbr_en_dis);
+	add_token_u8(&err, dev, *mbr_en_dis);
 	add_token_u8(&err, dev, OPAL_ENDNAME);
 	add_token_u8(&err, dev, OPAL_ENDLIST);
 	add_token_u8(&err, dev, OPAL_ENDNAME);
@@ -1554,11 +1549,10 @@ static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
 	return err;
 }
 
-static int set_new_pw(struct opal_dev *dev)
+static int set_new_pw(struct opal_dev *dev, void *data)
 {
 	u8 cpin_uid[OPAL_UID_LENGTH];
-	struct opal_session_info *usr = dev->func_data[dev->state];
-
+	struct opal_session_info *usr = data;
 
 	memcpy(cpin_uid, opaluid[OPAL_C_PIN_ADMIN1], OPAL_UID_LENGTH);
 
@@ -1579,10 +1573,10 @@ static int set_new_pw(struct opal_dev *dev)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int set_sid_cpin_pin(struct opal_dev *dev)
+static int set_sid_cpin_pin(struct opal_dev *dev, void *data)
 {
 	u8 cpin_uid[OPAL_UID_LENGTH];
-	struct opal_key *key = dev->func_data[dev->state];
+	struct opal_key *key = data;
 
 	memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH);
 
@@ -1593,18 +1587,16 @@ static int set_sid_cpin_pin(struct opal_dev *dev)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int add_user_to_lr(struct opal_dev *dev)
+static int add_user_to_lr(struct opal_dev *dev, void *data)
 {
 	u8 lr_buffer[OPAL_UID_LENGTH];
 	u8 user_uid[OPAL_UID_LENGTH];
-	struct opal_lock_unlock *lkul;
+	struct opal_lock_unlock *lkul = data;
 	int err = 0;
 
 	clear_opal_cmd(dev);
 	set_comid(dev, dev->comid);
 
-	lkul = dev->func_data[dev->state];
-
 	memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED],
 	       OPAL_UID_LENGTH);
 
@@ -1671,11 +1663,11 @@ static int add_user_to_lr(struct opal_dev *dev)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int lock_unlock_locking_range(struct opal_dev *dev)
+static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
 {
 	u8 lr_buffer[OPAL_UID_LENGTH];
 	const u8 *method;
-	struct opal_lock_unlock *lkul;
+	struct opal_lock_unlock *lkul = data;
 	u8 read_locked = 1, write_locked = 1;
 	int err = 0;
 
@@ -1683,7 +1675,6 @@ static int lock_unlock_locking_range(struct opal_dev *dev)
 	set_comid(dev, dev->comid);
 
 	method = opalmethod[OPAL_SET];
-	lkul = dev->func_data[dev->state];
 	if (build_locking_range(lr_buffer, sizeof(lr_buffer),
 				lkul->session.opal_key.lr) < 0)
 		return -ERANGE;
@@ -1735,19 +1726,18 @@ static int lock_unlock_locking_range(struct opal_dev *dev)
 }
 
 
-static int lock_unlock_locking_range_sum(struct opal_dev *dev)
+static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
 {
 	u8 lr_buffer[OPAL_UID_LENGTH];
 	u8 read_locked = 1, write_locked = 1;
 	const u8 *method;
-	struct opal_lock_unlock *lkul;
+	struct opal_lock_unlock *lkul = data;
 	int ret;
 
 	clear_opal_cmd(dev);
 	set_comid(dev, dev->comid);
 
 	method = opalmethod[OPAL_SET];
-	lkul = dev->func_data[dev->state];
 	if (build_locking_range(lr_buffer, sizeof(lr_buffer),
 				lkul->session.opal_key.lr) < 0)
 		return -ERANGE;
@@ -1778,9 +1768,9 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int activate_lsp(struct opal_dev *dev)
+static int activate_lsp(struct opal_dev *dev, void *data)
 {
-	struct opal_lr_act *opal_act;
+	struct opal_lr_act *opal_act = data;
 	u8 user_lr[OPAL_UID_LENGTH];
 	u8 uint_3 = 0x83;
 	int err = 0, i;
@@ -1788,8 +1778,6 @@ static int activate_lsp(struct opal_dev *dev)
 	clear_opal_cmd(dev);
 	set_comid(dev, dev->comid);
 
-	opal_act = dev->func_data[dev->state];
-
 	add_token_u8(&err, dev, OPAL_CALL);
 	add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
 			     OPAL_UID_LENGTH);
@@ -1854,7 +1842,7 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev)
 }
 
 /* Determine if we're in the Manufactured Inactive or Active state */
-static int get_lsp_lifecycle(struct opal_dev *dev)
+static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
 {
 	int err = 0;
 
@@ -1915,14 +1903,13 @@ static int get_msid_cpin_pin_cont(struct opal_dev *dev)
 	return 0;
 }
 
-static int get_msid_cpin_pin(struct opal_dev *dev)
+static int get_msid_cpin_pin(struct opal_dev *dev, void *data)
 {
 	int err = 0;
 
 	clear_opal_cmd(dev);
 	set_comid(dev, dev->comid);
 
-
 	add_token_u8(&err, dev, OPAL_CALL);
 	add_token_bytestring(&err, dev, opaluid[OPAL_C_PIN_MSID],
 			     OPAL_UID_LENGTH);
@@ -1952,58 +1939,48 @@ static int get_msid_cpin_pin(struct opal_dev *dev)
 	return finalize_and_send(dev, get_msid_cpin_pin_cont);
 }
 
-static int build_end_opal_session(struct opal_dev *dev)
+static int end_opal_session(struct opal_dev *dev, void *data)
 {
 	int err = 0;
 
 	clear_opal_cmd(dev);
-
 	set_comid(dev, dev->comid);
 	add_token_u8(&err, dev, OPAL_ENDOFSESSION);
-	return err;
-}
-
-static int end_opal_session(struct opal_dev *dev)
-{
-	int ret = build_end_opal_session(dev);
 
-	if (ret < 0)
-		return ret;
+	if (err < 0)
+		return err;
 	return finalize_and_send(dev, end_session_cont);
 }
 
 static int end_opal_session_error(struct opal_dev *dev)
 {
-	const opal_step error_end_session[] = {
-		end_opal_session,
-		NULL,
+	const struct opal_step error_end_session[] = {
+		{ end_opal_session, },
+		{ NULL, }
 	};
-	dev->funcs = error_end_session;
-	dev->state = 0;
+	dev->steps = error_end_session;
 	return next(dev);
 }
 
 static inline void setup_opal_dev(struct opal_dev *dev,
-				  const opal_step *funcs)
+				  const struct opal_step *steps)
 {
-	dev->state = 0;
-	dev->funcs = funcs;
+	dev->steps = steps;
 	dev->tsn = 0;
 	dev->hsn = 0;
-	dev->func_data = NULL;
 	dev->prev_data = NULL;
 }
 
 static int check_opal_support(struct opal_dev *dev)
 {
-	static const opal_step funcs[] = {
-		opal_discovery0,
-		NULL
+	const struct opal_step steps[] = {
+		{ opal_discovery0, },
+		{ NULL, }
 	};
 	int ret;
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, funcs);
+	setup_opal_dev(dev, steps);
 	ret = next(dev);
 	dev->supported = !ret;
 	mutex_unlock(&dev->dev_lock);
@@ -2034,24 +2011,18 @@ EXPORT_SYMBOL(init_opal_dev);
 static int opal_secure_erase_locking_range(struct opal_dev *dev,
 					   struct opal_session_info *opal_session)
 {
-	void *data[3] = { NULL };
-	static const opal_step erase_funcs[] = {
-		opal_discovery0,
-		start_auth_opal_session,
-		get_active_key,
-		gen_key,
-		end_opal_session,
-		NULL,
+	const struct opal_step erase_steps[] = {
+		{ opal_discovery0, },
+		{ start_auth_opal_session, opal_session },
+		{ get_active_key, &opal_session->opal_key.lr },
+		{ gen_key, },
+		{ end_opal_session, },
+		{ NULL, }
 	};
 	int ret;
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, erase_funcs);
-
-	dev->func_data = data;
-	dev->func_data[1] = opal_session;
-	dev->func_data[2] = &opal_session->opal_key.lr;
-
+	setup_opal_dev(dev, erase_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
@@ -2060,23 +2031,17 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev,
 static int opal_erase_locking_range(struct opal_dev *dev,
 				    struct opal_session_info *opal_session)
 {
-	void *data[3] = { NULL };
-	static const opal_step erase_funcs[] = {
-		opal_discovery0,
-		start_auth_opal_session,
-		erase_locking_range,
-		end_opal_session,
-		NULL,
+	const struct opal_step erase_steps[] = {
+		{ opal_discovery0, },
+		{ start_auth_opal_session, opal_session },
+		{ erase_locking_range, opal_session },
+		{ end_opal_session, },
+		{ NULL, }
 	};
 	int ret;
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, erase_funcs);
-
-	dev->func_data = data;
-	dev->func_data[1] = opal_session;
-	dev->func_data[2] = opal_session;
-
+	setup_opal_dev(dev, erase_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
@@ -2085,16 +2050,15 @@ static int opal_erase_locking_range(struct opal_dev *dev,
 static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
 					  struct opal_mbr_data *opal_mbr)
 {
-	void *func_data[6] = { NULL };
-	static const opal_step mbr_funcs[] = {
-		opal_discovery0,
-		start_admin1LSP_opal_session,
-		set_mbr_done,
-		end_opal_session,
-		start_admin1LSP_opal_session,
-		set_mbr_enable_disable,
-		end_opal_session,
-		NULL,
+	const struct opal_step mbr_steps[] = {
+		{ opal_discovery0, },
+		{ start_admin1LSP_opal_session, &opal_mbr->key },
+		{ set_mbr_done, &opal_mbr->enable_disable },
+		{ end_opal_session, },
+		{ start_admin1LSP_opal_session, &opal_mbr->key },
+		{ set_mbr_enable_disable, &opal_mbr->enable_disable },
+		{ end_opal_session, },
+		{ NULL, }
 	};
 	int ret;
 
@@ -2103,12 +2067,7 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
 		return -EINVAL;
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, mbr_funcs);
-	dev->func_data = func_data;
-	dev->func_data[1] = &opal_mbr->key;
-	dev->func_data[2] = &opal_mbr->enable_disable;
-	dev->func_data[4] = &opal_mbr->key;
-	dev->func_data[5] = &opal_mbr->enable_disable;
+	setup_opal_dev(dev, mbr_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
@@ -2135,13 +2094,12 @@ static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
 static int opal_add_user_to_lr(struct opal_dev *dev,
 			       struct opal_lock_unlock *lk_unlk)
 {
-	void *func_data[3] = { NULL };
-	static const opal_step funcs[] = {
-		opal_discovery0,
-		start_admin1LSP_opal_session,
-		add_user_to_lr,
-		end_opal_session,
-		NULL
+	const struct opal_step steps[] = {
+		{ opal_discovery0, },
+		{ start_admin1LSP_opal_session, &lk_unlk->session.opal_key },
+		{ add_user_to_lr, lk_unlk },
+		{ end_opal_session, },
+		{ NULL, }
 	};
 	int ret;
 
@@ -2163,10 +2121,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
 	}
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, funcs);
-	dev->func_data = func_data;
-	dev->func_data[1] = &lk_unlk->session.opal_key;
-	dev->func_data[2] = lk_unlk;
+	setup_opal_dev(dev, steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
@@ -2174,55 +2129,46 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
 
 static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
 {
-	void *data[2] = { NULL };
-	static const opal_step revert_funcs[] = {
-		opal_discovery0,
-		start_SIDASP_opal_session,
-		revert_tper, /* controller will terminate session */
-		NULL,
+	const struct opal_step revert_steps[] = {
+		{ opal_discovery0, },
+		{ start_SIDASP_opal_session, opal },
+		{ revert_tper, }, /* controller will terminate session */
+		{ NULL, }
 	};
 	int ret;
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, revert_funcs);
-	dev->func_data = data;
-	dev->func_data[1] = opal;
+	setup_opal_dev(dev, revert_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
 }
 
-static int __opal_lock_unlock_sum(struct opal_dev *dev)
+static int __opal_lock_unlock(struct opal_dev *dev,
+			      struct opal_lock_unlock *lk_unlk)
 {
-	static const opal_step ulk_funcs_sum[] = {
-		opal_discovery0,
-		start_auth_opal_session,
-		lock_unlock_locking_range_sum,
-		end_opal_session,
-		NULL
+	const struct opal_step unlock_steps[] = {
+		{ opal_discovery0, },
+		{ start_auth_opal_session, &lk_unlk->session },
+		{ lock_unlock_locking_range, lk_unlk },
+		{ end_opal_session, },
+		{ NULL, }
 	};
-
-	dev->funcs = ulk_funcs_sum;
-	return next(dev);
-}
-
-static int __opal_lock_unlock(struct opal_dev *dev)
-{
-	static const opal_step _unlock_funcs[] = {
-		opal_discovery0,
-		start_auth_opal_session,
-		lock_unlock_locking_range,
-		end_opal_session,
-		NULL
+	const struct opal_step unlock_sum_steps[] = {
+		{ opal_discovery0, },
+		{ start_auth_opal_session, &lk_unlk->session },
+		{ lock_unlock_locking_range_sum, lk_unlk },
+		{ end_opal_session, },
+		{ NULL, }
 	};
 
-	dev->funcs = _unlock_funcs;
+	dev->steps = lk_unlk->session.sum ? unlock_sum_steps : unlock_steps;
 	return next(dev);
 }
 
-static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
+static int opal_lock_unlock(struct opal_dev *dev,
+			    struct opal_lock_unlock *lk_unlk)
 {
-	void *func_data[3] = { NULL };
 	int ret;
 
 	if (lk_unlk->session.who < OPAL_ADMIN1 ||
@@ -2230,43 +2176,30 @@ static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_un
 		return -EINVAL;
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, NULL);
-	dev->func_data = func_data;
-	dev->func_data[1] = &lk_unlk->session;
-	dev->func_data[2] = lk_unlk;
-
-	if (lk_unlk->session.sum)
-		ret = __opal_lock_unlock_sum(dev);
-	else
-		ret = __opal_lock_unlock(dev);
-
+	ret = __opal_lock_unlock(dev, lk_unlk);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
 }
 
 static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
 {
-	static const opal_step owner_funcs[] = {
-		opal_discovery0,
-		start_anybodyASP_opal_session,
-		get_msid_cpin_pin,
-		end_opal_session,
-		start_SIDASP_opal_session,
-		set_sid_cpin_pin,
-		end_opal_session,
-		NULL
+	const struct opal_step owner_steps[] = {
+		{ opal_discovery0, },
+		{ start_anybodyASP_opal_session, },
+		{ get_msid_cpin_pin, },
+		{ end_opal_session, },
+		{ start_SIDASP_opal_session, opal },
+		{ set_sid_cpin_pin, opal },
+		{ end_opal_session, },
+		{ NULL, }
 	};
-	void *data[6] = { NULL };
 	int ret;
 
 	if (!dev)
 		return -ENODEV;
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, owner_funcs);
-	dev->func_data = data;
-	dev->func_data[4] = opal;
-	dev->func_data[5] = opal;
+	setup_opal_dev(dev, owner_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
@@ -2274,14 +2207,13 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
 
 static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_act)
 {
-	void *data[4] = { NULL };
-	static const opal_step active_funcs[] = {
-		opal_discovery0,
-		start_SIDASP_opal_session, /* Open session as SID auth */
-		get_lsp_lifecycle,
-		activate_lsp,
-		end_opal_session,
-		NULL
+	const struct opal_step active_steps[] = {
+		{ opal_discovery0, },
+		{ start_SIDASP_opal_session, &opal_lr_act->key },
+		{ get_lsp_lifecycle, },
+		{ activate_lsp, opal_lr_act },
+		{ end_opal_session, },
+		{ NULL, }
 	};
 	int ret;
 
@@ -2289,10 +2221,7 @@ static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_a
 		return -EINVAL;
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, active_funcs);
-	dev->func_data = data;
-	dev->func_data[1] = &opal_lr_act->key;
-	dev->func_data[3] = opal_lr_act;
+	setup_opal_dev(dev, active_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
@@ -2301,21 +2230,17 @@ static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_a
 static int opal_setup_locking_range(struct opal_dev *dev,
 				    struct opal_user_lr_setup *opal_lrs)
 {
-	void *data[3] = { NULL };
-	static const opal_step lr_funcs[] = {
-		opal_discovery0,
-		start_auth_opal_session,
-		setup_locking_range,
-		end_opal_session,
-		NULL,
+	const struct opal_step lr_steps[] = {
+		{ opal_discovery0, },
+		{ start_auth_opal_session, &opal_lrs->session },
+		{ setup_locking_range, opal_lrs },
+		{ end_opal_session, },
+		{ NULL, }
 	};
 	int ret;
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, lr_funcs);
-	dev->func_data = data;
-	dev->func_data[1] = &opal_lrs->session;
-	dev->func_data[2] = opal_lrs;
+	setup_opal_dev(dev, lr_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
@@ -2323,14 +2248,13 @@ static int opal_setup_locking_range(struct opal_dev *dev,
 
 static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
 {
-	static const opal_step pw_funcs[] = {
-		opal_discovery0,
-		start_auth_opal_session,
-		set_new_pw,
-		end_opal_session,
-		NULL
+	const struct opal_step pw_steps[] = {
+		{ opal_discovery0, },
+		{ start_auth_opal_session, &opal_pw->session },
+		{ set_new_pw, &opal_pw->new_user_pw },
+		{ end_opal_session, },
+		{ NULL }
 	};
-	void *data[3] = { NULL };
 	int ret;
 
 	if (opal_pw->session.who < OPAL_ADMIN1 ||
@@ -2340,11 +2264,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
 		return -EINVAL;
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, pw_funcs);
-	dev->func_data = data;
-	dev->func_data[1] = (void *) &opal_pw->session;
-	dev->func_data[2] = (void *) &opal_pw->new_user_pw;
-
+	setup_opal_dev(dev, pw_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
@@ -2353,14 +2273,13 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
 static int opal_activate_user(struct opal_dev *dev,
 			      struct opal_session_info *opal_session)
 {
-	static const opal_step act_funcs[] = {
-		opal_discovery0,
-		start_admin1LSP_opal_session,
-		internal_activate_user,
-		end_opal_session,
-		NULL
+	const struct opal_step act_steps[] = {
+		{ opal_discovery0, },
+		{ start_admin1LSP_opal_session, &opal_session->opal_key },
+		{ internal_activate_user, opal_session },
+		{ end_opal_session, },
+		{ NULL, }
 	};
-	void *data[3] = { NULL };
 	int ret;
 
 	/* We can't activate Admin1 it's active as manufactured */
@@ -2371,10 +2290,7 @@ static int opal_activate_user(struct opal_dev *dev,
 	}
 
 	mutex_lock(&dev->dev_lock);
-	setup_opal_dev(dev, act_funcs);
-	dev->func_data = data;
-	dev->func_data[1] = &opal_session->opal_key;
-	dev->func_data[2] = opal_session;
+	setup_opal_dev(dev, act_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
 	return ret;
@@ -2383,7 +2299,6 @@ static int opal_activate_user(struct opal_dev *dev,
 bool opal_unlock_from_suspend(struct opal_dev *dev)
 {
 	struct opal_suspend_data *suspend;
-	void *func_data[3] = { NULL };
 	bool was_failure = false;
 	int ret = 0;
 
@@ -2394,19 +2309,12 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
 
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev, NULL);
-	dev->func_data = func_data;
 
 	list_for_each_entry(suspend, &dev->unlk_lst, node) {
-		dev->state = 0;
-		dev->func_data[1] = &suspend->unlk.session;
-		dev->func_data[2] = &suspend->unlk;
 		dev->tsn = 0;
 		dev->hsn = 0;
 
-		if (suspend->unlk.session.sum)
-			ret = __opal_lock_unlock_sum(dev);
-		else
-			ret = __opal_lock_unlock(dev);
+		ret = __opal_lock_unlock(dev, &suspend->unlk);
 		if (ret) {
 			pr_warn("Failed to unlock LR %hhu with sum %d\n",
 				suspend->unlk.session.opal_key.lr,
@@ -2433,7 +2341,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 		return -ENOTSUPP;
 	}
 
-	p = memdup_user(arg,  _IOC_SIZE(cmd));
+	p = memdup_user(arg, _IOC_SIZE(cmd));
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
-- 
cgit v1.2.3


From b86dd815ff74ab9eda474d1c28428ac0db2c3032 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 22 Feb 2017 13:19:45 -0700
Subject: block: get rid of blk-mq default scheduler choice Kconfig entries

The wording in the entries were poor and not understandable
by even deities. Kill the selection for default block scheduler,
and impose a policy with sane defaults.

Architected-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Kconfig.iosched | 44 --------------------------------------------
 block/blk-mq-sched.c  |  9 ---------
 block/elevator.c      | 19 +++++++++++++------
 3 files changed, 13 insertions(+), 59 deletions(-)

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 0715ce93daef..58fc8684788d 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -69,50 +69,6 @@ config MQ_IOSCHED_DEADLINE
 	---help---
 	  MQ version of the deadline IO scheduler.
 
-config MQ_IOSCHED_NONE
-	bool
-	default y
-
-choice
-	prompt "Default single-queue blk-mq I/O scheduler"
-	default DEFAULT_SQ_NONE
-	help
-	  Select the I/O scheduler which will be used by default for blk-mq
-	  managed block devices with a single queue.
-
-	config DEFAULT_SQ_DEADLINE
-		bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
-
-	config DEFAULT_SQ_NONE
-		bool "None"
-
-endchoice
-
-config DEFAULT_SQ_IOSCHED
-	string
-	default "mq-deadline" if DEFAULT_SQ_DEADLINE
-	default "none" if DEFAULT_SQ_NONE
-
-choice
-	prompt "Default multi-queue blk-mq I/O scheduler"
-	default DEFAULT_MQ_NONE
-	help
-	  Select the I/O scheduler which will be used by default for blk-mq
-	  managed block devices with multiple queues.
-
-	config DEFAULT_MQ_DEADLINE
-		bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
-
-	config DEFAULT_MQ_NONE
-		bool "None"
-
-endchoice
-
-config DEFAULT_MQ_IOSCHED
-	string
-	default "mq-deadline" if DEFAULT_MQ_DEADLINE
-	default "none" if DEFAULT_MQ_NONE
-
 endmenu
 
 endif
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 9e8d6795a8c1..c14f92308244 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -498,15 +498,6 @@ int blk_mq_sched_init(struct request_queue *q)
 {
 	int ret;
 
-#if defined(CONFIG_DEFAULT_SQ_NONE)
-	if (q->nr_hw_queues == 1)
-		return 0;
-#endif
-#if defined(CONFIG_DEFAULT_MQ_NONE)
-	if (q->nr_hw_queues > 1)
-		return 0;
-#endif
-
 	mutex_lock(&q->sysfs_lock);
 	ret = elevator_init(q, NULL);
 	mutex_unlock(&q->sysfs_lock);
diff --git a/block/elevator.c b/block/elevator.c
index 699d10f71a2c..ac1c9f481a98 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -220,17 +220,24 @@ int elevator_init(struct request_queue *q, char *name)
 	}
 
 	if (!e) {
-		if (q->mq_ops && q->nr_hw_queues == 1)
-			e = elevator_get(CONFIG_DEFAULT_SQ_IOSCHED, false);
-		else if (q->mq_ops)
-			e = elevator_get(CONFIG_DEFAULT_MQ_IOSCHED, false);
-		else
+		/*
+		 * For blk-mq devices, we default to using mq-deadline,
+		 * if available, for single queue devices. If deadline
+		 * isn't available OR we have multiple queues, default
+		 * to "none".
+		 */
+		if (q->mq_ops) {
+			if (q->nr_hw_queues == 1)
+				e = elevator_get("mq-deadline", false);
+			if (!e)
+				return 0;
+		} else
 			e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
 
 		if (!e) {
 			printk(KERN_ERR
 				"Default I/O scheduler not found. " \
-				"Using noop/none.\n");
+				"Using noop.\n");
 			e = elevator_get("noop", false);
 		}
 	}
-- 
cgit v1.2.3


From b38b9054989f33d2e06f1979f90973030ddc17db Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 27 Dec 2016 16:04:09 +0000
Subject: nvme: admin-cmd: fix spelling mistake: "Counld" -> "Could"

trivial fix to spelling mistake in pr_err message

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/target/admin-cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 95ae52390478..204e85efebab 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -41,7 +41,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid);
 	if (!ns) {
 		status = NVME_SC_INVALID_NS;
-		pr_err("nvmet : Counld not find namespace id : %d\n",
+		pr_err("nvmet : Could not find namespace id : %d\n",
 				le32_to_cpu(req->cmd->get_log_page.nsid));
 		goto out;
 	}
-- 
cgit v1.2.3


From f64935abbfcedf47a23ad0345f625b1903d965d2 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Sat, 24 Dec 2016 09:46:44 -0800
Subject: nvmet_fc: cleanup of abort flag processing in fcp_op_done

Cleanup of abort flag processing in fcp_op_done.
References were unnecessary

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/target/fc.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index ba57f9852bde..8f483ee7868c 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1817,16 +1817,14 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
 		/* data no longer needed */
 		nvmet_fc_free_tgt_pgs(fod);
 
-		if (fcpreq->fcp_error || abort)
-			nvmet_req_complete(&fod->req, fcpreq->fcp_error);
-
+		nvmet_req_complete(&fod->req, fcpreq->fcp_error);
 		return;
 	}
 
 	switch (fcpreq->op) {
 
 	case NVMET_FCOP_WRITEDATA:
-		if (abort || fcpreq->fcp_error ||
+		if (fcpreq->fcp_error ||
 		    fcpreq->transferred_length != fcpreq->transfer_length) {
 			nvmet_req_complete(&fod->req,
 					NVME_SC_FC_TRANSPORT_ERROR);
@@ -1849,7 +1847,7 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
 
 	case NVMET_FCOP_READDATA:
 	case NVMET_FCOP_READDATA_RSP:
-		if (abort || fcpreq->fcp_error ||
+		if (fcpreq->fcp_error ||
 		    fcpreq->transferred_length != fcpreq->transfer_length) {
 			/* data no longer needed */
 			nvmet_fc_free_tgt_pgs(fod);
-- 
cgit v1.2.3


From 15fbad96fc5fb1c5a0502e6a10dea671f510de3d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 14 Nov 2016 14:24:21 +0200
Subject: nvmet: Make cntlid globally unique

We usually log the cntlid which is confusing in case
we have multiple subsystems each with it's own cntlid ida.
Instead make cntlid ida globally unique and log the initial
association.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/target/core.c        | 10 ++++------
 drivers/nvme/target/fabrics-cmd.c |  4 ++--
 drivers/nvme/target/nvmet.h       |  1 -
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index fc5ba2f9e15f..5267ce20c12d 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -17,6 +17,7 @@
 #include "nvmet.h"
 
 static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
+static DEFINE_IDA(cntlid_ida);
 
 /*
  * This read/write semaphore is used to synchronize access to configuration
@@ -749,7 +750,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	if (!ctrl->sqs)
 		goto out_free_cqs;
 
-	ret = ida_simple_get(&subsys->cntlid_ida,
+	ret = ida_simple_get(&cntlid_ida,
 			     NVME_CNTLID_MIN, NVME_CNTLID_MAX,
 			     GFP_KERNEL);
 	if (ret < 0) {
@@ -819,7 +820,7 @@ static void nvmet_ctrl_free(struct kref *ref)
 	flush_work(&ctrl->async_event_work);
 	cancel_work_sync(&ctrl->fatal_err_work);
 
-	ida_simple_remove(&subsys->cntlid_ida, ctrl->cntlid);
+	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
 	nvmet_subsys_put(subsys);
 
 	kfree(ctrl->sqs);
@@ -918,9 +919,6 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
 	mutex_init(&subsys->lock);
 	INIT_LIST_HEAD(&subsys->namespaces);
 	INIT_LIST_HEAD(&subsys->ctrls);
-
-	ida_init(&subsys->cntlid_ida);
-
 	INIT_LIST_HEAD(&subsys->hosts);
 
 	return subsys;
@@ -933,7 +931,6 @@ static void nvmet_subsys_free(struct kref *ref)
 
 	WARN_ON_ONCE(!list_empty(&subsys->namespaces));
 
-	ida_destroy(&subsys->cntlid_ida);
 	kfree(subsys->subsysnqn);
 	kfree(subsys);
 }
@@ -976,6 +973,7 @@ static void __exit nvmet_exit(void)
 {
 	nvmet_exit_configfs();
 	nvmet_exit_discovery();
+	ida_destroy(&cntlid_ida);
 
 	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
 	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index f4088198cd0d..18dc314601c3 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -153,8 +153,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 		goto out;
 	}
 
-	pr_info("creating controller %d for NQN %s.\n",
-			ctrl->cntlid, ctrl->hostnqn);
+	pr_info("creating controller %d for subsystem %s for NQN %s.\n",
+		ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn);
 	req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid);
 
 out:
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index cc7ad06b43a7..1370eee0a3c0 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -142,7 +142,6 @@ struct nvmet_subsys {
 	unsigned int		max_nsid;
 
 	struct list_head	ctrls;
-	struct ida		cntlid_ida;
 
 	struct list_head	hosts;
 	bool			allow_any_host;
-- 
cgit v1.2.3


From 8432bdb2905713cb3ef5cbe4a72630fa575565ad Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 28 Nov 2016 01:47:40 +0200
Subject: nvme: Make controller state visible via sysfs

Easier for debugging and testing state machine
transitions.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 44a1a257e0b5..d52978038621 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1577,6 +1577,29 @@ static ssize_t nvme_sysfs_show_transport(struct device *dev,
 }
 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
 
+static ssize_t nvme_sysfs_show_state(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	static const char *const state_name[] = {
+		[NVME_CTRL_NEW]		= "new",
+		[NVME_CTRL_LIVE]	= "live",
+		[NVME_CTRL_RESETTING]	= "resetting",
+		[NVME_CTRL_RECONNECTING]= "reconnecting",
+		[NVME_CTRL_DELETING]	= "deleting",
+		[NVME_CTRL_DEAD]	= "dead",
+	};
+
+	if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
+	    state_name[ctrl->state])
+		return sprintf(buf, "%s\n", state_name[ctrl->state]);
+
+	return sprintf(buf, "unknown state\n");
+}
+
+static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
+
 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
@@ -1609,6 +1632,7 @@ static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_transport.attr,
 	&dev_attr_subsysnqn.attr,
 	&dev_attr_address.attr,
+	&dev_attr_state.attr,
 	NULL
 };
 
-- 
cgit v1.2.3


From 2dbf5816bb52ee3198882c1d198440e4bac62253 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 23 Jan 2017 11:01:12 +0200
Subject: nvmet: avoid dereferencing nvmet_req

No need to dereference req twice to get the cmd when we already
have it stored in a local variable.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/target/fabrics-cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 18dc314601c3..8bd022af3df6 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -220,7 +220,7 @@ int nvmet_parse_connect_cmd(struct nvmet_req *req)
 
 	req->ns = NULL;
 
-	if (req->cmd->common.opcode != nvme_fabrics_command) {
+	if (cmd->common.opcode != nvme_fabrics_command) {
 		pr_err("invalid command 0x%x on unconnected queue.\n",
 			cmd->fabrics.opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
-- 
cgit v1.2.3


From 778f067c185ce56d06aedb5fba5a0e98bb464f7b Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Thu, 26 Jan 2017 17:17:27 +0200
Subject: nvme: add semicolon in nvme_command setting

Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d52978038621..a68bf6954ead 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -590,8 +590,8 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
 	int error;
 
 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
-	c.identify.opcode = nvme_admin_identify,
-	c.identify.nsid = cpu_to_le32(nsid),
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(nsid);
 
 	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
 	if (!*id)
-- 
cgit v1.2.3


From 986994a27587efd8ce4c595cab89b570f7475359 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Thu, 26 Jan 2017 17:17:28 +0200
Subject: nvme: Use CNS as 8-bit field and avoid endianness conversion

This patch defines CNS field as 8-bit field and avoids cpu_to/from_le
conversions.
Also initialize nvme_command cns value explicitly to NVME_ID_CNS_NS
for readability (don't rely on the fact that NVME_ID_CNS_NS = 0).

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c        | 5 +++--
 drivers/nvme/target/admin-cmd.c | 2 +-
 drivers/nvme/target/discovery.c | 4 ++--
 include/linux/nvme.h            | 4 +++-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a68bf6954ead..8b1be128b66e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -560,7 +560,7 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 
 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
 	c.identify.opcode = nvme_admin_identify;
-	c.identify.cns = cpu_to_le32(NVME_ID_CNS_CTRL);
+	c.identify.cns = NVME_ID_CNS_CTRL;
 
 	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
 	if (!*id)
@@ -578,7 +578,7 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n
 	struct nvme_command c = { };
 
 	c.identify.opcode = nvme_admin_identify;
-	c.identify.cns = cpu_to_le32(NVME_ID_CNS_NS_ACTIVE_LIST);
+	c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
 	c.identify.nsid = cpu_to_le32(nsid);
 	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
 }
@@ -592,6 +592,7 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
 	c.identify.opcode = nvme_admin_identify;
 	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.cns = NVME_ID_CNS_NS;
 
 	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
 	if (!*id)
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 204e85efebab..94e524fea568 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -509,7 +509,7 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req)
 		break;
 	case nvme_admin_identify:
 		req->data_len = 4096;
-		switch (le32_to_cpu(cmd->identify.cns)) {
+		switch (cmd->identify.cns) {
 		case NVME_ID_CNS_NS:
 			req->execute = nvmet_execute_identify_ns;
 			return 0;
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 12f39eea569f..af8aabf05335 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -186,14 +186,14 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
 		}
 	case nvme_admin_identify:
 		req->data_len = 4096;
-		switch (le32_to_cpu(cmd->identify.cns)) {
+		switch (cmd->identify.cns) {
 		case NVME_ID_CNS_CTRL:
 			req->execute =
 				nvmet_execute_identify_disc_ctrl;
 			return 0;
 		default:
 			pr_err("nvmet: unsupported identify cns %d\n",
-				le32_to_cpu(cmd->identify.cns));
+				cmd->identify.cns);
 			return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 		}
 	default:
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 0b676a02cf3e..5b32521456d6 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -644,7 +644,9 @@ struct nvme_identify {
 	__le32			nsid;
 	__u64			rsvd2[2];
 	union nvme_data_ptr	dptr;
-	__le32			cns;
+	__u8			cns;
+	__u8			rsvd3;
+	__le16			ctrlid;
 	__u32			rsvd11[5];
 };
 
-- 
cgit v1.2.3


From e5a39dd8238e5d42c830bbd8d31211adf6fea6ca Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Fri, 27 Jan 2017 09:03:45 +0100
Subject: nvme: make nvmf_register_transport require a create_ctrl callback

nvmf_create_ctrl() relys on the presence of a create_crtl callback in the
registered nvmf_transport_ops, so make nvmf_register_transport require one.

Update the available call-sites as well to reflect these changes.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/fabrics.c | 7 ++++++-
 drivers/nvme/host/fabrics.h | 2 +-
 drivers/nvme/host/fc.c      | 3 +--
 drivers/nvme/host/rdma.c    | 3 +--
 drivers/nvme/target/loop.c  | 3 +--
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 916d13608059..5b7386f69f4d 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -480,11 +480,16 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
  * being implemented to the common NVMe fabrics library. Part of
  * the overall init sequence of starting up a fabrics driver.
  */
-void nvmf_register_transport(struct nvmf_transport_ops *ops)
+int nvmf_register_transport(struct nvmf_transport_ops *ops)
 {
+	if (!ops->create_ctrl)
+		return -EINVAL;
+
 	mutex_lock(&nvmf_transports_mutex);
 	list_add_tail(&ops->entry, &nvmf_transports);
 	mutex_unlock(&nvmf_transports_mutex);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nvmf_register_transport);
 
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 924145c979f1..156018182ce4 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -128,7 +128,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
 int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
 int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
 int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
-void nvmf_register_transport(struct nvmf_transport_ops *ops);
+int nvmf_register_transport(struct nvmf_transport_ops *ops);
 void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
 void nvmf_free_options(struct nvmf_ctrl_options *opts);
 const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index fb51a8de9b29..6186fde718f9 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2562,8 +2562,7 @@ static int __init nvme_fc_init_module(void)
 	if (!nvme_fc_wq)
 		return -ENOMEM;
 
-	nvmf_register_transport(&nvme_fc_transport);
-	return 0;
+	return nvmf_register_transport(&nvme_fc_transport);
 }
 
 static void __exit nvme_fc_exit_module(void)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index a75e95d42b3f..aedac6e28c6f 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2063,8 +2063,7 @@ static int __init nvme_rdma_init_module(void)
 		return ret;
 	}
 
-	nvmf_register_transport(&nvme_rdma_transport);
-	return 0;
+	return nvmf_register_transport(&nvme_rdma_transport);
 }
 
 static void __exit nvme_rdma_cleanup_module(void)
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index f3862e38f574..d1f06e7768ff 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -724,8 +724,7 @@ static int __init nvme_loop_init_module(void)
 	ret = nvmet_register_transport(&nvme_loop_ops);
 	if (ret)
 		return ret;
-	nvmf_register_transport(&nvme_loop_transport);
-	return 0;
+	return nvmf_register_transport(&nvme_loop_transport);
 }
 
 static void __exit nvme_loop_cleanup_module(void)
-- 
cgit v1.2.3


From bd4da3abaabffdd2472fb7085fcadd5d1d8c2153 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 22 Feb 2017 13:32:36 -0700
Subject: nvme: Add a quirk mechanism that uses identify_ctrl

Currently, all NVMe quirks are based on PCI IDs.  Add a mechanism to
define quirks based on identify_ctrl's vendor id, model number,
and/or firmware revision.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  1 +
 2 files changed, 65 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8b1be128b66e..38efe4f752a1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1252,6 +1252,50 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 	blk_queue_write_cache(q, vwc, vwc);
 }
 
+struct nvme_core_quirk_entry {
+	/*
+	 * NVMe model and firmware strings are padded with spaces.  For
+	 * simplicity, strings in the quirk table are padded with NULLs
+	 * instead.
+	 */
+	u16 vid;
+	const char *mn;
+	const char *fr;
+	unsigned long quirks;
+};
+
+static const struct nvme_core_quirk_entry core_quirks[] = {
+};
+
+/* match is null-terminated but idstr is space-padded. */
+static bool string_matches(const char *idstr, const char *match, size_t len)
+{
+	size_t matchlen;
+
+	if (!match)
+		return true;
+
+	matchlen = strlen(match);
+	WARN_ON_ONCE(matchlen > len);
+
+	if (memcmp(idstr, match, matchlen))
+		return false;
+
+	for (; matchlen < len; matchlen++)
+		if (idstr[matchlen] != ' ')
+			return false;
+
+	return true;
+}
+
+static bool quirk_matches(const struct nvme_id_ctrl *id,
+			  const struct nvme_core_quirk_entry *q)
+{
+	return q->vid == le16_to_cpu(id->vid) &&
+		string_matches(id->mn, q->mn, sizeof(id->mn)) &&
+		string_matches(id->fr, q->fr, sizeof(id->fr));
+}
+
 /*
  * Initialize the cached copies of the Identify data and various controller
  * register in our nvme_ctrl structure.  This should be called as soon as
@@ -1286,6 +1330,24 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		return -EIO;
 	}
 
+	if (!ctrl->identified) {
+		/*
+		 * Check for quirks.  Quirk can depend on firmware version,
+		 * so, in principle, the set of quirks present can change
+		 * across a reset.  As a possible future enhancement, we
+		 * could re-scan for quirks every time we reinitialize
+		 * the device, but we'd have to make sure that the driver
+		 * behaves intelligently if the quirks change.
+		 */
+
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
+			if (quirk_matches(id, &core_quirks[i]))
+				ctrl->quirks |= core_quirks[i].quirks;
+		}
+	}
+
 	ctrl->oacs = le16_to_cpu(id->oacs);
 	ctrl->vid = le16_to_cpu(id->vid);
 	ctrl->oncs = le16_to_cpup(&id->oncs);
@@ -1329,6 +1391,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	}
 
 	kfree(id);
+
+	ctrl->identified = true;
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_identify);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 14cfc6f7facb..42ede67bfbc6 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -112,6 +112,7 @@ enum nvme_ctrl_state {
 
 struct nvme_ctrl {
 	enum nvme_ctrl_state state;
+	bool identified;
 	spinlock_t lock;
 	const struct nvme_ctrl_ops *ops;
 	struct request_queue *admin_q;
-- 
cgit v1.2.3


From c5552fde102fcc3f2cf9e502b8ac90e3500d8fdf Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 7 Feb 2017 10:08:45 -0800
Subject: nvme: Enable autonomous power state transitions

NVMe devices can advertise multiple power states.  These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.

The hardware configuration is a table.  For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.

This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely.  On hardware without APST support, the sysfs file will
not be exposed.

The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.

In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful.  There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset.  This can be configured from
userspace, but it doesn't seem useful to support in the driver.

On my laptop, enabling APST seems to save nearly 1W.

The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.

This feature is quirked off on a known-buggy Samsung device.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  11 ++++
 include/linux/nvme.h     |   6 ++
 3 files changed, 171 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 38efe4f752a1..849e85847ad4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -26,6 +26,7 @@
 #include <linux/ptrace.h>
 #include <linux/nvme_ioctl.h>
 #include <linux/t10-pi.h>
+#include <linux/pm_qos.h>
 #include <scsi/sg.h>
 #include <asm/unaligned.h>
 
@@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries);
 static int nvme_char_major;
 module_param(nvme_char_major, int, 0);
 
+static unsigned long default_ps_max_latency_us = 25000;
+module_param(default_ps_max_latency_us, ulong, 0644);
+MODULE_PARM_DESC(default_ps_max_latency_us,
+		 "max power saving latency for new devices; use PM QOS to change per device");
+
 static LIST_HEAD(nvme_ctrl_list);
 static DEFINE_SPINLOCK(dev_list_lock);
 
@@ -1252,6 +1258,122 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 	blk_queue_write_cache(q, vwc, vwc);
 }
 
+static void nvme_configure_apst(struct nvme_ctrl *ctrl)
+{
+	/*
+	 * APST (Autonomous Power State Transition) lets us program a
+	 * table of power state transitions that the controller will
+	 * perform automatically.  We configure it with a simple
+	 * heuristic: we are willing to spend at most 2% of the time
+	 * transitioning between power states.  Therefore, when running
+	 * in any given state, we will enter the next lower-power
+	 * non-operational state after waiting 100 * (enlat + exlat)
+	 * microseconds, as long as that state's total latency is under
+	 * the requested maximum latency.
+	 *
+	 * We will not autonomously enter any non-operational state for
+	 * which the total latency exceeds ps_max_latency_us.  Users
+	 * can set ps_max_latency_us to zero to turn off APST.
+	 */
+
+	unsigned apste;
+	struct nvme_feat_auto_pst *table;
+	int ret;
+
+	/*
+	 * If APST isn't supported or if we haven't been initialized yet,
+	 * then don't do anything.
+	 */
+	if (!ctrl->apsta)
+		return;
+
+	if (ctrl->npss > 31) {
+		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
+		return;
+	}
+
+	table = kzalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return;
+
+	if (ctrl->ps_max_latency_us == 0) {
+		/* Turn off APST. */
+		apste = 0;
+	} else {
+		__le64 target = cpu_to_le64(0);
+		int state;
+
+		/*
+		 * Walk through all states from lowest- to highest-power.
+		 * According to the spec, lower-numbered states use more
+		 * power.  NPSS, despite the name, is the index of the
+		 * lowest-power state, not the number of states.
+		 */
+		for (state = (int)ctrl->npss; state >= 0; state--) {
+			u64 total_latency_us, transition_ms;
+
+			if (target)
+				table->entries[state] = target;
+
+			/*
+			 * Is this state a useful non-operational state for
+			 * higher-power states to autonomously transition to?
+			 */
+			if (!(ctrl->psd[state].flags &
+			      NVME_PS_FLAGS_NON_OP_STATE))
+				continue;
+
+			total_latency_us =
+				(u64)le32_to_cpu(ctrl->psd[state].entry_lat) +
+				+ le32_to_cpu(ctrl->psd[state].exit_lat);
+			if (total_latency_us > ctrl->ps_max_latency_us)
+				continue;
+
+			/*
+			 * This state is good.  Use it as the APST idle
+			 * target for higher power states.
+			 */
+			transition_ms = total_latency_us + 19;
+			do_div(transition_ms, 20);
+			if (transition_ms > (1 << 24) - 1)
+				transition_ms = (1 << 24) - 1;
+
+			target = cpu_to_le64((state << 3) |
+					     (transition_ms << 8));
+		}
+
+		apste = 1;
+	}
+
+	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
+				table, sizeof(*table), NULL);
+	if (ret)
+		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
+
+	kfree(table);
+}
+
+static void nvme_set_latency_tolerance(struct device *dev, s32 val)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	u64 latency;
+
+	switch (val) {
+	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
+	case PM_QOS_LATENCY_ANY:
+		latency = U64_MAX;
+		break;
+
+	default:
+		latency = val;
+	}
+
+	if (ctrl->ps_max_latency_us != latency) {
+		ctrl->ps_max_latency_us = latency;
+		nvme_configure_apst(ctrl);
+	}
+}
+
 struct nvme_core_quirk_entry {
 	/*
 	 * NVMe model and firmware strings are padded with spaces.  For
@@ -1265,6 +1387,16 @@ struct nvme_core_quirk_entry {
 };
 
 static const struct nvme_core_quirk_entry core_quirks[] = {
+	/*
+	 * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes
+	 * the controller to go out to lunch.  It dies when the watchdog
+	 * timer reads CSTS and gets 0xffffffff.
+	 */
+	{
+		.vid = 0x144d,
+		.fr = "BXW75D0Q",
+		.quirks = NVME_QUIRK_NO_APST,
+	},
 };
 
 /* match is null-terminated but idstr is space-padded. */
@@ -1307,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	u64 cap;
 	int ret, page_shift;
 	u32 max_hw_sectors;
+	u8 prev_apsta;
 
 	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
 	if (ret) {
@@ -1368,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ctrl->sgls = le32_to_cpu(id->sgls);
 	ctrl->kas = le16_to_cpu(id->kas);
 
+	ctrl->npss = id->npss;
+	prev_apsta = ctrl->apsta;
+	ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
+
 	if (ctrl->ops->is_fabrics) {
 		ctrl->icdoff = le16_to_cpu(id->icdoff);
 		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
@@ -1392,7 +1530,15 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 
 	kfree(id);
 
+	if (ctrl->apsta && !prev_apsta)
+		dev_pm_qos_expose_latency_tolerance(ctrl->device);
+	else if (!ctrl->apsta && prev_apsta)
+		dev_pm_qos_hide_latency_tolerance(ctrl->device);
+
+	nvme_configure_apst(ctrl);
+
 	ctrl->identified = true;
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_identify);
@@ -2154,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	list_add_tail(&ctrl->node, &nvme_ctrl_list);
 	spin_unlock(&dev_list_lock);
 
+	/*
+	 * Initialize latency tolerance controls.  The sysfs files won't
+	 * be visible to userspace unless the device actually supports APST.
+	 */
+	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
+	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
+		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
+
 	return 0;
 out_release_instance:
 	nvme_release_instance(ctrl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 42ede67bfbc6..a3da1e90b99d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -78,6 +78,11 @@ enum nvme_quirks {
 	 * readiness, which is done by reading the NVME_CSTS_RDY bit.
 	 */
 	NVME_QUIRK_DELAY_BEFORE_CHK_RDY		= (1 << 3),
+
+	/*
+	 * APST should not be used.
+	 */
+	NVME_QUIRK_NO_APST			= (1 << 4),
 };
 
 /*
@@ -148,13 +153,19 @@ struct nvme_ctrl {
 	u32 vs;
 	u32 sgls;
 	u16 kas;
+	u8 npss;
+	u8 apsta;
 	unsigned int kato;
 	bool subsystem;
 	unsigned long quirks;
+	struct nvme_id_power_state psd[32];
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
 
+	/* Power saving configuration */
+	u64 ps_max_latency_us;
+
 	/* Fabrics only */
 	u16 sqsize;
 	u32 ioccsz;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5b32521456d6..c43d435d4225 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -579,6 +579,12 @@ struct nvme_write_zeroes_cmd {
 	__le16			appmask;
 };
 
+/* Features */
+
+struct nvme_feat_auto_pst {
+	__le64 entries[32];
+};
+
 /* Admin commands */
 
 enum nvme_admin_opcode {
-- 
cgit v1.2.3


From 6db28eda266052f86a6b402422de61eeb7d2e351 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 10 Feb 2017 18:15:49 -0500
Subject: nvme/pci: Disable on removal when disconnected

If the device is not present, the driver should disable the queues
immediately. Prior to this, the driver was relying on the watchdog timer
to kill the queues if requests were outstanding to the device, and that
just delays removal up to one second.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/pci.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ddc51adb594d..362a1468f99d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2001,8 +2001,10 @@ static void nvme_remove(struct pci_dev *pdev)
 
 	pci_set_drvdata(pdev, NULL);
 
-	if (!pci_device_is_present(pdev))
+	if (!pci_device_is_present(pdev)) {
 		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
+		nvme_dev_disable(dev, false);
+	}
 
 	flush_work(&dev->reset_work);
 	nvme_uninit_ctrl(&dev->ctrl);
-- 
cgit v1.2.3


From f33447b90e96076483525b21cc4e0a8977cdd07c Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 10 Feb 2017 18:15:51 -0500
Subject: nvme/core: Fix race kicking freed request_queue

If a namespace has already been marked dead, we don't want to kick the
request_queue again since we may have just freed it from another thread.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 849e85847ad4..25ec4e585220 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2333,9 +2333,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 		 * Revalidating a dead namespace sets capacity to 0. This will
 		 * end buffered writers dirtying pages that can't be synced.
 		 */
-		if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags))
-			revalidate_disk(ns->disk);
-
+		if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+			continue;
+		revalidate_disk(ns->disk);
 		blk_set_queue_dying(ns->queue);
 		blk_mq_abort_requeue_list(ns->queue);
 		blk_mq_start_stopped_hw_queues(ns->queue, true);
-- 
cgit v1.2.3


From 9ef3932e250f8e2e11ffbc0c1f28b3ba5dc40cd6 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 10 Feb 2017 18:15:52 -0500
Subject: nvme/pci: No special case for queue busy on IO

This driver previously required we have a special check for IO submitted
to nvme IO queues that are temporarily suspended. That is no longer
necessary since blk-mq provides a quiesce, so any IO that actually gets
submitted to such a queue must be ended since the queue isn't going to
start back up.

This is fixing a condition where we have fewer IO queues after a
controller reset. This may happen if the number of CPU's has changed,
or controller firmware update changed the queue count, for example.

While it may be possible to complete the IO on a different queue, the
block layer does not provide a way to resubmit a request on a different
hardware context once the request has entered the queue. We don't want
these requests to be stuck indefinitely either, so ending them in error
is our only option at the moment.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/pci.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 362a1468f99d..d38dae9b5de9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -613,10 +613,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	spin_lock_irq(&nvmeq->q_lock);
 	if (unlikely(nvmeq->cq_vector < 0)) {
-		if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
-			ret = BLK_MQ_RQ_QUEUE_BUSY;
-		else
-			ret = BLK_MQ_RQ_QUEUE_ERROR;
+		ret = BLK_MQ_RQ_QUEUE_ERROR;
 		spin_unlock_irq(&nvmeq->q_lock);
 		goto out_cleanup_iod;
 	}
-- 
cgit v1.2.3


From faef3af69dd7976315d37316698a436b81d05363 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 17 Feb 2017 07:57:28 -0800
Subject: nvme-fc: don't bother to validate ioccsz and iorcsz

Discovery controllers don't set the values. They are in reserved
areas of the Identify Controller data structure.

Given the cmd completed, the minimal capsule sizes are supported,
so no need to check nqn to detect discovery controllers and
special case validations.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/fc.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 6186fde718f9..9690beb15e69 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2353,18 +2353,6 @@ __nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 	/* sanity checks */
 
-	/* FC-NVME supports 64-byte SQE only */
-	if (ctrl->ctrl.ioccsz != 4) {
-		dev_err(ctrl->ctrl.device, "ioccsz %d is not supported!\n",
-				ctrl->ctrl.ioccsz);
-		goto out_remove_admin_queue;
-	}
-	/* FC-NVME supports 16-byte CQE only */
-	if (ctrl->ctrl.iorcsz != 1) {
-		dev_err(ctrl->ctrl.device, "iorcsz %d is not supported!\n",
-				ctrl->ctrl.iorcsz);
-		goto out_remove_admin_queue;
-	}
 	/* FC-NVME does not have other data in the capsule */
 	if (ctrl->ctrl.icdoff) {
 		dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
-- 
cgit v1.2.3


From 3ee80c3d15b61e61611903033df414a0590cfde9 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 20 Feb 2017 13:44:28 +0200
Subject: nvme-rdma: move nvme cm status helper to .h file

This will enable the usage for nvme rdma target.
Also move from a lookup array to a switch statement.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/rdma.c  | 22 ----------------------
 include/linux/nvme-rdma.h | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index aedac6e28c6f..ee789b38bae5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -42,28 +42,6 @@
 
 #define NVME_RDMA_MAX_INLINE_SEGMENTS	1
 
-static const char *const nvme_rdma_cm_status_strs[] = {
-	[NVME_RDMA_CM_INVALID_LEN]	= "invalid length",
-	[NVME_RDMA_CM_INVALID_RECFMT]	= "invalid record format",
-	[NVME_RDMA_CM_INVALID_QID]	= "invalid queue ID",
-	[NVME_RDMA_CM_INVALID_HSQSIZE]	= "invalid host SQ size",
-	[NVME_RDMA_CM_INVALID_HRQSIZE]	= "invalid host RQ size",
-	[NVME_RDMA_CM_NO_RSC]		= "resource not found",
-	[NVME_RDMA_CM_INVALID_IRD]	= "invalid IRD",
-	[NVME_RDMA_CM_INVALID_ORD]	= "Invalid ORD",
-};
-
-static const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
-{
-	size_t index = status;
-
-	if (index < ARRAY_SIZE(nvme_rdma_cm_status_strs) &&
-	    nvme_rdma_cm_status_strs[index])
-		return nvme_rdma_cm_status_strs[index];
-	else
-		return "unrecognized reason";
-};
-
 /*
  * We handle AEN commands ourselves and don't even let the
  * block layer know about them.
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
index bf240a3cbf99..a72fd04aa5e1 100644
--- a/include/linux/nvme-rdma.h
+++ b/include/linux/nvme-rdma.h
@@ -29,6 +29,30 @@ enum nvme_rdma_cm_status {
 	NVME_RDMA_CM_INVALID_ORD	= 0x08,
 };
 
+static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
+{
+	switch (status) {
+	case NVME_RDMA_CM_INVALID_LEN:
+		return "invalid length";
+	case NVME_RDMA_CM_INVALID_RECFMT:
+		return "invalid record format";
+	case NVME_RDMA_CM_INVALID_QID:
+		return "invalid queue ID";
+	case NVME_RDMA_CM_INVALID_HSQSIZE:
+		return "invalid host SQ size";
+	case NVME_RDMA_CM_INVALID_HRQSIZE:
+		return "invalid host RQ size";
+	case NVME_RDMA_CM_NO_RSC:
+		return "resource not found";
+	case NVME_RDMA_CM_INVALID_IRD:
+		return "invalid IRD";
+	case NVME_RDMA_CM_INVALID_ORD:
+		return "Invalid ORD";
+	default:
+		return "unrecognized reason";
+	}
+}
+
 /**
  * struct nvme_rdma_cm_req - rdma connect request
  *
-- 
cgit v1.2.3


From 7a01a6ea2c636769f9d029b026549bebc406094a Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 20 Feb 2017 13:44:29 +0200
Subject: nvmet-rdma: use nvme cm status helper

Also remove redundant debug prints.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/target/rdma.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 60990220bd83..5d189c012943 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1041,6 +1041,9 @@ static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
 {
 	struct nvme_rdma_cm_rej rej;
 
+	pr_debug("rejecting connect request: status %d (%s)\n",
+		 status, nvme_rdma_cm_msg(status));
+
 	rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
 	rej.sts = cpu_to_le16(status);
 
@@ -1135,7 +1138,6 @@ out_destroy_sq:
 out_free_queue:
 	kfree(queue);
 out_reject:
-	pr_debug("rejecting connect request with status code %d\n", ret);
 	nvmet_rdma_cm_reject(cm_id, ret);
 	return NULL;
 }
@@ -1188,7 +1190,6 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
 
 	ndev = nvmet_rdma_find_get_device(cm_id);
 	if (!ndev) {
-		pr_err("no client data!\n");
 		nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
 		return -ECONNREFUSED;
 	}
-- 
cgit v1.2.3


From 6ccaeb56004a14329b033968a687324c96d295bf Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 19 Feb 2017 20:04:38 +0100
Subject: nvmet-rdma: Fix error handling

According to the preceeding goto, it is likely that 'out_destroy_sq' was
expected here.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/target/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 5d189c012943..9aa1da3778b3 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1094,7 +1094,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
 	queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
 	if (queue->idx < 0) {
 		ret = NVME_RDMA_CM_NO_RSC;
-		goto out_free_queue;
+		goto out_destroy_sq;
 	}
 
 	ret = nvmet_rdma_alloc_rsps(queue);
-- 
cgit v1.2.3


From 8f4e8dace3d182132c8ffbcf243dda85def6c209 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Sun, 19 Feb 2017 20:08:03 +0200
Subject: nvme-rdma: add support for host_traddr

This will enable the user to control the specific interface for
connection establishment in case the host has more than 1 interface
under the same subnet.
E.g:
Host interfaces configured as:
 - ib0 1.1.1.1/16
 - ib1 1.1.1.2/16

Target interfaces configured as:
 - ib0 1.1.1.3/16 (listener interface)
 - ib1 1.1.1.4/16

the following connect command will go through host iface ib0 (default):
nvme connect -t rdma -n testsubsystem -a 1.1.1.3 -s 1023

but the following command will go through host iface ib1:
nvme connect -t rdma -n testsubsystem -a 1.1.1.3 -s 1023 -w 1.1.1.2

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/rdma.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ee789b38bae5..49b2121af689 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -133,6 +133,10 @@ struct nvme_rdma_ctrl {
 		struct sockaddr addr;
 		struct sockaddr_in addr_in;
 	};
+	union {
+		struct sockaddr src_addr;
+		struct sockaddr_in src_addr_in;
+	};
 
 	struct nvme_ctrl	ctrl;
 };
@@ -545,6 +549,7 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
 		int idx, size_t queue_size)
 {
 	struct nvme_rdma_queue *queue;
+	struct sockaddr *src_addr = NULL;
 	int ret;
 
 	queue = &ctrl->queues[idx];
@@ -567,7 +572,10 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
 	}
 
 	queue->cm_error = -ETIMEDOUT;
-	ret = rdma_resolve_addr(queue->cm_id, NULL, &ctrl->addr,
+	if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
+		src_addr = &ctrl->src_addr;
+
+	ret = rdma_resolve_addr(queue->cm_id, src_addr, &ctrl->addr,
 			NVME_RDMA_CONNECT_TIMEOUT_MS);
 	if (ret) {
 		dev_info(ctrl->ctrl.device,
@@ -1883,6 +1891,16 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 		goto out_free_ctrl;
 	}
 
+	if (opts->mask & NVMF_OPT_HOST_TRADDR) {
+		ret = nvme_rdma_parse_ipaddr(&ctrl->src_addr_in,
+				opts->host_traddr);
+		if (ret) {
+			pr_err("malformed src IP address passed: %s\n",
+			       opts->host_traddr);
+			goto out_free_ctrl;
+		}
+	}
+
 	if (opts->mask & NVMF_OPT_TRSVCID) {
 		u16 port;
 
@@ -1994,7 +2012,8 @@ out_free_ctrl:
 static struct nvmf_transport_ops nvme_rdma_transport = {
 	.name		= "rdma",
 	.required_opts	= NVMF_OPT_TRADDR,
-	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY,
+	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
+			  NVMF_OPT_HOST_TRADDR,
 	.create_ctrl	= nvme_rdma_create_ctrl,
 };
 
-- 
cgit v1.2.3


From 124298bd03acebd9c9da29a794718aca31bec1f7 Mon Sep 17 00:00:00 2001
From: Daniel Roschka <danielroschka@phoenitydawn.de>
Date: Wed, 22 Feb 2017 15:17:29 -0700
Subject: nvme: detect NVMe controller in recent MacBooks

Adds support for detection of the NVMe controller found in the
following recent MacBooks:
- Retina MacBook 2016 (MacBook9,1)
- 13" MacBook Pro 2016 without Touch Bar (MacBook13,1)
- 13" MacBook Pro 2016 with Touch Bar (MacBook13,2)

Signed-off-by: Daniel Roschka <danielroschka@phoenitydawn.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/pci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d38dae9b5de9..319f5c5b241a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2120,6 +2120,7 @@ static const struct pci_device_id nvme_id_table[] = {
 		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
 	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, nvme_id_table);
-- 
cgit v1.2.3


From 7d6d15789d69856f1c5405e106a773c87277eb8c Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Wed, 22 Feb 2017 10:15:06 -0700
Subject: block/sed-opal: Introduce free_opal_dev to free the structure and
 clean up state

Before we free the opal structure we need to clean up any saved
locking ranges that the user had told us to unlock from a suspend.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/sed-opal.c         | 30 ++++++++++++++++++++++++++++++
 include/linux/sed-opal.h |  5 +++++
 2 files changed, 35 insertions(+)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 893557554cc5..020bf3e28e38 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1987,6 +1987,28 @@ static int check_opal_support(struct opal_dev *dev)
 	return ret;
 }
 
+static void clean_opal_dev(struct opal_dev *dev)
+{
+
+	struct opal_suspend_data *suspend, *next;
+
+	mutex_lock(&dev->dev_lock);
+	list_for_each_entry_safe(suspend, next, &dev->unlk_lst, node) {
+		list_del(&suspend->node);
+		kfree(suspend);
+	}
+	mutex_unlock(&dev->dev_lock);
+}
+
+void free_opal_dev(struct opal_dev *dev)
+{
+	if (!dev)
+		return;
+	clean_opal_dev(dev);
+	kfree(dev);
+}
+EXPORT_SYMBOL(free_opal_dev);
+
 struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
 {
 	struct opal_dev *dev;
@@ -2141,6 +2163,14 @@ static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
 	setup_opal_dev(dev, revert_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
+
+	/*
+	 * If we successfully reverted lets clean
+	 * any saved locking ranges.
+	 */
+	if (!ret)
+		clean_opal_dev(dev);
+
 	return ret;
 }
 
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index deee23d012e7..04b124fca51e 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -27,6 +27,7 @@ typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer,
 		size_t len, bool send);
 
 #ifdef CONFIG_BLK_SED_OPAL
+void free_opal_dev(struct opal_dev *dev);
 bool opal_unlock_from_suspend(struct opal_dev *dev);
 struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv);
 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr);
@@ -51,6 +52,10 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	return false;
 }
 #else
+static inline void free_opal_dev(struct opal_dev *dev)
+{
+}
+
 static inline bool is_sed_ioctl(unsigned int cmd)
 {
 	return false;
-- 
cgit v1.2.3


From e286bcfc59e01c2c57f2d79a46063c0bddb7a0d1 Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Wed, 22 Feb 2017 10:15:07 -0700
Subject: nvme/pci: re-check security protocol support after reset

A device may change capabilities after each reset, e.g. due to a firmware
upgrade.  We should thus check for Security Send/Receive and OPAL support
after each reset.

Based on patches from Christoph and Keith.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/pci.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 319f5c5b241a..57a1af52b06e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1736,7 +1736,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	if (dev->ctrl.admin_q)
 		blk_put_queue(dev->ctrl.admin_q);
 	kfree(dev->queues);
-	kfree(dev->ctrl.opal_dev);
+	free_opal_dev(dev->ctrl.opal_dev);
 	kfree(dev);
 }
 
@@ -1786,14 +1786,17 @@ static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out;
 
-	if ((dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) && !dev->ctrl.opal_dev) {
-		dev->ctrl.opal_dev =
-			init_opal_dev(&dev->ctrl, &nvme_sec_submit);
+	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
+		if (!dev->ctrl.opal_dev)
+			dev->ctrl.opal_dev =
+				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
+		else if (was_suspend)
+			opal_unlock_from_suspend(dev->ctrl.opal_dev);
+	} else {
+		free_opal_dev(dev->ctrl.opal_dev);
+		dev->ctrl.opal_dev = NULL;
 	}
 
-	if (was_suspend)
-		opal_unlock_from_suspend(dev->ctrl.opal_dev);
-
 	result = nvme_setup_io_queues(dev);
 	if (result)
 		goto out;
-- 
cgit v1.2.3


From 2d19020b085eaf61d24377db27e8630cd9cff7ce Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Wed, 22 Feb 2017 10:15:08 -0700
Subject: block/sed-opal: Propagate original error message to userland.

During an error on a comannd, ex: user provides wrong pw to unlock
range, we will gracefully terminate the opal session. We want to
propagate the original error to userland instead of the result of
the session termination, which is almost always a success.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/sed-opal.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 020bf3e28e38..1e18dca360fc 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -396,8 +396,11 @@ static int next(struct opal_dev *dev)
 			 * session. Therefore we shouldn't attempt to terminate
 			 * a session, as one has not yet been created.
 			 */
-			if (state > 1)
-				return end_opal_session_error(dev);
+			if (state > 1) {
+				end_opal_session_error(dev);
+				return error;
+			}
+
 		}
 		state++;
 	} while (!error);
-- 
cgit v1.2.3


From da55f2cc78418dee88400aafbbaed19d7ac8188e Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 22 Feb 2017 10:58:29 -0800
Subject: blk-mq: use sbq wait queues instead of restart for driver tags

Commit 50e1dab86aa2 ("blk-mq-sched: fix starvation for multiple hardware
queues and shared tags") fixed one starvation issue for shared tags.
However, we can still get into a situation where we fail to allocate a
tag because all tags are allocated but we don't have any pending
requests on any hardware queue.

One solution for this would be to restart all queues that share a tag
map, but that really sucks. Ideally, we could just block and wait for a
tag, but that isn't always possible from blk_mq_dispatch_rq_list().

However, we can still use the struct sbitmap_queue wait queues with a
custom callback instead of blocking. This has a few benefits:

1. It avoids iterating over all hardware queues when completing an I/O,
   which the current restart code has to do.
2. It benefits from the existing rolling wakeup code.
3. It avoids punting to another thread just to have it block.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 64 +++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/blk-mq.h |  2 ++
 2 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b29e7dc7b309..9e6b064e5339 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list)
 	return first != NULL;
 }
 
+static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+				void *key)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+
+	list_del(&wait->task_list);
+	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+	blk_mq_run_hw_queue(hctx, true);
+	return 1;
+}
+
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+{
+	struct sbq_wait_state *ws;
+
+	/*
+	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
+	 * The thread which wins the race to grab this bit adds the hardware
+	 * queue to the wait queue.
+	 */
+	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
+	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+		return false;
+
+	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+
+	/*
+	 * As soon as this returns, it's no longer safe to fiddle with
+	 * hctx->dispatch_wait, since a completion can wake up the wait queue
+	 * and unlock the bit.
+	 */
+	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+	return true;
+}
+
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct request_queue *q = hctx->queue;
@@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 				continue;
 
 			/*
-			 * We failed getting a driver tag. Mark the queue(s)
-			 * as needing a restart. Retry getting a tag again,
-			 * in case the needed IO completed right before we
-			 * marked the queue as needing a restart.
+			 * The initial allocation attempt failed, so we need to
+			 * rerun the hardware queue when a tag is freed.
 			 */
-			blk_mq_sched_mark_restart(hctx);
-			if (!blk_mq_get_driver_tag(rq, &hctx, false))
+			if (blk_mq_dispatch_wait_add(hctx)) {
+				/*
+				 * It's possible that a tag was freed in the
+				 * window between the allocation failure and
+				 * adding the hardware queue to the wait queue.
+				 */
+				if (!blk_mq_get_driver_tag(rq, &hctx, false))
+					break;
+			} else {
 				break;
+			}
 		}
+
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
@@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 		 *
 		 * blk_mq_run_hw_queue() already checks the STOPPED bit
 		 *
-		 * If RESTART is set, then let completion restart the queue
-		 * instead of potentially looping here.
+		 * If RESTART or TAG_WAITING is set, then let completion restart
+		 * the queue instead of potentially looping here.
 		 */
-		if (!blk_mq_sched_needs_restart(hctx))
+		if (!blk_mq_sched_needs_restart(hctx) &&
+		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
 			blk_mq_run_hw_queue(hctx, true);
 	}
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8e4df3d6c8cd..001d30d727c5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -33,6 +33,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
+	wait_queue_t		dispatch_wait;
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
@@ -160,6 +161,7 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
+	BLK_MQ_S_TAG_WAITING	= 3,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
-- 
cgit v1.2.3


From d38d35155514ceef5efb79f6d5b4f0f1638da5b4 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 22 Feb 2017 10:58:30 -0800
Subject: blk-mq-sched: separate mark hctx and queue restart operations

In blk_mq_sched_dispatch_requests(), we call blk_mq_sched_mark_restart()
after we dispatch requests left over on our hardware queue dispatch
list. This is so we'll go back and dispatch requests from the scheduler.
In this case, it's only necessary to restart the hardware queue that we
are running; there's no reason to run other hardware queues just because
we are using shared tags.

So, split out blk_mq_sched_mark_restart() into two operations, one for
just the hardware queue and one for the whole request queue. The core
code only needs the hctx variant, but I/O schedulers will want to use
both.

This also requires adjusting blk_mq_sched_restart_queues() to always
check the queue restart flag, not just when using shared tags.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.c | 20 ++++++++------------
 block/blk-mq-sched.h | 26 ++++++++++++++++++--------
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c14f92308244..98c7b061781e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -205,7 +205,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 * needing a restart in that case.
 	 */
 	if (!list_empty(&rq_list)) {
-		blk_mq_sched_mark_restart(hctx);
+		blk_mq_sched_mark_restart_hctx(hctx);
 		did_work = blk_mq_dispatch_rq_list(hctx, &rq_list);
 	} else if (!has_sched_dispatch) {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
@@ -331,20 +331,16 @@ static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 
 void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
 {
+	struct request_queue *q = hctx->queue;
 	unsigned int i;
 
-	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+	if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
+		if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
+			queue_for_each_hw_ctx(q, hctx, i)
+				blk_mq_sched_restart_hctx(hctx);
+		}
+	} else {
 		blk_mq_sched_restart_hctx(hctx);
-	else {
-		struct request_queue *q = hctx->queue;
-
-		if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
-			return;
-
-		clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
-
-		queue_for_each_hw_ctx(q, hctx, i)
-			blk_mq_sched_restart_hctx(hctx);
 	}
 }
 
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 7b5f3b95c78e..a75b16b123f7 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -122,17 +122,27 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
-static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx)
+/*
+ * Mark a hardware queue as needing a restart.
+ */
+static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
-	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
 		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-		if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
-			struct request_queue *q = hctx->queue;
+}
+
+/*
+ * Mark a hardware queue and the request queue it belongs to as needing a
+ * restart.
+ */
+static inline void blk_mq_sched_mark_restart_queue(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
 
-			if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
-				set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
-		}
-	}
+	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+	if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+		set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
 }
 
 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
-- 
cgit v1.2.3


From 61febef40bfe8ab68259d8545257686e8a0d91d1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 24 Feb 2017 13:19:32 -0700
Subject: dm-rq: don't dereference request payload after ending request

Bart reported a case where dm would crash with use-after-free
poison. This is due to dm_softirq_done() accessing memory
associated with a request after calling end_request on it.
This is most visible on !blk-mq, since we free the memory
immediately for that case.

Reported-by: Bart Van Assche <bart.vanassche@sandisk.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Fixes: eb8db831be80 ("dm: always defer request allocation to the owner of the request_queue")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm-rq.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 67d76f21fecd..28955b94d2b2 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -328,13 +328,15 @@ static void dm_softirq_done(struct request *rq)
 	int rw;
 
 	if (!clone) {
-		rq_end_stats(tio->md, rq);
+		struct mapped_device *md = tio->md;
+
+		rq_end_stats(md, rq);
 		rw = rq_data_dir(rq);
 		if (!rq->q->mq_ops)
 			blk_end_request_all(rq, tio->error);
 		else
 			blk_mq_end_request(rq, tio->error);
-		rq_completed(tio->md, rw, false);
+		rq_completed(md, rw, false);
 		return;
 	}
 
-- 
cgit v1.2.3