From 36a106a4c1c100d55ba3d32a21ef748cfcd4fa99 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 3 Jan 2021 22:42:39 +0100
Subject: block: rsxx: select CONFIG_CRC32

Without crc32, the driver fails to link:

arm-linux-gnueabi-ld: drivers/block/rsxx/config.o: in function `rsxx_load_config':
config.c:(.text+0x124): undefined reference to `crc32_le'

Fixes: 8722ff8cdbfa ("block: IBM RamSan 70/80 device driver")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 262326973ee0..583b671b1d2d 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -445,6 +445,7 @@ config BLK_DEV_RBD
 config BLK_DEV_RSXX
 	tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver"
 	depends on PCI
+	select CRC32
 	help
 	  Device driver for IBM's high speed PCIe SSD
 	  storage device: Flash Adapter 900GB Full Height.
-- 
cgit v1.2.3


From 19cd3403cb0d522dd5e10188eef85817de29e26e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 3 Jan 2021 22:43:09 +0100
Subject: lightnvm: select CONFIG_CRC32

Without CRC32 support, this fails to link:

arm-linux-gnueabi-ld: drivers/lightnvm/pblk-init.o: in function `pblk_init':
pblk-init.c:(.text+0x2654): undefined reference to `crc32_le'
arm-linux-gnueabi-ld: drivers/lightnvm/pblk-init.o: in function `pblk_exit':
pblk-init.c:(.text+0x2a7c): undefined reference to `crc32_le'

Fixes: a4bd217b4326 ("lightnvm: physical block device (pblk) target")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 8f39f9ba5c80..4c2ce210c123 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -19,6 +19,7 @@ if NVM
 
 config NVM_PBLK
 	tristate "Physical Block Device Open-Channel SSD target"
+	select CRC32
 	help
 	  Allows an open-channel SSD to be exposed as a block device to the
 	  host. The target assumes the device exposes raw flash and must be
-- 
cgit v1.2.3


From d16baa3f1453c14d680c5fee01cd122a22d0e0ce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Jan 2021 12:37:23 -0500
Subject: blk-iocost: fix NULL iocg deref from racing against initialization

When initializing iocost for a queue, its rqos should be registered before
the blkcg policy is activated to allow policy data initiailization to lookup
the associated ioc. This unfortunately means that the rqos methods can be
called on bios before iocgs are attached to all existing blkgs.

While the race is theoretically possible on ioc_rqos_throttle(), it mostly
happened in ioc_rqos_merge() due to the difference in how they lookup ioc.
The former determines it from the passed in @rqos and then bails before
dereferencing iocg if the looked up ioc is disabled, which most likely is
the case if initialization is still in progress. The latter looked up ioc by
dereferencing the possibly NULL iocg making it a lot more prone to actually
triggering the bug.

* Make ioc_rqos_merge() use the same method as ioc_rqos_throttle() to look
  up ioc for consistency.

* Make ioc_rqos_throttle() and ioc_rqos_merge() test for NULL iocg before
  dereferencing it.

* Explain the danger of NULL iocgs in blk_iocost_init().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jonathan Lemon <bsd@fb.com>
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-iocost.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index ac6078a34939..98d656bdb42b 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2551,8 +2551,8 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
 	bool use_debt, ioc_locked;
 	unsigned long flags;
 
-	/* bypass IOs if disabled or for root cgroup */
-	if (!ioc->enabled || !iocg->level)
+	/* bypass IOs if disabled, still initializing, or for root cgroup */
+	if (!ioc->enabled || !iocg || !iocg->level)
 		return;
 
 	/* calculate the absolute vtime cost */
@@ -2679,14 +2679,14 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
 			   struct bio *bio)
 {
 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
-	struct ioc *ioc = iocg->ioc;
+	struct ioc *ioc = rqos_to_ioc(rqos);
 	sector_t bio_end = bio_end_sector(bio);
 	struct ioc_now now;
 	u64 vtime, abs_cost, cost;
 	unsigned long flags;
 
-	/* bypass if disabled or for root cgroup */
-	if (!ioc->enabled || !iocg->level)
+	/* bypass if disabled, still initializing, or for root cgroup */
+	if (!ioc->enabled || !iocg || !iocg->level)
 		return;
 
 	abs_cost = calc_vtime_cost(bio, iocg, true);
@@ -2863,6 +2863,12 @@ static int blk_iocost_init(struct request_queue *q)
 	ioc_refresh_params(ioc, true);
 	spin_unlock_irq(&ioc->lock);
 
+	/*
+	 * rqos must be added before activation to allow iocg_pd_init() to
+	 * lookup the ioc from q. This means that the rqos methods may get
+	 * called before policy activation completion, can't assume that the
+	 * target bio has an iocg associated and need to test for NULL iocg.
+	 */
 	rq_qos_add(q, rqos);
 	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
 	if (ret) {
-- 
cgit v1.2.3


From 6d4d273588378c65915acaf7b2ee74e9dd9c130a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 10 Dec 2020 10:44:33 +0100
Subject: bfq: Fix computation of shallow depth

BFQ computes number of tags it allows to be allocated for each request type
based on tag bitmap. However it uses 1 << bitmap.shift as number of
available tags which is wrong. 'shift' is just an internal bitmap value
containing logarithm of how many bits bitmap uses in each bitmap word.
Thus number of tags allowed for some request types can be far to low.
Use proper bitmap.depth which has the number of tags instead.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 9e81d1052091..9e4eb0fc1c16 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6332,13 +6332,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
 	 * limit 'something'.
 	 */
 	/* no more than 50% of tags for async I/O */
-	bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+	bfqd->word_depths[0][0] = max(bt->sb.depth >> 1, 1U);
 	/*
 	 * no more than 75% of tags for sync writes (25% extra tags
 	 * w.r.t. async I/O, to prevent async I/O from starving sync
 	 * writes)
 	 */
-	bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+	bfqd->word_depths[0][1] = max((bt->sb.depth * 3) >> 2, 1U);
 
 	/*
 	 * In-word depths in case some bfq_queue is being weight-
@@ -6348,9 +6348,9 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
 	 * shortage.
 	 */
 	/* no more than ~18% of tags for async I/O */
-	bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+	bfqd->word_depths[1][0] = max((bt->sb.depth * 3) >> 4, 1U);
 	/* no more than ~37% of tags for sync writes (~20% extra tags) */
-	bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
+	bfqd->word_depths[1][1] = max((bt->sb.depth * 6) >> 4, 1U);
 
 	for (i = 0; i < 2; i++)
 		for (j = 0; j < 2; j++)
-- 
cgit v1.2.3


From aebf5db917055b38f4945ed6d621d9f07a44ff30 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 21 Dec 2020 12:33:35 +0800
Subject: block: fix use-after-free in disk_part_iter_next

Make sure that bdgrab() is done on the 'block_device' instance before
referring to it for avoiding use-after-free.

Cc: <stable@vger.kernel.org>
Reported-by: syzbot+825f0f9657d4e528046e@syzkaller.appspotmail.com
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 73faec438e49..419548e92d82 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -246,15 +246,18 @@ struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
 		part = rcu_dereference(ptbl->part[piter->idx]);
 		if (!part)
 			continue;
+		piter->part = bdgrab(part);
+		if (!piter->part)
+			continue;
 		if (!bdev_nr_sectors(part) &&
 		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
 		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
-		      piter->idx == 0))
+		      piter->idx == 0)) {
+			bdput(piter->part);
+			piter->part = NULL;
 			continue;
+		}
 
-		piter->part = bdgrab(part);
-		if (!piter->part)
-			continue;
 		piter->idx += inc;
 		break;
 	}
-- 
cgit v1.2.3


From 19fce0470f05031e6af36e49ce222d0f0050d432 Mon Sep 17 00:00:00 2001
From: James Smart <james.smart@broadcom.com>
Date: Tue, 1 Dec 2020 17:52:43 -0800
Subject: nvme-fc: avoid calling _nvme_fc_abort_outstanding_ios from interrupt
 context

Recent patches changed calling sequences. nvme_fc_abort_outstanding_ios
used to be called from a timeout or work context. Now it is being called
in an io completion context, which can be an interrupt handler.
Unfortunately, the abort outstanding ios routine attempts to stop nvme
queues and nested routines that may try to sleep, which is in conflict
with the interrupt handler.

Correct replacing the direct call with a work element scheduling, and the
abort outstanding ios routine will be called in the work element.

Fixes: 95ced8a2c72d ("nvme-fc: eliminate terminate_io use by nvme_fc_error_recovery")
Signed-off-by: James Smart <james.smart@broadcom.com>
Reported-by: Daniel Wagner <dwagner@suse.de>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 38373a0e86ef..5f36cfa8136c 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -166,6 +166,7 @@ struct nvme_fc_ctrl {
 	struct blk_mq_tag_set	admin_tag_set;
 	struct blk_mq_tag_set	tag_set;
 
+	struct work_struct	ioerr_work;
 	struct delayed_work	connect_work;
 
 	struct kref		ref;
@@ -1888,6 +1889,15 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
 	}
 }
 
+static void
+nvme_fc_ctrl_ioerr_work(struct work_struct *work)
+{
+	struct nvme_fc_ctrl *ctrl =
+			container_of(work, struct nvme_fc_ctrl, ioerr_work);
+
+	nvme_fc_error_recovery(ctrl, "transport detected io error");
+}
+
 static void
 nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 {
@@ -2046,7 +2056,7 @@ done:
 
 check_error:
 	if (terminate_assoc)
-		nvme_fc_error_recovery(ctrl, "transport detected io error");
+		queue_work(nvme_reset_wq, &ctrl->ioerr_work);
 }
 
 static int
@@ -3233,6 +3243,7 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
 {
 	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
 
+	cancel_work_sync(&ctrl->ioerr_work);
 	cancel_delayed_work_sync(&ctrl->connect_work);
 	/*
 	 * kill the association on the link side.  this will block
@@ -3449,6 +3460,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
 	INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
+	INIT_WORK(&ctrl->ioerr_work, nvme_fc_ctrl_ioerr_work);
 	spin_lock_init(&ctrl->lock);
 
 	/* io queue count */
@@ -3540,6 +3552,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 fail_ctrl:
 	nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
+	cancel_work_sync(&ctrl->ioerr_work);
 	cancel_work_sync(&ctrl->ctrl.reset_work);
 	cancel_delayed_work_sync(&ctrl->connect_work);
 
-- 
cgit v1.2.3


From 2b54996b7d56badc563755840838614f2fa9c4de Mon Sep 17 00:00:00 2001
From: James Smart <james.smart@broadcom.com>
Date: Mon, 7 Dec 2020 12:29:40 -0800
Subject: nvme-fcloop: Fix sscanf type and list_first_entry_or_null warnings

Kernel robot had the following warnings:

>> fcloop.c:1506:6: warning: %x in format string (no. 1) requires
>> 'unsigned int *' but the argument type is 'signed int *'.
>> [invalidScanfArgType_int]
>>    if (sscanf(buf, "%x:%d:%d", &opcode, &starting, &amount) != 3)
>>        ^

Resolve by changing opcode from and int to an unsigned int

and

>>  fcloop.c:1632:32: warning: Uninitialized variable: lport [uninitvar]
>>     ret = __wait_localport_unreg(lport);
>>                                  ^

>>  fcloop.c:1615:28: warning: Uninitialized variable: nport [uninitvar]
>>     ret = __remoteport_unreg(nport, rport);
>>                              ^

These aren't actual issues as the values are assigned prior to use.
It appears the tool doesn't understand list_first_entry_or_null().
Regardless, quiet the tool by initializing the pointers to NULL at
declaration.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fcloop.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 733d9363900e..68213f0a052b 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -1501,7 +1501,8 @@ static ssize_t
 fcloop_set_cmd_drop(struct device *dev, struct device_attribute *attr,
 		const char *buf, size_t count)
 {
-	int opcode, starting, amount;
+	unsigned int opcode;
+	int starting, amount;
 
 	if (sscanf(buf, "%x:%d:%d", &opcode, &starting, &amount) != 3)
 		return -EBADRQC;
@@ -1588,8 +1589,8 @@ out_destroy_class:
 
 static void __exit fcloop_exit(void)
 {
-	struct fcloop_lport *lport;
-	struct fcloop_nport *nport;
+	struct fcloop_lport *lport = NULL;
+	struct fcloop_nport *nport = NULL;
 	struct fcloop_tport *tport;
 	struct fcloop_rport *rport;
 	unsigned long flags;
-- 
cgit v1.2.3


From 7ee5c78ca3895d44e918c38332921983ed678be0 Mon Sep 17 00:00:00 2001
From: Gopal Tiwari <gtiwari@redhat.com>
Date: Fri, 4 Dec 2020 21:46:57 +0530
Subject: nvme-pci: mark Samsung PM1725a as IGNORE_DEV_SUBNQN

A system with more than one of these SSDs will only have one usable.
Hence the kernel fails to detect nvme devices due to duplicate cntlids.

[    6.274554] nvme nvme1: Duplicate cntlid 33 with nvme0, rejecting
[    6.274566] nvme nvme1: Removing after probe failure status: -22

Adding the NVME_QUIRK_IGNORE_DEV_SUBNQN quirk to resolves the issue.

Signed-off-by: Gopal Tiwari <gtiwari@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b4385cb0ff60..553871e6962b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3196,7 +3196,8 @@ static const struct pci_device_id nvme_id_table[] = {
 	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
 		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
 	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
-		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
+				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
 	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
 		.driver_data = NVME_QUIRK_LIGHTNVM, },
 	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
-- 
cgit v1.2.3


From 5c11f7d9f843bdd24cd29b95401938bc3f168070 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 21 Dec 2020 00:03:39 -0800
Subject: nvme-tcp: Fix possible race of io_work and direct send

We may send a request (with or without its data) from two paths:

  1. From our I/O context nvme_tcp_io_work which is triggered from:
    - queue_rq
    - r2t reception
    - socket data_ready and write_space callbacks
  2. Directly from queue_rq if the send_list is empty (because we want to
     save the context switch associated with scheduling our io_work).

However, given that now we have the send_mutex, we may run into a race
condition where none of these contexts will send the pending payload to
the controller. Both io_work send path and queue_rq send path
opportunistically attempt to acquire the send_mutex however queue_rq only
attempts to send a single request, and if io_work context fails to
acquire the send_mutex it will complete without rescheduling itself.

The race can trigger with the following sequence:

  1. queue_rq sends request (no incapsule data) and blocks
  2. RX path receives r2t - prepares data PDU to send, adds h2cdata PDU
     to the send_list and schedules io_work
  3. io_work triggers and cannot acquire the send_mutex - because of (1),
     ends without self rescheduling
  4. queue_rq completes the send, and completes

==> no context will send the h2cdata - timeout.

Fix this by having queue_rq sending as much as it can from the send_list
such that if it still has any left, its because the socket buffer is
full and the socket write_space callback will trigger, thus guaranteeing
that a context will be scheduled to send the h2cdata PDU.

Fixes: db5ad6b7f8cd ("nvme-tcp: try to send request in queue_rq context")
Reported-by: Potnuri Bharat Teja <bharat@chelsio.com>
Reported-by: Samuel Jones <sjones@kalrayinc.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Potnuri Bharat Teja <bharat@chelsio.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 1ba659927442..979ee31b8dd1 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -262,6 +262,16 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
 	}
 }
 
+static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
+{
+	int ret;
+
+	/* drain the send queue as much as we can... */
+	do {
+		ret = nvme_tcp_try_send(queue);
+	} while (ret > 0);
+}
+
 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 		bool sync, bool last)
 {
@@ -279,7 +289,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 	if (queue->io_cpu == smp_processor_id() &&
 	    sync && empty && mutex_trylock(&queue->send_mutex)) {
 		queue->more_requests = !last;
-		nvme_tcp_try_send(queue);
+		nvme_tcp_send_all(queue);
 		queue->more_requests = false;
 		mutex_unlock(&queue->send_mutex);
 	} else if (last) {
-- 
cgit v1.2.3


From 62df80165d7f197c9c0652e7416164f294a96661 Mon Sep 17 00:00:00 2001
From: Lalithambika Krishnakumar <lalithambika.krishnakumar@intel.com>
Date: Wed, 23 Dec 2020 14:09:00 -0800
Subject: nvme: avoid possible double fetch in handling CQE

While handling the completion queue, keep a local copy of the command id
from the DMA-accessible completion entry. This silences a time-of-check
to time-of-use (TOCTOU) warning from KF/x[1], with respect to a
Thunderclap[2] vulnerability analysis. The double-read impact appears
benign.

There may be a theoretical window for @command_id to be used as an
adversary-controlled array-index-value for mounting a speculative
execution attack, but that mitigation is saved for a potential follow-on.
A man-in-the-middle attack on the data payload is out of scope for this
analysis and is hopefully mitigated by filesystem integrity mechanisms.

[1] https://github.com/intel/kernel-fuzzer-for-xen-project
[2] http://thunderclap.io/thunderclap-paper-ndss2019.pdf
Signed-off-by: Lalithambika Krishna Kumar <lalithambika.krishnakumar@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 553871e6962b..50d9a20568a2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -967,6 +967,7 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 {
 	struct nvme_completion *cqe = &nvmeq->cqes[idx];
+	__u16 command_id = READ_ONCE(cqe->command_id);
 	struct request *req;
 
 	/*
@@ -975,17 +976,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 	 * aborts.  We don't even bother to allocate a struct request
 	 * for them but rather special case them here.
 	 */
-	if (unlikely(nvme_is_aen_req(nvmeq->qid, cqe->command_id))) {
+	if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
 		nvme_complete_async_event(&nvmeq->dev->ctrl,
 				cqe->status, &cqe->result);
 		return;
 	}
 
-	req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
+	req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id);
 	if (unlikely(!req)) {
 		dev_warn(nvmeq->dev->ctrl.device,
 			"invalid id %d completed on queue %d\n",
-			cqe->command_id, le16_to_cpu(cqe->sq_id));
+			command_id, le16_to_cpu(cqe->sq_id));
 		return;
 	}
 
-- 
cgit v1.2.3


From 9b66fc02bec0ca613bc6d4c1d0049f727a95567d Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Wed, 30 Dec 2020 20:22:44 +0900
Subject: nvme: unexport functions with no external caller

There are no callers for nvme_reset_ctrl_sync() and
nvme_alloc_request_qid() so that we keep the symbols exported.

Unexport those functions, mark them static and update the header file
respectively.

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 6 ++----
 drivers/nvme/host/nvme.h | 3 ---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ce1b61519441..70a63d7c1d02 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -179,7 +179,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
 
-int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
+static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 {
 	int ret;
 
@@ -192,7 +192,6 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
 
 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
 {
@@ -578,7 +577,7 @@ struct request *nvme_alloc_request(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(nvme_alloc_request);
 
-struct request *nvme_alloc_request_qid(struct request_queue *q,
+static struct request *nvme_alloc_request_qid(struct request_queue *q,
 		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
 {
 	struct request *req;
@@ -589,7 +588,6 @@ struct request *nvme_alloc_request_qid(struct request_queue *q,
 		nvme_init_request(req, cmd);
 	return req;
 }
-EXPORT_SYMBOL_GPL(nvme_alloc_request_qid);
 
 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
 {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7e49f61f81df..9c4fbfe44c00 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -610,8 +610,6 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl);
 #define NVME_QID_ANY -1
 struct request *nvme_alloc_request(struct request_queue *q,
 		struct nvme_command *cmd, blk_mq_req_flags_t flags);
-struct request *nvme_alloc_request_qid(struct request_queue *q,
-		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
 void nvme_cleanup_cmd(struct request *req);
 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *cmd);
@@ -630,7 +628,6 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
-int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
 int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 
-- 
cgit v1.2.3


From 9ceb7863537748c67fa43ac4f2f565819bbd36e4 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@nvidia.com>
Date: Tue, 5 Jan 2021 10:46:54 +0200
Subject: nvmet-rdma: Fix list_del corruption on queue establishment failure

When a queue is in NVMET_RDMA_Q_CONNECTING state, it may has some
requests at rsp_wait_list. In case a disconnect occurs at this
state, no one will empty this list and will return the requests to
free_rsps list. Normally nvmet_rdma_queue_established() free those
requests after moving the queue to NVMET_RDMA_Q_LIVE state, but in
this case __nvmet_rdma_queue_disconnect() is called before. The
crash happens at nvmet_rdma_free_rsps() when calling
list_del(&rsp->free_list), because the request exists only at
the wait list. To fix the issue, simply clear rsp_wait_list when
destroying the queue.

Signed-off-by: Israel Rukshin <israelr@nvidia.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/rdma.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 5c1e7cb7fe0d..bdfc22eb2a10 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1641,6 +1641,16 @@ static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
 	spin_lock_irqsave(&queue->state_lock, flags);
 	switch (queue->state) {
 	case NVMET_RDMA_Q_CONNECTING:
+		while (!list_empty(&queue->rsp_wait_list)) {
+			struct nvmet_rdma_rsp *rsp;
+
+			rsp = list_first_entry(&queue->rsp_wait_list,
+					       struct nvmet_rdma_rsp,
+					       wait_list);
+			list_del(&rsp->wait_list);
+			nvmet_rdma_put_rsp(rsp);
+		}
+		fallthrough;
 	case NVMET_RDMA_Q_LIVE:
 		queue->state = NVMET_RDMA_Q_DISCONNECTING;
 		disconnect = true;
-- 
cgit v1.2.3


From 2b59787a223b79228fed9ade1bf6936194ddb8cd Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Tue, 5 Jan 2021 10:34:02 +0000
Subject: nvme: remove the unused status argument from nvme_trace_bio_complete

The only used argument in this function is the "req".

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 drivers/nvme/host/nvme.h | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 70a63d7c1d02..f320273fc672 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -330,7 +330,7 @@ static inline void nvme_end_req(struct request *req)
 		req->__sector = nvme_lba_to_sect(req->q->queuedata,
 			le64_to_cpu(nvme_req(req)->result.u64));
 
-	nvme_trace_bio_complete(req, status);
+	nvme_trace_bio_complete(req);
 	blk_mq_end_request(req, status);
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9c4fbfe44c00..88a6b97247f5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -672,8 +672,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 		kblockd_schedule_work(&head->requeue_work);
 }
 
-static inline void nvme_trace_bio_complete(struct request *req,
-        blk_status_t status)
+static inline void nvme_trace_bio_complete(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
 
@@ -728,8 +727,7 @@ static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 {
 }
-static inline void nvme_trace_bio_complete(struct request *req,
-        blk_status_t status)
+static inline void nvme_trace_bio_complete(struct request *req)
 {
 }
 static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
-- 
cgit v1.2.3


From 04a6a536bc3fd1436fc78c546c6b3ecdccbfaf6d Mon Sep 17 00:00:00 2001
From: Satya Tangirala <satyat@google.com>
Date: Thu, 24 Dec 2020 04:49:54 +0000
Subject: fs: Fix freeze_bdev()/thaw_bdev() accounting of bd_fsfreeze_sb

freeze/thaw_bdev() currently use bdev->bd_fsfreeze_count to infer
whether or not bdev->bd_fsfreeze_sb is valid (it's valid iff
bd_fsfreeze_count is non-zero). thaw_bdev() doesn't nullify
bd_fsfreeze_sb.

But this means a freeze_bdev() call followed by a thaw_bdev() call can
leave bd_fsfreeze_sb with a non-null value, while bd_fsfreeze_count is
zero. If freeze_bdev() is called again, and this time
get_active_super() returns NULL (e.g. because the FS is unmounted),
we'll end up with bd_fsfreeze_count > 0, but bd_fsfreeze_sb is
*untouched* - it stays the same (now garbage) value. A subsequent
thaw_bdev() will decide that the bd_fsfreeze_sb value is legitimate
(since bd_fsfreeze_count > 0), and attempt to use it.

Fix this by always setting bd_fsfreeze_sb to NULL when
bd_fsfreeze_count is successfully decremented to 0 in thaw_sb().
Alternatively, we could set bd_fsfreeze_sb to whatever
get_active_super() returns in freeze_bdev() whenever bd_fsfreeze_count
is successfully incremented to 1 from 0 (which can be achieved cleanly
by moving the line currently setting bd_fsfreeze_sb to immediately
after the "sync:" label, but it might be a little too subtle/easily
overlooked in future).

This fixes the currently panicking xfstests generic/085.

Fixes: 040f04bd2e82 ("fs: simplify freeze_bdev/thaw_bdev")
Signed-off-by: Satya Tangirala <satyat@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3e5b02f6606c..e454c5a81043 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -605,6 +605,8 @@ int thaw_bdev(struct block_device *bdev)
 		error = thaw_super(sb);
 	if (error)
 		bdev->bd_fsfreeze_count++;
+	else
+		bdev->bd_fsfreeze_sb = NULL;
 out:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return error;
-- 
cgit v1.2.3


From 2d2f6f1b4799428d160c021dd652bc3e3593945e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Jan 2021 19:36:40 +0100
Subject: block: pre-initialize struct block_device in bdev_alloc_inode

bdev_evict_inode and bdev_free_inode are also called for the root inode
of bdevfs, for which bdev_alloc is never called.  Move the zeroing o
f struct block_device and the initialization of the bd_bdi field into
bdev_alloc_inode to make sure they are initialized for the root inode
as well.

Fixes: e6cb53827ed6 ("block: initialize struct block_device in bdev_alloc")
Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index e454c5a81043..3b8963e228a1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -776,8 +776,11 @@ static struct kmem_cache * bdev_cachep __read_mostly;
 static struct inode *bdev_alloc_inode(struct super_block *sb)
 {
 	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
+
 	if (!ei)
 		return NULL;
+	memset(&ei->bdev, 0, sizeof(ei->bdev));
+	ei->bdev.bd_bdi = &noop_backing_dev_info;
 	return &ei->vfs_inode;
 }
 
@@ -871,14 +874,12 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	mapping_set_gfp_mask(&inode->i_data, GFP_USER);
 
 	bdev = I_BDEV(inode);
-	memset(bdev, 0, sizeof(*bdev));
 	mutex_init(&bdev->bd_mutex);
 	mutex_init(&bdev->bd_fsfreeze_mutex);
 	spin_lock_init(&bdev->bd_size_lock);
 	bdev->bd_disk = disk;
 	bdev->bd_partno = partno;
 	bdev->bd_inode = inode;
-	bdev->bd_bdi = &noop_backing_dev_info;
 #ifdef CONFIG_SYSFS
 	INIT_LIST_HEAD(&bdev->bd_holder_disks);
 #endif
-- 
cgit v1.2.3


From 74acfa996b2aec2a4ea8587104c7e2f8d4c6aec2 Mon Sep 17 00:00:00 2001
From: Jack Wang <jinpu.wang@cloud.ionos.com>
Date: Fri, 8 Jan 2021 15:36:30 +0100
Subject: block/rnbd: Select SG_POOL for RNBD_CLIENT

lkp reboot following build error:
 drivers/block/rnbd/rnbd-clt.c: In function 'rnbd_softirq_done_fn':
>> drivers/block/rnbd/rnbd-clt.c:387:2: error: implicit declaration of function 'sg_free_table_chained' [-Werror=implicit-function-declaration]
     387 |  sg_free_table_chained(&iu->sgt, RNBD_INLINE_SG_CNT);
         |  ^~~~~~~~~~~~~~~~~~~~~

The reason is CONFIG_SG_POOL is not enabled in the config, to
avoid such failure, select SG_POOL in Kconfig for RNBD_CLIENT.

Fixes: 5a1328d0c3a7 ("block/rnbd-clt: Dynamically allocate sglist for rnbd_iu")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rnbd/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/rnbd/Kconfig b/drivers/block/rnbd/Kconfig
index 4b6d3d816d1f..2ff05a0d2646 100644
--- a/drivers/block/rnbd/Kconfig
+++ b/drivers/block/rnbd/Kconfig
@@ -7,6 +7,7 @@ config BLK_DEV_RNBD_CLIENT
 	tristate "RDMA Network Block Device driver client"
 	depends on INFINIBAND_RTRS_CLIENT
 	select BLK_DEV_RNBD
+	select SG_POOL
 	help
 	  RNBD client is a network block device driver using rdma transport.
 
-- 
cgit v1.2.3


From 1a84e7c629f8f288e02236bc799f9b0be1cab4a7 Mon Sep 17 00:00:00 2001
From: Jack Wang <jinpu.wang@cloud.ionos.com>
Date: Fri, 8 Jan 2021 15:36:31 +0100
Subject: block/rnbd-srv: Fix use after free in rnbd_srv_sess_dev_force_close

KASAN detect following BUG:
[  778.215311] ==================================================================
[  778.216696] BUG: KASAN: use-after-free in rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server]
[  778.219037] Read of size 8 at addr ffff88b1d6516c28 by task tee/8842

[  778.220500] CPU: 37 PID: 8842 Comm: tee Kdump: loaded Not tainted 5.10.0-pserver #5.10.0-1+feature+linux+next+20201214.1025+0910d71
[  778.220529] Hardware name: Supermicro Super Server/X11DDW-L, BIOS 3.3 02/21/2020
[  778.220555] Call Trace:
[  778.220609]  dump_stack+0x99/0xcb
[  778.220667]  ? rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server]
[  778.220715]  print_address_description.constprop.7+0x1e/0x230
[  778.220750]  ? freeze_kernel_threads+0x73/0x73
[  778.220896]  ? rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server]
[  778.220932]  ? rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server]
[  778.220994]  kasan_report.cold.9+0x37/0x7c
[  778.221066]  ? kobject_put+0x80/0x270
[  778.221102]  ? rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server]
[  778.221184]  rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server]
[  778.221240]  rnbd_srv_dev_session_force_close_store+0x6a/0xc0 [rnbd_server]
[  778.221304]  ? sysfs_file_ops+0x90/0x90
[  778.221353]  kernfs_fop_write+0x141/0x240
[  778.221451]  vfs_write+0x142/0x4d0
[  778.221553]  ksys_write+0xc0/0x160
[  778.221602]  ? __ia32_sys_read+0x50/0x50
[  778.221684]  ? lockdep_hardirqs_on_prepare+0x13d/0x210
[  778.221718]  ? syscall_enter_from_user_mode+0x1c/0x50
[  778.221821]  do_syscall_64+0x33/0x40
[  778.221862]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  778.221896] RIP: 0033:0x7f4affdd9504
[  778.221928] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 48 8d 05 f9 61 0d 00 8b 00 85 c0 75 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 41 54 49 89 d4 55 48 89 f5 53
[  778.221956] RSP: 002b:00007fffebb36b28 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  778.222011] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f4affdd9504
[  778.222038] RDX: 0000000000000002 RSI: 00007fffebb36c50 RDI: 0000000000000003
[  778.222066] RBP: 00007fffebb36c50 R08: 0000556a151aa600 R09: 00007f4affeb1540
[  778.222094] R10: fffffffffffffc19 R11: 0000000000000246 R12: 0000556a151aa520
[  778.222121] R13: 0000000000000002 R14: 00007f4affea6760 R15: 0000000000000002

[  778.222764] Allocated by task 3212:
[  778.223285]  kasan_save_stack+0x19/0x40
[  778.223316]  __kasan_kmalloc.constprop.7+0xc1/0xd0
[  778.223347]  kmem_cache_alloc_trace+0x186/0x350
[  778.223382]  rnbd_srv_rdma_ev+0xf16/0x1690 [rnbd_server]
[  778.223422]  process_io_req+0x4d1/0x670 [rtrs_server]
[  778.223573]  __ib_process_cq+0x10a/0x350 [ib_core]
[  778.223709]  ib_cq_poll_work+0x31/0xb0 [ib_core]
[  778.223743]  process_one_work+0x521/0xa90
[  778.223773]  worker_thread+0x65/0x5b0
[  778.223802]  kthread+0x1f2/0x210
[  778.223833]  ret_from_fork+0x22/0x30

[  778.224296] Freed by task 8842:
[  778.224800]  kasan_save_stack+0x19/0x40
[  778.224829]  kasan_set_track+0x1c/0x30
[  778.224860]  kasan_set_free_info+0x1b/0x30
[  778.224889]  __kasan_slab_free+0x108/0x150
[  778.224919]  slab_free_freelist_hook+0x64/0x190
[  778.224947]  kfree+0xe2/0x650
[  778.224982]  rnbd_destroy_sess_dev+0x2fa/0x3b0 [rnbd_server]
[  778.225011]  kobject_put+0xda/0x270
[  778.225046]  rnbd_srv_sess_dev_force_close+0x30/0x60 [rnbd_server]
[  778.225081]  rnbd_srv_dev_session_force_close_store+0x6a/0xc0 [rnbd_server]
[  778.225111]  kernfs_fop_write+0x141/0x240
[  778.225140]  vfs_write+0x142/0x4d0
[  778.225169]  ksys_write+0xc0/0x160
[  778.225198]  do_syscall_64+0x33/0x40
[  778.225227]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

[  778.226506] The buggy address belongs to the object at ffff88b1d6516c00
                which belongs to the cache kmalloc-512 of size 512
[  778.227464] The buggy address is located 40 bytes inside of
                512-byte region [ffff88b1d6516c00, ffff88b1d6516e00)

The problem is in the sess_dev release function we call
rnbd_destroy_sess_dev, and could free the sess_dev already, but we still
set the keep_id in rnbd_srv_sess_dev_force_close, which lead to use
after free.

To fix it, move the keep_id before the sysfs removal, and cache the
rnbd_srv_session for lock accessing,

Fixes: 786998050cbc ("block/rnbd-srv: close a mapped device from server side.")
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
Reviewed-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rnbd/rnbd-srv.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index b8e44331e494..a6a68d44f517 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -338,10 +338,12 @@ static int rnbd_srv_link_ev(struct rtrs_srv *rtrs,
 
 void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev)
 {
-	mutex_lock(&sess_dev->sess->lock);
-	rnbd_srv_destroy_dev_session_sysfs(sess_dev);
-	mutex_unlock(&sess_dev->sess->lock);
+	struct rnbd_srv_session	*sess = sess_dev->sess;
+
 	sess_dev->keep_id = true;
+	mutex_lock(&sess->lock);
+	rnbd_srv_destroy_dev_session_sysfs(sess_dev);
+	mutex_unlock(&sess->lock);
 }
 
 static int process_msg_close(struct rtrs_srv *rtrs,
-- 
cgit v1.2.3


From 80f99093d81370c5cec37fca3b5a6bdf6bddf0f6 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Fri, 8 Jan 2021 15:36:32 +0100
Subject: block/rnbd-clt: Fix sg table use after free

Since dynamically allocate sglist is used for rnbd_iu, we can't free sg
table after send_usr_msg since the callback function (cqe.done) could
still access the sglist.

Otherwise KASAN reports UAF issue:

[ 4856.600257] BUG: KASAN: use-after-free in dma_direct_unmap_sg+0x53/0x290
[ 4856.600772] Read of size 4 at addr ffff888206af3a98 by task swapper/1/0

[ 4856.601729] CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G        W         5.10.0-pserver #5.10.0-1+feature+linux+next+20201214.1025+0910d71
[ 4856.601748] Hardware name: Supermicro Super Server/X11DDW-L, BIOS 3.3 02/21/2020
[ 4856.601766] Call Trace:
[ 4856.601785]  <IRQ>
[ 4856.601822]  dump_stack+0x99/0xcb
[ 4856.601856]  ? dma_direct_unmap_sg+0x53/0x290
[ 4856.601888]  print_address_description.constprop.7+0x1e/0x230
[ 4856.601913]  ? freeze_kernel_threads+0x73/0x73
[ 4856.601965]  ? mark_held_locks+0x29/0xa0
[ 4856.602019]  ? dma_direct_unmap_sg+0x53/0x290
[ 4856.602039]  ? dma_direct_unmap_sg+0x53/0x290
[ 4856.602079]  kasan_report.cold.9+0x37/0x7c
[ 4856.602188]  ? mlx5_ib_post_recv+0x430/0x520 [mlx5_ib]
[ 4856.602209]  ? dma_direct_unmap_sg+0x53/0x290
[ 4856.602256]  dma_direct_unmap_sg+0x53/0x290
[ 4856.602366]  complete_rdma_req+0x188/0x4b0 [rtrs_client]
[ 4856.602451]  ? rtrs_clt_close+0x80/0x80 [rtrs_client]
[ 4856.602535]  ? mlx5_ib_poll_cq+0x48b/0x16e0 [mlx5_ib]
[ 4856.602589]  ? radix_tree_insert+0x3a0/0x3a0
[ 4856.602610]  ? do_raw_spin_lock+0x119/0x1d0
[ 4856.602647]  ? rwlock_bug.part.1+0x60/0x60
[ 4856.602740]  rtrs_clt_rdma_done+0x3f7/0x670 [rtrs_client]
[ 4856.602804]  ? rtrs_clt_rdma_cm_handler+0xda0/0xda0 [rtrs_client]
[ 4856.602857]  ? check_flags.part.31+0x6c/0x1f0
[ 4856.602927]  ? rcu_read_lock_sched_held+0xaf/0xe0
[ 4856.602963]  ? rcu_read_lock_bh_held+0xc0/0xc0
[ 4856.603137]  __ib_process_cq+0x10a/0x350 [ib_core]
[ 4856.603309]  ib_poll_handler+0x41/0x1c0 [ib_core]
[ 4856.603358]  irq_poll_softirq+0xe6/0x280
[ 4856.603392]  ? lockdep_hardirqs_on_prepare+0x111/0x210
[ 4856.603446]  __do_softirq+0x10d/0x646
[ 4856.603540]  asm_call_irq_on_stack+0x12/0x20
[ 4856.603563]  </IRQ>

[ 4856.605096] Allocated by task 8914:
[ 4856.605510]  kasan_save_stack+0x19/0x40
[ 4856.605532]  __kasan_kmalloc.constprop.7+0xc1/0xd0
[ 4856.605552]  __kmalloc+0x155/0x320
[ 4856.605574]  __sg_alloc_table+0x155/0x1c0
[ 4856.605594]  sg_alloc_table+0x1f/0x50
[ 4856.605620]  send_msg_sess_info+0x119/0x2e0 [rnbd_client]
[ 4856.605646]  remap_devs+0x71/0x210 [rnbd_client]
[ 4856.605676]  init_sess+0xad8/0xe10 [rtrs_client]
[ 4856.605706]  rtrs_clt_reconnect_work+0xd6/0x170 [rtrs_client]
[ 4856.605728]  process_one_work+0x521/0xa90
[ 4856.605748]  worker_thread+0x65/0x5b0
[ 4856.605769]  kthread+0x1f2/0x210
[ 4856.605789]  ret_from_fork+0x22/0x30

[ 4856.606159] Freed by task 8914:
[ 4856.606559]  kasan_save_stack+0x19/0x40
[ 4856.606580]  kasan_set_track+0x1c/0x30
[ 4856.606601]  kasan_set_free_info+0x1b/0x30
[ 4856.606622]  __kasan_slab_free+0x108/0x150
[ 4856.606642]  slab_free_freelist_hook+0x64/0x190
[ 4856.606661]  kfree+0xe2/0x650
[ 4856.606681]  __sg_free_table+0xa4/0x100
[ 4856.606707]  send_msg_sess_info+0x1d6/0x2e0 [rnbd_client]
[ 4856.606733]  remap_devs+0x71/0x210 [rnbd_client]
[ 4856.606763]  init_sess+0xad8/0xe10 [rtrs_client]
[ 4856.606792]  rtrs_clt_reconnect_work+0xd6/0x170 [rtrs_client]
[ 4856.606813]  process_one_work+0x521/0xa90
[ 4856.606833]  worker_thread+0x65/0x5b0
[ 4856.606853]  kthread+0x1f2/0x210
[ 4856.606872]  ret_from_fork+0x22/0x30

The solution is to free iu's sgtable after the iu is not used anymore.
And also move sg_alloc_table into rnbd_get_iu accordingly.

Fixes: 5a1328d0c3a7 ("block/rnbd-clt: Dynamically allocate sglist for rnbd_iu")
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rnbd/rnbd-clt.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index 96e3f9fe8241..506e9094487f 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -375,12 +375,19 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess,
 	init_waitqueue_head(&iu->comp.wait);
 	iu->comp.errno = INT_MAX;
 
+	if (sg_alloc_table(&iu->sgt, 1, GFP_KERNEL)) {
+		rnbd_put_permit(sess, permit);
+		kfree(iu);
+		return NULL;
+	}
+
 	return iu;
 }
 
 static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu)
 {
 	if (atomic_dec_and_test(&iu->refcount)) {
+		sg_free_table(&iu->sgt);
 		rnbd_put_permit(sess, iu->permit);
 		kfree(iu);
 	}
@@ -487,8 +494,6 @@ static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait)
 	iu->buf = NULL;
 	iu->dev = dev;
 
-	sg_alloc_table(&iu->sgt, 1, GFP_KERNEL);
-
 	msg.hdr.type	= cpu_to_le16(RNBD_MSG_CLOSE);
 	msg.device_id	= cpu_to_le32(device_id);
 
@@ -502,7 +507,6 @@ static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait)
 		err = errno;
 	}
 
-	sg_free_table(&iu->sgt);
 	rnbd_put_iu(sess, iu);
 	return err;
 }
@@ -575,7 +579,6 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait)
 	iu->buf = rsp;
 	iu->dev = dev;
 
-	sg_alloc_table(&iu->sgt, 1, GFP_KERNEL);
 	sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
 
 	msg.hdr.type	= cpu_to_le16(RNBD_MSG_OPEN);
@@ -594,7 +597,6 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait)
 		err = errno;
 	}
 
-	sg_free_table(&iu->sgt);
 	rnbd_put_iu(sess, iu);
 	return err;
 }
@@ -622,8 +624,6 @@ static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait)
 
 	iu->buf = rsp;
 	iu->sess = sess;
-
-	sg_alloc_table(&iu->sgt, 1, GFP_KERNEL);
 	sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
 
 	msg.hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO);
@@ -650,7 +650,6 @@ put_iu:
 	} else {
 		err = errno;
 	}
-	sg_free_table(&iu->sgt);
 	rnbd_put_iu(sess, iu);
 	return err;
 }
-- 
cgit v1.2.3


From ef8048dd2345d070c41bc7df16763fd4d8fac296 Mon Sep 17 00:00:00 2001
From: Swapnil Ingle <ingleswapnil@gmail.com>
Date: Fri, 8 Jan 2021 15:36:33 +0100
Subject: block/rnbd: Adding name to the Contributors List

Adding name to the Contributors List

Signed-off-by: Swapnil Ingle <ingleswapnil@gmail.com>
Acked-by: Jack Wang <jinpu.wang@cloud.ionos.com>
Acked-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rnbd/README | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/rnbd/README b/drivers/block/rnbd/README
index 1773c0aa0bd4..080f58a5400a 100644
--- a/drivers/block/rnbd/README
+++ b/drivers/block/rnbd/README
@@ -90,3 +90,4 @@ Kleber Souza <kleber.souza@profitbricks.com>
 Lutz Pogrell <lutz.pogrell@cloud.ionos.com>
 Milind Dumbare <Milind.dumbare@gmail.com>
 Roman Penyaev <roman.penyaev@profitbricks.com>
+Swapnil Ingle <ingleswapnil@gmail.com>
-- 
cgit v1.2.3


From 3a21777c6ee99749bac10727b3c17e5bcfebe5c1 Mon Sep 17 00:00:00 2001
From: Jack Wang <jinpu.wang@cloud.ionos.com>
Date: Fri, 8 Jan 2021 15:36:34 +0100
Subject: block/rnbd-clt: avoid module unload race with close confirmation

We had kernel panic, it is caused by unload module and last
close confirmation.

call trace:
[1196029.743127]  free_sess+0x15/0x50 [rtrs_client]
[1196029.743128]  rtrs_clt_close+0x4c/0x70 [rtrs_client]
[1196029.743129]  ? rnbd_clt_unmap_device+0x1b0/0x1b0 [rnbd_client]
[1196029.743130]  close_rtrs+0x25/0x50 [rnbd_client]
[1196029.743131]  rnbd_client_exit+0x93/0xb99 [rnbd_client]
[1196029.743132]  __x64_sys_delete_module+0x190/0x260

And in the crashdump confirmation kworker is also running.
PID: 6943   TASK: ffff9e2ac8098000  CPU: 4   COMMAND: "kworker/4:2"
 #0 [ffffb206cf337c30] __schedule at ffffffff9f93f891
 #1 [ffffb206cf337cc8] schedule at ffffffff9f93fe98
 #2 [ffffb206cf337cd0] schedule_timeout at ffffffff9f943938
 #3 [ffffb206cf337d50] wait_for_completion at ffffffff9f9410a7
 #4 [ffffb206cf337da0] __flush_work at ffffffff9f08ce0e
 #5 [ffffb206cf337e20] rtrs_clt_close_conns at ffffffffc0d5f668 [rtrs_client]
 #6 [ffffb206cf337e48] rtrs_clt_close at ffffffffc0d5f801 [rtrs_client]
 #7 [ffffb206cf337e68] close_rtrs at ffffffffc0d26255 [rnbd_client]
 #8 [ffffb206cf337e78] free_sess at ffffffffc0d262ad [rnbd_client]
 #9 [ffffb206cf337e88] rnbd_clt_put_dev at ffffffffc0d266a7 [rnbd_client]

The problem is both code path try to close same session, which lead to
panic.

To fix it, just skip the sess if the refcount already drop to 0.

Fixes: f7a7a5c228d4 ("block/rnbd: client: main functionality")
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
Reviewed-by: Gioh Kim <gi-oh.kim@cloud.ionos.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rnbd/rnbd-clt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index 506e9094487f..45a470076652 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1697,7 +1697,8 @@ static void rnbd_destroy_sessions(void)
 	 */
 
 	list_for_each_entry_safe(sess, sn, &sess_list, list) {
-		WARN_ON(!rnbd_clt_get_sess(sess));
+		if (!rnbd_clt_get_sess(sess))
+			continue;
 		close_rtrs(sess);
 		list_for_each_entry_safe(dev, tn, &sess->devs_list, list) {
 			/*
-- 
cgit v1.2.3


From 02f938e9fed1681791605ca8b96c2d9da9355f6a Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Fri, 8 Jan 2021 16:55:37 +0800
Subject: blk-mq-debugfs: Add decode for BLK_MQ_F_TAG_HCTX_SHARED

Showing the hctx flags for when BLK_MQ_F_TAG_HCTX_SHARED is set gives
something like:

root@debian:/home/john# more /sys/kernel/debug/block/sda/hctx0/flags
alloc_policy=FIFO SHOULD_MERGE|TAG_QUEUE_SHARED|3

Add the decoding for that flag.

Fixes: 32bc15afed04b ("blk-mq: Facilitate a shared sbitmap per tagset")
Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index e21eed20a155..712a95f74ccb 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -246,6 +246,7 @@ static const char *const hctx_flag_name[] = {
 	HCTX_FLAG_NAME(BLOCKING),
 	HCTX_FLAG_NAME(NO_SCHED),
 	HCTX_FLAG_NAME(STACKING),
+	HCTX_FLAG_NAME(TAG_HCTX_SHARED),
 };
 #undef HCTX_FLAG_NAME
 
-- 
cgit v1.2.3


From e80927079fd97b4d5457e3af2400a0087b561564 Mon Sep 17 00:00:00 2001
From: Yi Li <yili@winhong.com>
Date: Mon, 4 Jan 2021 15:41:18 +0800
Subject: bcache: set pdev_set_uuid before scond loop iteration

There is no need to reassign pdev_set_uuid in the second loop iteration,
so move it to the place before second loop.

Signed-off-by: Yi Li <yili@winhong.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a4752ac410dc..6aa23a6fb394 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2644,8 +2644,8 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
 	}
 
 	list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
+		char *pdev_set_uuid = pdev->dc->sb.set_uuid;
 		list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
-			char *pdev_set_uuid = pdev->dc->sb.set_uuid;
 			char *set_uuid = c->set_uuid;
 
 			if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
-- 
cgit v1.2.3


From f7b4943dea48a572ad751ce1f18a245d43debe7e Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Mon, 4 Jan 2021 15:41:19 +0800
Subject: bcache: fix typo from SUUP to SUPP in features.h

This patch fixes the following typos,
from BCH_FEATURE_COMPAT_SUUP to BCH_FEATURE_COMPAT_SUPP
from BCH_FEATURE_INCOMPAT_SUUP to BCH_FEATURE_INCOMPAT_SUPP
from BCH_FEATURE_INCOMPAT_SUUP to BCH_FEATURE_RO_COMPAT_SUPP

Fixes: d721a43ff69c ("bcache: increase super block version for cache device and backing device")
Fixes: ffa470327572 ("bcache: add bucket_size_hi into struct cache_sb_disk for large bucket")
Signed-off-by: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org # 5.9+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/features.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
index a1653c478041..32c5bbda2f0d 100644
--- a/drivers/md/bcache/features.h
+++ b/drivers/md/bcache/features.h
@@ -15,9 +15,9 @@
 /* Incompat feature set */
 #define BCH_FEATURE_INCOMPAT_LARGE_BUCKET	0x0001 /* 32bit bucket size */
 
-#define BCH_FEATURE_COMPAT_SUUP		0
-#define BCH_FEATURE_RO_COMPAT_SUUP	0
-#define BCH_FEATURE_INCOMPAT_SUUP	BCH_FEATURE_INCOMPAT_LARGE_BUCKET
+#define BCH_FEATURE_COMPAT_SUPP		0
+#define BCH_FEATURE_RO_COMPAT_SUPP	0
+#define BCH_FEATURE_INCOMPAT_SUPP	BCH_FEATURE_INCOMPAT_LARGE_BUCKET
 
 #define BCH_HAS_COMPAT_FEATURE(sb, mask) \
 		((sb)->feature_compat & (mask))
-- 
cgit v1.2.3


From 1dfc0686c29a9bbd3a446a29f9ccde3dec3bc75a Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Mon, 4 Jan 2021 15:41:20 +0800
Subject: bcache: check unsupported feature sets for bcache register

This patch adds the check for features which is incompatible for
current supported feature sets.

Now if the bcache device created by bcache-tools has features that
current kernel doesn't support, read_super() will fail with error
messoage. E.g. if an unsupported incompatible feature detected,
bcache register will fail with dmesg "bcache: register_bcache() error :
Unsupported incompatible feature found".

Fixes: d721a43ff69c ("bcache: increase super block version for cache device and backing device")
Fixes: ffa470327572 ("bcache: add bucket_size_hi into struct cache_sb_disk for large bucket")
Signed-off-by: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org # 5.9+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/features.h | 15 +++++++++++++++
 drivers/md/bcache/super.c    | 14 ++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
index 32c5bbda2f0d..e73724c2b49b 100644
--- a/drivers/md/bcache/features.h
+++ b/drivers/md/bcache/features.h
@@ -79,6 +79,21 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
 
 BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET);
 
+static inline bool bch_has_unknown_compat_features(struct cache_sb *sb)
+{
+	return ((sb->feature_compat & ~BCH_FEATURE_COMPAT_SUPP) != 0);
+}
+
+static inline bool bch_has_unknown_ro_compat_features(struct cache_sb *sb)
+{
+	return ((sb->feature_ro_compat & ~BCH_FEATURE_RO_COMPAT_SUPP) != 0);
+}
+
+static inline bool bch_has_unknown_incompat_features(struct cache_sb *sb)
+{
+	return ((sb->feature_incompat & ~BCH_FEATURE_INCOMPAT_SUPP) != 0);
+}
+
 int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size);
 int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size);
 int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 6aa23a6fb394..f4674a3298af 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -228,6 +228,20 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 		sb->feature_compat = le64_to_cpu(s->feature_compat);
 		sb->feature_incompat = le64_to_cpu(s->feature_incompat);
 		sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
+
+		/* Check incompatible features */
+		err = "Unsupported compatible feature found";
+		if (bch_has_unknown_compat_features(sb))
+			goto err;
+
+		err = "Unsupported read-only compatible feature found";
+		if (bch_has_unknown_ro_compat_features(sb))
+			goto err;
+
+		err = "Unsupported incompatible feature found";
+		if (bch_has_unknown_incompat_features(sb))
+			goto err;
+
 		err = read_super_common(sb, bdev, s);
 		if (err)
 			goto err;
-- 
cgit v1.2.3


From b16671e8f493e3df40b1fb0dff4078f391c5099a Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Mon, 4 Jan 2021 15:41:21 +0800
Subject: bcache: introduce BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE for
 large bucket

When large bucket feature was added, BCH_FEATURE_INCOMPAT_LARGE_BUCKET
was introduced into the incompat feature set. It used bucket_size_hi
(which was added at the tail of struct cache_sb_disk) to extend current
16bit bucket size to 32bit with existing bucket_size in struct
cache_sb_disk.

This is not a good idea, there are two obvious problems,
- Bucket size is always value power of 2, if store log2(bucket size) in
  existing bucket_size of struct cache_sb_disk, it is unnecessary to add
  bucket_size_hi.
- Macro csum_set() assumes d[SB_JOURNAL_BUCKETS] is the last member in
  struct cache_sb_disk, bucket_size_hi was added after d[] which makes
  csum_set calculate an unexpected super block checksum.

To fix the above problems, this patch introduces a new incompat feature
bit BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE, when this bit is set, it
means bucket_size in struct cache_sb_disk stores the order of power-of-2
bucket size value. When user specifies a bucket size larger than 32768
sectors, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE will be set to
incompat feature set, and bucket_size stores log2(bucket size) more
than store the real bucket size value.

The obsoleted BCH_FEATURE_INCOMPAT_LARGE_BUCKET won't be used anymore,
it is renamed to BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET and still only
recognized by kernel driver for legacy compatible purpose. The previous
bucket_size_hi is renmaed to obso_bucket_size_hi in struct cache_sb_disk
and not used in bcache-tools anymore.

For cache device created with BCH_FEATURE_INCOMPAT_LARGE_BUCKET feature,
bcache-tools and kernel driver still recognize the feature string and
display it as "obso_large_bucket".

With this change, the unnecessary extra space extend of bcache on-disk
super block can be avoided, and csum_set() may generate expected check
sum as well.

Fixes: ffa470327572 ("bcache: add bucket_size_hi into struct cache_sb_disk for large bucket")
Signed-off-by: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org # 5.9+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/features.c |  2 +-
 drivers/md/bcache/features.h | 11 ++++++++---
 drivers/md/bcache/super.c    | 22 +++++++++++++++++++---
 include/uapi/linux/bcache.h  |  2 +-
 4 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c
index 6469223f0b77..d636b7b2d070 100644
--- a/drivers/md/bcache/features.c
+++ b/drivers/md/bcache/features.c
@@ -17,7 +17,7 @@ struct feature {
 };
 
 static struct feature feature_list[] = {
-	{BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LARGE_BUCKET,
+	{BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE,
 		"large_bucket"},
 	{0, 0, 0 },
 };
diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
index e73724c2b49b..84fc2c0f0101 100644
--- a/drivers/md/bcache/features.h
+++ b/drivers/md/bcache/features.h
@@ -13,11 +13,15 @@
 
 /* Feature set definition */
 /* Incompat feature set */
-#define BCH_FEATURE_INCOMPAT_LARGE_BUCKET	0x0001 /* 32bit bucket size */
+/* 32bit bucket size, obsoleted */
+#define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET		0x0001
+/* real bucket size is (1 << bucket_size) */
+#define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE	0x0002
 
 #define BCH_FEATURE_COMPAT_SUPP		0
 #define BCH_FEATURE_RO_COMPAT_SUPP	0
-#define BCH_FEATURE_INCOMPAT_SUPP	BCH_FEATURE_INCOMPAT_LARGE_BUCKET
+#define BCH_FEATURE_INCOMPAT_SUPP	(BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
+					 BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE)
 
 #define BCH_HAS_COMPAT_FEATURE(sb, mask) \
 		((sb)->feature_compat & (mask))
@@ -77,7 +81,8 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
 		~BCH##_FEATURE_INCOMPAT_##flagname; \
 }
 
-BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET);
+BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET);
+BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE);
 
 static inline bool bch_has_unknown_compat_features(struct cache_sb *sb)
 {
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f4674a3298af..3999641f1775 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -64,9 +64,25 @@ static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s
 {
 	unsigned int bucket_size = le16_to_cpu(s->bucket_size);
 
-	if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES &&
-	     bch_has_feature_large_bucket(sb))
-		bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16;
+	if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
+		if (bch_has_feature_large_bucket(sb)) {
+			unsigned int max, order;
+
+			max = sizeof(unsigned int) * BITS_PER_BYTE - 1;
+			order = le16_to_cpu(s->bucket_size);
+			/*
+			 * bcache tool will make sure the overflow won't
+			 * happen, an error message here is enough.
+			 */
+			if (order > max)
+				pr_err("Bucket size (1 << %u) overflows\n",
+					order);
+			bucket_size = 1 << order;
+		} else if (bch_has_feature_obso_large_bucket(sb)) {
+			bucket_size +=
+				le16_to_cpu(s->obso_bucket_size_hi) << 16;
+		}
+	}
 
 	return bucket_size;
 }
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 52e8bcb33981..cf7399f03b71 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -213,7 +213,7 @@ struct cache_sb_disk {
 		__le16		keys;
 	};
 	__le64			d[SB_JOURNAL_BUCKETS];	/* journal buckets */
-	__le16			bucket_size_hi;
+	__le16			obso_bucket_size_hi;	/* obsoleted */
 };
 
 /*
-- 
cgit v1.2.3


From 5342fd4255021ef0c4ce7be52eea1c4ebda11c63 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Mon, 4 Jan 2021 15:41:22 +0800
Subject: bcache: set bcache device into read-only mode for
 BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET

If BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET is set in incompat feature
set, it means the cache device is created with obsoleted layout with
obso_bucket_site_hi. Now bcache does not support this feature bit, a new
BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE incompat feature bit is added
for a better layout to support large bucket size.

For the legacy compatibility purpose, if a cache device created with
obsoleted BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET feature bit, all bcache
devices attached to this cache set should be set to read-only. Then the
dirty data can be written back to backing device before re-create the
cache device with BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE feature bit
by the latest bcache-tools.

This patch checks BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET feature bit
when running a cache set and attach a bcache device to the cache set. If
this bit is set,
- When run a cache set, print an error kernel message to indicate all
  following attached bcache device will be read-only.
- When attach a bcache device, print an error kernel message to indicate
  the attached bcache device will be read-only, and ask users to update
  to latest bcache-tools.

Such change is only for cache device whose bucket size >= 32MB, this is
for the zoned SSD and almost nobody uses such large bucket size at this
moment. If you don't explicit set a large bucket size for a zoned SSD,
such change is totally transparent to your bcache device.

Fixes: ffa470327572 ("bcache: add bucket_size_hi into struct cache_sb_disk for large bucket")
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3999641f1775..2047a9cccdb5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1332,6 +1332,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 	bcache_device_link(&dc->disk, c, "bdev");
 	atomic_inc(&c->attached_dev_nr);
 
+	if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) {
+		pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
+		pr_err("Please update to the latest bcache-tools to create the cache device\n");
+		set_disk_ro(dc->disk.disk, 1);
+	}
+
 	/* Allow the writeback thread to proceed */
 	up_write(&dc->writeback_lock);
 
@@ -1554,6 +1560,12 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
 
 	bcache_device_link(d, c, "volume");
 
+	if (bch_has_feature_obso_large_bucket(&c->cache->sb)) {
+		pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
+		pr_err("Please update to the latest bcache-tools to create the cache device\n");
+		set_disk_ro(d->disk, 1);
+	}
+
 	return 0;
 err:
 	kobject_put(&d->kobj);
@@ -2113,6 +2125,9 @@ static int run_cache_set(struct cache_set *c)
 	c->cache->sb.last_mount = (u32)ktime_get_real_seconds();
 	bcache_write_super(c);
 
+	if (bch_has_feature_obso_large_bucket(&c->cache->sb))
+		pr_err("Detect obsoleted large bucket layout, all attached bcache device will be read-only\n");
+
 	list_for_each_entry_safe(dc, t, &uncached_devices, list)
 		bch_cached_dev_attach(dc, c, NULL);
 
-- 
cgit v1.2.3