From 7f6c411c9b50cfab41cc798e003eff27608c7016 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 25 Mar 2021 14:12:34 -0400
Subject: hostfs: fix memory handling in follow_link()

1) argument should not be freed in any case - the caller already has
it as ->s_fs_info (and uses it a lot afterwards)
2) allocate readlink buffer with kmalloc() - the caller has no way
to tell if it's got that (on absolute symlink) or a result of
kasprintf().  Sure, for SLAB and SLUB kfree() works on results of
kmem_cache_alloc(), but that's not documented anywhere, might change
in the future *and* is already not true for SLOB.

Fixes: 52b209f7b848 ("get rid of hostfs_read_inode()")
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hostfs/hostfs_kern.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 29e407762626..743a005a5c64 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -144,7 +144,7 @@ static char *follow_link(char *link)
 	char *name, *resolved, *end;
 	int n;
 
-	name = __getname();
+	name = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!name) {
 		n = -ENOMEM;
 		goto out_free;
@@ -173,12 +173,11 @@ static char *follow_link(char *link)
 		goto out_free;
 	}
 
-	__putname(name);
-	kfree(link);
+	kfree(name);
 	return resolved;
 
  out_free:
-	__putname(name);
+	kfree(name);
 	return ERR_PTR(n);
 }
 
-- 
cgit v1.2.3


From 9b5b872215fe6d1ca6a1ef411f130bd58e269012 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 2 Apr 2021 10:29:36 +0200
Subject: file: fix close_range() for unshare+cloexec

syzbot reported a bug when putting the last reference to a tasks file
descriptor table. Debugging this showed we didn't recalculate the
current maximum fd number for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC
after we unshared the file descriptors table. So max_fd could exceed the
current fdtable maximum causing us to set excessive bits. As a concrete
example, let's say the user requested everything from fd 4 to ~0UL to be
closed and their current fdtable size is 256 with their highest open fd
being 4. With CLOSE_RANGE_UNSHARE the caller will end up with a new
fdtable which has room for 64 file descriptors since that is the lowest
fdtable size we accept. But now max_fd will still point to 255 and needs
to be adjusted. Fix this by retrieving the correct maximum fd value in
__range_cloexec().

Reported-by: syzbot+283ce5a46486d6acdbaf@syzkaller.appspotmail.com
Fixes: 582f1fb6b721 ("fs, close_range: add flag CLOSE_RANGE_CLOEXEC")
Fixes: fec8a6a69103 ("close_range: unshare all fds for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC")
Cc: Christoph Hellwig <hch@lst.de>
Cc: Giuseppe Scrivano <gscrivan@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/file.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index f3a4bac2cbe9..f633348029a5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -629,17 +629,30 @@ int close_fd(unsigned fd)
 }
 EXPORT_SYMBOL(close_fd); /* for ksys_close() */
 
+/**
+ * last_fd - return last valid index into fd table
+ * @cur_fds: files struct
+ *
+ * Context: Either rcu read lock or files_lock must be held.
+ *
+ * Returns: Last valid index into fdtable.
+ */
+static inline unsigned last_fd(struct fdtable *fdt)
+{
+	return fdt->max_fds - 1;
+}
+
 static inline void __range_cloexec(struct files_struct *cur_fds,
 				   unsigned int fd, unsigned int max_fd)
 {
 	struct fdtable *fdt;
 
-	if (fd > max_fd)
-		return;
-
+	/* make sure we're using the correct maximum value */
 	spin_lock(&cur_fds->file_lock);
 	fdt = files_fdtable(cur_fds);
-	bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
+	max_fd = min(last_fd(fdt), max_fd);
+	if (fd <= max_fd)
+		bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
 	spin_unlock(&cur_fds->file_lock);
 }
 
-- 
cgit v1.2.3


From 7d01ef7585c07afaf487759a48486228cd065726 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 6 Apr 2021 12:33:07 -0400
Subject: Make sure nd->path.mnt and nd->path.dentry are always valid pointers

Initialize them in set_nameidata() and make sure that terminate_walk() clears them
once the pointers become potentially invalid (i.e. we leave RCU mode or drop them
in non-RCU one).  Currently we have "path_init() always initializes them and nobody
accesses them outside of path_init()/terminate_walk() segments", which is asking
for trouble.

With that change we would have nd->path.{mnt,dentry}
	1) always valid - NULL or pointing to currently allocated objects.
	2) non-NULL while we are successfully walking
	3) NULL when we are not walking at all
	4) contributing to refcounts whenever non-NULL outside of RCU mode.

Fixes: 6c6ec2b0a3e0 ("fs: add support for LOOKUP_CACHED")
Reported-by: syzbot+c88a7030da47945a3cc3@syzkaller.appspotmail.com
Tested-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 216f16e74351..fc8760d4314e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -579,6 +579,8 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 	p->stack = p->internal;
 	p->dfd = dfd;
 	p->name = name;
+	p->path.mnt = NULL;
+	p->path.dentry = NULL;
 	p->total_link_count = old ? old->total_link_count : 0;
 	p->saved = old;
 	current->nameidata = p;
@@ -652,6 +654,8 @@ static void terminate_walk(struct nameidata *nd)
 		rcu_read_unlock();
 	}
 	nd->depth = 0;
+	nd->path.mnt = NULL;
+	nd->path.dentry = NULL;
 }
 
 /* path_put is needed afterwards regardless of success or failure */
@@ -2322,8 +2326,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	}
 
 	nd->root.mnt = NULL;
-	nd->path.mnt = NULL;
-	nd->path.dentry = NULL;
 
 	/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
 	if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
-- 
cgit v1.2.3


From 4f0ed93fb92d3528c73c80317509df3f800a222b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 6 Apr 2021 19:46:51 -0400
Subject: LOOKUP_MOUNTPOINT: we are cleaning "jumped" flag too late

That (and traversals in case of umount .) should be done before
complete_walk().  Either a braino or mismerge damage on queue
reorders - either way, I should've spotted that much earlier.

Fucked-up-by: Al Viro <viro@zeniv.linux.org.uk>
X-Paperbag: Brown
Fixes: 161aff1d93ab "LOOKUP_MOUNTPOINT: fold path_mountpointat() into path_lookupat()"
Cc: stable@vger.kernel.org # v5.7+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index fc8760d4314e..48a2f288e802 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2421,16 +2421,16 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
 	while (!(err = link_path_walk(s, nd)) &&
 	       (s = lookup_last(nd)) != NULL)
 		;
+	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
+		err = handle_lookup_down(nd);
+		nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
+	}
 	if (!err)
 		err = complete_walk(nd);
 
 	if (!err && nd->flags & LOOKUP_DIRECTORY)
 		if (!d_can_lookup(nd->path.dentry))
 			err = -ENOTDIR;
-	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
-		err = handle_lookup_down(nd);
-		nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
-	}
 	if (!err) {
 		*path = nd->path;
 		nd->path.mnt = NULL;
-- 
cgit v1.2.3


From 4e456b30f78c429b183db420e23b26cde7e03a78 Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Wed, 31 Mar 2021 14:35:24 +0000
Subject: cifs: On cifs_reconnect, resolve the hostname again.

On cifs_reconnect, make sure that DNS resolution happens again.
It could be the cause of connection to go dead in the first place.

This also contains the fix for a build issue identified by Intel bot.
Reported-by: kernel test robot <lkp@intel.com>

Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
CC: <stable@vger.kernel.org> # 5.11+
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/Kconfig   |  3 +--
 fs/cifs/Makefile  |  5 +++--
 fs/cifs/connect.c | 17 ++++++++++++++++-
 3 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index fe03cbdae959..bf52e9326ebe 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -18,6 +18,7 @@ config CIFS
 	select CRYPTO_AES
 	select CRYPTO_LIB_DES
 	select KEYS
+	select DNS_RESOLVER
 	help
 	  This is the client VFS module for the SMB3 family of NAS protocols,
 	  (including support for the most recent, most secure dialect SMB3.1.1)
@@ -112,7 +113,6 @@ config CIFS_WEAK_PW_HASH
 config CIFS_UPCALL
 	bool "Kerberos/SPNEGO advanced session setup"
 	depends on CIFS
-	select DNS_RESOLVER
 	help
 	  Enables an upcall mechanism for CIFS which accesses userspace helper
 	  utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
@@ -179,7 +179,6 @@ config CIFS_DEBUG_DUMP_KEYS
 config CIFS_DFS_UPCALL
 	bool "DFS feature support"
 	depends on CIFS
-	select DNS_RESOLVER
 	help
 	  Distributed File System (DFS) support is used to access shares
 	  transparently in an enterprise name space, even if the share
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 5213b20843b5..3ee3b7de4ded 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -10,13 +10,14 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
 	  cifs_unicode.o nterr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \
 	  smb2ops.o smb2maperror.o smb2transport.o \
-	  smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o
+	  smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
+	  dns_resolve.o
 
 cifs-$(CONFIG_CIFS_XATTR) += xattr.o
 
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 
-cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o
 
 cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index eec8a2052da2..24668eb006c6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -87,7 +87,6 @@ static void cifs_prune_tlinks(struct work_struct *work);
  *
  * This should be called with server->srv_mutex held.
  */
-#ifdef CONFIG_CIFS_DFS_UPCALL
 static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
 {
 	int rc;
@@ -124,6 +123,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
 	return !rc ? -1 : 0;
 }
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
 /* These functions must be called with server->srv_mutex held */
 static void reconn_set_next_dfs_target(struct TCP_Server_Info *server,
 				       struct cifs_sb_info *cifs_sb,
@@ -321,14 +321,29 @@ cifs_reconnect(struct TCP_Server_Info *server)
 #endif
 
 #ifdef CONFIG_CIFS_DFS_UPCALL
+		if (cifs_sb && cifs_sb->origin_fullpath)
 			/*
 			 * Set up next DFS target server (if any) for reconnect. If DFS
 			 * feature is disabled, then we will retry last server we
 			 * connected to before.
 			 */
 			reconn_set_next_dfs_target(server, cifs_sb, &tgt_list, &tgt_it);
+		else {
+#endif
+			/*
+			 * Resolve the hostname again to make sure that IP address is up-to-date.
+			 */
+			rc = reconn_set_ipaddr_from_hostname(server);
+			if (rc) {
+				cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
+						__func__, rc);
+			}
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+		}
 #endif
 
+
 #ifdef CONFIG_CIFS_SWN_UPCALL
 		}
 #endif
-- 
cgit v1.2.3


From d135be0a7fb83f4dd68721b3355fec6de686834c Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Thu, 1 Apr 2021 15:51:17 +0800
Subject: fs: cifs: Remove unnecessary struct declaration

struct cifs_readdata is declared twice. One is declared
at 208th line.
And struct cifs_readdata is defined blew.
The declaration here is not needed. Remove the duplicate.

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/cifsglob.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 67c056a9a519..ec824ab8c5ca 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1283,8 +1283,6 @@ struct cifs_aio_ctx {
 	bool			direct_io;
 };
 
-struct cifs_readdata;
-
 /* asynchronous read support */
 struct cifs_readdata {
 	struct kref			refcount;
-- 
cgit v1.2.3


From 0fc9322ab5e1fe6910c9673e1a7ff29f7dd72611 Mon Sep 17 00:00:00 2001
From: Maciek Borzecki <maciek.borzecki@gmail.com>
Date: Tue, 6 Apr 2021 17:02:29 +0200
Subject: cifs: escape spaces in share names

Commit 653a5efb849a ("cifs: update super_operations to show_devname")
introduced the display of devname for cifs mounts. However, when mounting
a share which has a whitespace in the name, that exact share name is also
displayed in mountinfo. Make sure that all whitespace is escaped.

Signed-off-by: Maciek Borzecki <maciek.borzecki@gmail.com>
CC: <stable@vger.kernel.org> # 5.11+
Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/cifsfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 099ad9f3660b..5ddd20b62484 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -476,7 +476,8 @@ static int cifs_show_devname(struct seq_file *m, struct dentry *root)
 		seq_puts(m, "none");
 	else {
 		convert_delimiter(devname, '/');
-		seq_puts(m, devname);
+		/* escape all spaces in share names */
+		seq_escape(m, devname, " \t");
 		kfree(devname);
 	}
 	return 0;
-- 
cgit v1.2.3


From 6ad7f2332e84c46f0c94e73e05b5b7c2bc1a6b74 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 8 Apr 2021 01:54:39 +0100
Subject: io_uring: clear F_REISSUE right after getting it

There are lots of ways r/w request may continue its path after getting
REQ_F_REISSUE, it's not necessarily io-wq and can be, e.g. apoll,
and submitted via  io_async_task_func() -> __io_req_task_submit()

Clear the flag right after getting it, so the next attempt is well
prepared regardless how the request will be executed.

Fixes: 230d50d448ac ("io_uring: move reissue into regular IO path")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/11dcead939343f4e27cab0074d34afcab771bfa4.1617842918.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 65a17d560a73..f1881ac0744b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3294,6 +3294,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 	ret = io_iter_do_read(req, iter);
 
 	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
+		req->flags &= ~REQ_F_REISSUE;
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
 			goto done;
@@ -3417,8 +3418,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	else
 		ret2 = -EINVAL;
 
-	if (req->flags & REQ_F_REISSUE)
+	if (req->flags & REQ_F_REISSUE) {
+		req->flags &= ~REQ_F_REISSUE;
 		ret2 = -EAGAIN;
+	}
 
 	/*
 	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
@@ -6173,7 +6176,6 @@ static void io_wq_submit_work(struct io_wq_work *work)
 		ret = -ECANCELED;
 
 	if (!ret) {
-		req->flags &= ~REQ_F_REISSUE;
 		do {
 			ret = io_issue_sqe(req, 0);
 			/*
-- 
cgit v1.2.3


From 9728463737db027557e8ba315cbbca6b81122c04 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 8 Apr 2021 19:28:03 +0100
Subject: io_uring: fix rw req completion

WARNING: at fs/io_uring.c:8578 io_ring_exit_work.cold+0x0/0x18

As reissuing is now passed back by REQ_F_REISSUE and kiocb_done()
internally uses __io_complete_rw(), it may stop after setting the flag
so leaving a dangling request.

There are tricky edge cases, e.g. reading beyound file, boundary, so
the easiest way is to hand code reissue in kiocb_done() as
__io_complete_rw() was doing for us before.

Fixes: 230d50d448ac ("io_uring: move reissue into regular IO path")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f602250d292f8a84cca9a01d747744d1e797be26.1617842918.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f1881ac0744b..bd14327c8e7e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2762,6 +2762,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 	struct io_async_rw *io = req->async_data;
+	bool check_reissue = kiocb->ki_complete == io_complete_rw;
 
 	/* add previously done IO, if any */
 	if (io && io->bytes_done > 0) {
@@ -2777,6 +2778,18 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 		__io_complete_rw(req, ret, 0, issue_flags);
 	else
 		io_rw_done(kiocb, ret);
+
+	if (check_reissue && req->flags & REQ_F_REISSUE) {
+		req->flags &= ~REQ_F_REISSUE;
+		if (!io_rw_reissue(req)) {
+			int cflags = 0;
+
+			req_set_fail_links(req);
+			if (req->flags & REQ_F_BUFFER_SELECTED)
+				cflags = io_put_rw_kbuf(req);
+			__io_req_complete(req, issue_flags, ret, cflags);
+		}
+	}
 }
 
 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
-- 
cgit v1.2.3


From c60eb049f4a19ddddcd3ee97a9c79ab8066a6a03 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 8 Apr 2021 01:54:42 +0100
Subject: io-wq: cancel unbounded works on io-wq destroy

WARNING: CPU: 5 PID: 227 at fs/io_uring.c:8578 io_ring_exit_work+0xe6/0x470
RIP: 0010:io_ring_exit_work+0xe6/0x470
Call Trace:
 process_one_work+0x206/0x400
 worker_thread+0x4a/0x3d0
 kthread+0x129/0x170
 ret_from_fork+0x22/0x30

INFO: task lfs-openat:2359 blocked for more than 245 seconds.
task:lfs-openat      state:D stack:    0 pid: 2359 ppid:     1 flags:0x00000004
Call Trace:
 ...
 wait_for_completion+0x8b/0xf0
 io_wq_destroy_manager+0x24/0x60
 io_wq_put_and_exit+0x18/0x30
 io_uring_clean_tctx+0x76/0xa0
 __io_uring_files_cancel+0x1b9/0x2e0
 do_exit+0xc0/0xb40
 ...

Even after io-wq destroy has been issued io-wq worker threads will
continue executing all left work items as usual, and may hang waiting
for I/O that won't ever complete (aka unbounded).

[<0>] pipe_read+0x306/0x450
[<0>] io_iter_do_read+0x1e/0x40
[<0>] io_read+0xd5/0x330
[<0>] io_issue_sqe+0xd21/0x18a0
[<0>] io_wq_submit_work+0x6c/0x140
[<0>] io_worker_handle_work+0x17d/0x400
[<0>] io_wqe_worker+0x2c0/0x330
[<0>] ret_from_fork+0x22/0x30

Cancel all unbounded I/O instead of executing them. This changes the
user visible behaviour, but that's inevitable as io-wq is not per task.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/cd4b543154154cba055cf86f351441c2174d7f71.1617842918.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 433c4d3c3c1c..4eba531bea5a 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -415,6 +415,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 {
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
+	bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
 
 	do {
 		struct io_wq_work *work;
@@ -444,6 +445,9 @@ get_next:
 			unsigned int hash = io_get_work_hash(work);
 
 			next_hashed = wq_next_work(work);
+
+			if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
+				work->flags |= IO_WQ_WORK_CANCEL;
 			wq->do_work(work);
 			io_assign_current_work(worker, NULL);
 
-- 
cgit v1.2.3


From 90bd070aae6c4fb5d302f9c4b9c88be60c8197ec Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Fri, 9 Apr 2021 13:27:29 -0700
Subject: ocfs2: fix deadlock between setattr and dio_end_io_write

The following deadlock is detected:

  truncate -> setattr path is waiting for pending direct IO to be done (inode->i_dio_count become zero) with inode->i_rwsem held (down_write).

  PID: 14827  TASK: ffff881686a9af80  CPU: 20  COMMAND: "ora_p005_hrltd9"
   #0  __schedule at ffffffff818667cc
   #1  schedule at ffffffff81866de6
   #2  inode_dio_wait at ffffffff812a2d04
   #3  ocfs2_setattr at ffffffffc05f322e [ocfs2]
   #4  notify_change at ffffffff812a5a09
   #5  do_truncate at ffffffff812808f5
   #6  do_sys_ftruncate.constprop.18 at ffffffff81280cf2
   #7  sys_ftruncate at ffffffff81280d8e
   #8  do_syscall_64 at ffffffff81003949
   #9  entry_SYSCALL_64_after_hwframe at ffffffff81a001ad

dio completion path is going to complete one direct IO (decrement
inode->i_dio_count), but before that it hung at locking inode->i_rwsem:

   #0  __schedule+700 at ffffffff818667cc
   #1  schedule+54 at ffffffff81866de6
   #2  rwsem_down_write_failed+536 at ffffffff8186aa28
   #3  call_rwsem_down_write_failed+23 at ffffffff8185a1b7
   #4  down_write+45 at ffffffff81869c9d
   #5  ocfs2_dio_end_io_write+180 at ffffffffc05d5444 [ocfs2]
   #6  ocfs2_dio_end_io+85 at ffffffffc05d5a85 [ocfs2]
   #7  dio_complete+140 at ffffffff812c873c
   #8  dio_aio_complete_work+25 at ffffffff812c89f9
   #9  process_one_work+361 at ffffffff810b1889
  #10  worker_thread+77 at ffffffff810b233d
  #11  kthread+261 at ffffffff810b7fd5
  #12  ret_from_fork+62 at ffffffff81a0035e

Thus above forms ABBA deadlock.  The same deadlock was mentioned in
upstream commit 28f5a8a7c033 ("ocfs2: should wait dio before inode lock
in ocfs2_setattr()").  It seems that that commit only removed the
cluster lock (the victim of above dead lock) from the ABBA deadlock
party.

End-user visible effects: Process hang in truncate -> ocfs2_setattr path
and other processes hang at ocfs2_dio_end_io_write path.

This is to fix the deadlock itself.  It removes inode_lock() call from
dio completion path to remove the deadlock and add ip_alloc_sem lock in
setattr path to synchronize the inode modifications.

[wen.gang.wang@oracle.com: remove the "had_alloc_lock" as suggested]
  Link: https://lkml.kernel.org/r/20210402171344.1605-1-wen.gang.wang@oracle.com

Link: https://lkml.kernel.org/r/20210331203654.3911-1-wen.gang.wang@oracle.com
Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c | 11 +----------
 fs/ocfs2/file.c |  8 ++++++--
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3bfb4147895a..ad20403b383f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2295,7 +2295,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle = NULL;
 	loff_t end = offset + bytes;
-	int ret = 0, credits = 0, locked = 0;
+	int ret = 0, credits = 0;
 
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
@@ -2306,13 +2306,6 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
 	    !dwc->dw_orphaned)
 		goto out;
 
-	/* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
-	 * are in that context. */
-	if (dwc->dw_writer_pid != task_pid_nr(current)) {
-		inode_lock(inode);
-		locked = 1;
-	}
-
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -2393,8 +2386,6 @@ out:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 	ocfs2_run_deallocs(osb, &dealloc);
-	if (locked)
-		inode_unlock(inode);
 	ocfs2_dio_free_write_ctx(inode, dwc);
 
 	return ret;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6611c64ca0be..5edc1d0cf115 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1245,22 +1245,24 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 				goto bail_unlock;
 			}
 		}
+		down_write(&OCFS2_I(inode)->ip_alloc_sem);
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
 					   2 * ocfs2_quota_trans_credits(sb));
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
 			mlog_errno(status);
-			goto bail_unlock;
+			goto bail_unlock_alloc;
 		}
 		status = __dquot_transfer(inode, transfer_to);
 		if (status < 0)
 			goto bail_commit;
 	} else {
+		down_write(&OCFS2_I(inode)->ip_alloc_sem);
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
 			mlog_errno(status);
-			goto bail_unlock;
+			goto bail_unlock_alloc;
 		}
 	}
 
@@ -1273,6 +1275,8 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
+bail_unlock_alloc:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 bail_unlock:
 	if (status && inode_locked) {
 		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
-- 
cgit v1.2.3


From df41872b68601059dd4a84858952dcae58acd331 Mon Sep 17 00:00:00 2001
From: Jack Qiu <jack.qiu@huawei.com>
Date: Fri, 9 Apr 2021 13:27:35 -0700
Subject: fs: direct-io: fix missing sdio->boundary

I encountered a hung task issue, but not a performance one.  I run DIO
on a device (need lba continuous, for example open channel ssd), maybe
hungtask in below case:

  DIO:						Checkpoint:
  get addr A(at boundary), merge into BIO,
  no submit because boundary missing
						flush dirty data(get addr A+1), wait IO(A+1)
						writeback timeout, because DIO(A) didn't submit
  get addr A+2 fail, because checkpoint is doing

dio_send_cur_page() may clear sdio->boundary, so prevent it from missing
a boundary.

Link: https://lkml.kernel.org/r/20210322042253.38312-1-jack.qiu@huawei.com
Fixes: b1058b981272 ("direct-io: submit bio after boundary buffer is added to it")
Signed-off-by: Jack Qiu <jack.qiu@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index b61491bf3166..b2e86e739d7a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -812,6 +812,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		    struct buffer_head *map_bh)
 {
 	int ret = 0;
+	int boundary = sdio->boundary;	/* dio_send_cur_page may clear it */
 
 	if (dio->op == REQ_OP_WRITE) {
 		/*
@@ -850,10 +851,10 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 	sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
 out:
 	/*
-	 * If sdio->boundary then we want to schedule the IO now to
+	 * If boundary then we want to schedule the IO now to
 	 * avoid metadata seeks.
 	 */
-	if (sdio->boundary) {
+	if (boundary) {
 		ret = dio_send_cur_page(dio, sdio, map_bh);
 		if (sdio->bio)
 			dio_bio_submit(dio, sdio);
-- 
cgit v1.2.3


From 53b74fa990bf76f290aa5930abfcf37424a1a865 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 8 Apr 2021 17:25:28 +0900
Subject: btrfs: zoned: move superblock logging zone location

Moves the location of the superblock logging zones. The new locations of
the logging zones are now determined based on fixed block addresses
instead of on fixed zone numbers.

The old placement method based on fixed zone numbers causes problems when
one needs to inspect a file system image without access to the drive zone
information. In such case, the super block locations cannot be reliably
determined as the zone size is unknown. By locating the superblock logging
zones using fixed addresses, we can scan a dumped file system image without
the zone information since a super block copy will always be present at or
after the fixed known locations.

Introduce the following three pairs of zones containing fixed offset
locations, regardless of the device zone size.

  - primary superblock: offset   0B (and the following zone)
  - first copy:         offset 512G (and the following zone)
  - Second copy:        offset   4T (4096G, and the following zone)

If a logging zone is outside of the disk capacity, we do not record the
superblock copy.

The first copy position is much larger than for a non-zoned filesystem,
which is at 64M.  This is to avoid overlapping with the log zones for
the primary superblock. This higher location is arbitrary but allows
supporting devices with very large zone sizes, plus some space around in
between.

Such large zone size is unrealistic and very unlikely to ever be seen in
real devices. Currently, SMR disks have a zone size of 256MB, and we are
expecting ZNS drives to be in the 1-4GB range, so this limit gives us
room to breathe. For now, we only allow zone sizes up to 8GB. The
maximum zone size that would still fit in the space is 256G.

The fixed location addresses are somewhat arbitrary, with the intent of
maintaining superblock reliability for smaller and larger devices, with
the preference for the latter. For this reason, there are two superblocks
under the first 1T. This should cover use cases for physical devices and
for emulated/device-mapper devices.

The superblock logging zones are reserved for superblock logging and
never used for data or metadata blocks. Note that we only reserve the
two zones per primary/copy actually used for superblock logging. We do
not reserve the ranges of zones possibly containing superblocks with the
largest supported zone size (0-16GB, 512G-528GB, 4096G-4112G).

The zones containing the fixed location offsets used to store
superblocks on a non-zoned volume are also reserved to avoid confusion.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 53 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 43948bd40e02..ba7a303300a3 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -21,9 +21,30 @@
 /* Pseudo write pointer value for conventional zone */
 #define WP_CONVENTIONAL ((u64)-2)
 
+/*
+ * Location of the first zone of superblock logging zone pairs.
+ *
+ * - primary superblock:    0B (zone 0)
+ * - first copy:          512G (zone starting at that offset)
+ * - second copy:           4T (zone starting at that offset)
+ */
+#define BTRFS_SB_LOG_PRIMARY_OFFSET	(0ULL)
+#define BTRFS_SB_LOG_FIRST_OFFSET	(512ULL * SZ_1G)
+#define BTRFS_SB_LOG_SECOND_OFFSET	(4096ULL * SZ_1G)
+
+#define BTRFS_SB_LOG_FIRST_SHIFT	const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
+#define BTRFS_SB_LOG_SECOND_SHIFT	const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
+
 /* Number of superblock log zones */
 #define BTRFS_NR_SB_LOG_ZONES 2
 
+/*
+ * Maximum supported zone size. Currently, SMR disks have a zone size of
+ * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
+ * expect the zone size to become larger than 8GiB in the near future.
+ */
+#define BTRFS_MAX_ZONE_SIZE		SZ_8G
+
 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
 {
 	struct blk_zone *zones = data;
@@ -111,23 +132,22 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
 }
 
 /*
- * The following zones are reserved as the circular buffer on ZONED btrfs.
- *  - The primary superblock: zones 0 and 1
- *  - The first copy: zones 16 and 17
- *  - The second copy: zones 1024 or zone at 256GB which is minimum, and
- *                     the following one
+ * Get the first zone number of the superblock mirror
  */
 static inline u32 sb_zone_number(int shift, int mirror)
 {
-	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+	u64 zone;
 
+	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
 	switch (mirror) {
-	case 0: return 0;
-	case 1: return 16;
-	case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
+	case 0: zone = 0; break;
+	case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
+	case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
 	}
 
-	return 0;
+	ASSERT(zone <= U32_MAX);
+
+	return (u32)zone;
 }
 
 /*
@@ -300,10 +320,21 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 		zone_sectors = bdev_zone_sectors(bdev);
 	}
 
-	nr_sectors = bdev_nr_sectors(bdev);
 	/* Check if it's power of 2 (see is_power_of_2) */
 	ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
 	zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
+
+	/* We reject devices with a zone size larger than 8GB */
+	if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
+		btrfs_err_in_rcu(fs_info,
+		"zoned: %s: zone size %llu larger than supported maximum %llu",
+				 rcu_str_deref(device->name),
+				 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nr_sectors = bdev_nr_sectors(bdev);
 	zone_info->zone_size_shift = ilog2(zone_info->zone_size);
 	zone_info->max_zone_append_size =
 		(u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
-- 
cgit v1.2.3