From 7f6c411c9b50cfab41cc798e003eff27608c7016 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 25 Mar 2021 14:12:34 -0400 Subject: hostfs: fix memory handling in follow_link() 1) argument should not be freed in any case - the caller already has it as ->s_fs_info (and uses it a lot afterwards) 2) allocate readlink buffer with kmalloc() - the caller has no way to tell if it's got that (on absolute symlink) or a result of kasprintf(). Sure, for SLAB and SLUB kfree() works on results of kmem_cache_alloc(), but that's not documented anywhere, might change in the future *and* is already not true for SLOB. Fixes: 52b209f7b848 ("get rid of hostfs_read_inode()") Signed-off-by: Al Viro --- fs/hostfs/hostfs_kern.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 29e407762626..743a005a5c64 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -144,7 +144,7 @@ static char *follow_link(char *link) char *name, *resolved, *end; int n; - name = __getname(); + name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { n = -ENOMEM; goto out_free; @@ -173,12 +173,11 @@ static char *follow_link(char *link) goto out_free; } - __putname(name); - kfree(link); + kfree(name); return resolved; out_free: - __putname(name); + kfree(name); return ERR_PTR(n); } -- cgit v1.2.3 From 9b5b872215fe6d1ca6a1ef411f130bd58e269012 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 2 Apr 2021 10:29:36 +0200 Subject: file: fix close_range() for unshare+cloexec syzbot reported a bug when putting the last reference to a tasks file descriptor table. Debugging this showed we didn't recalculate the current maximum fd number for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC after we unshared the file descriptors table. So max_fd could exceed the current fdtable maximum causing us to set excessive bits. As a concrete example, let's say the user requested everything from fd 4 to ~0UL to be closed and their current fdtable size is 256 with their highest open fd being 4. With CLOSE_RANGE_UNSHARE the caller will end up with a new fdtable which has room for 64 file descriptors since that is the lowest fdtable size we accept. But now max_fd will still point to 255 and needs to be adjusted. Fix this by retrieving the correct maximum fd value in __range_cloexec(). Reported-by: syzbot+283ce5a46486d6acdbaf@syzkaller.appspotmail.com Fixes: 582f1fb6b721 ("fs, close_range: add flag CLOSE_RANGE_CLOEXEC") Fixes: fec8a6a69103 ("close_range: unshare all fds for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC") Cc: Christoph Hellwig Cc: Giuseppe Scrivano Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- fs/file.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index f3a4bac2cbe9..f633348029a5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -629,17 +629,30 @@ int close_fd(unsigned fd) } EXPORT_SYMBOL(close_fd); /* for ksys_close() */ +/** + * last_fd - return last valid index into fd table + * @cur_fds: files struct + * + * Context: Either rcu read lock or files_lock must be held. + * + * Returns: Last valid index into fdtable. + */ +static inline unsigned last_fd(struct fdtable *fdt) +{ + return fdt->max_fds - 1; +} + static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; - if (fd > max_fd) - return; - + /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); - bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); + max_fd = min(last_fd(fdt), max_fd); + if (fd <= max_fd) + bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } -- cgit v1.2.3 From 7d01ef7585c07afaf487759a48486228cd065726 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 6 Apr 2021 12:33:07 -0400 Subject: Make sure nd->path.mnt and nd->path.dentry are always valid pointers Initialize them in set_nameidata() and make sure that terminate_walk() clears them once the pointers become potentially invalid (i.e. we leave RCU mode or drop them in non-RCU one). Currently we have "path_init() always initializes them and nobody accesses them outside of path_init()/terminate_walk() segments", which is asking for trouble. With that change we would have nd->path.{mnt,dentry} 1) always valid - NULL or pointing to currently allocated objects. 2) non-NULL while we are successfully walking 3) NULL when we are not walking at all 4) contributing to refcounts whenever non-NULL outside of RCU mode. Fixes: 6c6ec2b0a3e0 ("fs: add support for LOOKUP_CACHED") Reported-by: syzbot+c88a7030da47945a3cc3@syzkaller.appspotmail.com Tested-by: Christian Brauner Signed-off-by: Al Viro --- fs/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 216f16e74351..fc8760d4314e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -579,6 +579,8 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) p->stack = p->internal; p->dfd = dfd; p->name = name; + p->path.mnt = NULL; + p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; @@ -652,6 +654,8 @@ static void terminate_walk(struct nameidata *nd) rcu_read_unlock(); } nd->depth = 0; + nd->path.mnt = NULL; + nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ @@ -2322,8 +2326,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } nd->root.mnt = NULL; - nd->path.mnt = NULL; - nd->path.dentry = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { -- cgit v1.2.3 From 4f0ed93fb92d3528c73c80317509df3f800a222b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 6 Apr 2021 19:46:51 -0400 Subject: LOOKUP_MOUNTPOINT: we are cleaning "jumped" flag too late That (and traversals in case of umount .) should be done before complete_walk(). Either a braino or mismerge damage on queue reorders - either way, I should've spotted that much earlier. Fucked-up-by: Al Viro X-Paperbag: Brown Fixes: 161aff1d93ab "LOOKUP_MOUNTPOINT: fold path_mountpointat() into path_lookupat()" Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Al Viro --- fs/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index fc8760d4314e..48a2f288e802 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2421,16 +2421,16 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; + if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { + err = handle_lookup_down(nd); + nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; - if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { - err = handle_lookup_down(nd); - nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... - } if (!err) { *path = nd->path; nd->path.mnt = NULL; -- cgit v1.2.3 From 4e456b30f78c429b183db420e23b26cde7e03a78 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 31 Mar 2021 14:35:24 +0000 Subject: cifs: On cifs_reconnect, resolve the hostname again. On cifs_reconnect, make sure that DNS resolution happens again. It could be the cause of connection to go dead in the first place. This also contains the fix for a build issue identified by Intel bot. Reported-by: kernel test robot Signed-off-by: Shyam Prasad N Reviewed-by: Paulo Alcantara (SUSE) Reviewed-by: Pavel Shilovsky CC: # 5.11+ Signed-off-by: Steve French --- fs/cifs/Kconfig | 3 +-- fs/cifs/Makefile | 5 +++-- fs/cifs/connect.c | 17 ++++++++++++++++- 3 files changed, 20 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index fe03cbdae959..bf52e9326ebe 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -18,6 +18,7 @@ config CIFS select CRYPTO_AES select CRYPTO_LIB_DES select KEYS + select DNS_RESOLVER help This is the client VFS module for the SMB3 family of NAS protocols, (including support for the most recent, most secure dialect SMB3.1.1) @@ -112,7 +113,6 @@ config CIFS_WEAK_PW_HASH config CIFS_UPCALL bool "Kerberos/SPNEGO advanced session setup" depends on CIFS - select DNS_RESOLVER help Enables an upcall mechanism for CIFS which accesses userspace helper utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets @@ -179,7 +179,6 @@ config CIFS_DEBUG_DUMP_KEYS config CIFS_DFS_UPCALL bool "DFS feature support" depends on CIFS - select DNS_RESOLVER help Distributed File System (DFS) support is used to access shares transparently in an enterprise name space, even if the share diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 5213b20843b5..3ee3b7de4ded 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -10,13 +10,14 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ - smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o + smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ + dns_resolve.o cifs-$(CONFIG_CIFS_XATTR) += xattr.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o -cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o +cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index eec8a2052da2..24668eb006c6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -87,7 +87,6 @@ static void cifs_prune_tlinks(struct work_struct *work); * * This should be called with server->srv_mutex held. */ -#ifdef CONFIG_CIFS_DFS_UPCALL static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) { int rc; @@ -124,6 +123,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) return !rc ? -1 : 0; } +#ifdef CONFIG_CIFS_DFS_UPCALL /* These functions must be called with server->srv_mutex held */ static void reconn_set_next_dfs_target(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb, @@ -321,14 +321,29 @@ cifs_reconnect(struct TCP_Server_Info *server) #endif #ifdef CONFIG_CIFS_DFS_UPCALL + if (cifs_sb && cifs_sb->origin_fullpath) /* * Set up next DFS target server (if any) for reconnect. If DFS * feature is disabled, then we will retry last server we * connected to before. */ reconn_set_next_dfs_target(server, cifs_sb, &tgt_list, &tgt_it); + else { +#endif + /* + * Resolve the hostname again to make sure that IP address is up-to-date. + */ + rc = reconn_set_ipaddr_from_hostname(server); + if (rc) { + cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", + __func__, rc); + } + +#ifdef CONFIG_CIFS_DFS_UPCALL + } #endif + #ifdef CONFIG_CIFS_SWN_UPCALL } #endif -- cgit v1.2.3 From d135be0a7fb83f4dd68721b3355fec6de686834c Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 1 Apr 2021 15:51:17 +0800 Subject: fs: cifs: Remove unnecessary struct declaration struct cifs_readdata is declared twice. One is declared at 208th line. And struct cifs_readdata is defined blew. The declaration here is not needed. Remove the duplicate. Signed-off-by: Wan Jiabing Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 67c056a9a519..ec824ab8c5ca 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1283,8 +1283,6 @@ struct cifs_aio_ctx { bool direct_io; }; -struct cifs_readdata; - /* asynchronous read support */ struct cifs_readdata { struct kref refcount; -- cgit v1.2.3 From 0fc9322ab5e1fe6910c9673e1a7ff29f7dd72611 Mon Sep 17 00:00:00 2001 From: Maciek Borzecki Date: Tue, 6 Apr 2021 17:02:29 +0200 Subject: cifs: escape spaces in share names Commit 653a5efb849a ("cifs: update super_operations to show_devname") introduced the display of devname for cifs mounts. However, when mounting a share which has a whitespace in the name, that exact share name is also displayed in mountinfo. Make sure that all whitespace is escaped. Signed-off-by: Maciek Borzecki CC: # 5.11+ Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 099ad9f3660b..5ddd20b62484 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -476,7 +476,8 @@ static int cifs_show_devname(struct seq_file *m, struct dentry *root) seq_puts(m, "none"); else { convert_delimiter(devname, '/'); - seq_puts(m, devname); + /* escape all spaces in share names */ + seq_escape(m, devname, " \t"); kfree(devname); } return 0; -- cgit v1.2.3 From 6ad7f2332e84c46f0c94e73e05b5b7c2bc1a6b74 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 8 Apr 2021 01:54:39 +0100 Subject: io_uring: clear F_REISSUE right after getting it There are lots of ways r/w request may continue its path after getting REQ_F_REISSUE, it's not necessarily io-wq and can be, e.g. apoll, and submitted via io_async_task_func() -> __io_req_task_submit() Clear the flag right after getting it, so the next attempt is well prepared regardless how the request will be executed. Fixes: 230d50d448ac ("io_uring: move reissue into regular IO path") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/11dcead939343f4e27cab0074d34afcab771bfa4.1617842918.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 65a17d560a73..f1881ac0744b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3294,6 +3294,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(req, iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { + req->flags &= ~REQ_F_REISSUE; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; @@ -3417,8 +3418,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; - if (req->flags & REQ_F_REISSUE) + if (req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; ret2 = -EAGAIN; + } /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just @@ -6173,7 +6176,6 @@ static void io_wq_submit_work(struct io_wq_work *work) ret = -ECANCELED; if (!ret) { - req->flags &= ~REQ_F_REISSUE; do { ret = io_issue_sqe(req, 0); /* -- cgit v1.2.3 From 9728463737db027557e8ba315cbbca6b81122c04 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 8 Apr 2021 19:28:03 +0100 Subject: io_uring: fix rw req completion WARNING: at fs/io_uring.c:8578 io_ring_exit_work.cold+0x0/0x18 As reissuing is now passed back by REQ_F_REISSUE and kiocb_done() internally uses __io_complete_rw(), it may stop after setting the flag so leaving a dangling request. There are tricky edge cases, e.g. reading beyound file, boundary, so the easiest way is to hand code reissue in kiocb_done() as __io_complete_rw() was doing for us before. Fixes: 230d50d448ac ("io_uring: move reissue into regular IO path") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f602250d292f8a84cca9a01d747744d1e797be26.1617842918.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index f1881ac0744b..bd14327c8e7e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2762,6 +2762,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_async_rw *io = req->async_data; + bool check_reissue = kiocb->ki_complete == io_complete_rw; /* add previously done IO, if any */ if (io && io->bytes_done > 0) { @@ -2777,6 +2778,18 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, __io_complete_rw(req, ret, 0, issue_flags); else io_rw_done(kiocb, ret); + + if (check_reissue && req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; + if (!io_rw_reissue(req)) { + int cflags = 0; + + req_set_fail_links(req); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_rw_kbuf(req); + __io_req_complete(req, issue_flags, ret, cflags); + } + } } static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) -- cgit v1.2.3 From c60eb049f4a19ddddcd3ee97a9c79ab8066a6a03 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 8 Apr 2021 01:54:42 +0100 Subject: io-wq: cancel unbounded works on io-wq destroy WARNING: CPU: 5 PID: 227 at fs/io_uring.c:8578 io_ring_exit_work+0xe6/0x470 RIP: 0010:io_ring_exit_work+0xe6/0x470 Call Trace: process_one_work+0x206/0x400 worker_thread+0x4a/0x3d0 kthread+0x129/0x170 ret_from_fork+0x22/0x30 INFO: task lfs-openat:2359 blocked for more than 245 seconds. task:lfs-openat state:D stack: 0 pid: 2359 ppid: 1 flags:0x00000004 Call Trace: ... wait_for_completion+0x8b/0xf0 io_wq_destroy_manager+0x24/0x60 io_wq_put_and_exit+0x18/0x30 io_uring_clean_tctx+0x76/0xa0 __io_uring_files_cancel+0x1b9/0x2e0 do_exit+0xc0/0xb40 ... Even after io-wq destroy has been issued io-wq worker threads will continue executing all left work items as usual, and may hang waiting for I/O that won't ever complete (aka unbounded). [<0>] pipe_read+0x306/0x450 [<0>] io_iter_do_read+0x1e/0x40 [<0>] io_read+0xd5/0x330 [<0>] io_issue_sqe+0xd21/0x18a0 [<0>] io_wq_submit_work+0x6c/0x140 [<0>] io_worker_handle_work+0x17d/0x400 [<0>] io_wqe_worker+0x2c0/0x330 [<0>] ret_from_fork+0x22/0x30 Cancel all unbounded I/O instead of executing them. This changes the user visible behaviour, but that's inevitable as io-wq is not per task. Suggested-by: Jens Axboe Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cd4b543154154cba055cf86f351441c2174d7f71.1617842918.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 433c4d3c3c1c..4eba531bea5a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -415,6 +415,7 @@ static void io_worker_handle_work(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; + bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); do { struct io_wq_work *work; @@ -444,6 +445,9 @@ get_next: unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); + + if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) + work->flags |= IO_WQ_WORK_CANCEL; wq->do_work(work); io_assign_current_work(worker, NULL); -- cgit v1.2.3 From 90bd070aae6c4fb5d302f9c4b9c88be60c8197ec Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 9 Apr 2021 13:27:29 -0700 Subject: ocfs2: fix deadlock between setattr and dio_end_io_write The following deadlock is detected: truncate -> setattr path is waiting for pending direct IO to be done (inode->i_dio_count become zero) with inode->i_rwsem held (down_write). PID: 14827 TASK: ffff881686a9af80 CPU: 20 COMMAND: "ora_p005_hrltd9" #0 __schedule at ffffffff818667cc #1 schedule at ffffffff81866de6 #2 inode_dio_wait at ffffffff812a2d04 #3 ocfs2_setattr at ffffffffc05f322e [ocfs2] #4 notify_change at ffffffff812a5a09 #5 do_truncate at ffffffff812808f5 #6 do_sys_ftruncate.constprop.18 at ffffffff81280cf2 #7 sys_ftruncate at ffffffff81280d8e #8 do_syscall_64 at ffffffff81003949 #9 entry_SYSCALL_64_after_hwframe at ffffffff81a001ad dio completion path is going to complete one direct IO (decrement inode->i_dio_count), but before that it hung at locking inode->i_rwsem: #0 __schedule+700 at ffffffff818667cc #1 schedule+54 at ffffffff81866de6 #2 rwsem_down_write_failed+536 at ffffffff8186aa28 #3 call_rwsem_down_write_failed+23 at ffffffff8185a1b7 #4 down_write+45 at ffffffff81869c9d #5 ocfs2_dio_end_io_write+180 at ffffffffc05d5444 [ocfs2] #6 ocfs2_dio_end_io+85 at ffffffffc05d5a85 [ocfs2] #7 dio_complete+140 at ffffffff812c873c #8 dio_aio_complete_work+25 at ffffffff812c89f9 #9 process_one_work+361 at ffffffff810b1889 #10 worker_thread+77 at ffffffff810b233d #11 kthread+261 at ffffffff810b7fd5 #12 ret_from_fork+62 at ffffffff81a0035e Thus above forms ABBA deadlock. The same deadlock was mentioned in upstream commit 28f5a8a7c033 ("ocfs2: should wait dio before inode lock in ocfs2_setattr()"). It seems that that commit only removed the cluster lock (the victim of above dead lock) from the ABBA deadlock party. End-user visible effects: Process hang in truncate -> ocfs2_setattr path and other processes hang at ocfs2_dio_end_io_write path. This is to fix the deadlock itself. It removes inode_lock() call from dio completion path to remove the deadlock and add ip_alloc_sem lock in setattr path to synchronize the inode modifications. [wen.gang.wang@oracle.com: remove the "had_alloc_lock" as suggested] Link: https://lkml.kernel.org/r/20210402171344.1605-1-wen.gang.wang@oracle.com Link: https://lkml.kernel.org/r/20210331203654.3911-1-wen.gang.wang@oracle.com Signed-off-by: Wengang Wang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 11 +---------- fs/ocfs2/file.c | 8 ++++++-- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3bfb4147895a..ad20403b383f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2295,7 +2295,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle = NULL; loff_t end = offset + bytes; - int ret = 0, credits = 0, locked = 0; + int ret = 0, credits = 0; ocfs2_init_dealloc_ctxt(&dealloc); @@ -2306,13 +2306,6 @@ static int ocfs2_dio_end_io_write(struct inode *inode, !dwc->dw_orphaned) goto out; - /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we - * are in that context. */ - if (dwc->dw_writer_pid != task_pid_nr(current)) { - inode_lock(inode); - locked = 1; - } - ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); @@ -2393,8 +2386,6 @@ out: if (meta_ac) ocfs2_free_alloc_context(meta_ac); ocfs2_run_deallocs(osb, &dealloc); - if (locked) - inode_unlock(inode); ocfs2_dio_free_write_ctx(inode, dwc); return ret; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6611c64ca0be..5edc1d0cf115 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1245,22 +1245,24 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto bail_unlock; } } + down_write(&OCFS2_I(inode)->ip_alloc_sem); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + 2 * ocfs2_quota_trans_credits(sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail_unlock_alloc; } status = __dquot_transfer(inode, transfer_to); if (status < 0) goto bail_commit; } else { + down_write(&OCFS2_I(inode)->ip_alloc_sem); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail_unlock_alloc; } } @@ -1273,6 +1275,8 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, bail_commit: ocfs2_commit_trans(osb, handle); +bail_unlock_alloc: + up_write(&OCFS2_I(inode)->ip_alloc_sem); bail_unlock: if (status && inode_locked) { ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); -- cgit v1.2.3 From df41872b68601059dd4a84858952dcae58acd331 Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Fri, 9 Apr 2021 13:27:35 -0700 Subject: fs: direct-io: fix missing sdio->boundary I encountered a hung task issue, but not a performance one. I run DIO on a device (need lba continuous, for example open channel ssd), maybe hungtask in below case: DIO: Checkpoint: get addr A(at boundary), merge into BIO, no submit because boundary missing flush dirty data(get addr A+1), wait IO(A+1) writeback timeout, because DIO(A) didn't submit get addr A+2 fail, because checkpoint is doing dio_send_cur_page() may clear sdio->boundary, so prevent it from missing a boundary. Link: https://lkml.kernel.org/r/20210322042253.38312-1-jack.qiu@huawei.com Fixes: b1058b981272 ("direct-io: submit bio after boundary buffer is added to it") Signed-off-by: Jack Qiu Reviewed-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index b61491bf3166..b2e86e739d7a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -812,6 +812,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, struct buffer_head *map_bh) { int ret = 0; + int boundary = sdio->boundary; /* dio_send_cur_page may clear it */ if (dio->op == REQ_OP_WRITE) { /* @@ -850,10 +851,10 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: /* - * If sdio->boundary then we want to schedule the IO now to + * If boundary then we want to schedule the IO now to * avoid metadata seeks. */ - if (sdio->boundary) { + if (boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); if (sdio->bio) dio_bio_submit(dio, sdio); -- cgit v1.2.3 From 53b74fa990bf76f290aa5930abfcf37424a1a865 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 8 Apr 2021 17:25:28 +0900 Subject: btrfs: zoned: move superblock logging zone location Moves the location of the superblock logging zones. The new locations of the logging zones are now determined based on fixed block addresses instead of on fixed zone numbers. The old placement method based on fixed zone numbers causes problems when one needs to inspect a file system image without access to the drive zone information. In such case, the super block locations cannot be reliably determined as the zone size is unknown. By locating the superblock logging zones using fixed addresses, we can scan a dumped file system image without the zone information since a super block copy will always be present at or after the fixed known locations. Introduce the following three pairs of zones containing fixed offset locations, regardless of the device zone size. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If a logging zone is outside of the disk capacity, we do not record the superblock copy. The first copy position is much larger than for a non-zoned filesystem, which is at 64M. This is to avoid overlapping with the log zones for the primary superblock. This higher location is arbitrary but allows supporting devices with very large zone sizes, plus some space around in between. Such large zone size is unrealistic and very unlikely to ever be seen in real devices. Currently, SMR disks have a zone size of 256MB, and we are expecting ZNS drives to be in the 1-4GB range, so this limit gives us room to breathe. For now, we only allow zone sizes up to 8GB. The maximum zone size that would still fit in the space is 256G. The fixed location addresses are somewhat arbitrary, with the intent of maintaining superblock reliability for smaller and larger devices, with the preference for the latter. For this reason, there are two superblocks under the first 1T. This should cover use cases for physical devices and for emulated/device-mapper devices. The superblock logging zones are reserved for superblock logging and never used for data or metadata blocks. Note that we only reserve the two zones per primary/copy actually used for superblock logging. We do not reserve the ranges of zones possibly containing superblocks with the largest supported zone size (0-16GB, 512G-528GB, 4096G-4112G). The zones containing the fixed location offsets used to store superblocks on a non-zoned volume are also reserved to avoid confusion. Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 53 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 43948bd40e02..ba7a303300a3 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -21,9 +21,30 @@ /* Pseudo write pointer value for conventional zone */ #define WP_CONVENTIONAL ((u64)-2) +/* + * Location of the first zone of superblock logging zone pairs. + * + * - primary superblock: 0B (zone 0) + * - first copy: 512G (zone starting at that offset) + * - second copy: 4T (zone starting at that offset) + */ +#define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) +#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) +#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) + +#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) +#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) + /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 +/* + * Maximum supported zone size. Currently, SMR disks have a zone size of + * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not + * expect the zone size to become larger than 8GiB in the near future. + */ +#define BTRFS_MAX_ZONE_SIZE SZ_8G + static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; @@ -111,23 +132,22 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, } /* - * The following zones are reserved as the circular buffer on ZONED btrfs. - * - The primary superblock: zones 0 and 1 - * - The first copy: zones 16 and 17 - * - The second copy: zones 1024 or zone at 256GB which is minimum, and - * the following one + * Get the first zone number of the superblock mirror */ static inline u32 sb_zone_number(int shift, int mirror) { - ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + u64 zone; + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { - case 0: return 0; - case 1: return 16; - case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024); + case 0: zone = 0; break; + case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; + case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; } - return 0; + ASSERT(zone <= U32_MAX); + + return (u32)zone; } /* @@ -300,10 +320,21 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) zone_sectors = bdev_zone_sectors(bdev); } - nr_sectors = bdev_nr_sectors(bdev); /* Check if it's power of 2 (see is_power_of_2) */ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; + + /* We reject devices with a zone size larger than 8GB */ + if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu larger than supported maximum %llu", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); + ret = -EINVAL; + goto out; + } + + nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->max_zone_append_size = (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; -- cgit v1.2.3