From 501e7a4689378f8b1690089bfdd4f1e12ec22903 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 2 Jun 2017 11:21:34 -0400 Subject: NFSv4.2: Don't send mode again in post-EXCLUSIVE4_1 SETATTR with umask Now that we have umask support, we shouldn't re-send the mode in a SETATTR following an exclusive CREATE, or we risk having the same problem fixed in commit 5334c5bdac92 ("NFS: Send attributes in OPEN request for NFS4_CREATE_EXCLUSIVE4_1"), which is that files with S_ISGID will have that bit stripped away. Signed-off-by: Benjamin Coddington Fixes: dff25ddb4808 ("nfs: add support for the umask attribute") Cc: stable@vger.kernel.org # v4.10+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c08c46a3b8cd..4e43fd1270c4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2589,7 +2589,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, /* Except MODE, it seems harmless of setting twice. */ if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && - attrset[1] & FATTR4_WORD1_MODE) + (attrset[1] & FATTR4_WORD1_MODE || + attrset[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_valid &= ~ATTR_MODE; if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) -- cgit v1.2.3 From 8a7b0d8e8d9962ec3b2ae64dd4e86d68a6fb9220 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 May 2017 08:30:40 +0300 Subject: CIFS: Set ->should_dirty in cifs_user_readv() The current code causes a static checker warning because ITER_IOVEC is zero so the condition is never true. Fixes: 6685c5e2d1ac ("CIFS: Add asynchronous read support through kernel AIO") Signed-off-by: Dan Carpenter Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0fd081bd2a2f..fcef70602b27 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3271,7 +3271,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) if (!is_sync_kiocb(iocb)) ctx->iocb = iocb; - if (to->type & ITER_IOVEC) + if (to->type == ITER_IOVEC) ctx->should_dirty = true; rc = setup_aio_ctx_iter(ctx, to, READ); -- cgit v1.2.3 From ecf3411a121e7a653e309ff50a820ffa87c537f8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 17 May 2017 19:24:15 +0100 Subject: CIFS: check if pages is null rather than bv for a failed allocation pages is being allocated however a null check on bv is being used to see if the allocation failed. Fix this by checking if pages is null. Detected by CoverityScan, CID#1432974 ("Logically dead code") Fixes: ccf7f4088af2dd ("CIFS: Add asynchronous context to support kernel AIO") Signed-off-by: Colin Ian King Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index b08531977daa..3b147dc6af63 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -810,7 +810,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) if (!pages) { pages = vmalloc(max_pages * sizeof(struct page *)); - if (!bv) { + if (!pages) { kvfree(bv); return -ENOMEM; } -- cgit v1.2.3 From dcd87838c06f05ab7650b249ebf0d5b57ae63e1e Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 6 Jun 2017 16:58:58 -0700 Subject: CIFS: Improve readdir verbosity Downgrade the loglevel for SMB2 to prevent filling the log with messages if e.g. readdir was interrupted. Also make SMB2 and SMB1 codepaths do the same logging during readdir. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable --- fs/cifs/smb1ops.c | 9 +++++++-- fs/cifs/smb2ops.c | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 27bc360c7ffd..a723df3e0197 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -849,8 +849,13 @@ cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid, __u16 search_flags, struct cifs_search_info *srch_inf) { - return CIFSFindFirst(xid, tcon, path, cifs_sb, - &fid->netfid, search_flags, srch_inf, true); + int rc; + + rc = CIFSFindFirst(xid, tcon, path, cifs_sb, + &fid->netfid, search_flags, srch_inf, true); + if (rc) + cifs_dbg(FYI, "find first failed=%d\n", rc); + return rc; } static int diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c58691834eb2..59726013375b 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -982,7 +982,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); kfree(utf16_path); if (rc) { - cifs_dbg(VFS, "open dir failed\n"); + cifs_dbg(FYI, "open dir failed rc=%d\n", rc); return rc; } @@ -992,7 +992,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, fid->volatile_fid, 0, srch_inf); if (rc) { - cifs_dbg(VFS, "query directory failed\n"); + cifs_dbg(FYI, "query directory failed rc=%d\n", rc); SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); } return rc; -- cgit v1.2.3 From e125f5284f81bbb765a504494622b45c02faf978 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 7 Jun 2017 00:33:45 +0100 Subject: cifs: remove redundant return in cifs_creation_time_get There is a redundant return in function cifs_creation_time_get that appears to be old vestigial code than can be removed. So remove it. Detected by CoverityScan, CID#1361924 ("Structurally dead code") Signed-off-by: Colin Ian King Signed-off-by: Steve French --- fs/cifs/xattr.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 3cb5c9e2d4e7..de50e749ff05 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -188,8 +188,6 @@ static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode, pcreatetime = (__u64 *)value; *pcreatetime = CIFS_I(inode)->createtime; return sizeof(__u64); - - return rc; } -- cgit v1.2.3 From 517a6e43c4872c89794af5b377fa085e47345952 Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Sun, 11 Jun 2017 09:12:47 +0200 Subject: CIFS: Fix some return values in case of error in 'crypt_message' 'rc' is known to be 0 at this point. So if 'init_sg' or 'kzalloc' fails, we should return -ENOMEM instead. Also remove a useless 'rc' in a debug message as it is meaningless here. Fixes: 026e93dc0a3ee ("CIFS: Encrypt SMB3 requests before sending") Signed-off-by: Christophe JAILLET Reviewed-by: Pavel Shilovsky Reviewed-by: Aurelien Aptel Signed-off-by: Steve French CC: Stable --- fs/cifs/smb2ops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 59726013375b..7e48561abd29 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1809,7 +1809,8 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) sg = init_sg(rqst, sign); if (!sg) { - cifs_dbg(VFS, "%s: Failed to init sg %d", __func__, rc); + cifs_dbg(VFS, "%s: Failed to init sg", __func__); + rc = -ENOMEM; goto free_req; } @@ -1817,6 +1818,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) iv = kzalloc(iv_len, GFP_KERNEL); if (!iv) { cifs_dbg(VFS, "%s: Failed to alloc IV", __func__); + rc = -ENOMEM; goto free_sg; } iv[0] = 3; -- cgit v1.2.3 From eb5e248d502bec191bd99f04cae8b49992b3abde Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 21 Jun 2017 20:27:35 -0700 Subject: xfs: don't allow bmap on rt files bmap returns a dumb LBA address but not the block device that goes with that LBA. Swapfiles don't care about this and will blindly assume that the data volume is the correct blockdev, which is totally bogus for files on the rt subvolume. This results in the swap code doing IOs to arbitrary locations on the data device(!) if the passed in mapping is a realtime file, so just turn off bmap for rt files. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_aops.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 09af0f7cd55e..3b91faacc1ba 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1316,9 +1316,12 @@ xfs_vm_bmap( * The swap code (ab-)uses ->bmap to get a block mapping and then * bypasseѕ the file system for actual I/O. We really can't allow * that on reflinks inodes, so we have to skip out here. And yes, - * 0 is the magic code for a bmap error.. + * 0 is the magic code for a bmap error. + * + * Since we don't pass back blockdev info, we can't return bmap + * information for rt files either. */ - if (xfs_is_reflink_inode(ip)) + if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; filemap_write_and_wait(mapping); -- cgit v1.2.3 From 9fa4eb8e490a28de40964b1b0e583d8db4c7e57c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Jun 2017 15:08:43 -0700 Subject: autofs: sanity check status reported with AUTOFS_DEV_IOCTL_FAIL If a positive status is passed with the AUTOFS_DEV_IOCTL_FAIL ioctl, autofs4_d_automount() will return ERR_PTR(status) with that status to follow_automount(), which will then dereference an invalid pointer. So treat a positive status the same as zero, and map to ENOENT. See comment in systemd src/core/automount.c::automount_send_ready(). Link: http://lkml.kernel.org/r/871sqwczx5.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Cc: Ian Kent Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/dev-ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 734cbf8d9676..dd9f1bebb5a3 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -344,7 +344,7 @@ static int autofs_dev_ioctl_fail(struct file *fp, int status; token = (autofs_wqt_t) param->fail.token; - status = param->fail.status ? param->fail.status : -ENOENT; + status = param->fail.status < 0 ? param->fail.status : -ENOENT; return autofs4_wait_release(sbi, token, status); } -- cgit v1.2.3 From 1eb643d02b21412e603b42cdd96010a2ac31c05f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 23 Jun 2017 15:08:46 -0700 Subject: fs/dax.c: fix inefficiency in dax_writeback_mapping_range() dax_writeback_mapping_range() fails to update iteration index when searching radix tree for entries needing cache flushing. Thus each pagevec worth of entries is searched starting from the start which is inefficient and prone to livelocks. Update index properly. Link: http://lkml.kernel.org/r/20170619124531.21491-1-jack@suse.cz Fixes: 9973c98ecfda3 ("dax: add support for fsync/sync") Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 2a6889b3585f..9187f3b07f3e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -859,6 +859,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (ret < 0) goto out; } + start_index = indices[pvec.nr - 1] + 1; } out: put_dax(dax_dev); -- cgit v1.2.3 From 8818efaaacb78c60a9d90c5705b6c99b75d7d442 Mon Sep 17 00:00:00 2001 From: Eric Ren Date: Fri, 23 Jun 2017 15:08:55 -0700 Subject: ocfs2: fix deadlock caused by recursive locking in xattr Another deadlock path caused by recursive locking is reported. This kind of issue was introduced since commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). Two deadlock paths have been fixed by commit b891fa5024a9 ("ocfs2: fix deadlock issue when taking inode lock at vfs entry points"). Yes, we intend to fix this kind of case in incremental way, because it's hard to find out all possible paths at once. This one can be reproduced like this. On node1, cp a large file from home directory to ocfs2 mountpoint. While on node2, run setfacl/getfacl. Both nodes will hang up there. The backtraces: On node1: __ocfs2_cluster_lock.isra.39+0x357/0x740 [ocfs2] ocfs2_inode_lock_full_nested+0x17d/0x840 [ocfs2] ocfs2_write_begin+0x43/0x1a0 [ocfs2] generic_perform_write+0xa9/0x180 __generic_file_write_iter+0x1aa/0x1d0 ocfs2_file_write_iter+0x4f4/0xb40 [ocfs2] __vfs_write+0xc3/0x130 vfs_write+0xb1/0x1a0 SyS_write+0x46/0xa0 On node2: __ocfs2_cluster_lock.isra.39+0x357/0x740 [ocfs2] ocfs2_inode_lock_full_nested+0x17d/0x840 [ocfs2] ocfs2_xattr_set+0x12e/0xe80 [ocfs2] ocfs2_set_acl+0x22d/0x260 [ocfs2] ocfs2_iop_set_acl+0x65/0xb0 [ocfs2] set_posix_acl+0x75/0xb0 posix_acl_xattr_set+0x49/0xa0 __vfs_setxattr+0x69/0x80 __vfs_setxattr_noperm+0x72/0x1a0 vfs_setxattr+0xa7/0xb0 setxattr+0x12d/0x190 path_setxattr+0x9f/0xb0 SyS_setxattr+0x14/0x20 Fix this one by using ocfs2_inode_{lock|unlock}_tracker, which is exported by commit 439a36b8ef38 ("ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock"). Link: http://lkml.kernel.org/r/20170622014746.5815-1-zren@suse.com Fixes: 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()") Signed-off-by: Eric Ren Reported-by: Thomas Voegtle Tested-by: Thomas Voegtle Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 4 ++++ fs/ocfs2/xattr.c | 23 +++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3b7c937a36b5..4689940a953c 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2591,6 +2591,10 @@ void ocfs2_inode_unlock_tracker(struct inode *inode, struct ocfs2_lock_res *lockres; lockres = &OCFS2_I(inode)->ip_inode_lockres; + /* had_lock means that the currect process already takes the cluster + * lock previously. If had_lock is 1, we have nothing to do here, and + * it will get unlocked where we got the lock. + */ if (!had_lock) { ocfs2_remove_holder(lockres, oh); ocfs2_inode_unlock(inode, ex); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 3c5384d9b3a5..f70c3778d600 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1328,20 +1328,21 @@ static int ocfs2_xattr_get(struct inode *inode, void *buffer, size_t buffer_size) { - int ret; + int ret, had_lock; struct buffer_head *di_bh = NULL; + struct ocfs2_lock_holder oh; - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - return ret; + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh); + if (had_lock < 0) { + mlog_errno(had_lock); + return had_lock; } down_read(&OCFS2_I(inode)->ip_xattr_sem); ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, name, buffer, buffer_size); up_read(&OCFS2_I(inode)->ip_xattr_sem); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); @@ -3537,11 +3538,12 @@ int ocfs2_xattr_set(struct inode *inode, { struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - int ret, credits, ref_meta = 0, ref_credits = 0; + int ret, credits, had_lock, ref_meta = 0, ref_credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, }; struct ocfs2_refcount_tree *ref_tree = NULL; + struct ocfs2_lock_holder oh; struct ocfs2_xattr_info xi = { .xi_name_index = name_index, @@ -3572,8 +3574,9 @@ int ocfs2_xattr_set(struct inode *inode, return -ENOMEM; } - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 1, &oh); + if (had_lock < 0) { + ret = had_lock; mlog_errno(ret); goto cleanup_nolock; } @@ -3670,7 +3673,7 @@ cleanup: if (ret) mlog_errno(ret); } - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); cleanup_nolock: brelse(di_bh); brelse(xbs.xattr_bh); -- cgit v1.2.3 From 98da7d08850fb8bdeb395d6368ed15753304aa0c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Jun 2017 15:08:57 -0700 Subject: fs/exec.c: account for argv/envp pointers When limiting the argv/envp strings during exec to 1/4 of the stack limit, the storage of the pointers to the strings was not included. This means that an exec with huge numbers of tiny strings could eat 1/4 of the stack limit in strings and then additional space would be later used by the pointers to the strings. For example, on 32-bit with a 8MB stack rlimit, an exec with 1677721 single-byte strings would consume less than 2MB of stack, the max (8MB / 4) amount allowed, but the pointers to the strings would consume the remaining additional stack space (1677721 * 4 == 6710884). The result (1677721 + 6710884 == 8388605) would exhaust stack space entirely. Controlling this stack exhaustion could result in pathological behavior in setuid binaries (CVE-2017-1000365). [akpm@linux-foundation.org: additional commenting from Kees] Fixes: b6a2fea39318 ("mm: variable length argument support") Link: http://lkml.kernel.org/r/20170622001720.GA32173@beast Signed-off-by: Kees Cook Acked-by: Rik van Riel Acked-by: Michal Hocko Cc: Alexander Viro Cc: Qualys Security Advisory Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 72934df68471..904199086490 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -220,8 +220,26 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, if (write) { unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; + unsigned long ptr_size; struct rlimit *rlim; + /* + * Since the stack will hold pointers to the strings, we + * must account for them as well. + * + * The size calculation is the entire vma while each arg page is + * built, so each time we get here it's calculating how far it + * is currently (rather than each call being just the newly + * added size from the arg page). As a result, we need to + * always add the entire size of the pointers, so that on the + * last call to get_arg_page() we'll actually have the entire + * correct size. + */ + ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + if (ptr_size > ULONG_MAX - size) + goto fail; + size += ptr_size; + acct_arg_size(bprm, size / PAGE_SIZE); /* @@ -239,13 +257,15 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ rlim = current->signal->rlim; - if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { - put_page(page); - return NULL; - } + if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) + goto fail; } return page; + +fail: + put_page(page); + return NULL; } static void put_arg_page(struct page *page) -- cgit v1.2.3 From 898fc11bb2bd4fbcefb685872d9fffaba2c8edaf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Jun 2017 10:16:56 -0400 Subject: NFS: Trunking detection should handle ERESTARTSYS/EINTR Currently, it will return EIO in those cases. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b34de036501b..cbf82b0d4467 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2134,6 +2134,8 @@ again: put_rpccred(cred); switch (status) { case 0: + case -EINTR: + case -ERESTARTSYS: break; case -ETIMEDOUT: if (clnt->cl_softrtry) -- cgit v1.2.3 From bd171930e6a3de4f5cffdafbb944e50093dfb59b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 27 Jun 2017 17:33:38 -0400 Subject: NFSv4.1: Fix a race in nfs4_proc_layoutget If the task calling layoutget is signalled, then it is possible for the calls to nfs4_sequence_free_slot() and nfs4_layoutget_prepare() to race, in which case we leak a slot. The fix is to move the call to nfs4_sequence_free_slot() into the nfs4_layoutget_release() so that it gets called at task teardown time. Fixes: 2e80dbe7ac51 ("NFSv4.1: Close callback races for OPEN, LAYOUTGET...") Cc: stable@vger.kernel.org # v4.8+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4e43fd1270c4..dbfa18900e25 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8417,6 +8417,7 @@ static void nfs4_layoutget_release(void *calldata) size_t max_pages = max_response_pages(server); dprintk("--> %s\n", __func__); + nfs4_sequence_free_slot(&lgp->res.seq_res); nfs4_free_pages(lgp->args.layout.pages, max_pages); pnfs_put_layout_hdr(NFS_I(inode)->layout); put_nfs_open_context(lgp->args.ctx); @@ -8491,7 +8492,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); - nfs4_sequence_free_slot(&lgp->res.seq_res); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) -- cgit v1.2.3 From d9f2950006f110f54444a10442752372ee568289 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 16 Jun 2017 11:12:59 -0400 Subject: Revert "NFS: nfs_rename() handle -ERESTARTSYS dentry left behind" This reverts commit 920b4530fb80430ff30ef83efe21ba1fa5623731 which could call d_move() without holding the directory's i_mutex, and reverts commit d4ea7e3c5c0e341c15b073016dbf3ab6c65f12f3 "NFS: Fix old dentry rehash after move", which was a follow-up fix. Signed-off-by: Benjamin Coddington Fixes: 920b4530fb80 ("NFS: nfs_rename() handle -ERESTARTSYS dentry left behind") Cc: stable@vger.kernel.org # v4.10+ Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 51 ++++++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32ccd7754f8a..2ac00bf4ecf1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1946,29 +1946,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL_GPL(nfs_link); -static void -nfs_complete_rename(struct rpc_task *task, struct nfs_renamedata *data) -{ - struct dentry *old_dentry = data->old_dentry; - struct dentry *new_dentry = data->new_dentry; - struct inode *old_inode = d_inode(old_dentry); - struct inode *new_inode = d_inode(new_dentry); - - nfs_mark_for_revalidate(old_inode); - - switch (task->tk_status) { - case 0: - if (new_inode != NULL) - nfs_drop_nlink(new_inode); - d_move(old_dentry, new_dentry); - nfs_set_verifier(new_dentry, - nfs_save_change_attribute(data->new_dir)); - break; - case -ENOENT: - nfs_dentry_handle_enoent(old_dentry); - } -} - /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -1999,7 +1976,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct dentry *dentry = NULL; + struct dentry *dentry = NULL, *rehash = NULL; struct rpc_task *task; int error = -EBUSY; @@ -2022,8 +1999,10 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * To prevent any new references to the target during the * rename, we unhash the dentry in advance. */ - if (!d_unhashed(new_dentry)) + if (!d_unhashed(new_dentry)) { d_drop(new_dentry); + rehash = new_dentry; + } if (d_count(new_dentry) > 2) { int err; @@ -2040,6 +2019,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; new_dentry = dentry; + rehash = NULL; new_inode = NULL; } } @@ -2048,8 +2028,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode != NULL) NFS_PROTO(new_inode)->return_delegation(new_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, - nfs_complete_rename); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); goto out; @@ -2059,9 +2038,27 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error == 0) error = task->tk_status; rpc_put_task(task); + nfs_mark_for_revalidate(old_inode); out: + if (rehash) + d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); + if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); + /* + * The d_move() should be here instead of in an async RPC completion + * handler because we need the proper locks to move the dentry. If + * we're interrupted by a signal, the async RPC completion handler + * should mark the directories for revalidation. + */ + d_move(old_dentry, new_dentry); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(new_dir)); + } else if (error == -ENOENT) + nfs_dentry_handle_enoent(old_dentry); + /* new dentry created? */ if (dentry) dput(dentry); -- cgit v1.2.3 From 2e31b4cb895ae78db31dffb860cd255d86c6561c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 27 Jun 2017 17:40:50 -0400 Subject: NFSv4.1: nfs4_callback_free_slot() cannot call nfs4_slot_tbl_drain_complete() The current code works only for the case where we have exactly one slot, which is no longer true. nfs4_free_slot() will automatically declare the callback channel to be drained when all slots have been returned. Signed-off-by: Trond Myklebust --- fs/nfs/callback_xdr.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index c14758e08d73..390ac9c39c59 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -753,7 +753,6 @@ static void nfs4_callback_free_slot(struct nfs4_session *session, * A single slot, so highest used slotid is either 0 or -1 */ nfs4_free_slot(tbl, slot); - nfs4_slot_tbl_drain_complete(tbl); spin_unlock(&tbl->slot_tbl_lock); } -- cgit v1.2.3 From 9ae3b3f52c62ddd5eb12c57f195f4f38121faa01 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Jun 2017 15:30:13 -0600 Subject: block: provide bio_uninit() free freeing integrity/task associations Wen reports significant memory leaks with DIF and O_DIRECT: "With nvme devive + T10 enabled, On a system it has 256GB and started logging /proc/meminfo & /proc/slabinfo for every minute and in an hour it increased by 15968128 kB or ~15+GB.. Approximately 256 MB / minute leaking. /proc/meminfo | grep SUnreclaim... SUnreclaim: 6752128 kB SUnreclaim: 6874880 kB SUnreclaim: 7238080 kB .... SUnreclaim: 22307264 kB SUnreclaim: 22485888 kB SUnreclaim: 22720256 kB When testcases with T10 enabled call into __blkdev_direct_IO_simple, code doesn't free memory allocated by bio_integrity_alloc. The patch fixes the issue. HTX has been run with +60 hours without failure." Since __blkdev_direct_IO_simple() allocates the bio on the stack, it doesn't go through the regular bio free. This means that any ancillary data allocated with the bio through the stack is not freed. Hence, we can leak the integrity data associated with the bio, if the device is using DIF/DIX. Fix this by providing a bio_uninit() and export it, so that we can use it to free this data. Note that this is a minimal fix for this issue. Any current user of bio's that are allocated outside of bio_alloc_bioset() suffers from this issue, most notably some drivers. We will fix those in a more comprehensive patch for 4.13. This also means that the commit marked as being fixed by this isn't the real culprit, it's just the most obvious one out there. Fixes: 542ff7bf18c6 ("block: new direct I/O implementation") Reported-by: Wen Xiong Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 12 +++++++++--- fs/block_dev.c | 5 ++++- include/linux/bio.h | 1 + 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/block/bio.c b/block/bio.c index 888e7801c638..26b0810fb8ea 100644 --- a/block/bio.c +++ b/block/bio.c @@ -240,20 +240,21 @@ fallback: return bvl; } -static void __bio_free(struct bio *bio) +void bio_uninit(struct bio *bio) { bio_disassociate_task(bio); if (bio_integrity(bio)) bio_integrity_free(bio); } +EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; void *p; - __bio_free(bio); + bio_uninit(bio); if (bs) { bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); @@ -271,6 +272,11 @@ static void bio_free(struct bio *bio) } } +/* + * Users of this function have their own bio allocation. Subsequently, + * they must remember to pair any call to bio_init() with bio_uninit() + * when IO has completed, or when the bio is released. + */ void bio_init(struct bio *bio, struct bio_vec *table, unsigned short max_vecs) { @@ -297,7 +303,7 @@ void bio_reset(struct bio *bio) { unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - __bio_free(bio); + bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); bio->bi_flags = flags; diff --git a/fs/block_dev.c b/fs/block_dev.c index 519599dddd36..0a7404ef9335 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -263,7 +263,10 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, kfree(vecs); if (unlikely(bio.bi_error)) - return bio.bi_error; + ret = bio.bi_error; + + bio_uninit(&bio); + return ret; } diff --git a/include/linux/bio.h b/include/linux/bio.h index d1b04b0e99cf..a7e29fa0981f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -426,6 +426,7 @@ extern void bio_advance(struct bio *, unsigned); extern void bio_init(struct bio *bio, struct bio_vec *table, unsigned short max_vecs); +extern void bio_uninit(struct bio *); extern void bio_reset(struct bio *); void bio_chain(struct bio *, struct bio *); -- cgit v1.2.3