From 125892edfe69915a227d8d125ff0e1cd713178f4 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 1 Jan 2019 18:54:26 +0900 Subject: inotify: Fix fd refcount leak in inotify_add_watch(). Commit 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()") forgot to call fdput() before bailing out. Fixes: 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()") CC: stable@vger.kernel.org Signed-off-by: Tetsuo Handa Reviewed-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/inotify/inotify_user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 105576daca4a..798f1253141a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -724,8 +724,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, return -EBADF; /* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */ - if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) - return -EINVAL; + if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) { + ret = -EINVAL; + goto fput_and_out; + } /* verify that this is indeed an inotify instance */ if (unlikely(f.file->f_op != &inotify_fops)) { -- cgit v1.2.3 From c2b8bd49d35a768d3966c5e14e8f6971f2a63439 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 9 Jan 2019 17:23:54 +0000 Subject: afs: Use struct_size() in kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David Howells --- fs/afs/server_list.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 95d0761cdb34..155dc14caef9 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -42,9 +42,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, if (vldb->fs_mask[i] & type_mask) nr_servers++; - slist = kzalloc(sizeof(struct afs_server_list) + - sizeof(struct afs_server_entry) * nr_servers, - GFP_KERNEL); + slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL); if (!slist) goto error; -- cgit v1.2.3 From 5edc22cc1d33d6a88d175d25adc38d2a5cee134d Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Wed, 9 Jan 2019 17:23:54 +0000 Subject: afs: Set correct lock type for the yfs CreateFile A lock type of 0 is "LockRead", which makes the fileserver record an unintentional read lock on the new file. This will cause problems later on if the file is the subject of locking operations. The correct default value should be -1 ("LockNone"). Fix the operation marshalling code to set the value and provide an enum to symbolise the values whilst we're at it. Fixes: 30062bd13e36 ("afs: Implement YFS support in the fs client") Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/protocol_yfs.h | 11 +++++++++++ fs/afs/yfsclient.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index 07bc10f076aa..d443e2bfa094 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -161,3 +161,14 @@ struct yfs_xdr_YFSStoreVolumeStatus { struct yfs_xdr_u64 max_quota; struct yfs_xdr_u64 file_quota; } __packed; + +enum yfs_lock_type { + yfs_LockNone = -1, + yfs_LockRead = 0, + yfs_LockWrite = 1, + yfs_LockExtend = 2, + yfs_LockRelease = 3, + yfs_LockMandatoryRead = 0x100, + yfs_LockMandatoryWrite = 0x101, + yfs_LockMandatoryExtend = 0x102, +}; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 12658c1363ae..5aa57929e8c2 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -803,7 +803,7 @@ int yfs_fs_create_file(struct afs_fs_cursor *fc, bp = xdr_encode_YFSFid(bp, &vnode->fid); bp = xdr_encode_string(bp, name, namesz); bp = xdr_encode_YFSStoreStatus_mode(bp, mode); - bp = xdr_encode_u32(bp, 0); /* ViceLockType */ + bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */ yfs_check_req(call, bp); afs_use_fs_server(call, fc->cbi); -- cgit v1.2.3 From 04906b2f542c23626b0ef6219b808406f8dddbe9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Jan 2019 09:48:10 +0100 Subject: blockdev: Fix livelocks on loop device bd_set_size() updates also block device's block size. This is somewhat unexpected from its name and at this point, only blkdev_open() uses this functionality. Furthermore, this can result in changing block size under a filesystem mounted on a loop device which leads to livelocks inside __getblk_gfp() like: Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 10863 Comm: syz-executor0 Not tainted 4.18.0-rc5+ #151 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__sanitizer_cov_trace_pc+0x3f/0x50 kernel/kcov.c:106 ... Call Trace: init_page_buffers+0x3e2/0x530 fs/buffer.c:904 grow_dev_page fs/buffer.c:947 [inline] grow_buffers fs/buffer.c:1009 [inline] __getblk_slow fs/buffer.c:1036 [inline] __getblk_gfp+0x906/0xb10 fs/buffer.c:1313 __bread_gfp+0x2d/0x310 fs/buffer.c:1347 sb_bread include/linux/buffer_head.h:307 [inline] fat12_ent_bread+0x14e/0x3d0 fs/fat/fatent.c:75 fat_ent_read_block fs/fat/fatent.c:441 [inline] fat_alloc_clusters+0x8ce/0x16e0 fs/fat/fatent.c:489 fat_add_cluster+0x7a/0x150 fs/fat/inode.c:101 __fat_get_block fs/fat/inode.c:148 [inline] ... Trivial reproducer for the problem looks like: truncate -s 1G /tmp/image losetup /dev/loop0 /tmp/image mkfs.ext4 -b 1024 /dev/loop0 mount -t ext4 /dev/loop0 /mnt losetup -c /dev/loop0 l /mnt Fix the problem by moving initialization of a block device block size into a separate function and call it when needed. Thanks to Tetsuo Handa for help with debugging the problem. Reported-by: syzbot+9933e4476f365f5d5a1b@syzkaller.appspotmail.com Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/block_dev.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index c546cdce77e6..58a4c1217fa8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -104,6 +104,20 @@ void invalidate_bdev(struct block_device *bdev) } EXPORT_SYMBOL(invalidate_bdev); +static void set_init_blocksize(struct block_device *bdev) +{ + unsigned bsize = bdev_logical_block_size(bdev); + loff_t size = i_size_read(bdev->bd_inode); + + while (bsize < PAGE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_block_size = bsize; + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} + int set_blocksize(struct block_device *bdev, int size) { /* Size must be a power of two, and between 512 and PAGE_SIZE */ @@ -1431,18 +1445,9 @@ EXPORT_SYMBOL(check_disk_change); void bd_set_size(struct block_device *bdev, loff_t size) { - unsigned bsize = bdev_logical_block_size(bdev); - inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); inode_unlock(bdev->bd_inode); - while (bsize < PAGE_SIZE) { - if (size & bsize) - break; - bsize <<= 1; - } - bdev->bd_block_size = bsize; - bdev->bd_inode->i_blkbits = blksize_bits(bsize); } EXPORT_SYMBOL(bd_set_size); @@ -1519,8 +1524,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + set_init_blocksize(bdev); + } /* * If the device is invalidated, rescan partition @@ -1555,6 +1562,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + set_init_blocksize(bdev); } if (bdev->bd_bdi == &noop_backing_dev_info) -- cgit v1.2.3 From 45ac486ecf2dc998e25cf32f0cabf2deaad875be Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 11 Jan 2019 19:04:44 -0500 Subject: NFSv4.2 fix unnecessary retry in nfs4_copy_file_range Currently nfs42_proc_copy_file_range() can not return EAGAIN. Fixes: e4648aa4f98a ("NFS recover from destination server reboot for copies") Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4file.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 46d691ba04bc..45b2322e092d 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,15 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { - ssize_t ret; - if (file_inode(file_in) == file_inode(file_out)) return -EINVAL; -retry: - ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); - if (ret == -EAGAIN) - goto retry; - return ret; + return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) -- cgit v1.2.3 From 97e1532ef81acb31c30f9e75bf00306c33a77812 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Jan 2019 10:27:59 +0100 Subject: fuse: handle zero sized retrieve correctly Dereferencing req->page_descs[0] will Oops if req->max_pages is zero. Reported-by: syzbot+c1e36d30ee3416289cc0@syzkaller.appspotmail.com Tested-by: syzbot+c1e36d30ee3416289cc0@syzkaller.appspotmail.com Fixes: b2430d7567a3 ("fuse: add per-page descriptor to fuse_req") Cc: # v3.9 Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a5e516a40e7a..fc29264011a6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1742,7 +1742,6 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->in.h.nodeid = outarg->nodeid; req->in.numargs = 2; req->in.argpages = 1; - req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; @@ -1757,6 +1756,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, this_num = min_t(unsigned, num, PAGE_SIZE - offset); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].offset = offset; req->page_descs[req->num_pages].length = this_num; req->num_pages++; -- cgit v1.2.3 From 8a3177db59cd644fde05ba9efee29392dfdec8aa Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Jan 2019 10:27:59 +0100 Subject: cuse: fix ioctl cuse_process_init_reply() doesn't initialize fc->max_pages and thus all cuse bases ioctls fail with ENOMEM. Reported-by: Andreas Steinmetz Fixes: 5da784cce430 ("fuse: add max_pages to init_out") Cc: # v4.20 Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 76baaa6be393..c2d4099429be 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -628,6 +628,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); + fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -1162,7 +1163,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); - fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; /* Used by get_root_inode() */ sb->s_fs_info = fc; -- cgit v1.2.3 From 9509941e9c534920ccc4771ae70bd6cbbe79df1c Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 12 Jan 2019 02:39:05 +0100 Subject: fuse: call pipe_buf_release() under pipe lock Some of the pipe_buf_release() handlers seem to assume that the pipe is locked - in particular, anon_pipe_buf_release() accesses pipe->tmp_page without taking any extra locks. From a glance through the callers of pipe_buf_release(), it looks like FUSE is the only one that calls pipe_buf_release() without having the pipe locked. This bug should only lead to a memory leak, nothing terrible. Fixes: dd3bb14f44a6 ("fuse: support splice() writing to fuse device") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fc29264011a6..809c0f2f9942 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2077,8 +2077,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); + pipe_lock(pipe); for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); + pipe_unlock(pipe); out: kvfree(bufs); -- cgit v1.2.3 From a2ebba824106dabe79937a9f29a875f837e1b6d4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Jan 2019 10:27:59 +0100 Subject: fuse: decrement NR_WRITEBACK_TEMP on the right page NR_WRITEBACK_TEMP is accounted on the temporary page in the request, not the page cache page. Fixes: 8b284dc47291 ("fuse: writepages: handle same page rewrites") Cc: # v3.13 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ffaffe18352a..a59c16bd90ac 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1782,7 +1782,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, spin_unlock(&fc->lock); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK_TEMP); + dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); -- cgit v1.2.3 From 4882a27cec24319d10f95e978ecc80050e3e3e15 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Wed, 9 Jan 2019 17:23:54 +0000 Subject: afs: Don't set vnode->cb_s_break in afs_validate() A cb_interest record is not necessarily attached to the vnode on entry to afs_validate(), which can cause an oops when we try to bring the vnode's cb_s_break up to date in the default case (ie. no current callback promise and the vnode has not been deleted). Fix this by simply removing the line, as vnode->cb_s_break will be set when needed by afs_register_server_cb_interest() when we next get a callback promise from RPC call. The oops looks something like: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 ... RIP: 0010:afs_validate+0x66/0x250 [kafs] ... Call Trace: afs_d_revalidate+0x8d/0x340 [kafs] ? __d_lookup+0x61/0x150 lookup_dcache+0x44/0x70 ? lookup_dcache+0x44/0x70 __lookup_hash+0x24/0xa0 do_unlinkat+0x11d/0x2c0 __x64_sys_unlink+0x23/0x30 do_syscall_64+0x4d/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: ae3b7361dc0e ("afs: Fix validation/callback interaction") Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6b17d3620414..211343831c30 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -414,7 +414,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { valid = true; } else { - vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; vnode->cb_v_break = vnode->volume->cb_v_break; valid = false; } -- cgit v1.2.3 From 59d49076ae3e6912e6d7df2fd68e2337f3d02036 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 9 Jan 2019 17:23:54 +0000 Subject: afs: Fix key refcounting in file locking code Fix the refcounting of the authentication keys in the file locking code. The vnode->lock_key member points to a key on which it expects to be holding a ref, but it isn't always given an extra ref, however. Fixes: 0fafdc9f888b ("afs: Fix file locking") Signed-off-by: David Howells --- fs/afs/flock.c | 4 ++-- fs/afs/inode.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 0568fd986821..e432bd27a2e7 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -208,7 +208,7 @@ again: /* The new front of the queue now owns the state variables. */ next = list_entry(vnode->pending_locks.next, struct file_lock, fl_u.afs.link); - vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_key = key_get(afs_file_key(next->fl_file)); vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; goto again; @@ -413,7 +413,7 @@ static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl) /* The new front of the queue now owns the state variables. */ next = list_entry(vnode->pending_locks.next, struct file_lock, fl_u.afs.link); - vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_key = key_get(afs_file_key(next->fl_file)); vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; afs_lock_may_be_available(vnode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 211343831c30..1a4ce07fb406 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -545,6 +545,8 @@ void afs_evict_inode(struct inode *inode) #endif afs_put_permits(rcu_access_pointer(vnode->permit_cache)); + key_put(vnode->lock_key); + vnode->lock_key = NULL; _leave(""); } -- cgit v1.2.3 From 7a75b0079a1d54e342c502c3c8107ba97e05d3d3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 10 Jan 2019 15:14:29 +0000 Subject: afs: Provide a function to get a ref on a call Provide a function to get a reference on an afs_call struct. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index a7b44863d502..4830e0a6bf1d 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -203,20 +203,26 @@ void afs_put_call(struct afs_call *call) } } +static struct afs_call *afs_get_call(struct afs_call *call, + enum afs_call_trace why) +{ + int u = atomic_inc_return(&call->usage); + + trace_afs_call(call, why, u, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); + return call; +} + /* * Queue the call for actual work. */ static void afs_queue_call_work(struct afs_call *call) { if (call->type->work) { - int u = atomic_inc_return(&call->usage); - - trace_afs_call(call, afs_call_trace_work, u, - atomic_read(&call->net->nr_outstanding_calls), - __builtin_return_address(0)); - INIT_WORK(&call->work, call->type->work); + afs_get_call(call, afs_call_trace_work); if (!queue_work(afs_wq, &call->work)) afs_put_call(call); } -- cgit v1.2.3 From 34fa47612bfe5d7de7fcaf658a6952b6aeec3b13 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 10 Jan 2019 15:40:50 +0000 Subject: afs: Fix race in async call refcounting There's a race between afs_make_call() and afs_wake_up_async_call() in the case that an error is returned from rxrpc_kernel_send_data() after it has queued the final packet. afs_make_call() will try and clean up the mess, but the call state may have been moved on thereby causing afs_process_async_call() to also try and to delete the call. Fix this by: (1) Getting an extra ref for an asynchronous call for the call itself to hold. This makes sure the call doesn't evaporate on us accidentally and will allow the call to be retained by the caller in a future patch. The ref is released on leaving afs_make_call() or afs_wait_for_call_to_complete(). (2) In the event of an error from rxrpc_kernel_send_data(): (a) Don't set the call state to AFS_CALL_COMPLETE until *after* the call has been aborted and ended. This prevents afs_deliver_to_call() from doing anything with any notifications it gets. (b) Explicitly end the call immediately to prevent further callbacks. (c) Cancel any queued async_work and wait for the work if it's executing. This allows us to be sure the race won't recur when we change the state. We put the work queue's ref on the call if we managed to cancel it. (d) Put the call's ref that we got in (1). This belongs to us as long as the call is in state AFS_CALL_CL_REQUESTING. Fixes: 341f741f04be ("afs: Refcount the afs_call struct") Signed-off-by: David Howells --- fs/afs/rxrpc.c | 35 ++++++++++++++++++++++++++++++----- include/trace/events/afs.h | 2 ++ 2 files changed, 32 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 4830e0a6bf1d..2c588f9bbbda 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -23,6 +23,7 @@ struct workqueue_struct *afs_async_calls; static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); +static void afs_delete_async_call(struct work_struct *); static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); @@ -404,6 +405,12 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, } } + /* If the call is going to be asynchronous, we need an extra ref for + * the call to hold itself so the caller need not hang on to its ref. + */ + if (call->async) + afs_get_call(call, afs_call_trace_get); + /* create a call */ rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key, (unsigned long)call, @@ -444,15 +451,17 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, goto error_do_abort; } - /* at this point, an async call may no longer exist as it may have - * already completed */ - if (call->async) + /* Note that at this point, we may have received the reply or an abort + * - and an asynchronous call may already have completed. + */ + if (call->async) { + afs_put_call(call); return -EINPROGRESS; + } return afs_wait_for_call_to_complete(call, ac); error_do_abort: - call->state = AFS_CALL_COMPLETE; if (ret != -ECONNABORTED) { rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); @@ -469,8 +478,24 @@ error_do_abort: error_kill_call: if (call->type->done) call->type->done(call); - afs_put_call(call); + + /* We need to dispose of the extra ref we grabbed for an async call. + * The call, however, might be queued on afs_async_calls and we need to + * make sure we don't get any more notifications that might requeue it. + */ + if (call->rxcall) { + rxrpc_kernel_end_call(call->net->socket, call->rxcall); + call->rxcall = NULL; + } + if (call->async) { + if (cancel_work_sync(&call->async_work)) + afs_put_call(call); + afs_put_call(call); + } + ac->error = ret; + call->state = AFS_CALL_COMPLETE; + afs_put_call(call); _leave(" = %d", ret); return ret; } diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 33d291888ba9..e3f005eae1f7 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -25,6 +25,7 @@ enum afs_call_trace { afs_call_trace_alloc, afs_call_trace_free, + afs_call_trace_get, afs_call_trace_put, afs_call_trace_wake, afs_call_trace_work, @@ -159,6 +160,7 @@ enum afs_file_error { #define afs_call_traces \ EM(afs_call_trace_alloc, "ALLOC") \ EM(afs_call_trace_free, "FREE ") \ + EM(afs_call_trace_get, "GET ") \ EM(afs_call_trace_put, "PUT ") \ EM(afs_call_trace_wake, "WAKE ") \ E_(afs_call_trace_work, "WORK ") -- cgit v1.2.3 From 6a4c9ab13feeacd3072175d7d1f1fcfabbb9fc90 Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Thu, 17 Jan 2019 09:09:29 -0800 Subject: pstore/ram: Fix console ramoops to show the previous boot logs commit b05c950698fe ("pstore/ram: Simplify ramoops_get_next_prz() arguments") changed update assignment in getting next persistent ram zone by adding a check for record type. But the check always returns true since the record type is assigned 0. And this breaks console ramoops by showing current console log instead of previous log on warm reset and hard reset (actually hard reset should not be showing any logs). Fix this by having persistent ram zone type check instead of record type check. Tested this on SDM845 MTP and dragonboard 410c. Reproducing this issue is simple as below: 1. Trigger hard reset and mount pstore. Will see console-ramoops record in the mounted location which is the current log. 2. Trigger warm reset and mount pstore. Will see the current console-ramoops record instead of previous record. Fixes: b05c950698fe ("pstore/ram: Simplify ramoops_get_next_prz() arguments") Signed-off-by: Sai Prakash Ranjan Acked-by: Joel Fernandes (Google) [kees: dropped local variable usage] Signed-off-by: Kees Cook --- fs/pstore/ram.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 96f7d32cd184..076e26fdc0c0 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -128,7 +128,6 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id, struct pstore_record *record) { struct persistent_ram_zone *prz; - bool update = (record->type == PSTORE_TYPE_DMESG); /* Give up if we never existed or have hit the end. */ if (!przs) @@ -139,7 +138,7 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id, return NULL; /* Update old/shadowed buffer. */ - if (update) + if (prz->type == PSTORE_TYPE_DMESG) persistent_ram_save_old(prz); if (!persistent_ram_old_size(prz)) -- cgit v1.2.3 From 77b7aad195099e7c6da11e94b7fa6ef5e6fb0025 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 9 Jan 2019 15:02:23 +0100 Subject: Revert "btrfs: balance dirty metadata pages in btrfs_finish_ordered_io" This reverts commit e73e81b6d0114d4a303205a952ab2e87c44bd279. This patch causes a few problems: - adds latency to btrfs_finish_ordered_io - as btrfs_finish_ordered_io is used for free space cache, generating more work from btrfs_btree_balance_dirty_nodelay could end up in the same workque, effectively deadlocking 12260 kworker/u96:16+btrfs-freespace-write D [<0>] balance_dirty_pages+0x6e6/0x7ad [<0>] balance_dirty_pages_ratelimited+0x6bb/0xa90 [<0>] btrfs_finish_ordered_io+0x3da/0x770 [<0>] normal_work_helper+0x1c5/0x5a0 [<0>] process_one_work+0x1ee/0x5a0 [<0>] worker_thread+0x46/0x3d0 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x24/0x30 [<0>] 0xffffffffffffffff Transaction commit will wait on the freespace cache: 838 btrfs-transacti D [<0>] btrfs_start_ordered_extent+0x154/0x1e0 [<0>] btrfs_wait_ordered_range+0xbd/0x110 [<0>] __btrfs_wait_cache_io+0x49/0x1a0 [<0>] btrfs_write_dirty_block_groups+0x10b/0x3b0 [<0>] commit_cowonly_roots+0x215/0x2b0 [<0>] btrfs_commit_transaction+0x37e/0x910 [<0>] transaction_kthread+0x14d/0x180 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x24/0x30 [<0>] 0xffffffffffffffff And then writepages ends up waiting on transaction commit: 9520 kworker/u96:13+flush-btrfs-1 D [<0>] wait_current_trans+0xac/0xe0 [<0>] start_transaction+0x21b/0x4b0 [<0>] cow_file_range_inline+0x10b/0x6b0 [<0>] cow_file_range.isra.69+0x329/0x4a0 [<0>] run_delalloc_range+0x105/0x3c0 [<0>] writepage_delalloc+0x119/0x180 [<0>] __extent_writepage+0x10c/0x390 [<0>] extent_write_cache_pages+0x26f/0x3d0 [<0>] extent_writepages+0x4f/0x80 [<0>] do_writepages+0x17/0x60 [<0>] __writeback_single_inode+0x59/0x690 [<0>] writeback_sb_inodes+0x291/0x4e0 [<0>] __writeback_inodes_wb+0x87/0xb0 [<0>] wb_writeback+0x3bb/0x500 [<0>] wb_workfn+0x40d/0x610 [<0>] process_one_work+0x1ee/0x5a0 [<0>] worker_thread+0x1e0/0x3d0 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x24/0x30 [<0>] 0xffffffffffffffff Eventually, we have every process in the system waiting on balance_dirty_pages(), and nobody is able to make progress on page writeback. The original patch tried to fix an OOM condition, that happened on 4.4 but no success reproducing that on later kernels (4.19 and 4.20). This is more likely a problem in OOM itself. Link: https://lore.kernel.org/linux-btrfs/20180528054821.9092-1-ethanlien@synology.com/ Reported-by: Chris Mason CC: stable@vger.kernel.org # 4.18+ CC: ethanlien Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 43eb4535319d..b6025b5d0b25 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3129,9 +3129,6 @@ out: /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - /* Try to release some metadata so we don't get an OOM but don't wait */ - btrfs_btree_balance_dirty_nodelay(fs_info); - return ret; } -- cgit v1.2.3 From 31890da0bfdd24b135a258404b93c58a65510c7a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 21 Nov 2018 14:05:41 -0500 Subject: btrfs: handle delayed ref head accounting cleanup in abort We weren't doing any of the accounting cleanup when we aborted transactions. Fix this by making cleanup_ref_head_accounting global and calling it from the abort code, this fixes the issue where our accounting was all wrong after the fs aborts. The test generic/475 on a 2G VM can trigger the problems eg.: [ 8502.136957] WARNING: CPU: 0 PID: 11064 at fs/btrfs/extent-tree.c:5986 btrfs_free_block_grou +ps+0x3dc/0x410 [btrfs] [ 8502.148372] CPU: 0 PID: 11064 Comm: umount Not tainted 5.0.0-rc1-default+ #394 [ 8502.150807] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626 +cc-prebuilt.qemu-project.org 04/01/2014 [ 8502.154317] RIP: 0010:btrfs_free_block_groups+0x3dc/0x410 [btrfs] [ 8502.160623] RSP: 0018:ffffb1ab84b93de8 EFLAGS: 00010206 [ 8502.161906] RAX: 0000000001000000 RBX: ffff9f34b1756400 RCX: 0000000000000000 [ 8502.163448] RDX: 0000000000000002 RSI: 0000000000000001 RDI: ffff9f34b1755400 [ 8502.164906] RBP: ffff9f34b7e8c000 R08: 0000000000000001 R09: 0000000000000000 [ 8502.166716] R10: 0000000000000000 R11: 0000000000000001 R12: ffff9f34b7e8c108 [ 8502.168498] R13: ffff9f34b7e8c158 R14: 0000000000000000 R15: dead000000000100 [ 8502.170296] FS: 00007fb1cf15ffc0(0000) GS:ffff9f34bd400000(0000) knlGS:0000000000000000 [ 8502.172439] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8502.173669] CR2: 00007fb1ced507b0 CR3: 000000002f7a6000 CR4: 00000000000006f0 [ 8502.175094] Call Trace: [ 8502.175759] close_ctree+0x17f/0x350 [btrfs] [ 8502.176721] generic_shutdown_super+0x64/0x100 [ 8502.177702] kill_anon_super+0x14/0x30 [ 8502.178607] btrfs_kill_super+0x12/0xa0 [btrfs] [ 8502.179602] deactivate_locked_super+0x29/0x60 [ 8502.180595] cleanup_mnt+0x3b/0x70 [ 8502.181406] task_work_run+0x98/0xc0 [ 8502.182255] exit_to_usermode_loop+0x83/0x90 [ 8502.183113] do_syscall_64+0x15b/0x180 [ 8502.183919] entry_SYSCALL_64_after_hwframe+0x49/0xbe Corresponding to release_global_block_rsv() { ... WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); CC: stable@vger.kernel.org Signed-off-by: Josef Bacik [ add log dump ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 ++++ fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 12 +++++------- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f031a447a047..34019c8b6158 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,6 +35,7 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; +struct btrfs_delayed_ref_root; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; @@ -2664,6 +2665,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, unsigned long count, u64 transid, int wait); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8da2f380d3c0..469a51b4b273 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4265,6 +4265,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (pin_bytes) btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b15afeae16df..6f6fae410fc1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2456,12 +2456,10 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, return ret ? ret : 1; } -static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; int nr_items = 1; /* Dropping this ref head update. */ if (head->total_ref_mod < 0) { @@ -2544,7 +2542,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - cleanup_ref_head_accounting(trans, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); @@ -7188,7 +7186,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - cleanup_ref_head_accounting(trans, head); + btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; -- cgit v1.2.3 From 74d5d229b1bf60f93bff244b2dfc0eb21ec32a07 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 21 Nov 2018 14:05:45 -0500 Subject: btrfs: wait on ordered extents on abort cleanup If we flip read-only before we initiate writeback on all dirty pages for ordered extents we've created then we'll have ordered extents left over on umount, which results in all sorts of bad things happening. Fix this by making sure we wait on ordered extents if we have to do the aborted transaction cleanup stuff. generic/475 can produce this warning: [ 8531.177332] WARNING: CPU: 2 PID: 11997 at fs/btrfs/disk-io.c:3856 btrfs_free_fs_root+0x95/0xa0 [btrfs] [ 8531.183282] CPU: 2 PID: 11997 Comm: umount Tainted: G W 5.0.0-rc1-default+ #394 [ 8531.185164] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014 [ 8531.187851] RIP: 0010:btrfs_free_fs_root+0x95/0xa0 [btrfs] [ 8531.193082] RSP: 0018:ffffb1ab86163d98 EFLAGS: 00010286 [ 8531.194198] RAX: ffff9f3449494d18 RBX: ffff9f34a2695000 RCX:0000000000000000 [ 8531.195629] RDX: 0000000000000002 RSI: 0000000000000001 RDI:0000000000000000 [ 8531.197315] RBP: ffff9f344e930000 R08: 0000000000000001 R09:0000000000000000 [ 8531.199095] R10: 0000000000000000 R11: ffff9f34494d4ff8 R12:ffffb1ab86163dc0 [ 8531.200870] R13: ffff9f344e9300b0 R14: ffffb1ab86163db8 R15:0000000000000000 [ 8531.202707] FS: 00007fc68e949fc0(0000) GS:ffff9f34bd800000(0000)knlGS:0000000000000000 [ 8531.204851] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8531.205942] CR2: 00007ffde8114dd8 CR3: 000000002dfbd000 CR4:00000000000006e0 [ 8531.207516] Call Trace: [ 8531.208175] btrfs_free_fs_roots+0xdb/0x170 [btrfs] [ 8531.210209] ? wait_for_completion+0x5b/0x190 [ 8531.211303] close_ctree+0x157/0x350 [btrfs] [ 8531.212412] generic_shutdown_super+0x64/0x100 [ 8531.213485] kill_anon_super+0x14/0x30 [ 8531.214430] btrfs_kill_super+0x12/0xa0 [btrfs] [ 8531.215539] deactivate_locked_super+0x29/0x60 [ 8531.216633] cleanup_mnt+0x3b/0x70 [ 8531.217497] task_work_run+0x98/0xc0 [ 8531.218397] exit_to_usermode_loop+0x83/0x90 [ 8531.219324] do_syscall_64+0x15b/0x180 [ 8531.220192] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 8531.221286] RIP: 0033:0x7fc68e5e4d07 [ 8531.225621] RSP: 002b:00007ffde8116608 EFLAGS: 00000246 ORIG_RAX:00000000000000a6 [ 8531.227512] RAX: 0000000000000000 RBX: 00005580c2175970 RCX:00007fc68e5e4d07 [ 8531.229098] RDX: 0000000000000001 RSI: 0000000000000000 RDI:00005580c2175b80 [ 8531.230730] RBP: 0000000000000000 R08: 00005580c2175ba0 R09:00007ffde8114e80 [ 8531.232269] R10: 0000000000000000 R11: 0000000000000246 R12:00005580c2175b80 [ 8531.233839] R13: 00007fc68eac61c4 R14: 00005580c2175a68 R15:0000000000000000 Leaving a tree in the rb-tree: 3853 void btrfs_free_fs_root(struct btrfs_root *root) 3854 { 3855 iput(root->ino_cache_inode); 3856 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); CC: stable@vger.kernel.org Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik [ add stacktrace ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 469a51b4b273..18eefc5b2532 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4201,6 +4201,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); + + /* + * We need this here because if we've been flipped read-only we won't + * get sync() from the umount, so we need to make sure any ordered + * extents that haven't had their dirty pages IO start writeout yet + * actually get run and error out properly. + */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, -- cgit v1.2.3 From 3ec9a4c81c8cc2a8d9673588dd84d9cc7c31019b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 11 Jan 2019 10:21:01 -0500 Subject: btrfs: run delayed iputs before committing Delayed iputs means we can have final iputs of deleted inodes in the queue, which could potentially generate a lot of pinned space that could be free'd. So before we decide to commit the transaction for ENOPSC reasons, run the delayed iputs so that any potential space is free'd up. If there is and we freed enough we can then commit the transaction and potentially be able to make our reservation. Reviewed-by: Omar Sandoval Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6f6fae410fc1..d81035b7ea7d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4952,6 +4952,15 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = 0; break; case COMMIT_TRANS: + /* + * If we have pending delayed iputs then we could free up a + * bunch of pinned space, so make sure we run the iputs before + * we do our pinned bytes check below. + */ + mutex_lock(&fs_info->cleaner_delayed_iput_mutex); + btrfs_run_delayed_iputs(fs_info); + mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); + ret = may_commit_transaction(fs_info, space_info); break; default: -- cgit v1.2.3 From fd340d0f68cc87badfc9efcb226f23a5428826a0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 11 Jan 2019 10:21:02 -0500 Subject: btrfs: wakeup cleaner thread when adding delayed iput The cleaner thread usually takes care of delayed iputs, with the exception of the btrfs_end_transaction_throttle path. Delaying iputs means we are potentially delaying the eviction of an inode and it's respective space. The cleaner thread only gets woken up every 30 seconds, or when we require space. If there are a lot of inodes that need to be deleted we could induce a serious amount of latency while we wait for these inodes to be evicted. So instead wakeup the cleaner if it's not already awake to process any new delayed iputs we add to the list. If we suddenly need space we will less likely be backed up behind a bunch of inodes that are waiting to be deleted, and we could possibly free space before we need to get into the flushing logic which will save us some latency. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/disk-io.c | 3 +++ fs/btrfs/inode.c | 2 ++ 3 files changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 34019c8b6158..8b1d06fa222d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -787,6 +787,9 @@ enum { * main phase. The fs_info::balance_ctl is initialized. */ BTRFS_FS_BALANCE_RUNNING, + + /* Indicate that the cleaner thread is awake and doing something. */ + BTRFS_FS_CLEANER_RUNNING, }; struct btrfs_fs_info { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 18eefc5b2532..6a2a2a951705 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1682,6 +1682,8 @@ static int cleaner_kthread(void *arg) while (1) { again = 0; + set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); + /* Make the cleaner go to sleep early. */ if (btrfs_need_cleaner_sleep(fs_info)) goto sleep; @@ -1728,6 +1730,7 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: + clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) kthread_parkme(); if (kthread_should_stop()) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b6025b5d0b25..5c349667c761 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3251,6 +3251,8 @@ void btrfs_add_delayed_iput(struct inode *inode) ASSERT(list_empty(&binode->delayed_iput)); list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); + if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) -- cgit v1.2.3 From 5631e8576a3caf606cdc375f97425a67983b420c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 20 Jan 2019 14:33:34 -0800 Subject: pstore/ram: Avoid allocation and leak of platform data Yue Hu noticed that when parsing device tree the allocated platform data was never freed. Since it's not used beyond the function scope, this switches to using a stack variable instead. Reported-by: Yue Hu Fixes: 35da60941e44 ("pstore/ram: add Device Tree bindings") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- fs/pstore/ram.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 076e26fdc0c0..898c8321b343 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -710,18 +710,15 @@ static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ramoops_platform_data *pdata = dev->platform_data; + struct ramoops_platform_data pdata_local; struct ramoops_context *cxt = &oops_cxt; size_t dump_mem_sz; phys_addr_t paddr; int err = -EINVAL; if (dev_of_node(dev) && !pdata) { - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) { - pr_err("cannot allocate platform data buffer\n"); - err = -ENOMEM; - goto fail_out; - } + pdata = &pdata_local; + memset(pdata, 0, sizeof(*pdata)); err = ramoops_parse_dt(pdev, pdata); if (err < 0) -- cgit v1.2.3 From d95e674c01cfb5461e8b9fdeebf6d878c9b80b2f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 10 Jan 2019 15:41:09 +0800 Subject: ceph: clear inode pointer when snap realm gets dropped by its inode snap realm and corresponding inode have pointers to each other. The two pointer should get clear at the same time. Otherwise, snap realm's pointer may reference freed inode. Cc: stable@vger.kernel.org # 4.17+ Signed-off-by: "Yan, Zheng" Reviewed-by: Luis Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 94c026bba2c2..bba28a5034ba 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1035,6 +1035,8 @@ static void drop_inode_snap_realm(struct ceph_inode_info *ci) list_del_init(&ci->i_snap_realm_item); ci->i_snap_realm_counter++; ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; spin_unlock(&realm->inodes_with_caps_lock); ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, realm); -- cgit v1.2.3 From 74827ee29565f86e2a64495a5e3e58d3371d74ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jan 2019 00:14:22 +0100 Subject: ceph: quota: cleanup license mess Precise and non-ambiguous license information is important. The recently added quota.c file has a SPDX license identifier, which is nice, but at the same time it has a contradictionary license boiler plate text. SPDX-License-Identifier: GPL-2.0 versus * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. Oh well. As the other ceph related files are licensed under the GPL v2 only, it's assumed that the SPDX id is correct and the boiler plate was randomly copied into that patch. Remove the boiler plate as it is wrong and even if correct it is redundant. Fixes: fb18a57568c2 ("ceph: quota: add initial infrastructure to support cephfs quotas") Signed-off-by: Thomas Gleixner Cc: Luis Henriques Cc: Jiri Kosina Cc: "Yan, Zheng" Cc: Sage Weil Cc: Ilya Dryomov Cc: ceph-devel@vger.kernel.org Acked-by: Luis Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'fs') diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 03f4d24db8fe..9455d3aef0c3 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -3,19 +3,6 @@ * quota.c - CephFS quota * * Copyright (C) 2017-2018 SUSE - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include -- cgit v1.2.3 From 8b9433eb4de3c26a9226c981c283f9f4896ae030 Mon Sep 17 00:00:00 2001 From: "Ernesto A. Fernández" Date: Mon, 8 Oct 2018 20:58:23 -0300 Subject: direct-io: allow direct writes to empty inodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a DIO_SKIP_HOLES filesystem, the ->get_block() method is currently not allowed to create blocks for an empty inode. This confusion comes from trying to bit shift a negative number, so check the size of the inode first. The problem is most visible for hfsplus, because the fallback to buffered I/O doesn't happen and the write fails with EIO. This is in part the fault of the module, because it gives a wrong return value on ->get_block(); that will be fixed in a separate patch. Reviewed-by: Jeff Moyer Reviewed-by: Jan Kara Signed-off-by: Ernesto A. Fernández Signed-off-by: Jens Axboe --- fs/direct-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index dbc1a1f080ce..ec2fb6fe6d37 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -679,6 +679,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, unsigned long fs_count; /* Number of filesystem-sized blocks */ int create; unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; + loff_t i_size; /* * If there was a memory error and we've overwritten all the @@ -708,8 +709,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, */ create = dio->op == REQ_OP_WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> - i_blkbits)) + i_size = i_size_read(dio->inode); + if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits) create = 0; } -- cgit v1.2.3 From 7fc5854f8c6efae9e7624970ab49a1eac2faefb1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Dec 2017 08:38:30 -0800 Subject: writeback: synchronize sync(2) against cgroup writeback membership switches sync_inodes_sb() can race against cgwb (cgroup writeback) membership switches and fail to writeback some inodes. For example, if an inode switches to another wb while sync_inodes_sb() is in progress, the new wb might not be visible to bdi_split_work_to_wbs() at all or the inode might jump from a wb which hasn't issued writebacks yet to one which already has. This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb switch path against sync_inodes_sb() so that sync_inodes_sb() is guaranteed to see all the target wbs and inodes can't jump wbs to escape syncing. v2: Fixed misplaced rwsem init. Spotted by Jiufei. Signed-off-by: Tejun Heo Reported-by: Jiufei Xue Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com Acked-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 40 ++++++++++++++++++++++++++++++++++++++-- include/linux/backing-dev-defs.h | 1 + mm/backing-dev.c | 1 + 3 files changed, 40 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b40168fcc94a..36855c1f8daf 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -331,11 +331,22 @@ struct inode_switch_wbs_context { struct work_struct work; }; +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + down_write(&bdi->wb_switch_rwsem); +} + +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + up_write(&bdi->wb_switch_rwsem); +} + static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; + struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; @@ -343,6 +354,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) struct page *page; bool switched = false; + /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions @@ -428,6 +445,8 @@ skip_switch: spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); + up_read(&bdi->wb_switch_rwsem); + if (switched) { wb_wakeup(new_wb); wb_put(old_wb); @@ -468,9 +487,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (inode->i_state & I_WB_SWITCH) return; + /* + * Avoid starting new switches while sync_inodes_sb() is in + * progress. Otherwise, if the down_write protected issue path + * blocks heavily, we might end up starting a large number of + * switches which will block on the rwsem. + */ + if (!down_read_trylock(&bdi->wb_switch_rwsem)) + return; + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); if (!isw) - return; + goto out_unlock; /* find and pin the new wb */ rcu_read_lock(); @@ -504,12 +532,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); - return; + goto out_unlock; out_free: if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); +out_unlock: + up_read(&bdi->wb_switch_rwsem); } /** @@ -887,6 +917,9 @@ fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } + static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) @@ -2413,8 +2446,11 @@ void sync_inodes_sb(struct super_block *sb) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); + /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ + bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); wb_wait_for_completion(bdi, &done); + bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c31157135598..07e02d6df5ad 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -190,6 +190,7 @@ struct backing_dev_info { struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */ + struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */ #else struct bdi_writeback_congested *wb_congested; #endif diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8a8bb8796c6c..72e6d0c55cfa 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; mutex_init(&bdi->cgwb_release_mutex); + init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { -- cgit v1.2.3 From 73aaf920cc72024c4a4460cfa46d56e5014172f3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 16 Jan 2019 16:28:59 +0000 Subject: cifs: fix memory leak of an allocated cifs_ntsd structure The call to SMB2_queary_acl can allocate memory to pntsd and also return a failure via a call to SMB2_query_acl (and then query_info). This occurs when query_info allocates the structure and then in query_info the call to smb2_validate_and_copy_iov fails. Currently the failure just returns without kfree'ing pntsd hence causing a memory leak. Currently, *data is allocated if it's not already pointing to a buffer, so it needs to be kfree'd only if was allocated in query_info, so the fix adds an allocated flag to track this. Also set *dlen to zero on an error just to be safe since *data is kfree'd. Also set errno to -ENOMEM if the allocation of *data fails. Signed-off-by: Colin Ian King Signed-off-by: Steve French Reviewed-by: Dan Carpener --- fs/cifs/smb2pdu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 50811a7dc0e0..0af87bd0dc49 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2816,6 +2816,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype = CIFS_NO_BUFFER; struct cifs_ses *ses = tcon->ses; int flags = 0; + bool allocated = false; cifs_dbg(FYI, "Query Info\n"); @@ -2855,14 +2856,21 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, "Error %d allocating memory for acl\n", rc); *dlen = 0; + rc = -ENOMEM; goto qinf_exit; } + allocated = true; } } rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, min_len, *data); + if (rc && allocated) { + kfree(*data); + *data = NULL; + *dlen = 0; + } qinf_exit: SMB2_query_info_free(&rqst); -- cgit v1.2.3 From acc58d0bab55a50e02c25f00bd6a210ee121595f Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 17 Jan 2019 08:21:24 -0800 Subject: CIFS: Fix possible hang during async MTU reads and writes When doing MTU i/o we need to leave some credits for possible reopen requests and other operations happening in parallel. Currently we leave 1 credit which is not enough even for reopen only: we need at least 2 credits if durable handle reconnect fails. Also there may be other operations at the same time including compounding ones which require 3 credits at a time each. Fix this by leaving 8 credits which is big enough to cover most scenarios. Was able to reproduce this when server was configured to give out fewer credits than usual. The proper fix would be to reconnect a file handle first and then obtain credits for an MTU request but this leads to bigger code changes and should happen in other patches. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index cf7eb891804f..1d3ce127b02e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -165,14 +165,14 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, scredits = server->credits; /* can deadlock with reopen */ - if (scredits == 1) { + if (scredits <= 8) { *num = SMB2_MAX_BUFFER_SIZE; *credits = 0; break; } - /* leave one credit for a possible reopen */ - scredits--; + /* leave some credits for reopen and other ops */ + scredits -= 8; *num = min_t(unsigned int, size, scredits * SMB2_MAX_BUFFER_SIZE); -- cgit v1.2.3 From b0b2cac7e244629e1a84a26d7eabb885fed7ff68 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jan 2019 00:14:23 +0100 Subject: smb3: Cleanup license mess Precise and non-ambiguous license information is important. The recently added aegis header file has a SPDX license identifier, which is nice, but at the same time it has a contradictionary license boiler plate text. SPDX-License-Identifier: GPL-2.0 versus * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. Oh well. Assuming that the SPDX identifier is correct and according to x86/hyper-v contributions from Microsoft GPL V2 only is the usual license. Remove the boiler plate as it is wrong and even if correct it is redundant. Fixes: eccb4422cf97 ("smb3: Add ftrace tracepoints for improved SMB3 debugging") Signed-off-by: Thomas Gleixner Cc: Steve French Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/cifs/trace.c | 10 ---------- fs/cifs/trace.h | 10 ---------- 2 files changed, 20 deletions(-) (limited to 'fs') diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c index bd4a546feec1..465483787193 100644 --- a/fs/cifs/trace.c +++ b/fs/cifs/trace.c @@ -3,16 +3,6 @@ * Copyright (C) 2018, Microsoft Corporation. * * Author(s): Steve French - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index fb049809555f..59be48206932 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -3,16 +3,6 @@ * Copyright (C) 2018, Microsoft Corporation. * * Author(s): Steve French - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cifs -- cgit v1.2.3 From ef68e831840c40c7d01b328b3c0f5d8c4796c232 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 18 Jan 2019 17:25:36 -0800 Subject: CIFS: Do not reconnect TCP session in add_credits() When executing add_credits() we currently call cifs_reconnect() if the number of credits is zero and there are no requests in flight. In this case we may call cifs_reconnect() recursively twice and cause memory corruption given the following sequence of functions: mid1.callback() -> add_credits() -> cifs_reconnect() -> -> mid2.callback() -> add_credits() -> cifs_reconnect(). Fix this by avoiding to call cifs_reconnect() in add_credits() and checking for zero credits in the demultiplex thread. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/connect.c | 21 +++++++++++++++++++++ fs/cifs/smb2ops.c | 32 +++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 683310f26171..8463c940e0e5 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -720,6 +720,21 @@ server_unresponsive(struct TCP_Server_Info *server) return false; } +static inline bool +zero_credits(struct TCP_Server_Info *server) +{ + int val; + + spin_lock(&server->req_lock); + val = server->credits + server->echo_credits + server->oplock_credits; + if (server->in_flight == 0 && val == 0) { + spin_unlock(&server->req_lock); + return true; + } + spin_unlock(&server->req_lock); + return false; +} + static int cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) { @@ -732,6 +747,12 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) for (total_read = 0; msg_data_left(smb_msg); total_read += length) { try_to_freeze(); + /* reconnect if no credits and no requests in flight */ + if (zero_credits(server)) { + cifs_reconnect(server); + return -ECONNABORTED; + } + if (server_unresponsive(server)) return -ECONNABORTED; if (cifs_rdma_enabled(server) && server->smbd_conn) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 1d3ce127b02e..fa8d2e1076c8 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -34,6 +34,7 @@ #include "cifs_ioctl.h" #include "smbdirect.h" +/* Change credits for different ops and return the total number of credits */ static int change_conf(struct TCP_Server_Info *server) { @@ -41,17 +42,15 @@ change_conf(struct TCP_Server_Info *server) server->oplock_credits = server->echo_credits = 0; switch (server->credits) { case 0: - return -1; + return 0; case 1: server->echoes = false; server->oplocks = false; - cifs_dbg(VFS, "disabling echoes and oplocks\n"); break; case 2: server->echoes = true; server->oplocks = false; server->echo_credits = 1; - cifs_dbg(FYI, "disabling oplocks\n"); break; default: server->echoes = true; @@ -64,14 +63,15 @@ change_conf(struct TCP_Server_Info *server) server->echo_credits = 1; } server->credits -= server->echo_credits + server->oplock_credits; - return 0; + return server->credits + server->echo_credits + server->oplock_credits; } static void smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, const int optype) { - int *val, rc = 0; + int *val, rc = -1; + spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); @@ -101,8 +101,26 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, } spin_unlock(&server->req_lock); wake_up(&server->request_q); - if (rc) - cifs_reconnect(server); + + if (server->tcpStatus == CifsNeedReconnect) + return; + + switch (rc) { + case -1: + /* change_conf hasn't been executed */ + break; + case 0: + cifs_dbg(VFS, "Possible client or server bug - zero credits\n"); + break; + case 1: + cifs_dbg(VFS, "disabling echoes and oplocks\n"); + break; + case 2: + cifs_dbg(FYI, "disabling oplocks\n"); + break; + default: + cifs_dbg(FYI, "add %u credits total=%d\n", add, rc); + } } static void -- cgit v1.2.3 From 8004c78c68e894e4fd5ac3c22cc22eb7dc24cabc Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 17 Jan 2019 15:29:26 -0800 Subject: CIFS: Fix credits calculations for reads with errors Currently we mark MID as malformed if we get an error from server in a read response. This leads to not properly processing credits in the readv callback. Fix this by marking such a response as normal received response and process it appropriately. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e18915415e13..bb54ccf8481c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1549,18 +1549,26 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server) } static int -cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid, + bool malformed) { int length; - struct cifs_readdata *rdata = mid->callback_data; length = cifs_discard_remaining_data(server); - dequeue_mid(mid, rdata->result); + dequeue_mid(mid, malformed); mid->resp_buf = server->smallbuf; server->smallbuf = NULL; return length; } +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + + return __cifs_readv_discard(server, mid, rdata->result); +} + int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { @@ -1602,12 +1610,23 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return -1; } + /* set up first two iov for signature check and to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = server->total_read - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + /* Was the SMB read successful? */ rdata->result = server->ops->map_error(buf, false); if (rdata->result != 0) { cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); - return cifs_readv_discard(server, mid); + /* normal error on read response */ + return __cifs_readv_discard(server, mid, false); } /* Is there enough to get to the rest of the READ_RSP header? */ @@ -1651,14 +1670,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) server->total_read += length; } - /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->total_read - 4; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n", - rdata->iov[0].iov_base, server->total_read); - /* how much data is in the response? */ #ifdef CONFIG_CIFS_SMB_DIRECT use_rdma_mr = rdata->mr; -- cgit v1.2.3 From ec678eae746dd25766a61c4095e2b649d3b20b09 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 18 Jan 2019 15:38:11 -0800 Subject: CIFS: Fix credit calculation for encrypted reads with errors We do need to account for credits received in error responses to read requests on encrypted sessions. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index fa8d2e1076c8..73f9c6af4065 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3207,11 +3207,23 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, server->ops->is_status_pending(buf, server, 0)) return -1; - rdata->result = server->ops->map_error(buf, false); + /* set up first two iov to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = + min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + + rdata->result = server->ops->map_error(buf, true); if (rdata->result != 0) { cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); - dequeue_mid(mid, rdata->result); + /* normal error on read response */ + dequeue_mid(mid, false); return 0; } @@ -3284,14 +3296,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->vals->read_rsp_size - 4; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", - rdata->iov[0].iov_base, server->vals->read_rsp_size); - length = rdata->copy_into_pages(server, rdata, &iter); kfree(bvec); -- cgit v1.2.3 From 3d3003fce8e837acc4e3960fe3cbabebc356dcb5 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 22 Jan 2019 16:50:21 -0800 Subject: CIFS: Fix credit calculations in compound mid callback The current code doesn't do proper accounting for credits in SMB1 case: it adds one credit per response only if we get a complete response while it needs to return it unconditionally. Fix this and also include malformed responses for SMB2+ into accounting for credits because such responses have Credit Granted field, thus nothing prevents to get a proper credit value from them. Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 6 +++++- fs/cifs/transport.c | 11 +---------- 2 files changed, 6 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 73f9c6af4065..153238fc4fa9 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -154,7 +154,11 @@ smb2_get_credits(struct mid_q_entry *mid) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf; - return le16_to_cpu(shdr->CreditRequest); + if (mid->mid_state == MID_RESPONSE_RECEIVED + || mid->mid_state == MID_RESPONSE_MALFORMED) + return le16_to_cpu(shdr->CreditRequest); + + return 0; } static int diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 202e0e84efdd..53532bd3f50d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -786,17 +786,8 @@ static void cifs_compound_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->server; - unsigned int optype = mid->optype; - unsigned int credits_received = 0; - if (mid->mid_state == MID_RESPONSE_RECEIVED) { - if (mid->resp_buf) - credits_received = server->ops->get_credits(mid); - else - cifs_dbg(FYI, "Bad state for cancelled MID\n"); - } - - add_credits(server, credits_received, optype); + add_credits(server, server->ops->get_credits(mid), mid->optype); } static void -- cgit v1.2.3 From 0fd1d37b0501efc6e295f56ab55cdaff784aa50c Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 15 Jan 2019 15:08:48 -0800 Subject: CIFS: Do not assume one credit for async responses If we don't receive a response we can't assume that the server granted one credit. Assume zero credits in such cases. Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0af87bd0dc49..2ff209ec4fab 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2924,9 +2924,10 @@ smb2_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf; - unsigned int credits_received = 1; + unsigned int credits_received = 0; - if (mid->mid_state == MID_RESPONSE_RECEIVED) + if (mid->mid_state == MID_RESPONSE_RECEIVED + || mid->mid_state == MID_RESPONSE_MALFORMED) credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); DeleteMidQEntry(mid); @@ -3183,7 +3184,7 @@ smb2_readv_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = tcon->ses->server; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rdata->iov[0].iov_base; - unsigned int credits_received = 1; + unsigned int credits_received = 0; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, @@ -3222,6 +3223,9 @@ smb2_readv_callback(struct mid_q_entry *mid) task_io_account_read(rdata->got_bytes); cifs_stats_bytes_read(tcon, rdata->got_bytes); break; + case MID_RESPONSE_MALFORMED: + credits_received = le16_to_cpu(shdr->CreditRequest); + /* fall through */ default: if (rdata->result != -ENODATA) rdata->result = -EIO; @@ -3407,7 +3411,7 @@ smb2_writev_callback(struct mid_q_entry *mid) struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; - unsigned int credits_received = 1; + unsigned int credits_received = 0; switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -3435,6 +3439,9 @@ smb2_writev_callback(struct mid_q_entry *mid) case MID_RETRY_NEEDED: wdata->result = -EAGAIN; break; + case MID_RESPONSE_MALFORMED: + credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); + /* fall through */ default: wdata->result = -EIO; break; -- cgit v1.2.3 From 6a9cbdd1ceca1dc2359ddf082efe61b97c3e752b Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 16 Jan 2019 11:48:42 -0800 Subject: CIFS: Fix mounts if the client is low on credits If the server doesn't grant us at least 3 credits during the mount we won't be able to complete it because query path info operation requires 3 credits. Use the cached file handle if possible to allow the mount to succeed. Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2inode.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index f14533da3a93..01a76bccdb8d 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -293,6 +293,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_file_all_info *smb2_data; __u32 create_options = 0; + struct cifs_fid fid; + bool no_cached_open = tcon->nohandlecache; *adjust_tz = false; *symlink = false; @@ -301,6 +303,21 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; + + /* If it is a root and its handle is cached then use it */ + if (!strlen(full_path) && !no_cached_open) { + rc = open_shroot(xid, tcon, &fid); + if (rc) + goto out; + rc = SMB2_query_info(xid, tcon, fid.persistent_fid, + fid.volatile_fid, smb2_data); + close_shroot(&tcon->crfid); + if (rc) + goto out; + move_smb2_info_to_cifs(data, smb2_data); + goto out; + } + if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; -- cgit v1.2.3 From 2e5700bdde438ed708b36d8acd0398dc73cbf759 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 23 Jan 2019 16:20:38 +1000 Subject: smb3: add credits we receive from oplock/break PDUs Otherwise we gradually leak credits leading to potential hung session. Signed-off-by: Ronnie Sahlberg CC: Stable Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/smb2misc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 6a9c47541c53..7b8b58fb4d3f 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -648,6 +648,13 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) return false; + if (rsp->sync_hdr.CreditRequest) { + spin_lock(&server->req_lock); + server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + } + if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) -- cgit v1.2.3 From a5f1a81f701c594194eb70c679785882ab15f138 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 24 Jan 2019 16:19:31 +1000 Subject: cifs: print CIFSMaxBufSize as part of /proc/fs/cifs/DebugData Was helpful in debug for some recent problems. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 593fb422d0f3..e92a2fee3c57 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -252,6 +252,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_printf(m, ",ACL"); #endif seq_putc(m, '\n'); + seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); seq_printf(m, "Servers:"); -- cgit v1.2.3 From 8e47a457321ca1a74ad194ab5dcbca764bc70731 Mon Sep 17 00:00:00 2001 From: Piotr Jaroszynski Date: Sun, 27 Jan 2019 08:46:45 -0800 Subject: iomap: get/put the page in iomap_page_create/release() migrate_page_move_mapping() expects pages with private data set to have a page_count elevated by 1. This is what used to happen for xfs through the buffer_heads code before the switch to iomap in commit 82cb14175e7d ("xfs: add support for sub-pagesize writeback without buffer_heads"). Not having the count elevated causes move_pages() to fail on memory mapped files coming from xfs. Make iomap compatible with the migrate_page_move_mapping() assumption by elevating the page count as part of iomap_page_create() and lowering it in iomap_page_release(). It causes the move_pages() syscall to misbehave on memory mapped files from xfs. It does not not move any pages, which I suppose is "just" a perf issue, but it also ends up returning a positive number which is out of spec for the syscall. Talking to Michal Hocko, it sounds like returning positive numbers might be a necessary update to move_pages() anyway though. Fixes: 82cb14175e7d ("xfs: add support for sub-pagesize writeback without buffer_heads") Signed-off-by: Piotr Jaroszynski [hch: actually get/put the page iomap_migrate_page() to make it work properly] Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index a3088fae567b..cb184ff68680 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -116,6 +116,12 @@ iomap_page_create(struct inode *inode, struct page *page) atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); + + /* + * migrate_page_move_mapping() assumes that pages with private data have + * their count elevated by 1. + */ + get_page(page); set_page_private(page, (unsigned long)iop); SetPagePrivate(page); return iop; @@ -132,6 +138,7 @@ iomap_page_release(struct page *page) WARN_ON_ONCE(atomic_read(&iop->write_count)); ClearPagePrivate(page); set_page_private(page, 0); + put_page(page); kfree(iop); } @@ -569,8 +576,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, if (page_has_private(page)) { ClearPagePrivate(page); + get_page(newpage); set_page_private(newpage, page_private(page)); set_page_private(page, 0); + put_page(page); SetPagePrivate(newpage); } -- cgit v1.2.3 From 4ea899ead2786a30aaa8181fefa81a3df4ad28f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Jan 2019 08:58:58 -0800 Subject: iomap: fix a use after free in iomap_dio_rw Introduce a local wait_for_completion variable to avoid an access to the potentially freed dio struture after dropping the last reference count. Also use the chance to document the completion behavior to make the refcounting clear to the reader of the code. Fixes: ff6a9292e6 ("iomap: implement direct I/O") Reported-by: Chandan Rajendra Reported-by: Darrick J. Wong Signed-off-by: Christoph Hellwig Tested-by: Chandan Rajendra Tested-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index cb184ff68680..897c60215dd1 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1813,6 +1813,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos, start = pos; loff_t end = iocb->ki_pos + count - 1, ret = 0; unsigned int flags = IOMAP_DIRECT; + bool wait_for_completion = is_sync_kiocb(iocb); struct blk_plug plug; struct iomap_dio *dio; @@ -1832,7 +1833,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->end_io = end_io; dio->error = 0; dio->flags = 0; - dio->wait_for_completion = is_sync_kiocb(iocb); dio->submit.iter = iter; dio->submit.waiter = current; @@ -1887,7 +1887,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio_warn_stale_pagecache(iocb->ki_filp); ret = 0; - if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion && + if (iov_iter_rw(iter) == WRITE && !wait_for_completion && !inode->i_sb->s_dio_done_wq) { ret = sb_init_dio_done_wq(inode->i_sb); if (ret < 0) @@ -1903,7 +1903,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret <= 0) { /* magic error code to fall back to buffered I/O */ if (ret == -ENOTBLK) { - dio->wait_for_completion = true; + wait_for_completion = true; ret = 0; } break; @@ -1925,8 +1925,24 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + /* + * We are about to drop our additional submission reference, which + * might be the last reference to the dio. There are three three + * different ways we can progress here: + * + * (a) If this is the last reference we will always complete and free + * the dio ourselves. + * (b) If this is not the last reference, and we serve an asynchronous + * iocb, we must never touch the dio after the decrement, the + * I/O completion handler will complete and free it. + * (c) If this is not the last reference, but we serve a synchronous + * iocb, the I/O completion handler will wake us up on the drop + * of the final reference, and we will complete and free it here + * after we got woken by the I/O completion handler. + */ + dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { - if (!dio->wait_for_completion) + if (!wait_for_completion) return -EIOCBQUEUED; for (;;) { @@ -1943,9 +1959,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } - ret = iomap_dio_complete(dio); - - return ret; + return iomap_dio_complete(dio); out_free_dio: kfree(dio); -- cgit v1.2.3 From a6279470762c19ba97e454f90798373dccdf6148 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 25 Jan 2019 11:48:51 +0000 Subject: Btrfs: fix deadlock when allocating tree block during leaf/node split When splitting a leaf or node from one of the trees that are modified when flushing pending block groups (extent, chunk, device and free space trees), we need to allocate a new tree block, which in turn can result in the need to allocate a new block group. After allocating the new block group we may need to flush new block groups that were previously allocated during the course of the current transaction, which is what may cause a deadlock due to attempts to write lock twice the same leaf or node, as when splitting a leaf or node we are holding a write lock on it and its parent node. The same type of deadlock can also happen when increasing the tree's height, since we are holding a lock on the existing root while allocating the tree block to use as the new root node. An example trace when the deadlock happens during the leaf split path is: [27175.293054] CPU: 0 PID: 3005 Comm: kworker/u17:6 Tainted: G W 4.19.16 #1 [27175.293942] Hardware name: Penguin Computing Relion 1900/MD90-FS0-ZB-XX, BIOS R15 06/25/2018 [27175.294846] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs] (...) [27175.298384] RSP: 0018:ffffab2087107758 EFLAGS: 00010246 [27175.299269] RAX: 0000000000000bbd RBX: ffff9fadc7141c48 RCX: 0000000000000001 [27175.300155] RDX: 0000000000000001 RSI: 0000000000000002 RDI: ffff9fadc7141c48 [27175.301023] RBP: 0000000000000001 R08: ffff9faeb6ac1040 R09: ffff9fa9c0000000 [27175.301887] R10: 0000000000000000 R11: 0000000000000040 R12: ffff9fb21aac8000 [27175.302743] R13: ffff9fb1a64d6a20 R14: 0000000000000001 R15: ffff9fb1a64d6a18 [27175.303601] FS: 0000000000000000(0000) GS:ffff9fb21fa00000(0000) knlGS:0000000000000000 [27175.304468] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [27175.305339] CR2: 00007fdc8743ead8 CR3: 0000000763e0a006 CR4: 00000000003606f0 [27175.306220] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [27175.307087] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [27175.307940] Call Trace: [27175.308802] btrfs_search_slot+0x779/0x9a0 [btrfs] [27175.309669] ? update_space_info+0xba/0xe0 [btrfs] [27175.310534] btrfs_insert_empty_items+0x67/0xc0 [btrfs] [27175.311397] btrfs_insert_item+0x60/0xd0 [btrfs] [27175.312253] btrfs_create_pending_block_groups+0xee/0x210 [btrfs] [27175.313116] do_chunk_alloc+0x25f/0x300 [btrfs] [27175.313984] find_free_extent+0x706/0x10d0 [btrfs] [27175.314855] btrfs_reserve_extent+0x9b/0x1d0 [btrfs] [27175.315707] btrfs_alloc_tree_block+0x100/0x5b0 [btrfs] [27175.316548] split_leaf+0x130/0x610 [btrfs] [27175.317390] btrfs_search_slot+0x94d/0x9a0 [btrfs] [27175.318235] btrfs_insert_empty_items+0x67/0xc0 [btrfs] [27175.319087] alloc_reserved_file_extent+0x84/0x2c0 [btrfs] [27175.319938] __btrfs_run_delayed_refs+0x596/0x1150 [btrfs] [27175.320792] btrfs_run_delayed_refs+0xed/0x1b0 [btrfs] [27175.321643] delayed_ref_async_start+0x81/0x90 [btrfs] [27175.322491] normal_work_helper+0xd0/0x320 [btrfs] [27175.323328] ? move_linked_works+0x6e/0xa0 [27175.324160] process_one_work+0x191/0x370 [27175.324976] worker_thread+0x4f/0x3b0 [27175.325763] kthread+0xf8/0x130 [27175.326531] ? rescuer_thread+0x320/0x320 [27175.327284] ? kthread_create_worker_on_cpu+0x50/0x50 [27175.328027] ret_from_fork+0x35/0x40 [27175.328741] ---[ end trace 300a1b9f0ac30e26 ]--- Fix this by preventing the flushing of new blocks groups when splitting a leaf/node and when inserting a new root node for one of the trees modified by the flushing operation, similar to what is done when COWing a node/leaf from on of these trees. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202383 Reported-by: Eli V CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 78 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f64aad613727..5a6c39b44c84 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -968,6 +968,48 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return 0; } +static struct extent_buffer *alloc_tree_block_no_bg_flush( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent_start, + const struct btrfs_disk_key *disk_key, + int level, + u64 hint, + u64 empty_size) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *ret; + + /* + * If we are COWing a node/leaf from the extent, chunk, device or free + * space trees, make sure that we do not finish block group creation of + * pending block groups. We do this to avoid a deadlock. + * COWing can result in allocation of a new chunk, and flushing pending + * block groups (btrfs_create_pending_block_groups()) can be triggered + * when finishing allocation of a new chunk. Creation of a pending block + * group modifies the extent, chunk, device and free space trees, + * therefore we could deadlock with ourselves since we are holding a + * lock on an extent buffer that btrfs_create_pending_block_groups() may + * try to COW later. + * For similar reasons, we also need to delay flushing pending block + * groups when splitting a leaf or node, from one of those trees, since + * we are holding a write lock on it and its parent or when inserting a + * new root node for one of those trees. + */ + if (root == fs_info->extent_root || + root == fs_info->chunk_root || + root == fs_info->dev_root || + root == fs_info->free_space_root) + trans->can_flush_pending_bgs = false; + + ret = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, disk_key, level, + hint, empty_size); + trans->can_flush_pending_bgs = true; + + return ret; +} + /* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked @@ -1015,28 +1057,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) parent_start = parent->start; - /* - * If we are COWing a node/leaf from the extent, chunk, device or free - * space trees, make sure that we do not finish block group creation of - * pending block groups. We do this to avoid a deadlock. - * COWing can result in allocation of a new chunk, and flushing pending - * block groups (btrfs_create_pending_block_groups()) can be triggered - * when finishing allocation of a new chunk. Creation of a pending block - * group modifies the extent, chunk, device and free space trees, - * therefore we could deadlock with ourselves since we are holding a - * lock on an extent buffer that btrfs_create_pending_block_groups() may - * try to COW later. - */ - if (root == fs_info->extent_root || - root == fs_info->chunk_root || - root == fs_info->dev_root || - root == fs_info->free_space_root) - trans->can_flush_pending_bgs = false; - - cow = btrfs_alloc_tree_block(trans, root, parent_start, - root->root_key.objectid, &disk_key, level, - search_start, empty_size); - trans->can_flush_pending_bgs = true; + cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -3345,8 +3367,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &lower_key, level, root->node->start, 0); + c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, + root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -3475,8 +3497,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, level, c->start, 0); + split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, + c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -4260,8 +4282,8 @@ again: else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0); + right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, + l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); -- cgit v1.2.3 From 302167c50b32e7fccc98994a91d40ddbbab04e52 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jan 2019 09:31:43 -0500 Subject: btrfs: don't end the transaction for delayed refs in throttle Previously callers to btrfs_end_transaction_throttle() would commit the transaction if there wasn't enough delayed refs space. This happens in relocation, and if the fs is relatively empty we'll run out of delayed refs space basically immediately, so we'll just be stuck in this loop of committing the transaction over and over again. This code existed because we didn't have a good feedback mechanism for running delayed refs, but with the delayed refs rsv we do now. Delete this throttling code and let the btrfs_start_transaction() in relocation deal with putting pressure on the delayed refs infrastructure. With this patch we no longer take 5 minutes to balance a metadata only fs. Qu has submitted a fstest to catch slow balance or excessive transaction commits. Steps to reproduce: * create subvolume * create many (eg. 16000) inlined files, of size 2KiB * iteratively snapshot and touch several files to trigger metadata updates * start balance -m Reported-by: Qu Wenruo Fixes: 64403612b73a ("btrfs: rework btrfs_check_space_for_delayed_refs") Signed-off-by: Josef Bacik [ add tags and steps to reproduce ] Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 127fa1535f58..f15cf46f1b9d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -850,14 +850,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_chunk_metadata(trans); - if (lock && should_end_transaction(trans) && - READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { - spin_lock(&info->trans_lock); - if (cur_trans->state == TRANS_STATE_RUNNING) - cur_trans->state = TRANS_STATE_BLOCKED; - spin_unlock(&info->trans_lock); - } - if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { if (throttle) return btrfs_commit_transaction(trans); -- cgit v1.2.3 From 80ff00172407e0aad4b10b94ef0816fc3e7813cb Mon Sep 17 00:00:00 2001 From: Yao Liu Date: Mon, 28 Jan 2019 19:44:14 +0800 Subject: nfs: Fix NULL pointer dereference of dev_name There is a NULL pointer dereference of dev_name in nfs_parse_devname() The oops looks something like: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 ... RIP: 0010:nfs_fs_mount+0x3b6/0xc20 [nfs] ... Call Trace: ? ida_alloc_range+0x34b/0x3d0 ? nfs_clone_super+0x80/0x80 [nfs] ? nfs_free_parsed_mount_data+0x60/0x60 [nfs] mount_fs+0x52/0x170 ? __init_waitqueue_head+0x3b/0x50 vfs_kern_mount+0x6b/0x170 do_mount+0x216/0xdc0 ksys_mount+0x83/0xd0 __x64_sys_mount+0x25/0x30 do_syscall_64+0x65/0x220 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix this by adding a NULL check on dev_name Signed-off-by: Yao Liu Signed-off-by: Anna Schumaker --- fs/nfs/super.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 22ce3c8a2f46..0570391eaa16 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1895,6 +1895,11 @@ static int nfs_parse_devname(const char *dev_name, size_t len; char *end; + if (unlikely(!dev_name || !*dev_name)) { + dfprintk(MOUNT, "NFS: device name not specified\n"); + return -EINVAL; + } + /* Is the host name protected with square brakcets? */ if (*dev_name == '[') { end = strchr(++dev_name, ']'); -- cgit v1.2.3 From 8fc75bed96bb94e23ca51bd9be4daf65c57697bf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 29 Jan 2019 15:52:55 -0500 Subject: NFS: Fix up return value on fatal errors in nfs_page_async_flush() Ensure that we return the fatal error value that caused us to exit nfs_page_async_flush(). Fixes: c373fff7bd25 ("NFSv4: Don't special case "launder"") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.12+ Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5a0bbf917a32..f12cb31a41e5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -621,11 +621,12 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_set_page_writeback(page); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); - ret = 0; + ret = req->wb_context->error; /* If there is a fatal error that covers this write, just exit */ - if (nfs_error_is_fatal_on_server(req->wb_context->error)) + if (nfs_error_is_fatal_on_server(ret)) goto out_launder; + ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -635,9 +636,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_context_set_write_error(req->wb_context, ret); if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - } + } else + ret = -EAGAIN; nfs_redirty_request(req); - ret = -EAGAIN; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); -- cgit v1.2.3 From 58d15ed1203f4d858c339ea4d7dafa94bd2a56d3 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 29 Jan 2019 12:46:16 +1000 Subject: cifs: fix computation for MAX_SMB2_HDR_SIZE The size of the fixed part of the create response is 88 bytes not 56. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2pdu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 7a2d0a2255e6..e9cc6775df21 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -84,8 +84,8 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ -#define MAX_SMB2_HDR_SIZE 0x00b0 +/* 52 transform hdr + 64 hdr + 88 create rsp */ +#define MAX_SMB2_HDR_SIZE 204 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) -- cgit v1.2.3 From c4627e66f73a28c5515b908c90c2bf7120086497 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 29 Jan 2019 12:46:17 +1000 Subject: cifs: limit amount of data we request for xattrs to CIFSMaxBufSize minus the various headers and blobs that will be part of the reply. or else we might trigger a session reconnect. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2ops.c | 4 +++- fs/cifs/smb2pdu.h | 15 +++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 153238fc4fa9..6f96e2292856 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -866,7 +866,9 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, FILE_READ_EA, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, - SMB2_MAX_EA_BUF, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE, &rsp_iov, &buftype, cifs_sb); if (rc) { /* diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index e9cc6775df21..538e2299805f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -85,6 +85,7 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 /* 52 transform hdr + 64 hdr + 88 create rsp */ +#define SMB2_TRANSFORM_HEADER_SIZE 52 #define MAX_SMB2_HDR_SIZE 204 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) @@ -648,6 +649,13 @@ struct smb2_create_req { __u8 Buffer[0]; } __packed; +/* + * Maximum size of a SMB2_CREATE response is 64 (smb2 header) + + * 88 (fixed part of create response) + 520 (path) + 150 (contexts) + + * 2 bytes of padding. + */ +#define MAX_SMB2_CREATE_RESPONSE_SIZE 824 + struct smb2_create_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 89 */ @@ -996,6 +1004,11 @@ struct smb2_close_req { __u64 VolatileFileId; /* opaque endianness */ } __packed; +/* + * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data) + */ +#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124 + struct smb2_close_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* 60 */ @@ -1398,8 +1411,6 @@ struct smb2_file_link_info { /* encoding of request for level 11 */ char FileName[0]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ -#define SMB2_MAX_EA_BUF 65536 - struct smb2_file_full_ea_info { /* encoding of response for level 15 */ __le32 next_entry_offset; __u8 flags; -- cgit v1.2.3 From 9bda8723da2d55b1de833b98cf802b88006e5b69 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 23 Jan 2019 17:12:09 -0800 Subject: CIFS: Fix possible oops and memory leaks in async IO Allocation of a page array for non-cached IO was separated from allocation of rdata and wdata structures and this introduced memory leaks and a possible null pointer dereference. This patch fixes these problems. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 2c7689f3998d..659ce1b92c44 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2696,6 +2696,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, rc = cifs_write_allocate_pages(wdata->pages, nr_pages); if (rc) { + kvfree(wdata->pages); kfree(wdata); add_credits_and_wake_if(server, credits, 0); break; @@ -2707,6 +2708,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, if (rc) { for (i = 0; i < nr_pages; i++) put_page(wdata->pages[i]); + kvfree(wdata->pages); kfree(wdata); add_credits_and_wake_if(server, credits, 0); break; @@ -3386,8 +3388,12 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, } rc = cifs_read_allocate_pages(rdata, npages); - if (rc) - goto error; + if (rc) { + kvfree(rdata->pages); + kfree(rdata); + add_credits_and_wake_if(server, credits, 0); + break; + } rdata->tailsz = PAGE_SIZE; } @@ -3407,7 +3413,6 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, if (!rdata->cfile->invalidHandle || !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); -error: if (rc) { add_credits_and_wake_if(server, rdata->credits, 0); kref_put(&rdata->refcount, -- cgit v1.2.3 From 7d42e72fe8ee5ab70b1af843dd7d8615e6fb0abe Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 25 Jan 2019 11:38:53 -0800 Subject: CIFS: Fix trace command logging for SMB2 reads and writes Currently we log success once we send an async IO request to the server. Instead we need to analyse a response and then log success or failure for a particular command. Also fix argument list for read logging. Cc: # 4.18 Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2ff209ec4fab..4b5ab9c80cc3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3241,8 +3241,17 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->mr = NULL; } #endif - if (rdata->result) + if (rdata->result) { cifs_stats_fail_inc(tcon, SMB2_READ_HE); + trace_smb3_read_err(0 /* xid */, + rdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, rdata->offset, + rdata->bytes, rdata->result); + } else + trace_smb3_read_done(0 /* xid */, + rdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->offset, rdata->got_bytes); queue_work(cifsiod_wq, &rdata->work); DeleteMidQEntry(mid); @@ -3317,13 +3326,11 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); - trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid, - io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length); - } else - trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid, - io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length); + trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid, + io_parms.tcon->tid, + io_parms.tcon->ses->Suid, + io_parms.offset, io_parms.length, rc); + } cifs_small_buf_release(buf); return rc; @@ -3367,10 +3374,11 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc != -ENODATA) { cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); + trace_smb3_read_err(xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, io_parms->length, + rc); } - trace_smb3_read_err(rc, xid, req->PersistentFileId, - io_parms->tcon->tid, ses->Suid, - io_parms->offset, io_parms->length); free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc == -ENODATA ? 0 : rc; } else @@ -3459,8 +3467,17 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->mr = NULL; } #endif - if (wdata->result) + if (wdata->result) { cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); + trace_smb3_write_err(0 /* no xid */, + wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, wdata->offset, + wdata->bytes, wdata->result); + } else + trace_smb3_write_done(0 /* no xid */, + wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + wdata->offset, wdata->bytes); queue_work(cifsiod_wq, &wdata->work); DeleteMidQEntry(mid); @@ -3602,10 +3619,7 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata->bytes, rc); kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); - } else - trace_smb3_write_done(0 /* no xid */, req->PersistentFileId, - tcon->tid, tcon->ses->Suid, wdata->offset, - wdata->bytes); + } async_writev_out: cifs_small_buf_release(req); -- cgit v1.2.3 From 8e6e72aeceaaed5aeeb1cb43d3085de7ceb14f79 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 26 Jan 2019 12:21:32 -0800 Subject: CIFS: Do not count -ENODATA as failure for query directory Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 4b5ab9c80cc3..d858dc04fdc3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3845,8 +3845,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { srch_inf->endOfSearch = true; rc = 0; - } - cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + } else + cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); goto qdir_exit; } -- cgit v1.2.3 From 082aaa8700415f6471ec9c5ef0c8307ca214989a Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 18 Jan 2019 15:54:34 -0800 Subject: CIFS: Do not consider -ENODATA as stat failure for reads When doing reads beyound the end of a file the server returns error STATUS_END_OF_FILE error which is mapped to -ENODATA. Currently we report it as a failure which confuses read stats. Change it to not consider -ENODATA as failure for stat purposes. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index d858dc04fdc3..ef52d6642431 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3241,7 +3241,7 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->mr = NULL; } #endif - if (rdata->result) { + if (rdata->result && rdata->result != -ENODATA) { cifs_stats_fail_inc(tcon, SMB2_READ_HE); trace_smb3_read_err(0 /* xid */, rdata->cfile->fid.persistent_fid, -- cgit v1.2.3 From 92900e5160a5444d47dd376bc40066b709fbb5a6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 27 Jan 2019 04:58:00 +0000 Subject: btrfs: fix potential oops in device_list_add alloc_fs_devices() can return ERR_PTR(-ENOMEM), so dereferencing its result before the check for IS_ERR() is a bad idea. Fixes: d1a63002829a4 ("btrfs: add members to fs_devices to track fsid changes") Reviewed-by: Nikolay Borisov Reviewed-by: Anand Jain Signed-off-by: Al Viro Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3e4f8f88353e..15561926ab32 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -957,11 +957,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, else fs_devices = alloc_fs_devices(disk_super->fsid, NULL); - fs_devices->fsid_change = fsid_change_in_progress; - if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); + fs_devices->fsid_change = fsid_change_in_progress; + mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); -- cgit v1.2.3 From c7cc64a98512ffc41df86d14a414eb3b09bf7481 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 23 Jan 2019 17:09:16 +0100 Subject: btrfs: clean up pending block groups when transaction commit aborts The fstests generic/475 stresses transaction aborts and can reveal space accounting or use-after-free bugs regarding block goups. In this case the pending block groups that remain linked to the structures after transaction commit aborts in the middle. The corrupted slabs lead to failures in following tests, eg. generic/476 [ 8172.752887] BUG: unable to handle kernel NULL pointer dereference at 0000000000000058 [ 8172.755799] #PF error: [normal kernel read fault] [ 8172.757571] PGD 661ae067 P4D 661ae067 PUD 3db8e067 PMD 0 [ 8172.759000] Oops: 0000 [#1] PREEMPT SMP [ 8172.760209] CPU: 0 PID: 39 Comm: kswapd0 Tainted: G W 5.0.0-rc2-default #408 [ 8172.762495] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014 [ 8172.765772] RIP: 0010:shrink_page_list+0x2f9/0xe90 [ 8172.770453] RSP: 0018:ffff967f00663b18 EFLAGS: 00010287 [ 8172.771184] RAX: 0000000000000000 RBX: ffff967f00663c20 RCX: 0000000000000000 [ 8172.772850] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8c0620ab20e0 [ 8172.774629] RBP: ffff967f00663dd8 R08: 0000000000000000 R09: 0000000000000000 [ 8172.776094] R10: ffff8c0620ab22f8 R11: ffff8c063f772688 R12: ffff967f00663b78 [ 8172.777533] R13: ffff8c063f625600 R14: ffff8c063f625608 R15: dead000000000200 [ 8172.778886] FS: 0000000000000000(0000) GS:ffff8c063d400000(0000) knlGS:0000000000000000 [ 8172.780545] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8172.781787] CR2: 0000000000000058 CR3: 000000004e962000 CR4: 00000000000006f0 [ 8172.783547] Call Trace: [ 8172.784112] shrink_inactive_list+0x194/0x410 [ 8172.784747] shrink_node_memcg.constprop.85+0x3a5/0x6a0 [ 8172.785472] shrink_node+0x62/0x1e0 [ 8172.786011] balance_pgdat+0x216/0x460 [ 8172.786577] kswapd+0xe3/0x4a0 [ 8172.787085] ? finish_wait+0x80/0x80 [ 8172.787795] ? balance_pgdat+0x460/0x460 [ 8172.788799] kthread+0x116/0x130 [ 8172.789640] ? kthread_create_on_node+0x60/0x60 [ 8172.790323] ret_from_fork+0x24/0x30 [ 8172.794253] CR2: 0000000000000058 or accounting errors at umount time: [ 8159.537251] WARNING: CPU: 2 PID: 19031 at fs/btrfs/extent-tree.c:5987 btrfs_free_block_groups+0x3d5/0x410 [btrfs] [ 8159.543325] CPU: 2 PID: 19031 Comm: umount Tainted: G W 5.0.0-rc2-default #408 [ 8159.545472] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014 [ 8159.548155] RIP: 0010:btrfs_free_block_groups+0x3d5/0x410 [btrfs] [ 8159.554030] RSP: 0018:ffff967f079cbde8 EFLAGS: 00010206 [ 8159.555144] RAX: 0000000001000000 RBX: ffff8c06366cf800 RCX: 0000000000000000 [ 8159.556730] RDX: 0000000000000002 RSI: 0000000000000001 RDI: ffff8c06255ad800 [ 8159.558279] RBP: ffff8c0637ac0000 R08: 0000000000000001 R09: 0000000000000000 [ 8159.559797] R10: 0000000000000000 R11: 0000000000000001 R12: ffff8c0637ac0108 [ 8159.561296] R13: ffff8c0637ac0158 R14: 0000000000000000 R15: dead000000000100 [ 8159.562852] FS: 00007f7f693b9fc0(0000) GS:ffff8c063d800000(0000) knlGS:0000000000000000 [ 8159.564839] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8159.566160] CR2: 00007f7f68fab7b0 CR3: 000000000aec7000 CR4: 00000000000006e0 [ 8159.567898] Call Trace: [ 8159.568597] close_ctree+0x17f/0x350 [btrfs] [ 8159.569628] generic_shutdown_super+0x64/0x100 [ 8159.570808] kill_anon_super+0x14/0x30 [ 8159.571857] btrfs_kill_super+0x12/0xa0 [btrfs] [ 8159.573063] deactivate_locked_super+0x29/0x60 [ 8159.574234] cleanup_mnt+0x3b/0x70 [ 8159.575176] task_work_run+0x98/0xc0 [ 8159.576177] exit_to_usermode_loop+0x83/0x90 [ 8159.577315] do_syscall_64+0x15b/0x180 [ 8159.578339] entry_SYSCALL_64_after_hwframe+0x49/0xbe This fix is based on 2 Josef's patches that used sideefects of btrfs_create_pending_block_groups, this fix introduces the helper that does what we need. CC: stable@vger.kernel.org # 4.4+ CC: Josef Bacik Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f15cf46f1b9d..4ec2b660d014 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1871,6 +1871,21 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) kmem_cache_free(btrfs_trans_handle_cachep, trans); } +/* + * Release reserved delayed ref space of all pending block groups of the + * transaction and remove them from the list + */ +static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *block_group, *tmp; + + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + btrfs_delayed_refs_rsv_release(fs_info, 1); + list_del_init(&block_group->bg_list); + } +} + static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { /* @@ -2262,6 +2277,7 @@ scrub_continue: btrfs_scrub_continue(fs_info); cleanup_transaction: btrfs_trans_release_metadata(trans); + btrfs_cleanup_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; btrfs_warn(fs_info, "Skipping commit of aborted transaction."); -- cgit v1.2.3 From 532b618bdf237250d6d4566536d4b6ce3d0a31fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 30 Jan 2019 07:54:12 -0600 Subject: btrfs: On error always free subvol_name in btrfs_mount The subvol_name is allocated in btrfs_parse_subvol_options and is consumed and freed in mount_subvol. Add a free to the error paths that don't call mount_subvol so that it is guaranteed that subvol_name is freed when an error happens. Fixes: 312c89fbca06 ("btrfs: cleanup btrfs_mount() using btrfs_mount_root()") Cc: stable@vger.kernel.org # v4.19+ Reviewed-by: Nikolay Borisov Signed-off-by: "Eric W. Biederman" Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 368a5b9e6c13..74023786a735 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1677,6 +1677,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, flags | SB_RDONLY, device_name, data); if (IS_ERR(mnt_root)) { root = ERR_CAST(mnt_root); + kfree(subvol_name); goto out; } @@ -1686,12 +1687,14 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error < 0) { root = ERR_PTR(error); mntput(mnt_root); + kfree(subvol_name); goto out; } } } if (IS_ERR(mnt_root)) { root = ERR_CAST(mnt_root); + kfree(subvol_name); goto out; } -- cgit v1.2.3 From 1dbd449c9943e3145148cc893c2461b72ba6fef0 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Jan 2019 13:52:36 -0500 Subject: fs/dcache: Fix incorrect nr_dentry_unused accounting in shrink_dcache_sb() The nr_dentry_unused per-cpu counter tracks dentries in both the LRU lists and the shrink lists where the DCACHE_LRU_LIST bit is set. The shrink_dcache_sb() function moves dentries from the LRU list to a shrink list and subtracts the dentry count from nr_dentry_unused. This is incorrect as the nr_dentry_unused count will also be decremented in shrink_dentry_list() via d_shrink_del(). To fix this double decrement, the decrement in the shrink_dcache_sb() function is taken out. Fixes: 4e717f5c1083 ("list_lru: remove special case function list_lru_dispose_all." Cc: stable@kernel.org Signed-off-by: Waiman Long Reviewed-by: Dave Chinner Signed-off-by: Linus Torvalds --- fs/dcache.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 2593153471cf..44e5652b2664 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1188,15 +1188,11 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, */ void shrink_dcache_sb(struct super_block *sb) { - long freed; - do { LIST_HEAD(dispose); - freed = list_lru_walk(&sb->s_dentry_lru, + list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); - - this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } -- cgit v1.2.3 From af0c9af1b3f66052c369d08be3f60fa9a9559e48 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Jan 2019 13:52:38 -0500 Subject: fs/dcache: Track & report number of negative dentries The current dentry number tracking code doesn't distinguish between positive & negative dentries. It just reports the total number of dentries in the LRU lists. As excessive number of negative dentries can have an impact on system performance, it will be wise to track the number of positive and negative dentries separately. This patch adds tracking for the total number of negative dentries in the system LRU lists and reports it in the 5th field in the /proc/sys/fs/dentry-state file. The number, however, does not include negative dentries that are in flight but not in the LRU yet as well as those in the shrinker lists which are on the way out anyway. The number of positive dentries in the LRU lists can be roughly found by subtracting the number of negative dentries from the unused count. Matthew Wilcox had confirmed that since the introduction of the dentry_stat structure in 2.1.60, the dummy array was there, probably for future extension. They were not replacements of pre-existing fields. So no sane applications that read the value of /proc/sys/fs/dentry-state will do dummy thing if the last 2 fields of the sysctl parameter are not zero. IOW, it will be safe to use one of the dummy array entry for negative dentry count. Signed-off-by: Waiman Long Signed-off-by: Linus Torvalds --- Documentation/sysctl/fs.txt | 26 ++++++++++++++++---------- fs/dcache.c | 32 ++++++++++++++++++++++++++++++++ include/linux/dcache.h | 7 ++++--- 3 files changed, 52 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 819caf8ca05f..58649bd4fcfc 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -56,26 +56,32 @@ of any kernel data structures. dentry-state: -From linux/fs/dentry.c: +From linux/include/linux/dcache.h: -------------------------------------------------------------- -struct { +struct dentry_stat_t dentry_stat { int nr_dentry; int nr_unused; int age_limit; /* age in seconds */ int want_pages; /* pages requested by system */ - int dummy[2]; -} dentry_stat = {0, 0, 45, 0,}; --------------------------------------------------------------- - -Dentries are dynamically allocated and deallocated, and -nr_dentry seems to be 0 all the time. Hence it's safe to -assume that only nr_unused, age_limit and want_pages are -used. Nr_unused seems to be exactly what its name says. + int nr_negative; /* # of unused negative dentries */ + int dummy; /* Reserved for future use */ +}; +-------------------------------------------------------------- + +Dentries are dynamically allocated and deallocated. + +nr_dentry shows the total number of dentries allocated (active ++ unused). nr_unused shows the number of dentries that are not +actively used, but are saved in the LRU list for future reuse. + Age_limit is the age in seconds after which dcache entries can be reclaimed when memory is short and want_pages is nonzero when shrink_dcache_pages() has been called and the dcache isn't pruned yet. +nr_negative shows the number of unused dentries that are also +negative dentries which do not mapped to actual files. + ============================================================== dquot-max & dquot-nr: diff --git a/fs/dcache.c b/fs/dcache.c index 44e5652b2664..aac41adf4743 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -119,6 +119,7 @@ struct dentry_stat_t dentry_stat = { static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); +static DEFINE_PER_CPU(long, nr_dentry_negative); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) @@ -152,11 +153,22 @@ static long get_nr_dentry_unused(void) return sum < 0 ? 0 : sum; } +static long get_nr_dentry_negative(void) +{ + int i; + long sum = 0; + + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_negative, i); + return sum < 0 ? 0 : sum; +} + int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); + dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -317,6 +329,8 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; + if (dentry->d_flags & DCACHE_LRU_LIST) + this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) @@ -371,6 +385,11 @@ static void dentry_unlink_inode(struct dentry * dentry) * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * + * The per-cpu "nr_dentry_negative" counters are only updated + * when deleted from or added to the per-superblock LRU list, not + * from/to the shrink list. That is to avoid an unneeded dec/inc + * pair when moving from LRU to shrink list in select_collect(). + * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ @@ -380,6 +399,8 @@ static void d_lru_add(struct dentry *dentry) D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } @@ -388,6 +409,8 @@ static void d_lru_del(struct dentry *dentry) D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } @@ -418,6 +441,8 @@ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } @@ -426,6 +451,8 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } @@ -1816,6 +1843,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); + /* + * Decrement negative dentry count if it was in the LRU list. + */ + if (dentry->d_flags & DCACHE_LRU_LIST) + this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ef4b70f64f33..60996e64c579 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -62,9 +62,10 @@ extern const struct qstr slash_name; struct dentry_stat_t { long nr_dentry; long nr_unused; - long age_limit; /* age in seconds */ - long want_pages; /* pages requested by system */ - long dummy[2]; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long nr_negative; /* # of unused negative dentries */ + long dummy; /* Reserved for future use */ }; extern struct dentry_stat_t dentry_stat; -- cgit v1.2.3 From d339adc12a4f885b572c5412e4869af8939db854 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 31 Jan 2019 13:46:07 +0100 Subject: CIFS: fix use-after-free of the lease keys The request buffers are freed right before copying the pointers. Use the func args instead which are identical and still valid. Simple reproducer (requires KASAN enabled) on a cifs mount: echo foo > foo ; tail -f foo & rm foo Cc: # 4.20 Fixes: 179e44d49c2f ("smb3: add tracepoint for sending lease break responses to server") Signed-off-by: Aurelien Aptel Signed-off-by: Steve French Reviewed-by: Paulo Alcantara --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ef52d6642431..77b3aaa39b35 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -4441,8 +4441,8 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - please_key_low = (__u64 *)req->LeaseKey; - please_key_high = (__u64 *)(req->LeaseKey+8); + please_key_low = (__u64 *)lease_key; + please_key_high = (__u64 *)(lease_key+8); if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid, -- cgit v1.2.3 From b9b9378b49030d1aeca2387660fcd1ac1f306cad Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 29 Jan 2019 17:27:33 -0600 Subject: cifs: update internal module version number To 2.17 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index d1f9c2f3f575..7652551a1fc4 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.16" +#define CIFS_VERSION "2.17" #endif /* _CIFSFS_H */ -- cgit v1.2.3 From e74c98ca2d6ae4376cc15fa2a22483430909d96b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 30 Jan 2019 21:30:36 +0100 Subject: gfs2: Revert "Fix loop in gfs2_rbm_find" This reverts commit 2d29f6b96d8f80322ed2dd895bca590491c38d34. It turns out that the fix can lead to a ~20 percent performance regression in initial writes to the page cache according to iozone. Let's revert this for now to have more time for a proper fix. Cc: stable@vger.kernel.org # v3.13+ Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Linus Torvalds --- fs/gfs2/rgrp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 831d7cb5a49c..17a8d3b43990 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1780,9 +1780,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, goto next_iter; } if (ret == -E2BIG) { - n += rbm->bii - initial_bii; rbm->bii = 0; rbm->offset = 0; + n += (rbm->bii - initial_bii); goto res_covered_end_of_rgrp; } return ret; -- cgit v1.2.3 From 1fde6f21d90f8ba5da3cb9c54ca991ed72696c43 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 1 Feb 2019 14:20:01 -0800 Subject: proc: fix /proc/net/* after setns(2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /proc entries under /proc/net/* can't be cached into dcache because setns(2) can change current net namespace. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: avoid vim miscolorization] [adobriyan@gmail.com: write test, add dummy ->d_revalidate hook: necessary if /proc/net/* is pinned at setns time] Link: http://lkml.kernel.org/r/20190108192350.GA12034@avx2 Link: http://lkml.kernel.org/r/20190107162336.GA9239@avx2 Fixes: 1da4d377f943fe4194ffb9fb9c26cc58fad4dd24 ("proc: revalidate misc dentries") Signed-off-by: Alexey Dobriyan Reported-by: Mateusz Stępień Reported-by: Ahmad Fatoum Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 4 +- fs/proc/internal.h | 1 + fs/proc/proc_net.c | 20 +++++ tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/setns-dcache.c | 129 ++++++++++++++++++++++++++++ 6 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/proc/setns-dcache.c (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8ae109429a88..e39bac94dead 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -256,7 +256,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_misc_dentry_ops); + d_set_d_op(dentry, de->proc_dops); return d_splice_alias(inode, dentry); } read_unlock(&proc_subdir_lock); @@ -429,6 +429,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); + ent->proc_dops = &proc_misc_dentry_ops; + out: return ent; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5185d7f6a51e..95b14196f284 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -44,6 +44,7 @@ struct proc_dir_entry { struct completion *pde_unload_completion; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; + const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index d5e0fcb3439e..a7b12435519e 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -38,6 +38,22 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } +static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 0; +} + +static const struct dentry_operations proc_net_dentry_ops = { + .d_revalidate = proc_net_d_revalidate, + .d_delete = always_delete_dentry, +}; + +static void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->proc_dops = &proc_net_dentry_ops; +} + static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; @@ -90,6 +106,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -133,6 +150,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -181,6 +199,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; return proc_register(parent, p); @@ -223,6 +242,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; p->write = write; diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 82121a81681f..29bac5ef9a93 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -10,4 +10,5 @@ /proc-uptime-002 /read /self +/setns-dcache /thread-self diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 1c12c34cf85d..434d033ee067 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -14,6 +14,7 @@ TEST_GEN_PROGS += proc-uptime-001 TEST_GEN_PROGS += proc-uptime-002 TEST_GEN_PROGS += read TEST_GEN_PROGS += self +TEST_GEN_PROGS += setns-dcache TEST_GEN_PROGS += thread-self include ../lib.mk diff --git a/tools/testing/selftests/proc/setns-dcache.c b/tools/testing/selftests/proc/setns-dcache.c new file mode 100644 index 000000000000..60ab197a73fc --- /dev/null +++ b/tools/testing/selftests/proc/setns-dcache.c @@ -0,0 +1,129 @@ +/* + * Copyright © 2019 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Test that setns(CLONE_NEWNET) points to new /proc/net content even + * if old one is in dcache. + * + * FIXME /proc/net/unix is under CONFIG_UNIX which can be disabled. + */ +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static pid_t pid = -1; + +static void f(void) +{ + if (pid > 0) { + kill(pid, SIGTERM); + } +} + +int main(void) +{ + int fd[2]; + char _ = 0; + int nsfd; + + atexit(f); + + /* Check for priviledges and syscall availability straight away. */ + if (unshare(CLONE_NEWNET) == -1) { + if (errno == ENOSYS || errno == EPERM) { + return 4; + } + return 1; + } + /* Distinguisher between two otherwise empty net namespaces. */ + if (socket(AF_UNIX, SOCK_STREAM, 0) == -1) { + return 1; + } + + if (pipe(fd) == -1) { + return 1; + } + + pid = fork(); + if (pid == -1) { + return 1; + } + + if (pid == 0) { + if (unshare(CLONE_NEWNET) == -1) { + return 1; + } + + if (write(fd[1], &_, 1) != 1) { + return 1; + } + + pause(); + + return 0; + } + + if (read(fd[0], &_, 1) != 1) { + return 1; + } + + { + char buf[64]; + snprintf(buf, sizeof(buf), "/proc/%u/ns/net", pid); + nsfd = open(buf, O_RDONLY); + if (nsfd == -1) { + return 1; + } + } + + /* Reliably pin dentry into dcache. */ + (void)open("/proc/net/unix", O_RDONLY); + + if (setns(nsfd, CLONE_NEWNET) == -1) { + return 1; + } + + kill(pid, SIGTERM); + pid = 0; + + { + char buf[4096]; + ssize_t rv; + int fd; + + fd = open("/proc/net/unix", O_RDONLY); + if (fd == -1) { + return 1; + } + +#define S "Num RefCount Protocol Flags Type St Inode Path\n" + rv = read(fd, buf, sizeof(buf)); + + assert(rv == strlen(S)); + assert(memcmp(buf, S, strlen(S)) == 0); + } + + return 0; +} -- cgit v1.2.3 From c27d82f52f75fc9d8d9d40d120d2a96fdeeada5e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Feb 2019 14:21:23 -0800 Subject: fs/drop_caches.c: avoid softlockups in drop_pagecache_sb() When superblock has lots of inodes without any pagecache (like is the case for /proc), drop_pagecache_sb() will iterate through all of them without dropping sb->s_inode_list_lock which can lead to softlockups (one of our customers hit this). Fix the problem by going to the slow path and doing cond_resched() in case the process needs rescheduling. Link: http://lkml.kernel.org/r/20190114085343.15011-1-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/drop_caches.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 82377017130f..d31b6c72b476 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -21,8 +21,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); + /* + * We must skip inodes in unusual state. We may also skip + * inodes without pages but we deliberately won't in case + * we need to reschedule to avoid softlockups. + */ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (inode->i_mapping->nrpages == 0)) { + (inode->i_mapping->nrpages == 0 && !need_resched())) { spin_unlock(&inode->i_lock); continue; } @@ -30,6 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); + cond_resched(); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; -- cgit v1.2.3 From 63ce5f552beb9bdb41546b3a26c4374758b21815 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Fri, 1 Feb 2019 14:21:26 -0800 Subject: autofs: drop dentry reference only when it is never used autofs_expire_run() calls dput(dentry) to drop the reference count of dentry. However, dentry is read via autofs_dentry_ino(dentry) after that. This may result in a use-free-bug. The patch drops the reference count of dentry only when it is never used. Link: http://lkml.kernel.org/r/154725122396.11260.16053424107144453867.stgit@pluto-themaw-net Signed-off-by: Pan Bian Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/expire.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index d441244b79df..28d9c2b1b3bb 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -596,7 +596,6 @@ int autofs_expire_run(struct super_block *sb, pkt.len = dentry->d_name.len; memcpy(pkt.name, dentry->d_name.name, pkt.len); pkt.name[pkt.len] = '\0'; - dput(dentry); if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) ret = -EFAULT; @@ -609,6 +608,8 @@ int autofs_expire_run(struct super_block *sb, complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); + dput(dentry); + return ret; } -- cgit v1.2.3 From f585b283e3f025754c45bbe7533fc6e5c4643700 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 1 Feb 2019 14:21:29 -0800 Subject: autofs: fix error return in autofs_fill_super() In autofs_fill_super() on error of get inode/make root dentry the return should be ENOMEM as this is the only failure case of the called functions. Link: http://lkml.kernel.org/r/154725123240.11260.796773942606871359.stgit@pluto-themaw-net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 0e8ea2d9a2bb..078992eee299 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -266,8 +266,10 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) } root_inode = autofs_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); - if (!root) + if (!root) { + ret = -ENOMEM; goto fail_ino; + } pipe = NULL; root->d_fsdata = ino; -- cgit v1.2.3 From e3fdc89ca47ef34dfb6fd5101fec084c3dba5486 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Jan 2019 15:58:38 -0500 Subject: nfsd: Fix error return values for nfsd4_clone_file_range() If the parameter 'count' is non-zero, nfsd4_clone_file_range() will currently clobber all errors returned by vfs_clone_file_range() and replace them with EINVAL. Fixes: 42ec3d4c0218 ("vfs: make remap_file_range functions take and...") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9824e32b2f23..7dc98e14655d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -557,9 +557,11 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, loff_t cloned; cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); + if (cloned < 0) + return nfserrno(cloned); if (count && cloned != count) - cloned = -EINVAL; - return nfserrno(cloned < 0 ? cloned : 0); + return nfserrno(-EINVAL); + return 0; } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, -- cgit v1.2.3