From 125892edfe69915a227d8d125ff0e1cd713178f4 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 1 Jan 2019 18:54:26 +0900 Subject: inotify: Fix fd refcount leak in inotify_add_watch(). Commit 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()") forgot to call fdput() before bailing out. Fixes: 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()") CC: stable@vger.kernel.org Signed-off-by: Tetsuo Handa Reviewed-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/inotify/inotify_user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 105576daca4a..798f1253141a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -724,8 +724,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, return -EBADF; /* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */ - if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) - return -EINVAL; + if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) { + ret = -EINVAL; + goto fput_and_out; + } /* verify that this is indeed an inotify instance */ if (unlikely(f.file->f_op != &inotify_fops)) { -- cgit v1.2.3 From de96e9fea7ba56042f105b6fe163447b280eb800 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 3 Jan 2019 10:23:47 +0100 Subject: sysfs: convert BUG_ON to WARN_ON It's rude to crash the system just because the developer did something wrong, as it prevents them from usually even seeing what went wrong. So convert the few BUG_ON() calls that have snuck into the sysfs code over the years to WARN_ON() to make it more "friendly". All of these are able to be recovered from, so it makes no sense to crash. Reported-by: Linus Torvalds Reviewed-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 3 ++- fs/sysfs/file.c | 6 ++++-- fs/sysfs/group.c | 3 ++- fs/sysfs/symlink.c | 3 ++- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index feeae8081c22..aa85f2874a9f 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,7 +43,8 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) kuid_t uid; kgid_t gid; - BUG_ON(!kobj); + if (WARN_ON(!kobj)) + return -EINVAL; if (kobj->parent) parent = kobj->parent->sd; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index bb71db63c99c..51398457fe00 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -325,7 +325,8 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, kuid_t uid; kgid_t gid; - BUG_ON(!kobj || !kobj->sd || !attr); + if (WARN_ON(!kobj || !kobj->sd || !attr)) + return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, @@ -537,7 +538,8 @@ int sysfs_create_bin_file(struct kobject *kobj, kuid_t uid; kgid_t gid; - BUG_ON(!kobj || !kobj->sd || !attr); + if (WARN_ON(!kobj || !kobj->sd || !attr)) + return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); return sysfs_add_file_mode_ns(kobj->sd, &attr->attr, true, diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 1eb2d6307663..57038604d4a8 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -112,7 +112,8 @@ static int internal_create_group(struct kobject *kobj, int update, kgid_t gid; int error; - BUG_ON(!kobj || (!update && !kobj->sd)); + if (WARN_ON(!kobj || (!update && !kobj->sd))) + return -EINVAL; /* Updates may happen before the object has been instantiated */ if (unlikely(update && !kobj->sd)) diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 215c225b2ca1..c4deecc80f67 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -23,7 +23,8 @@ static int sysfs_do_create_link_sd(struct kernfs_node *parent, { struct kernfs_node *kn, *target = NULL; - BUG_ON(!name || !parent); + if (WARN_ON(!name || !parent)) + return -EINVAL; /* * We don't own @target_kobj and it may be removed at any time. -- cgit v1.2.3 From 02b2f549d502b46e68b97ea1452fb8853b3327dd Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 18 Dec 2018 04:31:48 -0500 Subject: libceph: allow setting abort_on_full for rbd Introduce a new option abort_on_full, default to false. Then we can get -ENOSPC when the pool is full, or reaches quota. [ Don't show abort_on_full in /proc/mounts. ] Signed-off-by: Dongsheng Yang Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 4 ++-- include/linux/ceph/libceph.h | 6 ++++-- include/linux/ceph/osd_client.h | 1 - net/ceph/ceph_common.c | 11 ++++++++++- net/ceph/debugfs.c | 2 +- net/ceph/osd_client.c | 4 ++-- 6 files changed, 19 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4e9a7cc488da..da2cd8e89062 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -530,7 +530,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_putc(m, ','); pos = m->count; - ret = ceph_print_client_options(m, fsc->client); + ret = ceph_print_client_options(m, fsc->client, false); if (ret) return ret; @@ -640,7 +640,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, opt = NULL; /* fsc->client now owns this */ fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->osdc.abort_on_full = true; + ceph_set_opt(fsc->client, ABORT_ON_FULL); if (!fsopt->mds_namespace) { ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 68bb09c29ce8..a420c07904bc 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -35,6 +35,7 @@ #define CEPH_OPT_NOMSGAUTH (1<<4) /* don't require msg signing feat */ #define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */ #define CEPH_OPT_NOMSGSIGN (1<<6) /* don't sign msgs */ +#define CEPH_OPT_ABORT_ON_FULL (1<<7) /* abort w/ ENOSPC when full */ #define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) @@ -53,7 +54,7 @@ struct ceph_options { unsigned long osd_request_timeout; /* jiffies */ /* - * any type that can't be simply compared or doesn't need need + * any type that can't be simply compared or doesn't need * to be compared should go beyond this point, * ceph_compare_options() should be updated accordingly */ @@ -281,7 +282,8 @@ extern struct ceph_options *ceph_parse_options(char *options, const char *dev_name, const char *dev_name_end, int (*parse_extra_token)(char *c, void *private), void *private); -int ceph_print_client_options(struct seq_file *m, struct ceph_client *client); +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, + bool show_all); extern void ceph_destroy_options(struct ceph_options *opt); extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 7a2af5034278..2294f963dab7 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -354,7 +354,6 @@ struct ceph_osd_client { struct rb_root linger_map_checks; atomic_t num_requests; atomic_t num_homeless; - bool abort_on_full; /* abort w/ ENOSPC when full */ int abort_err; struct delayed_work timeout_work; struct delayed_work osds_timeout_work; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 87afb9ec4c68..9cab80207ced 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -255,6 +255,7 @@ enum { Opt_nocephx_sign_messages, Opt_tcp_nodelay, Opt_notcp_nodelay, + Opt_abort_on_full, }; static match_table_t opt_tokens = { @@ -280,6 +281,7 @@ static match_table_t opt_tokens = { {Opt_nocephx_sign_messages, "nocephx_sign_messages"}, {Opt_tcp_nodelay, "tcp_nodelay"}, {Opt_notcp_nodelay, "notcp_nodelay"}, + {Opt_abort_on_full, "abort_on_full"}, {-1, NULL} }; @@ -535,6 +537,10 @@ ceph_parse_options(char *options, const char *dev_name, opt->flags &= ~CEPH_OPT_TCP_NODELAY; break; + case Opt_abort_on_full: + opt->flags |= CEPH_OPT_ABORT_ON_FULL; + break; + default: BUG_ON(token); } @@ -549,7 +555,8 @@ out: } EXPORT_SYMBOL(ceph_parse_options); -int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, + bool show_all) { struct ceph_options *opt = client->options; size_t pos = m->count; @@ -574,6 +581,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) seq_puts(m, "nocephx_sign_messages,"); if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) seq_puts(m, "notcp_nodelay,"); + if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL)) + seq_puts(m, "abort_on_full,"); if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) seq_printf(m, "mount_timeout=%d,", diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 02952605d121..46f65709a6ff 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -375,7 +375,7 @@ static int client_options_show(struct seq_file *s, void *p) struct ceph_client *client = s->private; int ret; - ret = ceph_print_client_options(s, client); + ret = ceph_print_client_options(s, client, true); if (ret) return ret; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d23a9f81f3d7..fa9530dd876e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2315,7 +2315,7 @@ again: (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { dout("req %p full/pool_full\n", req); - if (osdc->abort_on_full) { + if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) { err = -ENOSPC; } else { pr_warn_ratelimited("FULL or reached pool quota\n"); @@ -2545,7 +2545,7 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) { bool victims = false; - if (osdc->abort_on_full && + if (ceph_test_opt(osdc->client, ABORT_ON_FULL) && (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc))) for_each_request(osdc, abort_on_full_fn, &victims); } -- cgit v1.2.3 From c64a2b0516a02361d8deb1f038647c29020d0852 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Sat, 5 Jan 2019 01:00:29 +0530 Subject: ceph: use vmf_error() in ceph_filemap_fault() This code is converted to use vmf_error(). Signed-off-by: Souptick Joarder Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5d0c05e288cc..a47c541f8006 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1494,10 +1494,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) if (err < 0 || off >= i_size_read(inode)) { unlock_page(page); put_page(page); - if (err == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + ret = vmf_error(err); goto out_inline; } if (err < PAGE_SIZE) -- cgit v1.2.3 From e7c58097793ef15d58fadf190ee58738fbf447cd Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Jan 2019 15:23:32 -0800 Subject: hugetlbfs: revert "Use i_mmap_rwsem to fix page fault/truncate race" This reverts c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 The reverted commit caused ABBA deadlocks when file migration raced with file eviction for specific hugetlbfs files. This was discovered with a modified version of the LTP move_pages12 test. The purpose of the reverted patch was to close a long existing race between hugetlbfs file truncation and page faults. After more analysis of the patch and impacted code, it was determined that i_mmap_rwsem can not be used for all required synchronization. Therefore, revert this patch while working an another approach to the underlying issue. Link: http://lkml.kernel.org/r/20190103235452.29335-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Jan Stancek Cc: Michal Hocko Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: "Aneesh Kumar K . V" Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Davidlohr Bueso Cc: Prakash Sangappa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 61 ++++++++++++++++++++++++++++------------------------ mm/hugetlb.c | 21 +++++++++--------- 2 files changed, 44 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a2fcea5f8225..32920a10100e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv - * maps and global counts. + * maps and global counts. Page faults can not race with truncation + * in this routine. hugetlb_no_page() prevents page faults in the + * truncated range. It checks i_size before allocation, and again after + * with the page table lock for the page held. The same lock must be + * acquired to unmap a page. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserv map * deleted. The region/reserv map for ranges without associated - * pages are not modified. - * - * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent - * races with page faults. - * + * pages are not modified. Page faults can race with hole punch. + * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation. */ @@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; + u32 hash; index = page->index; + hash = hugetlb_fault_mutex_hash(h, current->mm, + &pseudo_vma, + mapping, index, 0); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + /* - * A mapped page is impossible as callers should unmap - * all references before calling. And, i_mmap_rwsem - * prevents the creation of additional mappings. + * If page is mapped, it was faulted in after being + * unmapped in caller. Unmap (again) now after taking + * the fault mutex. The mutex will prevent faults + * until we finish removing the page. + * + * This race can only happen in the hole punch case. + * Getting here in a truncate operation is a bug. */ - VM_BUG_ON(page_mapped(page)); + if (unlikely(page_mapped(page))) { + BUG_ON(truncate_op); + + i_mmap_lock_write(mapping); + hugetlb_vmdelete_list(&mapping->i_mmap, + index * pages_per_huge_page(h), + (index + 1) * pages_per_huge_page(h)); + i_mmap_unlock_write(mapping); + } lock_page(page); /* @@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } unlock_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } huge_pagevec_release(&pvec); cond_resched(); @@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, static void hugetlbfs_evict_inode(struct inode *inode) { - struct address_space *mapping = inode->i_mapping; struct resv_map *resv_map; - /* - * The vfs layer guarantees that there are no other users of this - * inode. Therefore, it would be safe to call remove_inode_hugepages - * without holding i_mmap_rwsem. We acquire and hold here to be - * consistent with other callers. Since there will be no contention - * on the semaphore, overhead is negligible. - */ - i_mmap_lock_write(mapping); remove_inode_hugepages(inode, 0, LLONG_MAX); - i_mmap_unlock_write(mapping); - resv_map = (struct resv_map *)inode->i_mapping->private_data; /* root inode doesn't have the resv_map, so we should check it */ if (resv_map) @@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); - remove_inode_hugepages(inode, offset, LLONG_MAX); i_mmap_unlock_write(mapping); + remove_inode_hugepages(inode, offset, LLONG_MAX); return 0; } @@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT); - remove_inode_hugepages(inode, hole_start, hole_end); i_mmap_unlock_write(mapping); + remove_inode_hugepages(inode, hole_start, hole_end); inode_unlock(inode); } @@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* - * fault mutex taken here, protects against fault path - * and hole punch. inode_lock previously taken protects - * against truncation. - */ + /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, index, addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 745088810965..aedc1b183cf9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3755,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } /* - * We can not race with truncation due to holding i_mmap_rwsem. - * Check once here for faults beyond end of file. + * Use page lock to guard against racing truncation + * before we get page_table_lock. */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - retry: page = find_lock_page(mapping, idx); if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + /* * Check for page in userfault range */ @@ -3854,6 +3854,9 @@ retry: } ptl = huge_pte_lock(h, mm, ptep); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; ret = 0; if (!huge_pte_none(huge_ptep_get(ptep))) @@ -3956,10 +3959,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This serves two purposes: - * 1) It prevents huge_pmd_unshare from being called elsewhere - * and making the ptep no longer valid. - * 2) It synchronizes us with file truncation. + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. * * ptep could have already be assigned via huge_pte_offset. That * is OK, as huge_pte_alloc will return the same value unless -- cgit v1.2.3 From f7fa1107f30e13255fb9a5359d357e07d3721b0b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Jan 2019 11:42:54 +0000 Subject: Btrfs: fix race between cloning range ending at eof and writeback The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between writeback and cloning a range that covers the eof extent of the source file into a destination offset that is greater then the same file's size. This happens because we now wait for writeback to complete before doing the truncation of the eof block, while previously we did the truncation and then waited for writeback to complete. This leads to a race between writeback of the truncated block and cloning the file extents in the source range, because we copy each file extent item we find in the fs root into a buffer, then release the path and then increment the reference count for the extent referred in that file extent item we copied, which can no longer exist if writeback of the truncated eof block completes after we copied the file extent item into the buffer and before we incremented the reference count. This is illustrated by the following diagram: CPU 1 CPU 2 btrfs_clone_files() btrfs_cont_expand() btrfs_truncate_block() --> zeroes part of the page containg eof, marking it for delalloc btrfs_clone() --> finds extent item covering eof, points to extent at bytenr X --> copies it into a local buffer --> releases path writeback starts btrfs_finish_ordered_io() insert_reserved_file_extent() __btrfs_drop_extents() --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by waiting for writeback to complete after truncating the eof block. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fab9443f6a42..d0da86ac53bf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3905,9 +3905,24 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, len = ALIGN(src->i_size, bs) - off; if (destoff > inode->i_size) { + const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); + ret = btrfs_cont_expand(inode, inode->i_size, destoff); if (ret) return ret; + /* + * We may have truncated the last block if the inode's size is + * not sector size aligned, so we need to wait for writeback to + * complete before proceeding further, otherwise we can race + * with cloning and attempt to increment a reference to an + * extent that no longer exists (writeback completed right after + * we found the previous extent covering eof and before we + * attempted to increment its reference count). + */ + ret = btrfs_wait_ordered_range(inode, wb_start, + destoff - wb_start); + if (ret) + return ret; } /* -- cgit v1.2.3 From d8b5524242108cb7d28c9b8b9aded7c1edd0e8a8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Jan 2019 11:43:07 +0000 Subject: Btrfs: fix race between reflink/dedupe and relocation The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between relocation and reflinking (for both cloning and deduplication) the file extents between the source and destination inodes. This happens because we no longer lock the source range anymore, and we do not lock it anymore because we wait for direct IO writes and writeback to complete early on the code path right after locking the inodes, which guarantees no other file operations interfere with the reflinking. However there is one exception which is relocation, since it replaces the byte number of file extents items in the fs tree after locking the range the file extent items represent. This is a problem because after finding each file extent to clone in the fs tree, the reflink process copies the file extent item into a local buffer, releases the search path, inserts new file extent items in the destination range and then increments the reference count for the extent mentioned in the file extent item that it previously copied to the buffer. If right after copying the file extent item into the buffer and releasing the path the relocation process updates the file extent item to point to the new extent, the reflink process ends up creating a delayed reference to increment the reference count of the old extent, for which the relocation process already created a delayed reference to drop it. This results in failure to run delayed references because we will attempt to increment the count of a reference that was already dropped. This is illustrated by the following diagram: CPU 1 CPU 2 relocation is running btrfs_clone_files() btrfs_clone() --> finds extent item in source range point to extent at bytenr X --> copies it into a local buffer --> releases path replace_file_extents() --> successfully locks the range represented by the file extent item --> replaces disk_bytenr field in the file extent item with some other value Y --> creates delayed reference to increment reference count for extent at bytenr Y --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by locking the source range before searching for the file extent items in the fs tree, since the relocation process will try to lock the range a file extent item represents before updating it with the new extent location. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d0da86ac53bf..9c8e1734429c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3221,6 +3221,26 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) inode_lock_nested(inode2, I_MUTEX_CHILD); } +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } else if (inode1 == inode2 && loff2 < loff1) { + swap(loff1, loff2); + } + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { @@ -3242,11 +3262,12 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, return -EINVAL; /* - * Lock destination range to serialize with concurrent readpages(). + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. */ - lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); - unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); return ret; } @@ -3926,11 +3947,12 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, } /* - * Lock destination range to serialize with concurrent readpages(). + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. */ - lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); + btrfs_double_extent_lock(src, off, inode, destoff, len); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); + btrfs_double_extent_unlock(src, off, inode, destoff, len); /* * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. -- cgit v1.2.3 From a6d8654d885d7d79a3fb82da64eaa489ca332a82 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Jan 2019 11:44:41 +0000 Subject: Btrfs: fix deadlock when using free space tree due to block group creation When modifying the free space tree we can end up COWing one of its extent buffers which in turn might result in allocating a new chunk, which in turn can result in flushing (finish creation) of pending block groups. If that happens we can deadlock because creating a pending block group needs to update the free space tree, and if any of the updates tries to modify the same extent buffer that we are COWing, we end up in a deadlock since we try to write lock twice the same extent buffer. So fix this by skipping pending block group creation if we are COWing an extent buffer from the free space tree. This is a case missed by commit 5ce555578e091 ("Btrfs: fix deadlock when writing out free space caches"). Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202173 Fixes: 5ce555578e091 ("Btrfs: fix deadlock when writing out free space caches") CC: stable@vger.kernel.org # 4.18+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d92462fe66c8..f64aad613727 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1016,19 +1016,21 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = parent->start; /* - * If we are COWing a node/leaf from the extent, chunk or device trees, - * make sure that we do not finish block group creation of pending block - * groups. We do this to avoid a deadlock. + * If we are COWing a node/leaf from the extent, chunk, device or free + * space trees, make sure that we do not finish block group creation of + * pending block groups. We do this to avoid a deadlock. * COWing can result in allocation of a new chunk, and flushing pending * block groups (btrfs_create_pending_block_groups()) can be triggered * when finishing allocation of a new chunk. Creation of a pending block - * group modifies the extent, chunk and device trees, therefore we could - * deadlock with ourselves since we are holding a lock on an extent - * buffer that btrfs_create_pending_block_groups() may try to COW later. + * group modifies the extent, chunk, device and free space trees, + * therefore we could deadlock with ourselves since we are holding a + * lock on an extent buffer that btrfs_create_pending_block_groups() may + * try to COW later. */ if (root == fs_info->extent_root || root == fs_info->chunk_root || - root == fs_info->dev_root) + root == fs_info->dev_root || + root == fs_info->free_space_root) trans->can_flush_pending_bgs = false; cow = btrfs_alloc_tree_block(trans, root, parent_start, -- cgit v1.2.3 From 1b3922a8bc74231f9a767d1be6d9a061a4d4eeab Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 8 Jan 2019 14:08:18 +0800 Subject: btrfs: Use real device structure to verify dev extent [BUG] Linux v5.0-rc1 will fail fstests/btrfs/163 with the following kernel message: BTRFS error (device dm-6): dev extent devid 1 physical offset 13631488 len 8388608 is beyond device boundary 0 BTRFS error (device dm-6): failed to verify dev extents against chunks: -117 BTRFS error (device dm-6): open_ctree failed [CAUSE] Commit cf90d884b347 ("btrfs: Introduce mount time chunk <-> dev extent mapping check") introduced strict check on dev extents. We use btrfs_find_device() with dev uuid and fs uuid set to NULL, and only dependent on @devid to find the real device. For seed devices, we call clone_fs_devices() in open_seed_devices() to allow us search seed devices directly. However clone_fs_devices() just populates devices with devid and dev uuid, without populating other essential members, like disk_total_bytes. This makes any device returned by btrfs_find_device(fs_info, devid, NULL, NULL) is just a dummy, with 0 disk_total_bytes, and any dev extents on the seed device will not pass the device boundary check. [FIX] This patch will try to verify the device returned by btrfs_find_device() and if it's a dummy then re-search in seed devices. Fixes: cf90d884b347 ("btrfs: Introduce mount time chunk <-> dev extent mapping check") CC: stable@vger.kernel.org # 4.19+ Reported-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2576b1a379c9..3e4f8f88353e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7825,6 +7825,18 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; goto out; } + + /* It's possible this device is a dummy for seed device */ + if (dev->disk_total_bytes == 0) { + dev = find_device(fs_info->fs_devices->seed, devid, NULL); + if (!dev) { + btrfs_err(fs_info, "failed to find seed devid %llu", + devid); + ret = -EUCLEAN; + goto out; + } + } + if (physical_offset + physical_len > dev->disk_total_bytes) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", -- cgit v1.2.3 From c2b8bd49d35a768d3966c5e14e8f6971f2a63439 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 9 Jan 2019 17:23:54 +0000 Subject: afs: Use struct_size() in kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David Howells --- fs/afs/server_list.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 95d0761cdb34..155dc14caef9 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -42,9 +42,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, if (vldb->fs_mask[i] & type_mask) nr_servers++; - slist = kzalloc(sizeof(struct afs_server_list) + - sizeof(struct afs_server_entry) * nr_servers, - GFP_KERNEL); + slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL); if (!slist) goto error; -- cgit v1.2.3 From 5edc22cc1d33d6a88d175d25adc38d2a5cee134d Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Wed, 9 Jan 2019 17:23:54 +0000 Subject: afs: Set correct lock type for the yfs CreateFile A lock type of 0 is "LockRead", which makes the fileserver record an unintentional read lock on the new file. This will cause problems later on if the file is the subject of locking operations. The correct default value should be -1 ("LockNone"). Fix the operation marshalling code to set the value and provide an enum to symbolise the values whilst we're at it. Fixes: 30062bd13e36 ("afs: Implement YFS support in the fs client") Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/protocol_yfs.h | 11 +++++++++++ fs/afs/yfsclient.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index 07bc10f076aa..d443e2bfa094 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -161,3 +161,14 @@ struct yfs_xdr_YFSStoreVolumeStatus { struct yfs_xdr_u64 max_quota; struct yfs_xdr_u64 file_quota; } __packed; + +enum yfs_lock_type { + yfs_LockNone = -1, + yfs_LockRead = 0, + yfs_LockWrite = 1, + yfs_LockExtend = 2, + yfs_LockRelease = 3, + yfs_LockMandatoryRead = 0x100, + yfs_LockMandatoryWrite = 0x101, + yfs_LockMandatoryExtend = 0x102, +}; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 12658c1363ae..5aa57929e8c2 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -803,7 +803,7 @@ int yfs_fs_create_file(struct afs_fs_cursor *fc, bp = xdr_encode_YFSFid(bp, &vnode->fid); bp = xdr_encode_string(bp, name, namesz); bp = xdr_encode_YFSStoreStatus_mode(bp, mode); - bp = xdr_encode_u32(bp, 0); /* ViceLockType */ + bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */ yfs_check_req(call, bp); afs_use_fs_server(call, fc->cbi); -- cgit v1.2.3 From 8428817dc40006dca0a531cfa06e89cb3b41790d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 5 Jan 2019 15:25:29 +0300 Subject: cifs: Fix a debug message This debug message was never shown because it was checking for NULL returns but extract_hostname() returns error pointers. Fixes: 93d5cb517db3 ("cifs: Add support for failover in cifs_reconnect()") Signed-off-by: Dan Carpenter Signed-off-by: Steve French Reviewed-by: Paulo Alcantara --- fs/cifs/connect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f66529679ca2..683310f26171 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -433,9 +433,10 @@ static void reconn_inval_dfs_target(struct TCP_Server_Info *server, kfree(server->hostname); server->hostname = extract_hostname(name); - if (!server->hostname) { - cifs_dbg(FYI, "%s: failed to extract hostname from target: %d\n", - __func__, -ENOMEM); + if (IS_ERR(server->hostname)) { + cifs_dbg(FYI, + "%s: failed to extract hostname from target: %ld\n", + __func__, PTR_ERR(server->hostname)); } } -- cgit v1.2.3 From c715f89c4dab76317c773df2611af2dac4dea2b7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 5 Jan 2019 21:18:03 +0300 Subject: cifs: Fix a tiny potential memory leak The most recent "it" allocation is leaked on this error path. I believe that small allocations always succeed in current kernels so this doesn't really affect run time. Fixes: 54be1f6c1c37 ("cifs: Add DFS cache routines") Signed-off-by: Dan Carpenter Signed-off-by: Steve French --- fs/cifs/dfs_cache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index cd63c4a70875..09b7d0d4f6e4 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -776,6 +776,7 @@ static int get_tgt_list(const struct dfs_cache_entry *ce, it->it_name = kstrndup(t->t_name, strlen(t->t_name), GFP_KERNEL); if (!it->it_name) { + kfree(it); rc = -ENOMEM; goto err_free_it; } -- cgit v1.2.3 From b983f7e92348d7e7d091db1b78b7915e9dd3d63a Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 19 Dec 2018 22:49:09 +0000 Subject: CIFS: Fix adjustment of credits for MTU requests Currently for MTU requests we allocate maximum possible credits in advance and then adjust them according to the request size. While we were adjusting the number of credits belonging to the server, we were skipping adjustment of credits belonging to the request. This patch fixes it by setting request credits to CreditCharge field value of SMB2 packet header. Also ask 1 credit more for async read and write operations to increase parallelism and match the behavior of other operations. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable --- fs/cifs/smb2pdu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index e57f6aa1d638..b9d7891edaa1 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3278,12 +3278,14 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = shdr->CreditCharge; + shdr->CreditRequest = + cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); spin_lock(&server->req_lock); server->credits += rdata->credits - le16_to_cpu(shdr->CreditCharge); spin_unlock(&server->req_lock); wake_up(&server->request_q); + rdata->credits = le16_to_cpu(shdr->CreditCharge); flags |= CIFS_HAS_CREDITS; } @@ -3555,12 +3557,14 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = shdr->CreditCharge; + shdr->CreditRequest = + cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); spin_lock(&server->req_lock); server->credits += wdata->credits - le16_to_cpu(shdr->CreditCharge); spin_unlock(&server->req_lock); wake_up(&server->request_q); + wdata->credits = le16_to_cpu(shdr->CreditCharge); flags |= CIFS_HAS_CREDITS; } -- cgit v1.2.3 From 33fa5c8b8a7dbe6353a56eaa654b790348890d42 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 3 Jan 2019 16:45:13 -0800 Subject: CIFS: Do not set credits to 1 if the server didn't grant anything Currently we reset the number of total credits granted by the server to 1 if the server didn't grant us anything int the response. This violates the SMB3 protocol - we need to trust the server and use the credit values from the response. Fix this by removing the corresponding code. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable --- fs/cifs/transport.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 5be7302853b6..8e75d689be46 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -894,8 +894,6 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (i = 0; i < num_rqst; i++) if (midQ[i]->resp_buf) credits += ses->server->ops->get_credits(midQ[i]); - if (!credits) - credits = 1; for (i = 0; i < num_rqst; i++) { if (rc < 0) -- cgit v1.2.3 From 8544f4aa9dd19a04d1244dae10feecc813ccf175 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 22 Dec 2018 12:40:05 -0800 Subject: CIFS: Fix credit computation for compounded requests In SMB3 protocol every part of the compound chain consumes credits individually, so we need to call wait_for_free_credits() for each of the PDUs in the chain. If an operation is interrupted, we must ensure we return all credits taken from the server structure back. Without this patch server can sometimes disconnect the session due to credit mismatches, especially when first operation(s) are large writes. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable --- fs/cifs/transport.c | 59 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 8e75d689be46..e047f06c9b4f 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -795,7 +795,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, int i, j, rc = 0; int timeout, optype; struct mid_q_entry *midQ[MAX_COMPOUND]; - unsigned int credits = 0; + bool cancelled_mid[MAX_COMPOUND] = {false}; + unsigned int credits[MAX_COMPOUND] = {0}; char *buf; timeout = flags & CIFS_TIMEOUT_MASK; @@ -813,13 +814,31 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -ENOENT; /* - * Ensure that we do not send more than 50 overlapping requests - * to the same server. We may make this configurable later or - * use ses->maxReq. + * Ensure we obtain 1 credit per request in the compound chain. + * It can be optimized further by waiting for all the credits + * at once but this can wait long enough if we don't have enough + * credits due to some heavy operations in progress or the server + * not granting us much, so a fallback to the current approach is + * needed anyway. */ - rc = wait_for_free_request(ses->server, timeout, optype); - if (rc) - return rc; + for (i = 0; i < num_rqst; i++) { + rc = wait_for_free_request(ses->server, timeout, optype); + if (rc) { + /* + * We haven't sent an SMB packet to the server yet but + * we already obtained credits for i requests in the + * compound chain - need to return those credits back + * for future use. Note that we need to call add_credits + * multiple times to match the way we obtained credits + * in the first place and to account for in flight + * requests correctly. + */ + for (j = 0; j < i; j++) + add_credits(ses->server, 1, optype); + return rc; + } + credits[i] = 1; + } /* * Make sure that we sign in the same order that we send on this socket @@ -835,8 +854,10 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (j = 0; j < i; j++) cifs_delete_mid(midQ[j]); mutex_unlock(&ses->server->srv_mutex); + /* Update # of requests on wire to server */ - add_credits(ses->server, 1, optype); + for (j = 0; j < num_rqst; j++) + add_credits(ses->server, credits[j], optype); return PTR_ERR(midQ[i]); } @@ -883,17 +904,16 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { midQ[i]->mid_flags |= MID_WAIT_CANCELLED; midQ[i]->callback = DeleteMidQEntry; - spin_unlock(&GlobalMid_Lock); - add_credits(ses->server, 1, optype); - return rc; + cancelled_mid[i] = true; } spin_unlock(&GlobalMid_Lock); } } for (i = 0; i < num_rqst; i++) - if (midQ[i]->resp_buf) - credits += ses->server->ops->get_credits(midQ[i]); + if (!cancelled_mid[i] && midQ[i]->resp_buf + && (midQ[i]->mid_state == MID_RESPONSE_RECEIVED)) + credits[i] = ses->server->ops->get_credits(midQ[i]); for (i = 0; i < num_rqst; i++) { if (rc < 0) @@ -901,8 +921,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, rc = cifs_sync_mid_result(midQ[i], ses->server); if (rc != 0) { - add_credits(ses->server, credits, optype); - return rc; + /* mark this mid as cancelled to not free it below */ + cancelled_mid[i] = true; + goto out; } if (!midQ[i]->resp_buf || @@ -949,9 +970,11 @@ out: * This is prevented above by using a noop callback that will not * wake this thread except for the very last PDU. */ - for (i = 0; i < num_rqst; i++) - cifs_delete_mid(midQ[i]); - add_credits(ses->server, credits, optype); + for (i = 0; i < num_rqst; i++) { + if (!cancelled_mid[i]) + cifs_delete_mid(midQ[i]); + add_credits(ses->server, credits[i], optype); + } return rc; } -- cgit v1.2.3 From ee13919c2e8d1f904e035ad4b4239029a8994131 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 10 Jan 2019 11:27:28 -0800 Subject: CIFS: Do not hide EINTR after sending network packets Currently we hide EINTR code returned from sock_sendmsg() and return 0 instead. This makes a caller think that we successfully completed the network operation which is not true. Fix this by properly returning EINTR to callers. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index e047f06c9b4f..aaff9c5aab3d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -387,7 +387,7 @@ smbd_done: if (rc < 0 && rc != -EINTR) cifs_dbg(VFS, "Error %d sending data on socket to server\n", rc); - else + else if (rc > 0) rc = 0; return rc; -- cgit v1.2.3 From 15bc77f94e9f0e3cfd6972f3b84bb7aaa4c36b1b Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Tue, 8 Jan 2019 13:41:00 +0100 Subject: cifs: move large array from stack to heap This addresses some compile warnings that you can see depending on configuration settings. Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 23 ++++++++++++++++------- fs/cifs/smb2pdu.c | 23 ++++++++++++++++------- 2 files changed, 32 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index b1f49c1c543a..332f88d753d1 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -128,24 +128,31 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, int rc; struct dfs_cache_tgt_list tl; struct dfs_cache_tgt_iterator *it = NULL; - char tree[MAX_TREE_SIZE + 1]; + char *tree; const char *tcp_host; size_t tcp_host_len; const char *dfs_host; size_t dfs_host_len; + tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL); + if (!tree) + return -ENOMEM; + if (tcon->ipc) { - snprintf(tree, sizeof(tree), "\\\\%s\\IPC$", + snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", tcon->ses->server->hostname); - return CIFSTCon(0, tcon->ses, tree, tcon, nlsc); + rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); + goto out; } - if (!tcon->dfs_path) - return CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc); + if (!tcon->dfs_path) { + rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc); + goto out; + } rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl); if (rc) - return rc; + goto out; extract_unc_hostname(tcon->ses->server->hostname, &tcp_host, &tcp_host_len); @@ -165,7 +172,7 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, continue; } - snprintf(tree, sizeof(tree), "\\%s", tgt); + snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); if (!rc) @@ -182,6 +189,8 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, rc = -ENOENT; } dfs_cache_free_tgts(&tl); +out: + kfree(tree); return rc; } #else diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b9d7891edaa1..50811a7dc0e0 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -162,24 +162,31 @@ static int __smb2_reconnect(const struct nls_table *nlsc, int rc; struct dfs_cache_tgt_list tl; struct dfs_cache_tgt_iterator *it = NULL; - char tree[MAX_TREE_SIZE + 1]; + char *tree; const char *tcp_host; size_t tcp_host_len; const char *dfs_host; size_t dfs_host_len; + tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL); + if (!tree) + return -ENOMEM; + if (tcon->ipc) { - snprintf(tree, sizeof(tree), "\\\\%s\\IPC$", + snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", tcon->ses->server->hostname); - return SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); + rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); + goto out; } - if (!tcon->dfs_path) - return SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc); + if (!tcon->dfs_path) { + rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc); + goto out; + } rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl); if (rc) - return rc; + goto out; extract_unc_hostname(tcon->ses->server->hostname, &tcp_host, &tcp_host_len); @@ -199,7 +206,7 @@ static int __smb2_reconnect(const struct nls_table *nlsc, continue; } - snprintf(tree, sizeof(tree), "\\%s", tgt); + snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); if (!rc) @@ -216,6 +223,8 @@ static int __smb2_reconnect(const struct nls_table *nlsc, rc = -ENOENT; } dfs_cache_free_tgts(&tl); +out: + kfree(tree); return rc; } #else -- cgit v1.2.3 From 92a8109e4d3a34fb6b115c9098b51767dc933444 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Tue, 8 Jan 2019 18:30:56 +0000 Subject: cifs: Limit memory used by lock request calls to a page The code tries to allocate a contiguous buffer with a size supplied by the server (maxBuf). This could fail if memory is fragmented since it results in high order allocations for commonly used server implementations. It is also wasteful since there are probably few locks in the usual case. Limit the buffer to be no larger than a page to avoid memory allocation failures due to fragmentation. Signed-off-by: Ross Lagerwall Signed-off-by: Steve French --- fs/cifs/file.c | 8 ++++++++ fs/cifs/smb2file.c | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index e3e3a7550205..673f948e4760 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1140,6 +1140,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) return -EINVAL; } + BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) > + PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr), + PAGE_SIZE); max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); @@ -1478,6 +1482,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, if (!max_buf) return -EINVAL; + BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) > + PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr), + PAGE_SIZE); max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 4ed10dd086e6..eff01ed6db0a 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -128,6 +128,8 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, if (!max_buf) return -EINVAL; + BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf, PAGE_SIZE); max_num = max_buf / sizeof(struct smb2_lock_element); buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) @@ -264,6 +266,8 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) return -EINVAL; } + BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf, PAGE_SIZE); max_num = max_buf / sizeof(struct smb2_lock_element); buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) { -- cgit v1.2.3 From b9a74cde94957d82003fb9f7ab4777938ca851cd Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Tue, 8 Jan 2019 18:30:57 +0000 Subject: cifs: Fix potential OOB access of lock element array If maxBuf is small but non-zero, it could result in a zero sized lock element array which we would then try and access OOB. Signed-off-by: Ross Lagerwall Signed-off-by: Steve French CC: Stable --- fs/cifs/file.c | 8 ++++---- fs/cifs/smb2file.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 673f948e4760..5b6f8392d9db 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1132,10 +1132,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) { + if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) { free_xid(xid); return -EINVAL; } @@ -1476,10 +1476,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) + if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) return -EINVAL; BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) > diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index eff01ed6db0a..b204e84b87fb 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -122,10 +122,10 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) + if (max_buf < sizeof(struct smb2_lock_element)) return -EINVAL; BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE); -- cgit v1.2.3 From 8a26f0f781f56d3016b34a2217e346973d067e7b Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 3 Jan 2019 16:45:27 -0800 Subject: CIFS: Fix credits calculation for cancelled requests If a request is cancelled, we can't assume that the server returns 1 credit back. Instead we need to wait for a response and process the number of credits granted by the server. Create a separate mid callback for cancelled request, parse the number of credits in a response buffer and add them to the client's credits. If the didn't get a response (no response buffer available) assume 0 credits granted. The latter most probably happens together with session reconnect, so the client's credits are adjusted anyway. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 1 + fs/cifs/transport.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 01ded7038b19..770926877b7c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1438,6 +1438,7 @@ struct mid_q_entry { int mid_state; /* wish this were enum but can not pass to wait_event */ unsigned int mid_flags; __le16 command; /* smb command code */ + unsigned int optype; /* operation type */ bool large_buf:1; /* if valid response, is pointer to large buf */ bool multiRsp:1; /* multiple trans2 responses for one request */ bool multiEnd:1; /* both received */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index aaff9c5aab3d..450c43494324 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -787,6 +787,24 @@ cifs_noop_callback(struct mid_q_entry *mid) { } +static void +cifs_cancelled_callback(struct mid_q_entry *mid) +{ + struct TCP_Server_Info *server = mid->server; + unsigned int optype = mid->optype; + unsigned int credits_received = 0; + + if (mid->mid_state == MID_RESPONSE_RECEIVED) { + if (mid->resp_buf) + credits_received = server->ops->get_credits(mid); + else + cifs_dbg(FYI, "Bad state for cancelled MID\n"); + } + + DeleteMidQEntry(mid); + add_credits(server, credits_received, optype); +} + int compound_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, const int num_rqst, struct smb_rqst *rqst, @@ -862,6 +880,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } midQ[i]->mid_state = MID_REQUEST_SUBMITTED; + midQ[i]->optype = optype; /* * We don't invoke the callback compounds unless it is the last * request. @@ -896,15 +915,20 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (i = 0; i < num_rqst; i++) { rc = wait_for_response(ses->server, midQ[i]); - if (rc != 0) { + if (rc != 0) + break; + } + if (rc != 0) { + for (; i < num_rqst; i++) { cifs_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n", midQ[i]->mid, le16_to_cpu(midQ[i]->command)); send_cancel(ses->server, &rqst[i], midQ[i]); spin_lock(&GlobalMid_Lock); if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { midQ[i]->mid_flags |= MID_WAIT_CANCELLED; - midQ[i]->callback = DeleteMidQEntry; + midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; + credits[i] = 0; } spin_unlock(&GlobalMid_Lock); } -- cgit v1.2.3 From ee258d79159afed52ca9372aeb9c1a51e89b32ee Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 3 Jan 2019 15:53:10 -0800 Subject: CIFS: Move credit processing to mid callbacks for SMB3 Currently we account for credits in the thread initiating a request and waiting for a response. The demultiplex thread receives the response, wakes up the thread and the latter collects credits from the response buffer and add them to the server structure on the client. This approach is not accurate, because it may race with reconnect events in the demultiplex thread which resets the number of credits. Fix this by moving credit processing to new mid callbacks that collect credits granted by the server from the response in the demultiplex thread. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/transport.c | 51 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 450c43494324..202e0e84efdd 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -783,12 +783,7 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) } static void -cifs_noop_callback(struct mid_q_entry *mid) -{ -} - -static void -cifs_cancelled_callback(struct mid_q_entry *mid) +cifs_compound_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->server; unsigned int optype = mid->optype; @@ -801,10 +796,23 @@ cifs_cancelled_callback(struct mid_q_entry *mid) cifs_dbg(FYI, "Bad state for cancelled MID\n"); } - DeleteMidQEntry(mid); add_credits(server, credits_received, optype); } +static void +cifs_compound_last_callback(struct mid_q_entry *mid) +{ + cifs_compound_callback(mid); + cifs_wake_up_task(mid); +} + +static void +cifs_cancelled_callback(struct mid_q_entry *mid) +{ + cifs_compound_callback(mid); + DeleteMidQEntry(mid); +} + int compound_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, const int num_rqst, struct smb_rqst *rqst, @@ -882,11 +890,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, midQ[i]->mid_state = MID_REQUEST_SUBMITTED; midQ[i]->optype = optype; /* - * We don't invoke the callback compounds unless it is the last - * request. + * Invoke callback for every part of the compound chain + * to calculate credits properly. Wake up this thread only when + * the last element is received. */ if (i < num_rqst - 1) - midQ[i]->callback = cifs_noop_callback; + midQ[i]->callback = cifs_compound_callback; + else + midQ[i]->callback = cifs_compound_last_callback; } cifs_in_send_inc(ses->server); rc = smb_send_rqst(ses->server, num_rqst, rqst, flags); @@ -900,8 +911,20 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, mutex_unlock(&ses->server->srv_mutex); - if (rc < 0) + if (rc < 0) { + /* Sending failed for some reason - return credits back */ + for (i = 0; i < num_rqst; i++) + add_credits(ses->server, credits[i], optype); goto out; + } + + /* + * At this point the request is passed to the network stack - we assume + * that any credits taken from the server structure on the client have + * been spent and we can't return them back. Once we receive responses + * we will collect credits granted by the server in the mid callbacks + * and add those credits to the server structure. + */ /* * Compounding is never used during session establish. @@ -934,11 +957,6 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } } - for (i = 0; i < num_rqst; i++) - if (!cancelled_mid[i] && midQ[i]->resp_buf - && (midQ[i]->mid_state == MID_RESPONSE_RECEIVED)) - credits[i] = ses->server->ops->get_credits(midQ[i]); - for (i = 0; i < num_rqst; i++) { if (rc < 0) goto out; @@ -997,7 +1015,6 @@ out: for (i = 0; i < num_rqst; i++) { if (!cancelled_mid[i]) cifs_delete_mid(midQ[i]); - add_credits(ses->server, credits[i], optype); } return rc; -- cgit v1.2.3 From 9a66396f1857cc1de06f4f4771797315e1a4ea56 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 8 Jan 2019 11:15:28 -0800 Subject: CIFS: Fix error paths in writeback code This patch aims to address writeback code problems related to error paths. In particular it respects EINTR and related error codes and stores and returns the first error occurred during writeback. Signed-off-by: Pavel Shilovsky Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 19 +++++++++++++++++++ fs/cifs/cifssmb.c | 7 ++++--- fs/cifs/file.c | 29 +++++++++++++++++++++++------ fs/cifs/inode.c | 10 ++++++++++ 4 files changed, 56 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 770926877b7c..94dbdbe5be34 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1575,6 +1575,25 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, kfree(param); } +static inline bool is_interrupt_error(int error) +{ + switch (error) { + case -EINTR: + case -ERESTARTSYS: + case -ERESTARTNOHAND: + case -ERESTARTNOINTR: + return true; + } + return false; +} + +static inline bool is_retryable_error(int error) +{ + if (is_interrupt_error(error) || error == -EAGAIN) + return true; + return false; +} + #define MID_FREE 0 #define MID_REQUEST_ALLOCATED 1 #define MID_REQUEST_SUBMITTED 2 diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 332f88d753d1..e18915415e13 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2123,7 +2123,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) for (j = 0; j < nr_pages; j++) { unlock_page(wdata2->pages[j]); - if (rc != 0 && rc != -EAGAIN) { + if (rc != 0 && !is_retryable_error(rc)) { SetPageError(wdata2->pages[j]); end_page_writeback(wdata2->pages[j]); put_page(wdata2->pages[j]); @@ -2132,7 +2132,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) if (rc) { kref_put(&wdata2->refcount, cifs_writedata_release); - if (rc == -EAGAIN) + if (is_retryable_error(rc)) continue; break; } @@ -2141,7 +2141,8 @@ cifs_writev_requeue(struct cifs_writedata *wdata) i += nr_pages; } while (i < wdata->nr_pages); - mapping_set_error(inode->i_mapping, rc); + if (rc != 0 && !is_retryable_error(rc)) + mapping_set_error(inode->i_mapping, rc); kref_put(&wdata->refcount, cifs_writedata_release); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5b6f8392d9db..2c7689f3998d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -733,7 +733,8 @@ reopen_success: if (can_flush) { rc = filemap_write_and_wait(inode->i_mapping); - mapping_set_error(inode->i_mapping, rc); + if (!is_interrupt_error(rc)) + mapping_set_error(inode->i_mapping, rc); if (tcon->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, @@ -2118,6 +2119,7 @@ static int cifs_writepages(struct address_space *mapping, pgoff_t end, index; struct cifs_writedata *wdata; int rc = 0; + int saved_rc = 0; unsigned int xid; /* @@ -2146,8 +2148,10 @@ retry: rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, &wsize, &credits); - if (rc) + if (rc != 0) { + done = true; break; + } tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1; @@ -2155,6 +2159,7 @@ retry: &found_pages); if (!wdata) { rc = -ENOMEM; + done = true; add_credits_and_wake_if(server, credits, 0); break; } @@ -2183,7 +2188,7 @@ retry: if (rc != 0) { add_credits_and_wake_if(server, wdata->credits, 0); for (i = 0; i < nr_pages; ++i) { - if (rc == -EAGAIN) + if (is_retryable_error(rc)) redirty_page_for_writepage(wbc, wdata->pages[i]); else @@ -2191,7 +2196,7 @@ retry: end_page_writeback(wdata->pages[i]); put_page(wdata->pages[i]); } - if (rc != -EAGAIN) + if (!is_retryable_error(rc)) mapping_set_error(mapping, rc); } kref_put(&wdata->refcount, cifs_writedata_release); @@ -2201,6 +2206,15 @@ retry: continue; } + /* Return immediately if we received a signal during writing */ + if (is_interrupt_error(rc)) { + done = true; + break; + } + + if (rc != 0 && saved_rc == 0) + saved_rc = rc; + wbc->nr_to_write -= nr_pages; if (wbc->nr_to_write <= 0) done = true; @@ -2218,6 +2232,9 @@ retry: goto retry; } + if (saved_rc != 0) + rc = saved_rc; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; @@ -2250,8 +2267,8 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc) set_page_writeback(page); retry_write: rc = cifs_partialpagewrite(page, 0, PAGE_SIZE); - if (rc == -EAGAIN) { - if (wbc->sync_mode == WB_SYNC_ALL) + if (is_retryable_error(rc)) { + if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) goto retry_write; redirty_page_for_writepage(wbc, page); } else if (rc != 0) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 13fb59aadebc..478003644916 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2257,6 +2257,11 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) * the flush returns error? */ rc = filemap_write_and_wait(inode->i_mapping); + if (is_interrupt_error(rc)) { + rc = -ERESTARTSYS; + goto out; + } + mapping_set_error(inode->i_mapping, rc); rc = 0; @@ -2400,6 +2405,11 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) * the flush returns error? */ rc = filemap_write_and_wait(inode->i_mapping); + if (is_interrupt_error(rc)) { + rc = -ERESTARTSYS; + goto cifs_setattr_exit; + } + mapping_set_error(inode->i_mapping, rc); rc = 0; -- cgit v1.2.3 From 48d2ba6257013676e57ff69444d5212031aee763 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 5 Jan 2019 19:46:35 -0600 Subject: cifs: update internal module version number To 2.16 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 26776eddd85d..d1f9c2f3f575 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.15" +#define CIFS_VERSION "2.16" #endif /* _CIFSFS_H */ -- cgit v1.2.3 From 04906b2f542c23626b0ef6219b808406f8dddbe9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Jan 2019 09:48:10 +0100 Subject: blockdev: Fix livelocks on loop device bd_set_size() updates also block device's block size. This is somewhat unexpected from its name and at this point, only blkdev_open() uses this functionality. Furthermore, this can result in changing block size under a filesystem mounted on a loop device which leads to livelocks inside __getblk_gfp() like: Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 10863 Comm: syz-executor0 Not tainted 4.18.0-rc5+ #151 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__sanitizer_cov_trace_pc+0x3f/0x50 kernel/kcov.c:106 ... Call Trace: init_page_buffers+0x3e2/0x530 fs/buffer.c:904 grow_dev_page fs/buffer.c:947 [inline] grow_buffers fs/buffer.c:1009 [inline] __getblk_slow fs/buffer.c:1036 [inline] __getblk_gfp+0x906/0xb10 fs/buffer.c:1313 __bread_gfp+0x2d/0x310 fs/buffer.c:1347 sb_bread include/linux/buffer_head.h:307 [inline] fat12_ent_bread+0x14e/0x3d0 fs/fat/fatent.c:75 fat_ent_read_block fs/fat/fatent.c:441 [inline] fat_alloc_clusters+0x8ce/0x16e0 fs/fat/fatent.c:489 fat_add_cluster+0x7a/0x150 fs/fat/inode.c:101 __fat_get_block fs/fat/inode.c:148 [inline] ... Trivial reproducer for the problem looks like: truncate -s 1G /tmp/image losetup /dev/loop0 /tmp/image mkfs.ext4 -b 1024 /dev/loop0 mount -t ext4 /dev/loop0 /mnt losetup -c /dev/loop0 l /mnt Fix the problem by moving initialization of a block device block size into a separate function and call it when needed. Thanks to Tetsuo Handa for help with debugging the problem. Reported-by: syzbot+9933e4476f365f5d5a1b@syzkaller.appspotmail.com Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/block_dev.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index c546cdce77e6..58a4c1217fa8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -104,6 +104,20 @@ void invalidate_bdev(struct block_device *bdev) } EXPORT_SYMBOL(invalidate_bdev); +static void set_init_blocksize(struct block_device *bdev) +{ + unsigned bsize = bdev_logical_block_size(bdev); + loff_t size = i_size_read(bdev->bd_inode); + + while (bsize < PAGE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_block_size = bsize; + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} + int set_blocksize(struct block_device *bdev, int size) { /* Size must be a power of two, and between 512 and PAGE_SIZE */ @@ -1431,18 +1445,9 @@ EXPORT_SYMBOL(check_disk_change); void bd_set_size(struct block_device *bdev, loff_t size) { - unsigned bsize = bdev_logical_block_size(bdev); - inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); inode_unlock(bdev->bd_inode); - while (bsize < PAGE_SIZE) { - if (size & bsize) - break; - bsize <<= 1; - } - bdev->bd_block_size = bsize; - bdev->bd_inode->i_blkbits = blksize_bits(bsize); } EXPORT_SYMBOL(bd_set_size); @@ -1519,8 +1524,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + set_init_blocksize(bdev); + } /* * If the device is invalidated, rescan partition @@ -1555,6 +1562,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + set_init_blocksize(bdev); } if (bdev->bd_bdi == &noop_backing_dev_info) -- cgit v1.2.3 From 45ac486ecf2dc998e25cf32f0cabf2deaad875be Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 11 Jan 2019 19:04:44 -0500 Subject: NFSv4.2 fix unnecessary retry in nfs4_copy_file_range Currently nfs42_proc_copy_file_range() can not return EAGAIN. Fixes: e4648aa4f98a ("NFS recover from destination server reboot for copies") Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4file.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 46d691ba04bc..45b2322e092d 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,15 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { - ssize_t ret; - if (file_inode(file_in) == file_inode(file_out)) return -EINVAL; -retry: - ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); - if (ret == -EAGAIN) - goto retry; - return ret; + return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) -- cgit v1.2.3 From 97e1532ef81acb31c30f9e75bf00306c33a77812 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Jan 2019 10:27:59 +0100 Subject: fuse: handle zero sized retrieve correctly Dereferencing req->page_descs[0] will Oops if req->max_pages is zero. Reported-by: syzbot+c1e36d30ee3416289cc0@syzkaller.appspotmail.com Tested-by: syzbot+c1e36d30ee3416289cc0@syzkaller.appspotmail.com Fixes: b2430d7567a3 ("fuse: add per-page descriptor to fuse_req") Cc: # v3.9 Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a5e516a40e7a..fc29264011a6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1742,7 +1742,6 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->in.h.nodeid = outarg->nodeid; req->in.numargs = 2; req->in.argpages = 1; - req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; @@ -1757,6 +1756,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, this_num = min_t(unsigned, num, PAGE_SIZE - offset); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].offset = offset; req->page_descs[req->num_pages].length = this_num; req->num_pages++; -- cgit v1.2.3 From 8a3177db59cd644fde05ba9efee29392dfdec8aa Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Jan 2019 10:27:59 +0100 Subject: cuse: fix ioctl cuse_process_init_reply() doesn't initialize fc->max_pages and thus all cuse bases ioctls fail with ENOMEM. Reported-by: Andreas Steinmetz Fixes: 5da784cce430 ("fuse: add max_pages to init_out") Cc: # v4.20 Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 76baaa6be393..c2d4099429be 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -628,6 +628,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); + fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -1162,7 +1163,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); - fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; /* Used by get_root_inode() */ sb->s_fs_info = fc; -- cgit v1.2.3 From 9509941e9c534920ccc4771ae70bd6cbbe79df1c Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 12 Jan 2019 02:39:05 +0100 Subject: fuse: call pipe_buf_release() under pipe lock Some of the pipe_buf_release() handlers seem to assume that the pipe is locked - in particular, anon_pipe_buf_release() accesses pipe->tmp_page without taking any extra locks. From a glance through the callers of pipe_buf_release(), it looks like FUSE is the only one that calls pipe_buf_release() without having the pipe locked. This bug should only lead to a memory leak, nothing terrible. Fixes: dd3bb14f44a6 ("fuse: support splice() writing to fuse device") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fc29264011a6..809c0f2f9942 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2077,8 +2077,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); + pipe_lock(pipe); for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); + pipe_unlock(pipe); out: kvfree(bufs); -- cgit v1.2.3 From a2ebba824106dabe79937a9f29a875f837e1b6d4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Jan 2019 10:27:59 +0100 Subject: fuse: decrement NR_WRITEBACK_TEMP on the right page NR_WRITEBACK_TEMP is accounted on the temporary page in the request, not the page cache page. Fixes: 8b284dc47291 ("fuse: writepages: handle same page rewrites") Cc: # v3.13 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ffaffe18352a..a59c16bd90ac 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1782,7 +1782,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, spin_unlock(&fc->lock); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK_TEMP); + dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); -- cgit v1.2.3 From 4882a27cec24319d10f95e978ecc80050e3e3e15 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Wed, 9 Jan 2019 17:23:54 +0000 Subject: afs: Don't set vnode->cb_s_break in afs_validate() A cb_interest record is not necessarily attached to the vnode on entry to afs_validate(), which can cause an oops when we try to bring the vnode's cb_s_break up to date in the default case (ie. no current callback promise and the vnode has not been deleted). Fix this by simply removing the line, as vnode->cb_s_break will be set when needed by afs_register_server_cb_interest() when we next get a callback promise from RPC call. The oops looks something like: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 ... RIP: 0010:afs_validate+0x66/0x250 [kafs] ... Call Trace: afs_d_revalidate+0x8d/0x340 [kafs] ? __d_lookup+0x61/0x150 lookup_dcache+0x44/0x70 ? lookup_dcache+0x44/0x70 __lookup_hash+0x24/0xa0 do_unlinkat+0x11d/0x2c0 __x64_sys_unlink+0x23/0x30 do_syscall_64+0x4d/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: ae3b7361dc0e ("afs: Fix validation/callback interaction") Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6b17d3620414..211343831c30 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -414,7 +414,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { valid = true; } else { - vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; vnode->cb_v_break = vnode->volume->cb_v_break; valid = false; } -- cgit v1.2.3 From 59d49076ae3e6912e6d7df2fd68e2337f3d02036 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 9 Jan 2019 17:23:54 +0000 Subject: afs: Fix key refcounting in file locking code Fix the refcounting of the authentication keys in the file locking code. The vnode->lock_key member points to a key on which it expects to be holding a ref, but it isn't always given an extra ref, however. Fixes: 0fafdc9f888b ("afs: Fix file locking") Signed-off-by: David Howells --- fs/afs/flock.c | 4 ++-- fs/afs/inode.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 0568fd986821..e432bd27a2e7 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -208,7 +208,7 @@ again: /* The new front of the queue now owns the state variables. */ next = list_entry(vnode->pending_locks.next, struct file_lock, fl_u.afs.link); - vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_key = key_get(afs_file_key(next->fl_file)); vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; goto again; @@ -413,7 +413,7 @@ static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl) /* The new front of the queue now owns the state variables. */ next = list_entry(vnode->pending_locks.next, struct file_lock, fl_u.afs.link); - vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_key = key_get(afs_file_key(next->fl_file)); vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; afs_lock_may_be_available(vnode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 211343831c30..1a4ce07fb406 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -545,6 +545,8 @@ void afs_evict_inode(struct inode *inode) #endif afs_put_permits(rcu_access_pointer(vnode->permit_cache)); + key_put(vnode->lock_key); + vnode->lock_key = NULL; _leave(""); } -- cgit v1.2.3 From 7a75b0079a1d54e342c502c3c8107ba97e05d3d3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 10 Jan 2019 15:14:29 +0000 Subject: afs: Provide a function to get a ref on a call Provide a function to get a reference on an afs_call struct. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index a7b44863d502..4830e0a6bf1d 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -203,20 +203,26 @@ void afs_put_call(struct afs_call *call) } } +static struct afs_call *afs_get_call(struct afs_call *call, + enum afs_call_trace why) +{ + int u = atomic_inc_return(&call->usage); + + trace_afs_call(call, why, u, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); + return call; +} + /* * Queue the call for actual work. */ static void afs_queue_call_work(struct afs_call *call) { if (call->type->work) { - int u = atomic_inc_return(&call->usage); - - trace_afs_call(call, afs_call_trace_work, u, - atomic_read(&call->net->nr_outstanding_calls), - __builtin_return_address(0)); - INIT_WORK(&call->work, call->type->work); + afs_get_call(call, afs_call_trace_work); if (!queue_work(afs_wq, &call->work)) afs_put_call(call); } -- cgit v1.2.3 From 34fa47612bfe5d7de7fcaf658a6952b6aeec3b13 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 10 Jan 2019 15:40:50 +0000 Subject: afs: Fix race in async call refcounting There's a race between afs_make_call() and afs_wake_up_async_call() in the case that an error is returned from rxrpc_kernel_send_data() after it has queued the final packet. afs_make_call() will try and clean up the mess, but the call state may have been moved on thereby causing afs_process_async_call() to also try and to delete the call. Fix this by: (1) Getting an extra ref for an asynchronous call for the call itself to hold. This makes sure the call doesn't evaporate on us accidentally and will allow the call to be retained by the caller in a future patch. The ref is released on leaving afs_make_call() or afs_wait_for_call_to_complete(). (2) In the event of an error from rxrpc_kernel_send_data(): (a) Don't set the call state to AFS_CALL_COMPLETE until *after* the call has been aborted and ended. This prevents afs_deliver_to_call() from doing anything with any notifications it gets. (b) Explicitly end the call immediately to prevent further callbacks. (c) Cancel any queued async_work and wait for the work if it's executing. This allows us to be sure the race won't recur when we change the state. We put the work queue's ref on the call if we managed to cancel it. (d) Put the call's ref that we got in (1). This belongs to us as long as the call is in state AFS_CALL_CL_REQUESTING. Fixes: 341f741f04be ("afs: Refcount the afs_call struct") Signed-off-by: David Howells --- fs/afs/rxrpc.c | 35 ++++++++++++++++++++++++++++++----- include/trace/events/afs.h | 2 ++ 2 files changed, 32 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 4830e0a6bf1d..2c588f9bbbda 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -23,6 +23,7 @@ struct workqueue_struct *afs_async_calls; static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); +static void afs_delete_async_call(struct work_struct *); static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); @@ -404,6 +405,12 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, } } + /* If the call is going to be asynchronous, we need an extra ref for + * the call to hold itself so the caller need not hang on to its ref. + */ + if (call->async) + afs_get_call(call, afs_call_trace_get); + /* create a call */ rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key, (unsigned long)call, @@ -444,15 +451,17 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, goto error_do_abort; } - /* at this point, an async call may no longer exist as it may have - * already completed */ - if (call->async) + /* Note that at this point, we may have received the reply or an abort + * - and an asynchronous call may already have completed. + */ + if (call->async) { + afs_put_call(call); return -EINPROGRESS; + } return afs_wait_for_call_to_complete(call, ac); error_do_abort: - call->state = AFS_CALL_COMPLETE; if (ret != -ECONNABORTED) { rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); @@ -469,8 +478,24 @@ error_do_abort: error_kill_call: if (call->type->done) call->type->done(call); - afs_put_call(call); + + /* We need to dispose of the extra ref we grabbed for an async call. + * The call, however, might be queued on afs_async_calls and we need to + * make sure we don't get any more notifications that might requeue it. + */ + if (call->rxcall) { + rxrpc_kernel_end_call(call->net->socket, call->rxcall); + call->rxcall = NULL; + } + if (call->async) { + if (cancel_work_sync(&call->async_work)) + afs_put_call(call); + afs_put_call(call); + } + ac->error = ret; + call->state = AFS_CALL_COMPLETE; + afs_put_call(call); _leave(" = %d", ret); return ret; } diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 33d291888ba9..e3f005eae1f7 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -25,6 +25,7 @@ enum afs_call_trace { afs_call_trace_alloc, afs_call_trace_free, + afs_call_trace_get, afs_call_trace_put, afs_call_trace_wake, afs_call_trace_work, @@ -159,6 +160,7 @@ enum afs_file_error { #define afs_call_traces \ EM(afs_call_trace_alloc, "ALLOC") \ EM(afs_call_trace_free, "FREE ") \ + EM(afs_call_trace_get, "GET ") \ EM(afs_call_trace_put, "PUT ") \ EM(afs_call_trace_wake, "WAKE ") \ E_(afs_call_trace_work, "WORK ") -- cgit v1.2.3 From 6a4c9ab13feeacd3072175d7d1f1fcfabbb9fc90 Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Thu, 17 Jan 2019 09:09:29 -0800 Subject: pstore/ram: Fix console ramoops to show the previous boot logs commit b05c950698fe ("pstore/ram: Simplify ramoops_get_next_prz() arguments") changed update assignment in getting next persistent ram zone by adding a check for record type. But the check always returns true since the record type is assigned 0. And this breaks console ramoops by showing current console log instead of previous log on warm reset and hard reset (actually hard reset should not be showing any logs). Fix this by having persistent ram zone type check instead of record type check. Tested this on SDM845 MTP and dragonboard 410c. Reproducing this issue is simple as below: 1. Trigger hard reset and mount pstore. Will see console-ramoops record in the mounted location which is the current log. 2. Trigger warm reset and mount pstore. Will see the current console-ramoops record instead of previous record. Fixes: b05c950698fe ("pstore/ram: Simplify ramoops_get_next_prz() arguments") Signed-off-by: Sai Prakash Ranjan Acked-by: Joel Fernandes (Google) [kees: dropped local variable usage] Signed-off-by: Kees Cook --- fs/pstore/ram.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 96f7d32cd184..076e26fdc0c0 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -128,7 +128,6 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id, struct pstore_record *record) { struct persistent_ram_zone *prz; - bool update = (record->type == PSTORE_TYPE_DMESG); /* Give up if we never existed or have hit the end. */ if (!przs) @@ -139,7 +138,7 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id, return NULL; /* Update old/shadowed buffer. */ - if (update) + if (prz->type == PSTORE_TYPE_DMESG) persistent_ram_save_old(prz); if (!persistent_ram_old_size(prz)) -- cgit v1.2.3 From 77b7aad195099e7c6da11e94b7fa6ef5e6fb0025 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 9 Jan 2019 15:02:23 +0100 Subject: Revert "btrfs: balance dirty metadata pages in btrfs_finish_ordered_io" This reverts commit e73e81b6d0114d4a303205a952ab2e87c44bd279. This patch causes a few problems: - adds latency to btrfs_finish_ordered_io - as btrfs_finish_ordered_io is used for free space cache, generating more work from btrfs_btree_balance_dirty_nodelay could end up in the same workque, effectively deadlocking 12260 kworker/u96:16+btrfs-freespace-write D [<0>] balance_dirty_pages+0x6e6/0x7ad [<0>] balance_dirty_pages_ratelimited+0x6bb/0xa90 [<0>] btrfs_finish_ordered_io+0x3da/0x770 [<0>] normal_work_helper+0x1c5/0x5a0 [<0>] process_one_work+0x1ee/0x5a0 [<0>] worker_thread+0x46/0x3d0 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x24/0x30 [<0>] 0xffffffffffffffff Transaction commit will wait on the freespace cache: 838 btrfs-transacti D [<0>] btrfs_start_ordered_extent+0x154/0x1e0 [<0>] btrfs_wait_ordered_range+0xbd/0x110 [<0>] __btrfs_wait_cache_io+0x49/0x1a0 [<0>] btrfs_write_dirty_block_groups+0x10b/0x3b0 [<0>] commit_cowonly_roots+0x215/0x2b0 [<0>] btrfs_commit_transaction+0x37e/0x910 [<0>] transaction_kthread+0x14d/0x180 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x24/0x30 [<0>] 0xffffffffffffffff And then writepages ends up waiting on transaction commit: 9520 kworker/u96:13+flush-btrfs-1 D [<0>] wait_current_trans+0xac/0xe0 [<0>] start_transaction+0x21b/0x4b0 [<0>] cow_file_range_inline+0x10b/0x6b0 [<0>] cow_file_range.isra.69+0x329/0x4a0 [<0>] run_delalloc_range+0x105/0x3c0 [<0>] writepage_delalloc+0x119/0x180 [<0>] __extent_writepage+0x10c/0x390 [<0>] extent_write_cache_pages+0x26f/0x3d0 [<0>] extent_writepages+0x4f/0x80 [<0>] do_writepages+0x17/0x60 [<0>] __writeback_single_inode+0x59/0x690 [<0>] writeback_sb_inodes+0x291/0x4e0 [<0>] __writeback_inodes_wb+0x87/0xb0 [<0>] wb_writeback+0x3bb/0x500 [<0>] wb_workfn+0x40d/0x610 [<0>] process_one_work+0x1ee/0x5a0 [<0>] worker_thread+0x1e0/0x3d0 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x24/0x30 [<0>] 0xffffffffffffffff Eventually, we have every process in the system waiting on balance_dirty_pages(), and nobody is able to make progress on page writeback. The original patch tried to fix an OOM condition, that happened on 4.4 but no success reproducing that on later kernels (4.19 and 4.20). This is more likely a problem in OOM itself. Link: https://lore.kernel.org/linux-btrfs/20180528054821.9092-1-ethanlien@synology.com/ Reported-by: Chris Mason CC: stable@vger.kernel.org # 4.18+ CC: ethanlien Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 43eb4535319d..b6025b5d0b25 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3129,9 +3129,6 @@ out: /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - /* Try to release some metadata so we don't get an OOM but don't wait */ - btrfs_btree_balance_dirty_nodelay(fs_info); - return ret; } -- cgit v1.2.3 From 31890da0bfdd24b135a258404b93c58a65510c7a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 21 Nov 2018 14:05:41 -0500 Subject: btrfs: handle delayed ref head accounting cleanup in abort We weren't doing any of the accounting cleanup when we aborted transactions. Fix this by making cleanup_ref_head_accounting global and calling it from the abort code, this fixes the issue where our accounting was all wrong after the fs aborts. The test generic/475 on a 2G VM can trigger the problems eg.: [ 8502.136957] WARNING: CPU: 0 PID: 11064 at fs/btrfs/extent-tree.c:5986 btrfs_free_block_grou +ps+0x3dc/0x410 [btrfs] [ 8502.148372] CPU: 0 PID: 11064 Comm: umount Not tainted 5.0.0-rc1-default+ #394 [ 8502.150807] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626 +cc-prebuilt.qemu-project.org 04/01/2014 [ 8502.154317] RIP: 0010:btrfs_free_block_groups+0x3dc/0x410 [btrfs] [ 8502.160623] RSP: 0018:ffffb1ab84b93de8 EFLAGS: 00010206 [ 8502.161906] RAX: 0000000001000000 RBX: ffff9f34b1756400 RCX: 0000000000000000 [ 8502.163448] RDX: 0000000000000002 RSI: 0000000000000001 RDI: ffff9f34b1755400 [ 8502.164906] RBP: ffff9f34b7e8c000 R08: 0000000000000001 R09: 0000000000000000 [ 8502.166716] R10: 0000000000000000 R11: 0000000000000001 R12: ffff9f34b7e8c108 [ 8502.168498] R13: ffff9f34b7e8c158 R14: 0000000000000000 R15: dead000000000100 [ 8502.170296] FS: 00007fb1cf15ffc0(0000) GS:ffff9f34bd400000(0000) knlGS:0000000000000000 [ 8502.172439] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8502.173669] CR2: 00007fb1ced507b0 CR3: 000000002f7a6000 CR4: 00000000000006f0 [ 8502.175094] Call Trace: [ 8502.175759] close_ctree+0x17f/0x350 [btrfs] [ 8502.176721] generic_shutdown_super+0x64/0x100 [ 8502.177702] kill_anon_super+0x14/0x30 [ 8502.178607] btrfs_kill_super+0x12/0xa0 [btrfs] [ 8502.179602] deactivate_locked_super+0x29/0x60 [ 8502.180595] cleanup_mnt+0x3b/0x70 [ 8502.181406] task_work_run+0x98/0xc0 [ 8502.182255] exit_to_usermode_loop+0x83/0x90 [ 8502.183113] do_syscall_64+0x15b/0x180 [ 8502.183919] entry_SYSCALL_64_after_hwframe+0x49/0xbe Corresponding to release_global_block_rsv() { ... WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); CC: stable@vger.kernel.org Signed-off-by: Josef Bacik [ add log dump ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 ++++ fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 12 +++++------- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f031a447a047..34019c8b6158 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,6 +35,7 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; +struct btrfs_delayed_ref_root; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; @@ -2664,6 +2665,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, unsigned long count, u64 transid, int wait); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8da2f380d3c0..469a51b4b273 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4265,6 +4265,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (pin_bytes) btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b15afeae16df..6f6fae410fc1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2456,12 +2456,10 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, return ret ? ret : 1; } -static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; int nr_items = 1; /* Dropping this ref head update. */ if (head->total_ref_mod < 0) { @@ -2544,7 +2542,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - cleanup_ref_head_accounting(trans, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); @@ -7188,7 +7186,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - cleanup_ref_head_accounting(trans, head); + btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; -- cgit v1.2.3 From 74d5d229b1bf60f93bff244b2dfc0eb21ec32a07 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 21 Nov 2018 14:05:45 -0500 Subject: btrfs: wait on ordered extents on abort cleanup If we flip read-only before we initiate writeback on all dirty pages for ordered extents we've created then we'll have ordered extents left over on umount, which results in all sorts of bad things happening. Fix this by making sure we wait on ordered extents if we have to do the aborted transaction cleanup stuff. generic/475 can produce this warning: [ 8531.177332] WARNING: CPU: 2 PID: 11997 at fs/btrfs/disk-io.c:3856 btrfs_free_fs_root+0x95/0xa0 [btrfs] [ 8531.183282] CPU: 2 PID: 11997 Comm: umount Tainted: G W 5.0.0-rc1-default+ #394 [ 8531.185164] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014 [ 8531.187851] RIP: 0010:btrfs_free_fs_root+0x95/0xa0 [btrfs] [ 8531.193082] RSP: 0018:ffffb1ab86163d98 EFLAGS: 00010286 [ 8531.194198] RAX: ffff9f3449494d18 RBX: ffff9f34a2695000 RCX:0000000000000000 [ 8531.195629] RDX: 0000000000000002 RSI: 0000000000000001 RDI:0000000000000000 [ 8531.197315] RBP: ffff9f344e930000 R08: 0000000000000001 R09:0000000000000000 [ 8531.199095] R10: 0000000000000000 R11: ffff9f34494d4ff8 R12:ffffb1ab86163dc0 [ 8531.200870] R13: ffff9f344e9300b0 R14: ffffb1ab86163db8 R15:0000000000000000 [ 8531.202707] FS: 00007fc68e949fc0(0000) GS:ffff9f34bd800000(0000)knlGS:0000000000000000 [ 8531.204851] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8531.205942] CR2: 00007ffde8114dd8 CR3: 000000002dfbd000 CR4:00000000000006e0 [ 8531.207516] Call Trace: [ 8531.208175] btrfs_free_fs_roots+0xdb/0x170 [btrfs] [ 8531.210209] ? wait_for_completion+0x5b/0x190 [ 8531.211303] close_ctree+0x157/0x350 [btrfs] [ 8531.212412] generic_shutdown_super+0x64/0x100 [ 8531.213485] kill_anon_super+0x14/0x30 [ 8531.214430] btrfs_kill_super+0x12/0xa0 [btrfs] [ 8531.215539] deactivate_locked_super+0x29/0x60 [ 8531.216633] cleanup_mnt+0x3b/0x70 [ 8531.217497] task_work_run+0x98/0xc0 [ 8531.218397] exit_to_usermode_loop+0x83/0x90 [ 8531.219324] do_syscall_64+0x15b/0x180 [ 8531.220192] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 8531.221286] RIP: 0033:0x7fc68e5e4d07 [ 8531.225621] RSP: 002b:00007ffde8116608 EFLAGS: 00000246 ORIG_RAX:00000000000000a6 [ 8531.227512] RAX: 0000000000000000 RBX: 00005580c2175970 RCX:00007fc68e5e4d07 [ 8531.229098] RDX: 0000000000000001 RSI: 0000000000000000 RDI:00005580c2175b80 [ 8531.230730] RBP: 0000000000000000 R08: 00005580c2175ba0 R09:00007ffde8114e80 [ 8531.232269] R10: 0000000000000000 R11: 0000000000000246 R12:00005580c2175b80 [ 8531.233839] R13: 00007fc68eac61c4 R14: 00005580c2175a68 R15:0000000000000000 Leaving a tree in the rb-tree: 3853 void btrfs_free_fs_root(struct btrfs_root *root) 3854 { 3855 iput(root->ino_cache_inode); 3856 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); CC: stable@vger.kernel.org Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik [ add stacktrace ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 469a51b4b273..18eefc5b2532 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4201,6 +4201,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); + + /* + * We need this here because if we've been flipped read-only we won't + * get sync() from the umount, so we need to make sure any ordered + * extents that haven't had their dirty pages IO start writeout yet + * actually get run and error out properly. + */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, -- cgit v1.2.3 From 3ec9a4c81c8cc2a8d9673588dd84d9cc7c31019b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 11 Jan 2019 10:21:01 -0500 Subject: btrfs: run delayed iputs before committing Delayed iputs means we can have final iputs of deleted inodes in the queue, which could potentially generate a lot of pinned space that could be free'd. So before we decide to commit the transaction for ENOPSC reasons, run the delayed iputs so that any potential space is free'd up. If there is and we freed enough we can then commit the transaction and potentially be able to make our reservation. Reviewed-by: Omar Sandoval Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6f6fae410fc1..d81035b7ea7d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4952,6 +4952,15 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = 0; break; case COMMIT_TRANS: + /* + * If we have pending delayed iputs then we could free up a + * bunch of pinned space, so make sure we run the iputs before + * we do our pinned bytes check below. + */ + mutex_lock(&fs_info->cleaner_delayed_iput_mutex); + btrfs_run_delayed_iputs(fs_info); + mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); + ret = may_commit_transaction(fs_info, space_info); break; default: -- cgit v1.2.3 From fd340d0f68cc87badfc9efcb226f23a5428826a0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 11 Jan 2019 10:21:02 -0500 Subject: btrfs: wakeup cleaner thread when adding delayed iput The cleaner thread usually takes care of delayed iputs, with the exception of the btrfs_end_transaction_throttle path. Delaying iputs means we are potentially delaying the eviction of an inode and it's respective space. The cleaner thread only gets woken up every 30 seconds, or when we require space. If there are a lot of inodes that need to be deleted we could induce a serious amount of latency while we wait for these inodes to be evicted. So instead wakeup the cleaner if it's not already awake to process any new delayed iputs we add to the list. If we suddenly need space we will less likely be backed up behind a bunch of inodes that are waiting to be deleted, and we could possibly free space before we need to get into the flushing logic which will save us some latency. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/disk-io.c | 3 +++ fs/btrfs/inode.c | 2 ++ 3 files changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 34019c8b6158..8b1d06fa222d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -787,6 +787,9 @@ enum { * main phase. The fs_info::balance_ctl is initialized. */ BTRFS_FS_BALANCE_RUNNING, + + /* Indicate that the cleaner thread is awake and doing something. */ + BTRFS_FS_CLEANER_RUNNING, }; struct btrfs_fs_info { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 18eefc5b2532..6a2a2a951705 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1682,6 +1682,8 @@ static int cleaner_kthread(void *arg) while (1) { again = 0; + set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); + /* Make the cleaner go to sleep early. */ if (btrfs_need_cleaner_sleep(fs_info)) goto sleep; @@ -1728,6 +1730,7 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: + clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) kthread_parkme(); if (kthread_should_stop()) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b6025b5d0b25..5c349667c761 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3251,6 +3251,8 @@ void btrfs_add_delayed_iput(struct inode *inode) ASSERT(list_empty(&binode->delayed_iput)); list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); + if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) -- cgit v1.2.3 From 5631e8576a3caf606cdc375f97425a67983b420c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 20 Jan 2019 14:33:34 -0800 Subject: pstore/ram: Avoid allocation and leak of platform data Yue Hu noticed that when parsing device tree the allocated platform data was never freed. Since it's not used beyond the function scope, this switches to using a stack variable instead. Reported-by: Yue Hu Fixes: 35da60941e44 ("pstore/ram: add Device Tree bindings") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- fs/pstore/ram.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 076e26fdc0c0..898c8321b343 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -710,18 +710,15 @@ static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ramoops_platform_data *pdata = dev->platform_data; + struct ramoops_platform_data pdata_local; struct ramoops_context *cxt = &oops_cxt; size_t dump_mem_sz; phys_addr_t paddr; int err = -EINVAL; if (dev_of_node(dev) && !pdata) { - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) { - pr_err("cannot allocate platform data buffer\n"); - err = -ENOMEM; - goto fail_out; - } + pdata = &pdata_local; + memset(pdata, 0, sizeof(*pdata)); err = ramoops_parse_dt(pdev, pdata); if (err < 0) -- cgit v1.2.3 From d95e674c01cfb5461e8b9fdeebf6d878c9b80b2f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 10 Jan 2019 15:41:09 +0800 Subject: ceph: clear inode pointer when snap realm gets dropped by its inode snap realm and corresponding inode have pointers to each other. The two pointer should get clear at the same time. Otherwise, snap realm's pointer may reference freed inode. Cc: stable@vger.kernel.org # 4.17+ Signed-off-by: "Yan, Zheng" Reviewed-by: Luis Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 94c026bba2c2..bba28a5034ba 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1035,6 +1035,8 @@ static void drop_inode_snap_realm(struct ceph_inode_info *ci) list_del_init(&ci->i_snap_realm_item); ci->i_snap_realm_counter++; ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; spin_unlock(&realm->inodes_with_caps_lock); ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, realm); -- cgit v1.2.3 From 74827ee29565f86e2a64495a5e3e58d3371d74ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jan 2019 00:14:22 +0100 Subject: ceph: quota: cleanup license mess Precise and non-ambiguous license information is important. The recently added quota.c file has a SPDX license identifier, which is nice, but at the same time it has a contradictionary license boiler plate text. SPDX-License-Identifier: GPL-2.0 versus * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. Oh well. As the other ceph related files are licensed under the GPL v2 only, it's assumed that the SPDX id is correct and the boiler plate was randomly copied into that patch. Remove the boiler plate as it is wrong and even if correct it is redundant. Fixes: fb18a57568c2 ("ceph: quota: add initial infrastructure to support cephfs quotas") Signed-off-by: Thomas Gleixner Cc: Luis Henriques Cc: Jiri Kosina Cc: "Yan, Zheng" Cc: Sage Weil Cc: Ilya Dryomov Cc: ceph-devel@vger.kernel.org Acked-by: Luis Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'fs') diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 03f4d24db8fe..9455d3aef0c3 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -3,19 +3,6 @@ * quota.c - CephFS quota * * Copyright (C) 2017-2018 SUSE - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include -- cgit v1.2.3 From 8b9433eb4de3c26a9226c981c283f9f4896ae030 Mon Sep 17 00:00:00 2001 From: "Ernesto A. Fernández" Date: Mon, 8 Oct 2018 20:58:23 -0300 Subject: direct-io: allow direct writes to empty inodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a DIO_SKIP_HOLES filesystem, the ->get_block() method is currently not allowed to create blocks for an empty inode. This confusion comes from trying to bit shift a negative number, so check the size of the inode first. The problem is most visible for hfsplus, because the fallback to buffered I/O doesn't happen and the write fails with EIO. This is in part the fault of the module, because it gives a wrong return value on ->get_block(); that will be fixed in a separate patch. Reviewed-by: Jeff Moyer Reviewed-by: Jan Kara Signed-off-by: Ernesto A. Fernández Signed-off-by: Jens Axboe --- fs/direct-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index dbc1a1f080ce..ec2fb6fe6d37 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -679,6 +679,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, unsigned long fs_count; /* Number of filesystem-sized blocks */ int create; unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; + loff_t i_size; /* * If there was a memory error and we've overwritten all the @@ -708,8 +709,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, */ create = dio->op == REQ_OP_WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> - i_blkbits)) + i_size = i_size_read(dio->inode); + if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits) create = 0; } -- cgit v1.2.3 From 7fc5854f8c6efae9e7624970ab49a1eac2faefb1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Dec 2017 08:38:30 -0800 Subject: writeback: synchronize sync(2) against cgroup writeback membership switches sync_inodes_sb() can race against cgwb (cgroup writeback) membership switches and fail to writeback some inodes. For example, if an inode switches to another wb while sync_inodes_sb() is in progress, the new wb might not be visible to bdi_split_work_to_wbs() at all or the inode might jump from a wb which hasn't issued writebacks yet to one which already has. This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb switch path against sync_inodes_sb() so that sync_inodes_sb() is guaranteed to see all the target wbs and inodes can't jump wbs to escape syncing. v2: Fixed misplaced rwsem init. Spotted by Jiufei. Signed-off-by: Tejun Heo Reported-by: Jiufei Xue Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com Acked-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 40 ++++++++++++++++++++++++++++++++++++++-- include/linux/backing-dev-defs.h | 1 + mm/backing-dev.c | 1 + 3 files changed, 40 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b40168fcc94a..36855c1f8daf 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -331,11 +331,22 @@ struct inode_switch_wbs_context { struct work_struct work; }; +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + down_write(&bdi->wb_switch_rwsem); +} + +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + up_write(&bdi->wb_switch_rwsem); +} + static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; + struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; @@ -343,6 +354,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) struct page *page; bool switched = false; + /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions @@ -428,6 +445,8 @@ skip_switch: spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); + up_read(&bdi->wb_switch_rwsem); + if (switched) { wb_wakeup(new_wb); wb_put(old_wb); @@ -468,9 +487,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (inode->i_state & I_WB_SWITCH) return; + /* + * Avoid starting new switches while sync_inodes_sb() is in + * progress. Otherwise, if the down_write protected issue path + * blocks heavily, we might end up starting a large number of + * switches which will block on the rwsem. + */ + if (!down_read_trylock(&bdi->wb_switch_rwsem)) + return; + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); if (!isw) - return; + goto out_unlock; /* find and pin the new wb */ rcu_read_lock(); @@ -504,12 +532,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); - return; + goto out_unlock; out_free: if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); +out_unlock: + up_read(&bdi->wb_switch_rwsem); } /** @@ -887,6 +917,9 @@ fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } + static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) @@ -2413,8 +2446,11 @@ void sync_inodes_sb(struct super_block *sb) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); + /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ + bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); wb_wait_for_completion(bdi, &done); + bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c31157135598..07e02d6df5ad 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -190,6 +190,7 @@ struct backing_dev_info { struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */ + struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */ #else struct bdi_writeback_congested *wb_congested; #endif diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8a8bb8796c6c..72e6d0c55cfa 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; mutex_init(&bdi->cgwb_release_mutex); + init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { -- cgit v1.2.3 From 73aaf920cc72024c4a4460cfa46d56e5014172f3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 16 Jan 2019 16:28:59 +0000 Subject: cifs: fix memory leak of an allocated cifs_ntsd structure The call to SMB2_queary_acl can allocate memory to pntsd and also return a failure via a call to SMB2_query_acl (and then query_info). This occurs when query_info allocates the structure and then in query_info the call to smb2_validate_and_copy_iov fails. Currently the failure just returns without kfree'ing pntsd hence causing a memory leak. Currently, *data is allocated if it's not already pointing to a buffer, so it needs to be kfree'd only if was allocated in query_info, so the fix adds an allocated flag to track this. Also set *dlen to zero on an error just to be safe since *data is kfree'd. Also set errno to -ENOMEM if the allocation of *data fails. Signed-off-by: Colin Ian King Signed-off-by: Steve French Reviewed-by: Dan Carpener --- fs/cifs/smb2pdu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 50811a7dc0e0..0af87bd0dc49 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2816,6 +2816,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype = CIFS_NO_BUFFER; struct cifs_ses *ses = tcon->ses; int flags = 0; + bool allocated = false; cifs_dbg(FYI, "Query Info\n"); @@ -2855,14 +2856,21 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, "Error %d allocating memory for acl\n", rc); *dlen = 0; + rc = -ENOMEM; goto qinf_exit; } + allocated = true; } } rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, min_len, *data); + if (rc && allocated) { + kfree(*data); + *data = NULL; + *dlen = 0; + } qinf_exit: SMB2_query_info_free(&rqst); -- cgit v1.2.3 From acc58d0bab55a50e02c25f00bd6a210ee121595f Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 17 Jan 2019 08:21:24 -0800 Subject: CIFS: Fix possible hang during async MTU reads and writes When doing MTU i/o we need to leave some credits for possible reopen requests and other operations happening in parallel. Currently we leave 1 credit which is not enough even for reopen only: we need at least 2 credits if durable handle reconnect fails. Also there may be other operations at the same time including compounding ones which require 3 credits at a time each. Fix this by leaving 8 credits which is big enough to cover most scenarios. Was able to reproduce this when server was configured to give out fewer credits than usual. The proper fix would be to reconnect a file handle first and then obtain credits for an MTU request but this leads to bigger code changes and should happen in other patches. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index cf7eb891804f..1d3ce127b02e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -165,14 +165,14 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, scredits = server->credits; /* can deadlock with reopen */ - if (scredits == 1) { + if (scredits <= 8) { *num = SMB2_MAX_BUFFER_SIZE; *credits = 0; break; } - /* leave one credit for a possible reopen */ - scredits--; + /* leave some credits for reopen and other ops */ + scredits -= 8; *num = min_t(unsigned int, size, scredits * SMB2_MAX_BUFFER_SIZE); -- cgit v1.2.3 From b0b2cac7e244629e1a84a26d7eabb885fed7ff68 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jan 2019 00:14:23 +0100 Subject: smb3: Cleanup license mess Precise and non-ambiguous license information is important. The recently added aegis header file has a SPDX license identifier, which is nice, but at the same time it has a contradictionary license boiler plate text. SPDX-License-Identifier: GPL-2.0 versus * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. Oh well. Assuming that the SPDX identifier is correct and according to x86/hyper-v contributions from Microsoft GPL V2 only is the usual license. Remove the boiler plate as it is wrong and even if correct it is redundant. Fixes: eccb4422cf97 ("smb3: Add ftrace tracepoints for improved SMB3 debugging") Signed-off-by: Thomas Gleixner Cc: Steve French Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/cifs/trace.c | 10 ---------- fs/cifs/trace.h | 10 ---------- 2 files changed, 20 deletions(-) (limited to 'fs') diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c index bd4a546feec1..465483787193 100644 --- a/fs/cifs/trace.c +++ b/fs/cifs/trace.c @@ -3,16 +3,6 @@ * Copyright (C) 2018, Microsoft Corporation. * * Author(s): Steve French - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index fb049809555f..59be48206932 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -3,16 +3,6 @@ * Copyright (C) 2018, Microsoft Corporation. * * Author(s): Steve French - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cifs -- cgit v1.2.3 From ef68e831840c40c7d01b328b3c0f5d8c4796c232 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 18 Jan 2019 17:25:36 -0800 Subject: CIFS: Do not reconnect TCP session in add_credits() When executing add_credits() we currently call cifs_reconnect() if the number of credits is zero and there are no requests in flight. In this case we may call cifs_reconnect() recursively twice and cause memory corruption given the following sequence of functions: mid1.callback() -> add_credits() -> cifs_reconnect() -> -> mid2.callback() -> add_credits() -> cifs_reconnect(). Fix this by avoiding to call cifs_reconnect() in add_credits() and checking for zero credits in the demultiplex thread. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/connect.c | 21 +++++++++++++++++++++ fs/cifs/smb2ops.c | 32 +++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 683310f26171..8463c940e0e5 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -720,6 +720,21 @@ server_unresponsive(struct TCP_Server_Info *server) return false; } +static inline bool +zero_credits(struct TCP_Server_Info *server) +{ + int val; + + spin_lock(&server->req_lock); + val = server->credits + server->echo_credits + server->oplock_credits; + if (server->in_flight == 0 && val == 0) { + spin_unlock(&server->req_lock); + return true; + } + spin_unlock(&server->req_lock); + return false; +} + static int cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) { @@ -732,6 +747,12 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) for (total_read = 0; msg_data_left(smb_msg); total_read += length) { try_to_freeze(); + /* reconnect if no credits and no requests in flight */ + if (zero_credits(server)) { + cifs_reconnect(server); + return -ECONNABORTED; + } + if (server_unresponsive(server)) return -ECONNABORTED; if (cifs_rdma_enabled(server) && server->smbd_conn) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 1d3ce127b02e..fa8d2e1076c8 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -34,6 +34,7 @@ #include "cifs_ioctl.h" #include "smbdirect.h" +/* Change credits for different ops and return the total number of credits */ static int change_conf(struct TCP_Server_Info *server) { @@ -41,17 +42,15 @@ change_conf(struct TCP_Server_Info *server) server->oplock_credits = server->echo_credits = 0; switch (server->credits) { case 0: - return -1; + return 0; case 1: server->echoes = false; server->oplocks = false; - cifs_dbg(VFS, "disabling echoes and oplocks\n"); break; case 2: server->echoes = true; server->oplocks = false; server->echo_credits = 1; - cifs_dbg(FYI, "disabling oplocks\n"); break; default: server->echoes = true; @@ -64,14 +63,15 @@ change_conf(struct TCP_Server_Info *server) server->echo_credits = 1; } server->credits -= server->echo_credits + server->oplock_credits; - return 0; + return server->credits + server->echo_credits + server->oplock_credits; } static void smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, const int optype) { - int *val, rc = 0; + int *val, rc = -1; + spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); @@ -101,8 +101,26 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, } spin_unlock(&server->req_lock); wake_up(&server->request_q); - if (rc) - cifs_reconnect(server); + + if (server->tcpStatus == CifsNeedReconnect) + return; + + switch (rc) { + case -1: + /* change_conf hasn't been executed */ + break; + case 0: + cifs_dbg(VFS, "Possible client or server bug - zero credits\n"); + break; + case 1: + cifs_dbg(VFS, "disabling echoes and oplocks\n"); + break; + case 2: + cifs_dbg(FYI, "disabling oplocks\n"); + break; + default: + cifs_dbg(FYI, "add %u credits total=%d\n", add, rc); + } } static void -- cgit v1.2.3 From 8004c78c68e894e4fd5ac3c22cc22eb7dc24cabc Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 17 Jan 2019 15:29:26 -0800 Subject: CIFS: Fix credits calculations for reads with errors Currently we mark MID as malformed if we get an error from server in a read response. This leads to not properly processing credits in the readv callback. Fix this by marking such a response as normal received response and process it appropriately. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e18915415e13..bb54ccf8481c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1549,18 +1549,26 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server) } static int -cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid, + bool malformed) { int length; - struct cifs_readdata *rdata = mid->callback_data; length = cifs_discard_remaining_data(server); - dequeue_mid(mid, rdata->result); + dequeue_mid(mid, malformed); mid->resp_buf = server->smallbuf; server->smallbuf = NULL; return length; } +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + + return __cifs_readv_discard(server, mid, rdata->result); +} + int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { @@ -1602,12 +1610,23 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return -1; } + /* set up first two iov for signature check and to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = server->total_read - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + /* Was the SMB read successful? */ rdata->result = server->ops->map_error(buf, false); if (rdata->result != 0) { cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); - return cifs_readv_discard(server, mid); + /* normal error on read response */ + return __cifs_readv_discard(server, mid, false); } /* Is there enough to get to the rest of the READ_RSP header? */ @@ -1651,14 +1670,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) server->total_read += length; } - /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->total_read - 4; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n", - rdata->iov[0].iov_base, server->total_read); - /* how much data is in the response? */ #ifdef CONFIG_CIFS_SMB_DIRECT use_rdma_mr = rdata->mr; -- cgit v1.2.3 From ec678eae746dd25766a61c4095e2b649d3b20b09 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 18 Jan 2019 15:38:11 -0800 Subject: CIFS: Fix credit calculation for encrypted reads with errors We do need to account for credits received in error responses to read requests on encrypted sessions. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index fa8d2e1076c8..73f9c6af4065 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3207,11 +3207,23 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, server->ops->is_status_pending(buf, server, 0)) return -1; - rdata->result = server->ops->map_error(buf, false); + /* set up first two iov to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = + min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + + rdata->result = server->ops->map_error(buf, true); if (rdata->result != 0) { cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); - dequeue_mid(mid, rdata->result); + /* normal error on read response */ + dequeue_mid(mid, false); return 0; } @@ -3284,14 +3296,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->vals->read_rsp_size - 4; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", - rdata->iov[0].iov_base, server->vals->read_rsp_size); - length = rdata->copy_into_pages(server, rdata, &iter); kfree(bvec); -- cgit v1.2.3 From 3d3003fce8e837acc4e3960fe3cbabebc356dcb5 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 22 Jan 2019 16:50:21 -0800 Subject: CIFS: Fix credit calculations in compound mid callback The current code doesn't do proper accounting for credits in SMB1 case: it adds one credit per response only if we get a complete response while it needs to return it unconditionally. Fix this and also include malformed responses for SMB2+ into accounting for credits because such responses have Credit Granted field, thus nothing prevents to get a proper credit value from them. Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 6 +++++- fs/cifs/transport.c | 11 +---------- 2 files changed, 6 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 73f9c6af4065..153238fc4fa9 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -154,7 +154,11 @@ smb2_get_credits(struct mid_q_entry *mid) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf; - return le16_to_cpu(shdr->CreditRequest); + if (mid->mid_state == MID_RESPONSE_RECEIVED + || mid->mid_state == MID_RESPONSE_MALFORMED) + return le16_to_cpu(shdr->CreditRequest); + + return 0; } static int diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 202e0e84efdd..53532bd3f50d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -786,17 +786,8 @@ static void cifs_compound_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->server; - unsigned int optype = mid->optype; - unsigned int credits_received = 0; - if (mid->mid_state == MID_RESPONSE_RECEIVED) { - if (mid->resp_buf) - credits_received = server->ops->get_credits(mid); - else - cifs_dbg(FYI, "Bad state for cancelled MID\n"); - } - - add_credits(server, credits_received, optype); + add_credits(server, server->ops->get_credits(mid), mid->optype); } static void -- cgit v1.2.3 From 0fd1d37b0501efc6e295f56ab55cdaff784aa50c Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 15 Jan 2019 15:08:48 -0800 Subject: CIFS: Do not assume one credit for async responses If we don't receive a response we can't assume that the server granted one credit. Assume zero credits in such cases. Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0af87bd0dc49..2ff209ec4fab 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2924,9 +2924,10 @@ smb2_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf; - unsigned int credits_received = 1; + unsigned int credits_received = 0; - if (mid->mid_state == MID_RESPONSE_RECEIVED) + if (mid->mid_state == MID_RESPONSE_RECEIVED + || mid->mid_state == MID_RESPONSE_MALFORMED) credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); DeleteMidQEntry(mid); @@ -3183,7 +3184,7 @@ smb2_readv_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = tcon->ses->server; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rdata->iov[0].iov_base; - unsigned int credits_received = 1; + unsigned int credits_received = 0; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, @@ -3222,6 +3223,9 @@ smb2_readv_callback(struct mid_q_entry *mid) task_io_account_read(rdata->got_bytes); cifs_stats_bytes_read(tcon, rdata->got_bytes); break; + case MID_RESPONSE_MALFORMED: + credits_received = le16_to_cpu(shdr->CreditRequest); + /* fall through */ default: if (rdata->result != -ENODATA) rdata->result = -EIO; @@ -3407,7 +3411,7 @@ smb2_writev_callback(struct mid_q_entry *mid) struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; - unsigned int credits_received = 1; + unsigned int credits_received = 0; switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -3435,6 +3439,9 @@ smb2_writev_callback(struct mid_q_entry *mid) case MID_RETRY_NEEDED: wdata->result = -EAGAIN; break; + case MID_RESPONSE_MALFORMED: + credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); + /* fall through */ default: wdata->result = -EIO; break; -- cgit v1.2.3 From 6a9cbdd1ceca1dc2359ddf082efe61b97c3e752b Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 16 Jan 2019 11:48:42 -0800 Subject: CIFS: Fix mounts if the client is low on credits If the server doesn't grant us at least 3 credits during the mount we won't be able to complete it because query path info operation requires 3 credits. Use the cached file handle if possible to allow the mount to succeed. Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2inode.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index f14533da3a93..01a76bccdb8d 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -293,6 +293,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_file_all_info *smb2_data; __u32 create_options = 0; + struct cifs_fid fid; + bool no_cached_open = tcon->nohandlecache; *adjust_tz = false; *symlink = false; @@ -301,6 +303,21 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; + + /* If it is a root and its handle is cached then use it */ + if (!strlen(full_path) && !no_cached_open) { + rc = open_shroot(xid, tcon, &fid); + if (rc) + goto out; + rc = SMB2_query_info(xid, tcon, fid.persistent_fid, + fid.volatile_fid, smb2_data); + close_shroot(&tcon->crfid); + if (rc) + goto out; + move_smb2_info_to_cifs(data, smb2_data); + goto out; + } + if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; -- cgit v1.2.3 From 2e5700bdde438ed708b36d8acd0398dc73cbf759 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 23 Jan 2019 16:20:38 +1000 Subject: smb3: add credits we receive from oplock/break PDUs Otherwise we gradually leak credits leading to potential hung session. Signed-off-by: Ronnie Sahlberg CC: Stable Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/smb2misc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 6a9c47541c53..7b8b58fb4d3f 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -648,6 +648,13 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) return false; + if (rsp->sync_hdr.CreditRequest) { + spin_lock(&server->req_lock); + server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + } + if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) -- cgit v1.2.3 From a5f1a81f701c594194eb70c679785882ab15f138 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 24 Jan 2019 16:19:31 +1000 Subject: cifs: print CIFSMaxBufSize as part of /proc/fs/cifs/DebugData Was helpful in debug for some recent problems. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 593fb422d0f3..e92a2fee3c57 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -252,6 +252,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_printf(m, ",ACL"); #endif seq_putc(m, '\n'); + seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); seq_printf(m, "Servers:"); -- cgit v1.2.3 From d88c93f090f708c18195553b352b9f205e65418f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 23 Jan 2019 11:27:02 +0100 Subject: debugfs: fix debugfs_rename parameter checking debugfs_rename() needs to check that the dentries passed into it really are valid, as sometimes they are not (i.e. if the return value of another debugfs call is passed into this one.) So fix this up by properly checking if the two parent directories are errors (they are allowed to be NULL), and if the dentry to rename is not NULL or an error. Cc: stable Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 13b01351dd1c..41ef452c1fcf 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -787,6 +787,13 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *dentry = NULL, *trap; struct name_snapshot old_name; + if (IS_ERR(old_dir)) + return old_dir; + if (IS_ERR(new_dir)) + return new_dir; + if (IS_ERR_OR_NULL(old_dentry)) + return old_dentry; + trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) -- cgit v1.2.3 From 8e47a457321ca1a74ad194ab5dcbca764bc70731 Mon Sep 17 00:00:00 2001 From: Piotr Jaroszynski Date: Sun, 27 Jan 2019 08:46:45 -0800 Subject: iomap: get/put the page in iomap_page_create/release() migrate_page_move_mapping() expects pages with private data set to have a page_count elevated by 1. This is what used to happen for xfs through the buffer_heads code before the switch to iomap in commit 82cb14175e7d ("xfs: add support for sub-pagesize writeback without buffer_heads"). Not having the count elevated causes move_pages() to fail on memory mapped files coming from xfs. Make iomap compatible with the migrate_page_move_mapping() assumption by elevating the page count as part of iomap_page_create() and lowering it in iomap_page_release(). It causes the move_pages() syscall to misbehave on memory mapped files from xfs. It does not not move any pages, which I suppose is "just" a perf issue, but it also ends up returning a positive number which is out of spec for the syscall. Talking to Michal Hocko, it sounds like returning positive numbers might be a necessary update to move_pages() anyway though. Fixes: 82cb14175e7d ("xfs: add support for sub-pagesize writeback without buffer_heads") Signed-off-by: Piotr Jaroszynski [hch: actually get/put the page iomap_migrate_page() to make it work properly] Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index a3088fae567b..cb184ff68680 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -116,6 +116,12 @@ iomap_page_create(struct inode *inode, struct page *page) atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); + + /* + * migrate_page_move_mapping() assumes that pages with private data have + * their count elevated by 1. + */ + get_page(page); set_page_private(page, (unsigned long)iop); SetPagePrivate(page); return iop; @@ -132,6 +138,7 @@ iomap_page_release(struct page *page) WARN_ON_ONCE(atomic_read(&iop->write_count)); ClearPagePrivate(page); set_page_private(page, 0); + put_page(page); kfree(iop); } @@ -569,8 +576,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, if (page_has_private(page)) { ClearPagePrivate(page); + get_page(newpage); set_page_private(newpage, page_private(page)); set_page_private(page, 0); + put_page(page); SetPagePrivate(newpage); } -- cgit v1.2.3 From 4ea899ead2786a30aaa8181fefa81a3df4ad28f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Jan 2019 08:58:58 -0800 Subject: iomap: fix a use after free in iomap_dio_rw Introduce a local wait_for_completion variable to avoid an access to the potentially freed dio struture after dropping the last reference count. Also use the chance to document the completion behavior to make the refcounting clear to the reader of the code. Fixes: ff6a9292e6 ("iomap: implement direct I/O") Reported-by: Chandan Rajendra Reported-by: Darrick J. Wong Signed-off-by: Christoph Hellwig Tested-by: Chandan Rajendra Tested-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index cb184ff68680..897c60215dd1 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1813,6 +1813,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos, start = pos; loff_t end = iocb->ki_pos + count - 1, ret = 0; unsigned int flags = IOMAP_DIRECT; + bool wait_for_completion = is_sync_kiocb(iocb); struct blk_plug plug; struct iomap_dio *dio; @@ -1832,7 +1833,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->end_io = end_io; dio->error = 0; dio->flags = 0; - dio->wait_for_completion = is_sync_kiocb(iocb); dio->submit.iter = iter; dio->submit.waiter = current; @@ -1887,7 +1887,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio_warn_stale_pagecache(iocb->ki_filp); ret = 0; - if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion && + if (iov_iter_rw(iter) == WRITE && !wait_for_completion && !inode->i_sb->s_dio_done_wq) { ret = sb_init_dio_done_wq(inode->i_sb); if (ret < 0) @@ -1903,7 +1903,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret <= 0) { /* magic error code to fall back to buffered I/O */ if (ret == -ENOTBLK) { - dio->wait_for_completion = true; + wait_for_completion = true; ret = 0; } break; @@ -1925,8 +1925,24 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + /* + * We are about to drop our additional submission reference, which + * might be the last reference to the dio. There are three three + * different ways we can progress here: + * + * (a) If this is the last reference we will always complete and free + * the dio ourselves. + * (b) If this is not the last reference, and we serve an asynchronous + * iocb, we must never touch the dio after the decrement, the + * I/O completion handler will complete and free it. + * (c) If this is not the last reference, but we serve a synchronous + * iocb, the I/O completion handler will wake us up on the drop + * of the final reference, and we will complete and free it here + * after we got woken by the I/O completion handler. + */ + dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { - if (!dio->wait_for_completion) + if (!wait_for_completion) return -EIOCBQUEUED; for (;;) { @@ -1943,9 +1959,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } - ret = iomap_dio_complete(dio); - - return ret; + return iomap_dio_complete(dio); out_free_dio: kfree(dio); -- cgit v1.2.3 From a6279470762c19ba97e454f90798373dccdf6148 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 25 Jan 2019 11:48:51 +0000 Subject: Btrfs: fix deadlock when allocating tree block during leaf/node split When splitting a leaf or node from one of the trees that are modified when flushing pending block groups (extent, chunk, device and free space trees), we need to allocate a new tree block, which in turn can result in the need to allocate a new block group. After allocating the new block group we may need to flush new block groups that were previously allocated during the course of the current transaction, which is what may cause a deadlock due to attempts to write lock twice the same leaf or node, as when splitting a leaf or node we are holding a write lock on it and its parent node. The same type of deadlock can also happen when increasing the tree's height, since we are holding a lock on the existing root while allocating the tree block to use as the new root node. An example trace when the deadlock happens during the leaf split path is: [27175.293054] CPU: 0 PID: 3005 Comm: kworker/u17:6 Tainted: G W 4.19.16 #1 [27175.293942] Hardware name: Penguin Computing Relion 1900/MD90-FS0-ZB-XX, BIOS R15 06/25/2018 [27175.294846] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs] (...) [27175.298384] RSP: 0018:ffffab2087107758 EFLAGS: 00010246 [27175.299269] RAX: 0000000000000bbd RBX: ffff9fadc7141c48 RCX: 0000000000000001 [27175.300155] RDX: 0000000000000001 RSI: 0000000000000002 RDI: ffff9fadc7141c48 [27175.301023] RBP: 0000000000000001 R08: ffff9faeb6ac1040 R09: ffff9fa9c0000000 [27175.301887] R10: 0000000000000000 R11: 0000000000000040 R12: ffff9fb21aac8000 [27175.302743] R13: ffff9fb1a64d6a20 R14: 0000000000000001 R15: ffff9fb1a64d6a18 [27175.303601] FS: 0000000000000000(0000) GS:ffff9fb21fa00000(0000) knlGS:0000000000000000 [27175.304468] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [27175.305339] CR2: 00007fdc8743ead8 CR3: 0000000763e0a006 CR4: 00000000003606f0 [27175.306220] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [27175.307087] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [27175.307940] Call Trace: [27175.308802] btrfs_search_slot+0x779/0x9a0 [btrfs] [27175.309669] ? update_space_info+0xba/0xe0 [btrfs] [27175.310534] btrfs_insert_empty_items+0x67/0xc0 [btrfs] [27175.311397] btrfs_insert_item+0x60/0xd0 [btrfs] [27175.312253] btrfs_create_pending_block_groups+0xee/0x210 [btrfs] [27175.313116] do_chunk_alloc+0x25f/0x300 [btrfs] [27175.313984] find_free_extent+0x706/0x10d0 [btrfs] [27175.314855] btrfs_reserve_extent+0x9b/0x1d0 [btrfs] [27175.315707] btrfs_alloc_tree_block+0x100/0x5b0 [btrfs] [27175.316548] split_leaf+0x130/0x610 [btrfs] [27175.317390] btrfs_search_slot+0x94d/0x9a0 [btrfs] [27175.318235] btrfs_insert_empty_items+0x67/0xc0 [btrfs] [27175.319087] alloc_reserved_file_extent+0x84/0x2c0 [btrfs] [27175.319938] __btrfs_run_delayed_refs+0x596/0x1150 [btrfs] [27175.320792] btrfs_run_delayed_refs+0xed/0x1b0 [btrfs] [27175.321643] delayed_ref_async_start+0x81/0x90 [btrfs] [27175.322491] normal_work_helper+0xd0/0x320 [btrfs] [27175.323328] ? move_linked_works+0x6e/0xa0 [27175.324160] process_one_work+0x191/0x370 [27175.324976] worker_thread+0x4f/0x3b0 [27175.325763] kthread+0xf8/0x130 [27175.326531] ? rescuer_thread+0x320/0x320 [27175.327284] ? kthread_create_worker_on_cpu+0x50/0x50 [27175.328027] ret_from_fork+0x35/0x40 [27175.328741] ---[ end trace 300a1b9f0ac30e26 ]--- Fix this by preventing the flushing of new blocks groups when splitting a leaf/node and when inserting a new root node for one of the trees modified by the flushing operation, similar to what is done when COWing a node/leaf from on of these trees. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202383 Reported-by: Eli V CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 78 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f64aad613727..5a6c39b44c84 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -968,6 +968,48 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return 0; } +static struct extent_buffer *alloc_tree_block_no_bg_flush( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent_start, + const struct btrfs_disk_key *disk_key, + int level, + u64 hint, + u64 empty_size) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *ret; + + /* + * If we are COWing a node/leaf from the extent, chunk, device or free + * space trees, make sure that we do not finish block group creation of + * pending block groups. We do this to avoid a deadlock. + * COWing can result in allocation of a new chunk, and flushing pending + * block groups (btrfs_create_pending_block_groups()) can be triggered + * when finishing allocation of a new chunk. Creation of a pending block + * group modifies the extent, chunk, device and free space trees, + * therefore we could deadlock with ourselves since we are holding a + * lock on an extent buffer that btrfs_create_pending_block_groups() may + * try to COW later. + * For similar reasons, we also need to delay flushing pending block + * groups when splitting a leaf or node, from one of those trees, since + * we are holding a write lock on it and its parent or when inserting a + * new root node for one of those trees. + */ + if (root == fs_info->extent_root || + root == fs_info->chunk_root || + root == fs_info->dev_root || + root == fs_info->free_space_root) + trans->can_flush_pending_bgs = false; + + ret = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, disk_key, level, + hint, empty_size); + trans->can_flush_pending_bgs = true; + + return ret; +} + /* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked @@ -1015,28 +1057,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) parent_start = parent->start; - /* - * If we are COWing a node/leaf from the extent, chunk, device or free - * space trees, make sure that we do not finish block group creation of - * pending block groups. We do this to avoid a deadlock. - * COWing can result in allocation of a new chunk, and flushing pending - * block groups (btrfs_create_pending_block_groups()) can be triggered - * when finishing allocation of a new chunk. Creation of a pending block - * group modifies the extent, chunk, device and free space trees, - * therefore we could deadlock with ourselves since we are holding a - * lock on an extent buffer that btrfs_create_pending_block_groups() may - * try to COW later. - */ - if (root == fs_info->extent_root || - root == fs_info->chunk_root || - root == fs_info->dev_root || - root == fs_info->free_space_root) - trans->can_flush_pending_bgs = false; - - cow = btrfs_alloc_tree_block(trans, root, parent_start, - root->root_key.objectid, &disk_key, level, - search_start, empty_size); - trans->can_flush_pending_bgs = true; + cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -3345,8 +3367,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &lower_key, level, root->node->start, 0); + c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, + root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -3475,8 +3497,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, level, c->start, 0); + split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, + c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -4260,8 +4282,8 @@ again: else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0); + right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, + l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); -- cgit v1.2.3 From 302167c50b32e7fccc98994a91d40ddbbab04e52 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jan 2019 09:31:43 -0500 Subject: btrfs: don't end the transaction for delayed refs in throttle Previously callers to btrfs_end_transaction_throttle() would commit the transaction if there wasn't enough delayed refs space. This happens in relocation, and if the fs is relatively empty we'll run out of delayed refs space basically immediately, so we'll just be stuck in this loop of committing the transaction over and over again. This code existed because we didn't have a good feedback mechanism for running delayed refs, but with the delayed refs rsv we do now. Delete this throttling code and let the btrfs_start_transaction() in relocation deal with putting pressure on the delayed refs infrastructure. With this patch we no longer take 5 minutes to balance a metadata only fs. Qu has submitted a fstest to catch slow balance or excessive transaction commits. Steps to reproduce: * create subvolume * create many (eg. 16000) inlined files, of size 2KiB * iteratively snapshot and touch several files to trigger metadata updates * start balance -m Reported-by: Qu Wenruo Fixes: 64403612b73a ("btrfs: rework btrfs_check_space_for_delayed_refs") Signed-off-by: Josef Bacik [ add tags and steps to reproduce ] Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 127fa1535f58..f15cf46f1b9d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -850,14 +850,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_chunk_metadata(trans); - if (lock && should_end_transaction(trans) && - READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { - spin_lock(&info->trans_lock); - if (cur_trans->state == TRANS_STATE_RUNNING) - cur_trans->state = TRANS_STATE_BLOCKED; - spin_unlock(&info->trans_lock); - } - if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { if (throttle) return btrfs_commit_transaction(trans); -- cgit v1.2.3 From 80ff00172407e0aad4b10b94ef0816fc3e7813cb Mon Sep 17 00:00:00 2001 From: Yao Liu Date: Mon, 28 Jan 2019 19:44:14 +0800 Subject: nfs: Fix NULL pointer dereference of dev_name There is a NULL pointer dereference of dev_name in nfs_parse_devname() The oops looks something like: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 ... RIP: 0010:nfs_fs_mount+0x3b6/0xc20 [nfs] ... Call Trace: ? ida_alloc_range+0x34b/0x3d0 ? nfs_clone_super+0x80/0x80 [nfs] ? nfs_free_parsed_mount_data+0x60/0x60 [nfs] mount_fs+0x52/0x170 ? __init_waitqueue_head+0x3b/0x50 vfs_kern_mount+0x6b/0x170 do_mount+0x216/0xdc0 ksys_mount+0x83/0xd0 __x64_sys_mount+0x25/0x30 do_syscall_64+0x65/0x220 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix this by adding a NULL check on dev_name Signed-off-by: Yao Liu Signed-off-by: Anna Schumaker --- fs/nfs/super.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 22ce3c8a2f46..0570391eaa16 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1895,6 +1895,11 @@ static int nfs_parse_devname(const char *dev_name, size_t len; char *end; + if (unlikely(!dev_name || !*dev_name)) { + dfprintk(MOUNT, "NFS: device name not specified\n"); + return -EINVAL; + } + /* Is the host name protected with square brakcets? */ if (*dev_name == '[') { end = strchr(++dev_name, ']'); -- cgit v1.2.3 From ff9fb72bc07705c00795ca48631f7fffe24d2c6b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 23 Jan 2019 11:28:14 +0100 Subject: debugfs: return error values, not NULL When an error happens, debugfs should return an error pointer value, not NULL. This will prevent the totally theoretical error where a debugfs call fails due to lack of memory, returning NULL, and that dentry value is then passed to another debugfs call, which would end up succeeding, creating a file at the root of the debugfs tree, but would then be impossible to remove (because you can not remove the directory NULL). So, to make everyone happy, always return errors, this makes the users of debugfs much simpler (they do not have to ever check the return value), and everyone can rest easy. Reported-by: Gary R Hook Reported-by: Heiko Carstens Reported-by: Masami Hiramatsu Reported-by: Michal Hocko Reported-by: Sebastian Andrzej Siewior Reported-by: Ulf Hansson Reviewed-by: Masami Hiramatsu Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 41ef452c1fcf..b16f8035b1af 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -254,8 +254,8 @@ MODULE_ALIAS_FS("debugfs"); * @parent: a pointer to the parent dentry of the file. * * This function will return a pointer to a dentry if it succeeds. If the file - * doesn't exist or an error occurs, %NULL will be returned. The returned - * dentry must be passed to dput() when it is no longer needed. + * doesn't exist or an error occurs, %ERR_PTR(-ERROR) will be returned. The + * returned dentry must be passed to dput() when it is no longer needed. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -265,17 +265,17 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) struct dentry *dentry; if (IS_ERR(parent)) - return NULL; + return parent; if (!parent) parent = debugfs_mount->mnt_root; dentry = lookup_one_len_unlocked(name, parent, strlen(name)); if (IS_ERR(dentry)) - return NULL; + return dentry; if (!d_really_is_positive(dentry)) { dput(dentry); - return NULL; + return ERR_PTR(-EINVAL); } return dentry; } @@ -324,7 +324,7 @@ static struct dentry *failed_creating(struct dentry *dentry) inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); - return NULL; + return ERR_PTR(-ENOMEM); } static struct dentry *end_creating(struct dentry *dentry) @@ -347,7 +347,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, dentry = start_creating(name, parent); if (IS_ERR(dentry)) - return NULL; + return dentry; inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) @@ -386,7 +386,8 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -464,7 +465,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -495,7 +497,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -506,7 +509,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) struct inode *inode; if (IS_ERR(dentry)) - return NULL; + return dentry; inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) @@ -545,7 +548,7 @@ struct dentry *debugfs_create_automount(const char *name, struct inode *inode; if (IS_ERR(dentry)) - return NULL; + return dentry; inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) @@ -581,8 +584,8 @@ EXPORT_SYMBOL(debugfs_create_automount); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the symbolic * link is to be removed (no automatic cleanup happens if your module is - * unloaded, you are responsible here.) If an error occurs, %NULL will be - * returned. + * unloaded, you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) + * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -594,12 +597,12 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, struct inode *inode; char *link = kstrdup(target, GFP_KERNEL); if (!link) - return NULL; + return ERR_PTR(-ENOMEM); dentry = start_creating(name, parent); if (IS_ERR(dentry)) { kfree(link); - return NULL; + return dentry; } inode = debugfs_get_inode(dentry->d_sb); @@ -827,7 +830,9 @@ exit: if (dentry && !IS_ERR(dentry)) dput(dentry); unlock_rename(new_dir, old_dir); - return NULL; + if (IS_ERR(dentry)) + return dentry; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(debugfs_rename); -- cgit v1.2.3 From 8fc75bed96bb94e23ca51bd9be4daf65c57697bf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 29 Jan 2019 15:52:55 -0500 Subject: NFS: Fix up return value on fatal errors in nfs_page_async_flush() Ensure that we return the fatal error value that caused us to exit nfs_page_async_flush(). Fixes: c373fff7bd25 ("NFSv4: Don't special case "launder"") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.12+ Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5a0bbf917a32..f12cb31a41e5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -621,11 +621,12 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_set_page_writeback(page); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); - ret = 0; + ret = req->wb_context->error; /* If there is a fatal error that covers this write, just exit */ - if (nfs_error_is_fatal_on_server(req->wb_context->error)) + if (nfs_error_is_fatal_on_server(ret)) goto out_launder; + ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -635,9 +636,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_context_set_write_error(req->wb_context, ret); if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - } + } else + ret = -EAGAIN; nfs_redirty_request(req); - ret = -EAGAIN; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); -- cgit v1.2.3 From 58d15ed1203f4d858c339ea4d7dafa94bd2a56d3 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 29 Jan 2019 12:46:16 +1000 Subject: cifs: fix computation for MAX_SMB2_HDR_SIZE The size of the fixed part of the create response is 88 bytes not 56. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2pdu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 7a2d0a2255e6..e9cc6775df21 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -84,8 +84,8 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ -#define MAX_SMB2_HDR_SIZE 0x00b0 +/* 52 transform hdr + 64 hdr + 88 create rsp */ +#define MAX_SMB2_HDR_SIZE 204 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) -- cgit v1.2.3 From c4627e66f73a28c5515b908c90c2bf7120086497 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 29 Jan 2019 12:46:17 +1000 Subject: cifs: limit amount of data we request for xattrs to CIFSMaxBufSize minus the various headers and blobs that will be part of the reply. or else we might trigger a session reconnect. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2ops.c | 4 +++- fs/cifs/smb2pdu.h | 15 +++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 153238fc4fa9..6f96e2292856 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -866,7 +866,9 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, FILE_READ_EA, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, - SMB2_MAX_EA_BUF, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE, &rsp_iov, &buftype, cifs_sb); if (rc) { /* diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index e9cc6775df21..538e2299805f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -85,6 +85,7 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 /* 52 transform hdr + 64 hdr + 88 create rsp */ +#define SMB2_TRANSFORM_HEADER_SIZE 52 #define MAX_SMB2_HDR_SIZE 204 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) @@ -648,6 +649,13 @@ struct smb2_create_req { __u8 Buffer[0]; } __packed; +/* + * Maximum size of a SMB2_CREATE response is 64 (smb2 header) + + * 88 (fixed part of create response) + 520 (path) + 150 (contexts) + + * 2 bytes of padding. + */ +#define MAX_SMB2_CREATE_RESPONSE_SIZE 824 + struct smb2_create_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 89 */ @@ -996,6 +1004,11 @@ struct smb2_close_req { __u64 VolatileFileId; /* opaque endianness */ } __packed; +/* + * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data) + */ +#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124 + struct smb2_close_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* 60 */ @@ -1398,8 +1411,6 @@ struct smb2_file_link_info { /* encoding of request for level 11 */ char FileName[0]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ -#define SMB2_MAX_EA_BUF 65536 - struct smb2_file_full_ea_info { /* encoding of response for level 15 */ __le32 next_entry_offset; __u8 flags; -- cgit v1.2.3 From 9bda8723da2d55b1de833b98cf802b88006e5b69 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 23 Jan 2019 17:12:09 -0800 Subject: CIFS: Fix possible oops and memory leaks in async IO Allocation of a page array for non-cached IO was separated from allocation of rdata and wdata structures and this introduced memory leaks and a possible null pointer dereference. This patch fixes these problems. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 2c7689f3998d..659ce1b92c44 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2696,6 +2696,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, rc = cifs_write_allocate_pages(wdata->pages, nr_pages); if (rc) { + kvfree(wdata->pages); kfree(wdata); add_credits_and_wake_if(server, credits, 0); break; @@ -2707,6 +2708,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, if (rc) { for (i = 0; i < nr_pages; i++) put_page(wdata->pages[i]); + kvfree(wdata->pages); kfree(wdata); add_credits_and_wake_if(server, credits, 0); break; @@ -3386,8 +3388,12 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, } rc = cifs_read_allocate_pages(rdata, npages); - if (rc) - goto error; + if (rc) { + kvfree(rdata->pages); + kfree(rdata); + add_credits_and_wake_if(server, credits, 0); + break; + } rdata->tailsz = PAGE_SIZE; } @@ -3407,7 +3413,6 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, if (!rdata->cfile->invalidHandle || !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); -error: if (rc) { add_credits_and_wake_if(server, rdata->credits, 0); kref_put(&rdata->refcount, -- cgit v1.2.3 From 7d42e72fe8ee5ab70b1af843dd7d8615e6fb0abe Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 25 Jan 2019 11:38:53 -0800 Subject: CIFS: Fix trace command logging for SMB2 reads and writes Currently we log success once we send an async IO request to the server. Instead we need to analyse a response and then log success or failure for a particular command. Also fix argument list for read logging. Cc: # 4.18 Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2ff209ec4fab..4b5ab9c80cc3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3241,8 +3241,17 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->mr = NULL; } #endif - if (rdata->result) + if (rdata->result) { cifs_stats_fail_inc(tcon, SMB2_READ_HE); + trace_smb3_read_err(0 /* xid */, + rdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, rdata->offset, + rdata->bytes, rdata->result); + } else + trace_smb3_read_done(0 /* xid */, + rdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->offset, rdata->got_bytes); queue_work(cifsiod_wq, &rdata->work); DeleteMidQEntry(mid); @@ -3317,13 +3326,11 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); - trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid, - io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length); - } else - trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid, - io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length); + trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid, + io_parms.tcon->tid, + io_parms.tcon->ses->Suid, + io_parms.offset, io_parms.length, rc); + } cifs_small_buf_release(buf); return rc; @@ -3367,10 +3374,11 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc != -ENODATA) { cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); + trace_smb3_read_err(xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, io_parms->length, + rc); } - trace_smb3_read_err(rc, xid, req->PersistentFileId, - io_parms->tcon->tid, ses->Suid, - io_parms->offset, io_parms->length); free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc == -ENODATA ? 0 : rc; } else @@ -3459,8 +3467,17 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->mr = NULL; } #endif - if (wdata->result) + if (wdata->result) { cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); + trace_smb3_write_err(0 /* no xid */, + wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, wdata->offset, + wdata->bytes, wdata->result); + } else + trace_smb3_write_done(0 /* no xid */, + wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + wdata->offset, wdata->bytes); queue_work(cifsiod_wq, &wdata->work); DeleteMidQEntry(mid); @@ -3602,10 +3619,7 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata->bytes, rc); kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); - } else - trace_smb3_write_done(0 /* no xid */, req->PersistentFileId, - tcon->tid, tcon->ses->Suid, wdata->offset, - wdata->bytes); + } async_writev_out: cifs_small_buf_release(req); -- cgit v1.2.3 From 8e6e72aeceaaed5aeeb1cb43d3085de7ceb14f79 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 26 Jan 2019 12:21:32 -0800 Subject: CIFS: Do not count -ENODATA as failure for query directory Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 4b5ab9c80cc3..d858dc04fdc3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3845,8 +3845,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { srch_inf->endOfSearch = true; rc = 0; - } - cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + } else + cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); goto qdir_exit; } -- cgit v1.2.3 From 082aaa8700415f6471ec9c5ef0c8307ca214989a Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 18 Jan 2019 15:54:34 -0800 Subject: CIFS: Do not consider -ENODATA as stat failure for reads When doing reads beyound the end of a file the server returns error STATUS_END_OF_FILE error which is mapped to -ENODATA. Currently we report it as a failure which confuses read stats. Change it to not consider -ENODATA as failure for stat purposes. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index d858dc04fdc3..ef52d6642431 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3241,7 +3241,7 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->mr = NULL; } #endif - if (rdata->result) { + if (rdata->result && rdata->result != -ENODATA) { cifs_stats_fail_inc(tcon, SMB2_READ_HE); trace_smb3_read_err(0 /* xid */, rdata->cfile->fid.persistent_fid, -- cgit v1.2.3 From 37ea7b630ae5cdea4e8ff381d9d23abfef5939e6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 30 Jan 2019 12:37:35 +0100 Subject: debugfs: debugfs_lookup() should return NULL if not found Lots of callers of debugfs_lookup() were just checking NULL to see if the file/directory was found or not. By changing this in ff9fb72bc077 ("debugfs: return error values, not NULL") we caused some subsystems to easily crash. Fixes: ff9fb72bc077 ("debugfs: return error values, not NULL") Reported-by: syzbot+b382ba6a802a3d242790@syzkaller.appspotmail.com Reported-by: Tetsuo Handa Cc: Omar Sandoval Cc: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b16f8035b1af..29c68c5d44d5 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -254,8 +254,8 @@ MODULE_ALIAS_FS("debugfs"); * @parent: a pointer to the parent dentry of the file. * * This function will return a pointer to a dentry if it succeeds. If the file - * doesn't exist or an error occurs, %ERR_PTR(-ERROR) will be returned. The - * returned dentry must be passed to dput() when it is no longer needed. + * doesn't exist or an error occurs, %NULL will be returned. The returned + * dentry must be passed to dput() when it is no longer needed. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -265,17 +265,17 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) struct dentry *dentry; if (IS_ERR(parent)) - return parent; + return NULL; if (!parent) parent = debugfs_mount->mnt_root; dentry = lookup_one_len_unlocked(name, parent, strlen(name)); if (IS_ERR(dentry)) - return dentry; + return NULL; if (!d_really_is_positive(dentry)) { dput(dentry); - return ERR_PTR(-EINVAL); + return NULL; } return dentry; } -- cgit v1.2.3 From 92900e5160a5444d47dd376bc40066b709fbb5a6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 27 Jan 2019 04:58:00 +0000 Subject: btrfs: fix potential oops in device_list_add alloc_fs_devices() can return ERR_PTR(-ENOMEM), so dereferencing its result before the check for IS_ERR() is a bad idea. Fixes: d1a63002829a4 ("btrfs: add members to fs_devices to track fsid changes") Reviewed-by: Nikolay Borisov Reviewed-by: Anand Jain Signed-off-by: Al Viro Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3e4f8f88353e..15561926ab32 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -957,11 +957,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, else fs_devices = alloc_fs_devices(disk_super->fsid, NULL); - fs_devices->fsid_change = fsid_change_in_progress; - if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); + fs_devices->fsid_change = fsid_change_in_progress; + mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); -- cgit v1.2.3 From c7cc64a98512ffc41df86d14a414eb3b09bf7481 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 23 Jan 2019 17:09:16 +0100 Subject: btrfs: clean up pending block groups when transaction commit aborts The fstests generic/475 stresses transaction aborts and can reveal space accounting or use-after-free bugs regarding block goups. In this case the pending block groups that remain linked to the structures after transaction commit aborts in the middle. The corrupted slabs lead to failures in following tests, eg. generic/476 [ 8172.752887] BUG: unable to handle kernel NULL pointer dereference at 0000000000000058 [ 8172.755799] #PF error: [normal kernel read fault] [ 8172.757571] PGD 661ae067 P4D 661ae067 PUD 3db8e067 PMD 0 [ 8172.759000] Oops: 0000 [#1] PREEMPT SMP [ 8172.760209] CPU: 0 PID: 39 Comm: kswapd0 Tainted: G W 5.0.0-rc2-default #408 [ 8172.762495] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014 [ 8172.765772] RIP: 0010:shrink_page_list+0x2f9/0xe90 [ 8172.770453] RSP: 0018:ffff967f00663b18 EFLAGS: 00010287 [ 8172.771184] RAX: 0000000000000000 RBX: ffff967f00663c20 RCX: 0000000000000000 [ 8172.772850] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8c0620ab20e0 [ 8172.774629] RBP: ffff967f00663dd8 R08: 0000000000000000 R09: 0000000000000000 [ 8172.776094] R10: ffff8c0620ab22f8 R11: ffff8c063f772688 R12: ffff967f00663b78 [ 8172.777533] R13: ffff8c063f625600 R14: ffff8c063f625608 R15: dead000000000200 [ 8172.778886] FS: 0000000000000000(0000) GS:ffff8c063d400000(0000) knlGS:0000000000000000 [ 8172.780545] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8172.781787] CR2: 0000000000000058 CR3: 000000004e962000 CR4: 00000000000006f0 [ 8172.783547] Call Trace: [ 8172.784112] shrink_inactive_list+0x194/0x410 [ 8172.784747] shrink_node_memcg.constprop.85+0x3a5/0x6a0 [ 8172.785472] shrink_node+0x62/0x1e0 [ 8172.786011] balance_pgdat+0x216/0x460 [ 8172.786577] kswapd+0xe3/0x4a0 [ 8172.787085] ? finish_wait+0x80/0x80 [ 8172.787795] ? balance_pgdat+0x460/0x460 [ 8172.788799] kthread+0x116/0x130 [ 8172.789640] ? kthread_create_on_node+0x60/0x60 [ 8172.790323] ret_from_fork+0x24/0x30 [ 8172.794253] CR2: 0000000000000058 or accounting errors at umount time: [ 8159.537251] WARNING: CPU: 2 PID: 19031 at fs/btrfs/extent-tree.c:5987 btrfs_free_block_groups+0x3d5/0x410 [btrfs] [ 8159.543325] CPU: 2 PID: 19031 Comm: umount Tainted: G W 5.0.0-rc2-default #408 [ 8159.545472] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014 [ 8159.548155] RIP: 0010:btrfs_free_block_groups+0x3d5/0x410 [btrfs] [ 8159.554030] RSP: 0018:ffff967f079cbde8 EFLAGS: 00010206 [ 8159.555144] RAX: 0000000001000000 RBX: ffff8c06366cf800 RCX: 0000000000000000 [ 8159.556730] RDX: 0000000000000002 RSI: 0000000000000001 RDI: ffff8c06255ad800 [ 8159.558279] RBP: ffff8c0637ac0000 R08: 0000000000000001 R09: 0000000000000000 [ 8159.559797] R10: 0000000000000000 R11: 0000000000000001 R12: ffff8c0637ac0108 [ 8159.561296] R13: ffff8c0637ac0158 R14: 0000000000000000 R15: dead000000000100 [ 8159.562852] FS: 00007f7f693b9fc0(0000) GS:ffff8c063d800000(0000) knlGS:0000000000000000 [ 8159.564839] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8159.566160] CR2: 00007f7f68fab7b0 CR3: 000000000aec7000 CR4: 00000000000006e0 [ 8159.567898] Call Trace: [ 8159.568597] close_ctree+0x17f/0x350 [btrfs] [ 8159.569628] generic_shutdown_super+0x64/0x100 [ 8159.570808] kill_anon_super+0x14/0x30 [ 8159.571857] btrfs_kill_super+0x12/0xa0 [btrfs] [ 8159.573063] deactivate_locked_super+0x29/0x60 [ 8159.574234] cleanup_mnt+0x3b/0x70 [ 8159.575176] task_work_run+0x98/0xc0 [ 8159.576177] exit_to_usermode_loop+0x83/0x90 [ 8159.577315] do_syscall_64+0x15b/0x180 [ 8159.578339] entry_SYSCALL_64_after_hwframe+0x49/0xbe This fix is based on 2 Josef's patches that used sideefects of btrfs_create_pending_block_groups, this fix introduces the helper that does what we need. CC: stable@vger.kernel.org # 4.4+ CC: Josef Bacik Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f15cf46f1b9d..4ec2b660d014 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1871,6 +1871,21 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) kmem_cache_free(btrfs_trans_handle_cachep, trans); } +/* + * Release reserved delayed ref space of all pending block groups of the + * transaction and remove them from the list + */ +static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *block_group, *tmp; + + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + btrfs_delayed_refs_rsv_release(fs_info, 1); + list_del_init(&block_group->bg_list); + } +} + static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { /* @@ -2262,6 +2277,7 @@ scrub_continue: btrfs_scrub_continue(fs_info); cleanup_transaction: btrfs_trans_release_metadata(trans); + btrfs_cleanup_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; btrfs_warn(fs_info, "Skipping commit of aborted transaction."); -- cgit v1.2.3 From 532b618bdf237250d6d4566536d4b6ce3d0a31fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 30 Jan 2019 07:54:12 -0600 Subject: btrfs: On error always free subvol_name in btrfs_mount The subvol_name is allocated in btrfs_parse_subvol_options and is consumed and freed in mount_subvol. Add a free to the error paths that don't call mount_subvol so that it is guaranteed that subvol_name is freed when an error happens. Fixes: 312c89fbca06 ("btrfs: cleanup btrfs_mount() using btrfs_mount_root()") Cc: stable@vger.kernel.org # v4.19+ Reviewed-by: Nikolay Borisov Signed-off-by: "Eric W. Biederman" Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 368a5b9e6c13..74023786a735 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1677,6 +1677,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, flags | SB_RDONLY, device_name, data); if (IS_ERR(mnt_root)) { root = ERR_CAST(mnt_root); + kfree(subvol_name); goto out; } @@ -1686,12 +1687,14 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error < 0) { root = ERR_PTR(error); mntput(mnt_root); + kfree(subvol_name); goto out; } } } if (IS_ERR(mnt_root)) { root = ERR_CAST(mnt_root); + kfree(subvol_name); goto out; } -- cgit v1.2.3 From 1dbd449c9943e3145148cc893c2461b72ba6fef0 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Jan 2019 13:52:36 -0500 Subject: fs/dcache: Fix incorrect nr_dentry_unused accounting in shrink_dcache_sb() The nr_dentry_unused per-cpu counter tracks dentries in both the LRU lists and the shrink lists where the DCACHE_LRU_LIST bit is set. The shrink_dcache_sb() function moves dentries from the LRU list to a shrink list and subtracts the dentry count from nr_dentry_unused. This is incorrect as the nr_dentry_unused count will also be decremented in shrink_dentry_list() via d_shrink_del(). To fix this double decrement, the decrement in the shrink_dcache_sb() function is taken out. Fixes: 4e717f5c1083 ("list_lru: remove special case function list_lru_dispose_all." Cc: stable@kernel.org Signed-off-by: Waiman Long Reviewed-by: Dave Chinner Signed-off-by: Linus Torvalds --- fs/dcache.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 2593153471cf..44e5652b2664 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1188,15 +1188,11 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, */ void shrink_dcache_sb(struct super_block *sb) { - long freed; - do { LIST_HEAD(dispose); - freed = list_lru_walk(&sb->s_dentry_lru, + list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); - - this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } -- cgit v1.2.3 From af0c9af1b3f66052c369d08be3f60fa9a9559e48 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Jan 2019 13:52:38 -0500 Subject: fs/dcache: Track & report number of negative dentries The current dentry number tracking code doesn't distinguish between positive & negative dentries. It just reports the total number of dentries in the LRU lists. As excessive number of negative dentries can have an impact on system performance, it will be wise to track the number of positive and negative dentries separately. This patch adds tracking for the total number of negative dentries in the system LRU lists and reports it in the 5th field in the /proc/sys/fs/dentry-state file. The number, however, does not include negative dentries that are in flight but not in the LRU yet as well as those in the shrinker lists which are on the way out anyway. The number of positive dentries in the LRU lists can be roughly found by subtracting the number of negative dentries from the unused count. Matthew Wilcox had confirmed that since the introduction of the dentry_stat structure in 2.1.60, the dummy array was there, probably for future extension. They were not replacements of pre-existing fields. So no sane applications that read the value of /proc/sys/fs/dentry-state will do dummy thing if the last 2 fields of the sysctl parameter are not zero. IOW, it will be safe to use one of the dummy array entry for negative dentry count. Signed-off-by: Waiman Long Signed-off-by: Linus Torvalds --- Documentation/sysctl/fs.txt | 26 ++++++++++++++++---------- fs/dcache.c | 32 ++++++++++++++++++++++++++++++++ include/linux/dcache.h | 7 ++++--- 3 files changed, 52 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 819caf8ca05f..58649bd4fcfc 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -56,26 +56,32 @@ of any kernel data structures. dentry-state: -From linux/fs/dentry.c: +From linux/include/linux/dcache.h: -------------------------------------------------------------- -struct { +struct dentry_stat_t dentry_stat { int nr_dentry; int nr_unused; int age_limit; /* age in seconds */ int want_pages; /* pages requested by system */ - int dummy[2]; -} dentry_stat = {0, 0, 45, 0,}; --------------------------------------------------------------- - -Dentries are dynamically allocated and deallocated, and -nr_dentry seems to be 0 all the time. Hence it's safe to -assume that only nr_unused, age_limit and want_pages are -used. Nr_unused seems to be exactly what its name says. + int nr_negative; /* # of unused negative dentries */ + int dummy; /* Reserved for future use */ +}; +-------------------------------------------------------------- + +Dentries are dynamically allocated and deallocated. + +nr_dentry shows the total number of dentries allocated (active ++ unused). nr_unused shows the number of dentries that are not +actively used, but are saved in the LRU list for future reuse. + Age_limit is the age in seconds after which dcache entries can be reclaimed when memory is short and want_pages is nonzero when shrink_dcache_pages() has been called and the dcache isn't pruned yet. +nr_negative shows the number of unused dentries that are also +negative dentries which do not mapped to actual files. + ============================================================== dquot-max & dquot-nr: diff --git a/fs/dcache.c b/fs/dcache.c index 44e5652b2664..aac41adf4743 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -119,6 +119,7 @@ struct dentry_stat_t dentry_stat = { static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); +static DEFINE_PER_CPU(long, nr_dentry_negative); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) @@ -152,11 +153,22 @@ static long get_nr_dentry_unused(void) return sum < 0 ? 0 : sum; } +static long get_nr_dentry_negative(void) +{ + int i; + long sum = 0; + + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_negative, i); + return sum < 0 ? 0 : sum; +} + int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); + dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -317,6 +329,8 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; + if (dentry->d_flags & DCACHE_LRU_LIST) + this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) @@ -371,6 +385,11 @@ static void dentry_unlink_inode(struct dentry * dentry) * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * + * The per-cpu "nr_dentry_negative" counters are only updated + * when deleted from or added to the per-superblock LRU list, not + * from/to the shrink list. That is to avoid an unneeded dec/inc + * pair when moving from LRU to shrink list in select_collect(). + * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ @@ -380,6 +399,8 @@ static void d_lru_add(struct dentry *dentry) D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } @@ -388,6 +409,8 @@ static void d_lru_del(struct dentry *dentry) D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } @@ -418,6 +441,8 @@ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } @@ -426,6 +451,8 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } @@ -1816,6 +1843,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); + /* + * Decrement negative dentry count if it was in the LRU list. + */ + if (dentry->d_flags & DCACHE_LRU_LIST) + this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ef4b70f64f33..60996e64c579 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -62,9 +62,10 @@ extern const struct qstr slash_name; struct dentry_stat_t { long nr_dentry; long nr_unused; - long age_limit; /* age in seconds */ - long want_pages; /* pages requested by system */ - long dummy[2]; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long nr_negative; /* # of unused negative dentries */ + long dummy; /* Reserved for future use */ }; extern struct dentry_stat_t dentry_stat; -- cgit v1.2.3 From d339adc12a4f885b572c5412e4869af8939db854 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 31 Jan 2019 13:46:07 +0100 Subject: CIFS: fix use-after-free of the lease keys The request buffers are freed right before copying the pointers. Use the func args instead which are identical and still valid. Simple reproducer (requires KASAN enabled) on a cifs mount: echo foo > foo ; tail -f foo & rm foo Cc: # 4.20 Fixes: 179e44d49c2f ("smb3: add tracepoint for sending lease break responses to server") Signed-off-by: Aurelien Aptel Signed-off-by: Steve French Reviewed-by: Paulo Alcantara --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ef52d6642431..77b3aaa39b35 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -4441,8 +4441,8 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - please_key_low = (__u64 *)req->LeaseKey; - please_key_high = (__u64 *)(req->LeaseKey+8); + please_key_low = (__u64 *)lease_key; + please_key_high = (__u64 *)(lease_key+8); if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid, -- cgit v1.2.3 From b9b9378b49030d1aeca2387660fcd1ac1f306cad Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 29 Jan 2019 17:27:33 -0600 Subject: cifs: update internal module version number To 2.17 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index d1f9c2f3f575..7652551a1fc4 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.16" +#define CIFS_VERSION "2.17" #endif /* _CIFSFS_H */ -- cgit v1.2.3 From e74c98ca2d6ae4376cc15fa2a22483430909d96b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 30 Jan 2019 21:30:36 +0100 Subject: gfs2: Revert "Fix loop in gfs2_rbm_find" This reverts commit 2d29f6b96d8f80322ed2dd895bca590491c38d34. It turns out that the fix can lead to a ~20 percent performance regression in initial writes to the page cache according to iozone. Let's revert this for now to have more time for a proper fix. Cc: stable@vger.kernel.org # v3.13+ Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Linus Torvalds --- fs/gfs2/rgrp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 831d7cb5a49c..17a8d3b43990 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1780,9 +1780,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, goto next_iter; } if (ret == -E2BIG) { - n += rbm->bii - initial_bii; rbm->bii = 0; rbm->offset = 0; + n += (rbm->bii - initial_bii); goto res_covered_end_of_rgrp; } return ret; -- cgit v1.2.3 From 8fdd60f2ae3682caf2a7258626abc21eb4711892 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 31 Jan 2019 23:41:11 -0500 Subject: Revert "ext4: use ext4_write_inode() when fsyncing w/o a journal" This reverts commit ad211f3e94b314a910d4af03178a0b52a7d1ee0a. As Jan Kara pointed out, this change was unsafe since it means we lose the call to sync_mapping_buffers() in the nojournal case. The original point of the commit was avoid taking the inode mutex (since it causes a lockdep warning in generic/113); but we need the mutex in order to call sync_mapping_buffers(). The real fix to this problem was discussed here: https://lore.kernel.org/lkml/20181025150540.259281-4-bvanassche@acm.org The proposed patch was to fix a syzbot complaint, but the problem can also demonstrated via "kvm-xfstests -c nojournal generic/113". Multiple solutions were discused in the e-mail thread, but none have landed in the kernel as of this writing. Anyway, commit ad211f3e94b314 is absolutely the wrong way to suppress the lockdep, so revert it. Fixes: ad211f3e94b314a910d4af03178a0b52a7d1ee0a ("ext4: use ext4_write_inode() when fsyncing w/o a journal") Signed-off-by: Theodore Ts'o Reported: Jan Kara --- fs/ext4/fsync.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 712f00995390..5508baa11bb6 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -116,16 +116,8 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } - ret = file_write_and_wait_range(file, start, end); - if (ret) - return ret; - if (!journal) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL - }; - - ret = ext4_write_inode(inode, &wbc); + ret = __generic_file_fsync(file, start, end, datasync); if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) @@ -133,6 +125,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } + ret = file_write_and_wait_range(file, start, end); + if (ret) + return ret; /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. -- cgit v1.2.3 From 1fde6f21d90f8ba5da3cb9c54ca991ed72696c43 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 1 Feb 2019 14:20:01 -0800 Subject: proc: fix /proc/net/* after setns(2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /proc entries under /proc/net/* can't be cached into dcache because setns(2) can change current net namespace. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: avoid vim miscolorization] [adobriyan@gmail.com: write test, add dummy ->d_revalidate hook: necessary if /proc/net/* is pinned at setns time] Link: http://lkml.kernel.org/r/20190108192350.GA12034@avx2 Link: http://lkml.kernel.org/r/20190107162336.GA9239@avx2 Fixes: 1da4d377f943fe4194ffb9fb9c26cc58fad4dd24 ("proc: revalidate misc dentries") Signed-off-by: Alexey Dobriyan Reported-by: Mateusz Stępień Reported-by: Ahmad Fatoum Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 4 +- fs/proc/internal.h | 1 + fs/proc/proc_net.c | 20 +++++ tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/setns-dcache.c | 129 ++++++++++++++++++++++++++++ 6 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/proc/setns-dcache.c (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8ae109429a88..e39bac94dead 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -256,7 +256,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_misc_dentry_ops); + d_set_d_op(dentry, de->proc_dops); return d_splice_alias(inode, dentry); } read_unlock(&proc_subdir_lock); @@ -429,6 +429,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); + ent->proc_dops = &proc_misc_dentry_ops; + out: return ent; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5185d7f6a51e..95b14196f284 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -44,6 +44,7 @@ struct proc_dir_entry { struct completion *pde_unload_completion; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; + const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index d5e0fcb3439e..a7b12435519e 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -38,6 +38,22 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } +static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 0; +} + +static const struct dentry_operations proc_net_dentry_ops = { + .d_revalidate = proc_net_d_revalidate, + .d_delete = always_delete_dentry, +}; + +static void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->proc_dops = &proc_net_dentry_ops; +} + static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; @@ -90,6 +106,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -133,6 +150,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -181,6 +199,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; return proc_register(parent, p); @@ -223,6 +242,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; p->write = write; diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 82121a81681f..29bac5ef9a93 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -10,4 +10,5 @@ /proc-uptime-002 /read /self +/setns-dcache /thread-self diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 1c12c34cf85d..434d033ee067 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -14,6 +14,7 @@ TEST_GEN_PROGS += proc-uptime-001 TEST_GEN_PROGS += proc-uptime-002 TEST_GEN_PROGS += read TEST_GEN_PROGS += self +TEST_GEN_PROGS += setns-dcache TEST_GEN_PROGS += thread-self include ../lib.mk diff --git a/tools/testing/selftests/proc/setns-dcache.c b/tools/testing/selftests/proc/setns-dcache.c new file mode 100644 index 000000000000..60ab197a73fc --- /dev/null +++ b/tools/testing/selftests/proc/setns-dcache.c @@ -0,0 +1,129 @@ +/* + * Copyright © 2019 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Test that setns(CLONE_NEWNET) points to new /proc/net content even + * if old one is in dcache. + * + * FIXME /proc/net/unix is under CONFIG_UNIX which can be disabled. + */ +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static pid_t pid = -1; + +static void f(void) +{ + if (pid > 0) { + kill(pid, SIGTERM); + } +} + +int main(void) +{ + int fd[2]; + char _ = 0; + int nsfd; + + atexit(f); + + /* Check for priviledges and syscall availability straight away. */ + if (unshare(CLONE_NEWNET) == -1) { + if (errno == ENOSYS || errno == EPERM) { + return 4; + } + return 1; + } + /* Distinguisher between two otherwise empty net namespaces. */ + if (socket(AF_UNIX, SOCK_STREAM, 0) == -1) { + return 1; + } + + if (pipe(fd) == -1) { + return 1; + } + + pid = fork(); + if (pid == -1) { + return 1; + } + + if (pid == 0) { + if (unshare(CLONE_NEWNET) == -1) { + return 1; + } + + if (write(fd[1], &_, 1) != 1) { + return 1; + } + + pause(); + + return 0; + } + + if (read(fd[0], &_, 1) != 1) { + return 1; + } + + { + char buf[64]; + snprintf(buf, sizeof(buf), "/proc/%u/ns/net", pid); + nsfd = open(buf, O_RDONLY); + if (nsfd == -1) { + return 1; + } + } + + /* Reliably pin dentry into dcache. */ + (void)open("/proc/net/unix", O_RDONLY); + + if (setns(nsfd, CLONE_NEWNET) == -1) { + return 1; + } + + kill(pid, SIGTERM); + pid = 0; + + { + char buf[4096]; + ssize_t rv; + int fd; + + fd = open("/proc/net/unix", O_RDONLY); + if (fd == -1) { + return 1; + } + +#define S "Num RefCount Protocol Flags Type St Inode Path\n" + rv = read(fd, buf, sizeof(buf)); + + assert(rv == strlen(S)); + assert(memcmp(buf, S, strlen(S)) == 0); + } + + return 0; +} -- cgit v1.2.3 From c27d82f52f75fc9d8d9d40d120d2a96fdeeada5e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Feb 2019 14:21:23 -0800 Subject: fs/drop_caches.c: avoid softlockups in drop_pagecache_sb() When superblock has lots of inodes without any pagecache (like is the case for /proc), drop_pagecache_sb() will iterate through all of them without dropping sb->s_inode_list_lock which can lead to softlockups (one of our customers hit this). Fix the problem by going to the slow path and doing cond_resched() in case the process needs rescheduling. Link: http://lkml.kernel.org/r/20190114085343.15011-1-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/drop_caches.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 82377017130f..d31b6c72b476 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -21,8 +21,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); + /* + * We must skip inodes in unusual state. We may also skip + * inodes without pages but we deliberately won't in case + * we need to reschedule to avoid softlockups. + */ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (inode->i_mapping->nrpages == 0)) { + (inode->i_mapping->nrpages == 0 && !need_resched())) { spin_unlock(&inode->i_lock); continue; } @@ -30,6 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); + cond_resched(); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; -- cgit v1.2.3 From 63ce5f552beb9bdb41546b3a26c4374758b21815 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Fri, 1 Feb 2019 14:21:26 -0800 Subject: autofs: drop dentry reference only when it is never used autofs_expire_run() calls dput(dentry) to drop the reference count of dentry. However, dentry is read via autofs_dentry_ino(dentry) after that. This may result in a use-free-bug. The patch drops the reference count of dentry only when it is never used. Link: http://lkml.kernel.org/r/154725122396.11260.16053424107144453867.stgit@pluto-themaw-net Signed-off-by: Pan Bian Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/expire.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index d441244b79df..28d9c2b1b3bb 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -596,7 +596,6 @@ int autofs_expire_run(struct super_block *sb, pkt.len = dentry->d_name.len; memcpy(pkt.name, dentry->d_name.name, pkt.len); pkt.name[pkt.len] = '\0'; - dput(dentry); if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) ret = -EFAULT; @@ -609,6 +608,8 @@ int autofs_expire_run(struct super_block *sb, complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); + dput(dentry); + return ret; } -- cgit v1.2.3 From f585b283e3f025754c45bbe7533fc6e5c4643700 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 1 Feb 2019 14:21:29 -0800 Subject: autofs: fix error return in autofs_fill_super() In autofs_fill_super() on error of get inode/make root dentry the return should be ENOMEM as this is the only failure case of the called functions. Link: http://lkml.kernel.org/r/154725123240.11260.796773942606871359.stgit@pluto-themaw-net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 0e8ea2d9a2bb..078992eee299 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -266,8 +266,10 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) } root_inode = autofs_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); - if (!root) + if (!root) { + ret = -ENOMEM; goto fail_ino; + } pipe = NULL; root->d_fsdata = ino; -- cgit v1.2.3 From aa6ee4ab69293969867ab09b57546d226ace3d7a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 1 Feb 2019 09:36:36 -0800 Subject: xfs: eof trim writeback mapping as soon as it is cached The cached writeback mapping is EOF trimmed to try and avoid races between post-eof block management and writeback that result in sending cached data to a stale location. The cached mapping is currently trimmed on the validation check, which leaves a race window between the time the mapping is cached and when it is trimmed against the current inode size. For example, if a new mapping is cached by delalloc conversion on a blocksize == page size fs, we could cycle various locks, perform memory allocations, etc. in the writeback codepath before the associated mapping is eventually trimmed to i_size. This leaves enough time for a post-eof truncate and file append before the cached mapping is trimmed. The former event essentially invalidates a range of the cached mapping and the latter bumps the inode size such the trim on the next writepage event won't trim all of the invalid blocks. fstest generic/464 reproduces this scenario occasionally and causes a lost writeback and stale delalloc blocks warning on inode inactivation. To work around this problem, trim the cached writeback mapping as soon as it is cached in addition to on subsequent validation checks. This is a minor tweak to tighten the race window as much as possible until a proper invalidation mechanism is available. Fixes: 40214d128e07 ("xfs: trim writepage mapping to within eof") Cc: # v4.14+ Signed-off-by: Brian Foster Reviewed-by: Allison Henderson Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 338b9d9984e0..d9048bcea49c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -449,6 +449,7 @@ xfs_map_blocks( } wpc->imap = imap; + xfs_trim_extent_eof(&wpc->imap, ip); trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); return 0; allocate_blocks: @@ -459,6 +460,7 @@ allocate_blocks: ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || imap.br_startoff + imap.br_blockcount <= cow_fsb); wpc->imap = imap; + xfs_trim_extent_eof(&wpc->imap, ip); trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); return 0; } -- cgit v1.2.3 From 465fa17f4a303d9fdff9eac4d45f91ece92e96ca Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Sun, 3 Feb 2019 14:03:06 -0800 Subject: xfs: end sync buffer I/O properly on shutdown error As of commit e339dd8d8b ("xfs: use sync buffer I/O for sync delwri queue submission"), the delwri submission code uses sync buffer I/O for sync delwri I/O. Instead of waiting on async I/O to unlock the buffer, it uses the underlying sync I/O completion mechanism. If delwri buffer submission fails due to a shutdown scenario, an error is set on the buffer and buffer completion never occurs. This can cause xfs_buf_delwri_submit() to deadlock waiting on a completion event. We could check the error state before waiting on such buffers, but that doesn't serialize against the case of an error set via a racing I/O completion. Instead, invoke I/O completion in the shutdown case regardless of buffer I/O type. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index eedc5e0156ff..1f9857e3630a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1536,8 +1536,7 @@ __xfs_buf_submit( xfs_buf_ioerror(bp, -EIO); bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); - if (bp->b_flags & XBF_ASYNC) - xfs_buf_ioend(bp); + xfs_buf_ioend(bp); return -EIO; } -- cgit v1.2.3 From add46b3b021263c02d5a7080c58e5b459479fafd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 3 Feb 2019 14:03:59 -0800 Subject: xfs: set buffer ops when repair probes for btree type In xrep_findroot_block, we work out the btree type and correctness of a given block by calling different btree verifiers on root block candidates. However, we leave the NULL b_ops while ->verify_read validates the block, which means that if the verifier calls xfs_buf_verifier_error it'll crash on the null b_ops. Fix it to set b_ops before calling the verifier and unsetting it if the verifier fails. Furthermore, improve the documentation around xfs_buf_ensure_ops, which is the function that is responsible for cleaning up the b_ops state of buffers that go through xrep_findroot_block but don't match anything. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/repair.c | 11 ++++++++--- fs/xfs/xfs_buf.c | 16 ++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1c8eecfe52b8..6acf1bfa0bfe 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -768,18 +768,23 @@ xrep_findroot_block( if (!uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) goto out; + /* + * Read verifiers can reference b_ops, so we set the pointer + * here. If the verifier fails we'll reset the buffer state + * to what it was before we touched the buffer. + */ + bp->b_ops = fab->buf_ops; fab->buf_ops->verify_read(bp); if (bp->b_error) { + bp->b_ops = NULL; bp->b_error = 0; goto out; } /* * Some read verifiers will (re)set b_ops, so we must be - * careful not to blow away any such assignment. + * careful not to change b_ops after running the verifier. */ - if (!bp->b_ops) - bp->b_ops = fab->buf_ops; } /* diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1f9857e3630a..4f5f2ff3f70f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -776,10 +776,26 @@ _xfs_buf_read( } /* + * Set buffer ops on an unchecked buffer and validate it, if possible. + * * If the caller passed in an ops structure and the buffer doesn't have ops * assigned, set the ops and use them to verify the contents. If the contents * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no * recorded errors and is already in XBF_DONE state. + * + * Under normal operations, every in-core buffer must have buffer ops assigned + * to them when the buffer is read in from disk so that we can validate the + * metadata. + * + * However, there are two scenarios where one can encounter in-core buffers + * that don't have buffer ops. The first is during log recovery of buffers on + * a V4 filesystem, though these buffers are purged at the end of recovery. + * + * The other is online repair, which tries to match arbitrary metadata blocks + * with btree types in order to find the root. If online repair doesn't match + * the buffer with /any/ btree type, the buffer remains in memory in DONE state + * with no ops, and a subsequent read_buf call from elsewhere will not set the + * ops. This function helps us fix this situation. */ int xfs_buf_ensure_ops( -- cgit v1.2.3 From ec51f8ee1e63498e9f521ec0e5a6d04622bb2c67 Mon Sep 17 00:00:00 2001 From: Mike Marshall Date: Tue, 5 Feb 2019 14:13:35 -0500 Subject: aio: initialize kiocb private in case any filesystems expect it. A recent optimization had left private uninitialized. Fixes: 2bc4ca9bb600 ("aio: don't zero entire aio_kiocb aio_get_req()") Reviewed-by: Christoph Hellwig Signed-off-by: Mike Marshall Signed-off-by: Jens Axboe --- fs/aio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index b906ff70c90f..aaaaf4d12c73 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1436,6 +1436,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) if (unlikely(!req->ki_filp)) return -EBADF; req->ki_complete = aio_complete_rw; + req->private = NULL; req->ki_pos = iocb->aio_offset; req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) -- cgit v1.2.3 From 43636c804df0126da669c261fc820fb22f62bfc2 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 21 Jan 2019 22:49:37 +0900 Subject: fs: ratelimit __find_get_block_slow() failure message. When something let __find_get_block_slow() hit all_mapped path, it calls printk() for 100+ times per a second. But there is no need to print same message with such high frequency; it is just asking for stall warning, or at least bloating log files. [ 399.866302][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8 [ 399.873324][T15342] b_state=0x00000029, b_size=512 [ 399.878403][T15342] device loop0 blocksize: 4096 [ 399.883296][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8 [ 399.890400][T15342] b_state=0x00000029, b_size=512 [ 399.895595][T15342] device loop0 blocksize: 4096 [ 399.900556][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8 [ 399.907471][T15342] b_state=0x00000029, b_size=512 [ 399.912506][T15342] device loop0 blocksize: 4096 This patch reduces frequency to up to once per a second, in addition to concatenating three lines into one. [ 399.866302][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8, b_state=0x00000029, b_size=512, device loop0 blocksize: 4096 Signed-off-by: Tetsuo Handa Reviewed-by: Jan Kara Cc: Dmitry Vyukov Signed-off-by: Jens Axboe --- fs/buffer.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 52d024bfdbc1..48318fb74938 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -200,6 +200,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) struct buffer_head *head; struct page *page; int all_mapped = 1; + static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); @@ -227,15 +228,15 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) * file io on the block device and getblk. It gets dealt with * elsewhere, don't buffer_error if we had some unmapped buffers */ - if (all_mapped) { - printk("__find_get_block_slow() failed. " - "block=%llu, b_blocknr=%llu\n", - (unsigned long long)block, - (unsigned long long)bh->b_blocknr); - printk("b_state=0x%08lx, b_size=%zu\n", - bh->b_state, bh->b_size); - printk("device %pg blocksize: %d\n", bdev, - 1 << bd_inode->i_blkbits); + ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE); + if (all_mapped && __ratelimit(&last_warned)) { + printk("__find_get_block_slow() failed. block=%llu, " + "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, " + "device %pg blocksize: %d\n", + (unsigned long long)block, + (unsigned long long)bh->b_blocknr, + bh->b_state, bh->b_size, bdev, + 1 << bd_inode->i_blkbits); } out_unlock: spin_unlock(&bd_mapping->private_lock); -- cgit v1.2.3 From e3fdc89ca47ef34dfb6fd5101fec084c3dba5486 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Jan 2019 15:58:38 -0500 Subject: nfsd: Fix error return values for nfsd4_clone_file_range() If the parameter 'count' is non-zero, nfsd4_clone_file_range() will currently clobber all errors returned by vfs_clone_file_range() and replace them with EINVAL. Fixes: 42ec3d4c0218 ("vfs: make remap_file_range functions take and...") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9824e32b2f23..7dc98e14655d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -557,9 +557,11 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, loff_t cloned; cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); + if (cloned < 0) + return nfserrno(cloned); if (count && cloned != count) - cloned = -EINVAL; - return nfserrno(cloned < 0 ? cloned : 0); + return nfserrno(-EINVAL); + return 0; } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, -- cgit v1.2.3 From d2ceb7e57086750ea6198a31fd942d98099a0786 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 6 Feb 2019 06:09:43 -0500 Subject: NFS: Don't use page_file_mapping after removing the page If nfs_page_async_flush() removes the page from the mapping, then we can't use page_file_mapping() on it as nfs_updatepate() is wont to do when receiving an error. Instead, push the mapping to the stack before the page is possibly truncated. Fixes: 8fc75bed96bb ("NFS: Fix up return value on fatal errors in nfs_page_async_flush()") Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f12cb31a41e5..d09c9f878141 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -238,9 +238,9 @@ out: } /* A writeback failed: mark the page as bad, and invalidate the page cache */ -static void nfs_set_pageerror(struct page *page) +static void nfs_set_pageerror(struct address_space *mapping) { - nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); + nfs_zap_mapping(mapping->host, mapping); } /* @@ -994,7 +994,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) nfs_list_remove_request(req); if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes < bytes)) { - nfs_set_pageerror(req->wb_page); + nfs_set_pageerror(page_file_mapping(req->wb_page)); nfs_context_set_write_error(req->wb_context, hdr->error); goto remove_req; } @@ -1348,7 +1348,8 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = page_file_mapping(page)->host; + struct address_space *mapping = page_file_mapping(page); + struct inode *inode = mapping->host; int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); @@ -1366,7 +1367,7 @@ int nfs_updatepage(struct file *file, struct page *page, status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) - nfs_set_pageerror(page); + nfs_set_pageerror(mapping); else __set_page_dirty_nobuffers(page); out: -- cgit v1.2.3 From 69056ee6a8a3d576ed31e38b3b14c70d6c74edcc Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 12 Feb 2019 15:35:51 -0800 Subject: Revert "mm: don't reclaim inodes with many attached pages" This reverts commit a76cf1a474d7d ("mm: don't reclaim inodes with many attached pages"). This change causes serious changes to page cache and inode cache behaviour and balance, resulting in major performance regressions when combining worklaods such as large file copies and kernel compiles. https://bugzilla.kernel.org/show_bug.cgi?id=202441 This change is a hack to work around the problems introduced by changing how agressive shrinkers are on small caches in commit 172b06c32b94 ("mm: slowly shrink slabs with a relatively small number of objects"). It creates more problems than it solves, wasn't adequately reviewed or tested, so it needs to be reverted. Link: http://lkml.kernel.org/r/20190130041707.27750-2-david@fromorbit.com Fixes: a76cf1a474d7d ("mm: don't reclaim inodes with many attached pages") Signed-off-by: Dave Chinner Cc: Wolfgang Walter Cc: Roman Gushchin Cc: Spock Cc: Rik van Riel Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 0cd47fe0dbe5..73432e64f874 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_REMOVED; } - /* - * Recently referenced inodes and inodes with many attached pages - * get one more pass. - */ - if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) { + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; -- cgit v1.2.3 From 27dd768ed8db48beefc4d9e006c58e7a00342bde Mon Sep 17 00:00:00 2001 From: Sandeep Patil Date: Tue, 12 Feb 2019 15:36:11 -0800 Subject: mm: proc: smaps_rollup: fix pss_locked calculation The 'pss_locked' field of smaps_rollup was being calculated incorrectly. It accumulated the current pss everytime a locked VMA was found. Fix that by adding to 'pss_locked' the same time as that of 'pss' if the vma being walked is locked. Link: http://lkml.kernel.org/r/20190203065425.14650-1-sspatil@android.com Fixes: 493b0e9d945f ("mm: add /proc/pid/smaps_rollup") Signed-off-by: Sandeep Patil Acked-by: Vlastimil Babka Reviewed-by: Joel Fernandes (Google) Cc: Alexey Dobriyan Cc: Daniel Colascione Cc: [4.14.x, 4.19.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0ec9edab2f3..85b0ef890b28 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -423,7 +423,7 @@ struct mem_size_stats { }; static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty) + bool compound, bool young, bool dirty, bool locked) { int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -450,24 +450,31 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, else mss->private_clean += size; mss->pss += (u64)size << PSS_SHIFT; + if (locked) + mss->pss_locked += (u64)size << PSS_SHIFT; return; } for (i = 0; i < nr; i++, page++) { int mapcount = page_mapcount(page); + unsigned long pss = (PAGE_SIZE << PSS_SHIFT); if (mapcount >= 2) { if (dirty || PageDirty(page)) mss->shared_dirty += PAGE_SIZE; else mss->shared_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; + mss->pss += pss / mapcount; + if (locked) + mss->pss_locked += pss / mapcount; } else { if (dirty || PageDirty(page)) mss->private_dirty += PAGE_SIZE; else mss->private_clean += PAGE_SIZE; - mss->pss += PAGE_SIZE << PSS_SHIFT; + mss->pss += pss; + if (locked) + mss->pss_locked += pss; } } } @@ -490,6 +497,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; if (pte_present(*pte)) { @@ -532,7 +540,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte)); + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -541,6 +549,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page; /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -555,7 +564,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, /* pass */; else VM_BUG_ON_PAGE(1, page); - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -737,11 +746,8 @@ static void smap_gather_stats(struct vm_area_struct *vma, } } #endif - /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); - if (vma->vm_flags & VM_LOCKED) - mss->pss_locked += mss->pss; } #define SEQ_PUT_DEC(str, val) \ -- cgit v1.2.3 From 3bf6b57ec2ec945e5a6edf5c202a754f1e852ecd Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 14 Feb 2019 12:33:19 -0500 Subject: Revert "nfsd4: return default lease period" This reverts commit d6ebf5088f09472c1136cd506bdc27034a6763f8. I forgot that the kernel's default lease period should never be decreased! After a kernel upgrade, the kernel has no way of knowing on its own what the previous lease time was. Unless userspace tells it otherwise, it will assume the previous lease period was the same. So if we decrease this value in a kernel upgrade, we end up enforcing a grace period that's too short, and clients will fail to reclaim state in time. Symptoms may include EIO and log messages like "NFS: nfs4_reclaim_open_state: Lock reclaim failed!" There was no real justification for the lease period decrease anyway. Reported-by: Donald Buczek Fixes: d6ebf5088f09 "nfsd4: return default lease period" Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b33f9785b756..72a7681f4046 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1239,8 +1239,8 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - nn->nfsd4_lease = 45; /* default lease time */ - nn->nfsd4_grace = 45; + nn->nfsd4_lease = 90; /* default lease time */ + nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); -- cgit v1.2.3 From 23e93c9b2cde73f9912d0d8534adbddd3dcc48f4 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 13 Feb 2019 15:12:17 -0500 Subject: Revert "gfs2: read journal in large chunks to locate the head" This reverts commit 2a5f14f279f59143139bcd1606903f2f80a34241. This patch causes xfstests generic/311 to fail. Reverting this for now until we have a proper fix. Signed-off-by: Abhi Das Signed-off-by: Bob Peterson Signed-off-by: Linus Torvalds --- fs/gfs2/glops.c | 1 - fs/gfs2/log.c | 4 +- fs/gfs2/lops.c | 190 ++------------------------------------------------- fs/gfs2/lops.h | 4 +- fs/gfs2/ops_fstype.c | 1 - fs/gfs2/recovery.c | 123 +++++++++++++++++++++++++++++++++ fs/gfs2/recovery.h | 2 + fs/gfs2/super.c | 1 - 8 files changed, 134 insertions(+), 192 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f15b4c57c4bd..78510ab91835 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,7 +28,6 @@ #include "util.h" #include "trans.h" #include "dir.h" -#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 5bfaf381921a..b8830fda51e8 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -733,7 +733,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); log_flush_wait(sdp); } @@ -810,7 +810,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 94dcab655bc0..2295042bc625 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,9 +17,7 @@ #include #include #include -#include -#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -195,6 +193,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio + * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -231,19 +230,20 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_submit_bio - Submit any pending log bio * @biop: Address of the bio pointer - * @opf: REQ_OP | op_flags + * @op: REQ_OP + * @op_flags: req_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_submit_bio(struct bio **biop, int opf) +void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags) { struct bio *bio = *biop; if (bio) { struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio->bi_opf = opf; + bio_set_op_attrs(bio, op, op_flags); submit_bio(bio); *biop = NULL; } @@ -304,7 +304,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk && !flush) return bio; - gfs2_log_submit_bio(biop, op); + gfs2_log_submit_bio(biop, op, 0); } *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); @@ -375,184 +375,6 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } -/** - * gfs2_end_log_read - end I/O callback for reads from the log - * @bio: The bio - * - * Simply unlock the pages in the bio. The main thread will wait on them and - * process them in order as necessary. - */ - -static void gfs2_end_log_read(struct bio *bio) -{ - struct page *page; - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - page = bvec->bv_page; - if (bio->bi_status) { - int err = blk_status_to_errno(bio->bi_status); - - SetPageError(page); - mapping_set_error(page->mapping, err); - } - unlock_page(page); - } - - bio_put(bio); -} - -/** - * gfs2_jhead_pg_srch - Look for the journal head in a given page. - * @jd: The journal descriptor - * @page: The page to look in - * - * Returns: 1 if found, 0 otherwise. - */ - -static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, - struct page *page) -{ - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - struct gfs2_log_header_host uninitialized_var(lh); - void *kaddr = kmap_atomic(page); - unsigned int offset; - bool ret = false; - - for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { - if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { - if (lh.lh_sequence > head->lh_sequence) - *head = lh; - else { - ret = true; - break; - } - } - } - kunmap_atomic(kaddr); - return ret; -} - -/** - * gfs2_jhead_process_page - Search/cleanup a page - * @jd: The journal descriptor - * @index: Index of the page to look into - * @done: If set, perform only cleanup, else search and set if found. - * - * Find the page with 'index' in the journal's mapping. Search the page for - * the journal head if requested (cleanup == false). Release refs on the - * page so the page cache can reclaim it (put_page() twice). We grabbed a - * reference on this page two times, first when we did a find_or_create_page() - * to obtain the page to add it to the bio and second when we do a - * find_get_page() here to get the page to wait on while I/O on it is being - * completed. - * This function is also used to free up a page we might've grabbed but not - * used. Maybe we added it to a bio, but not submitted it for I/O. Or we - * submitted the I/O, but we already found the jhead so we only need to drop - * our references to the page. - */ - -static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, - struct gfs2_log_header_host *head, - bool *done) -{ - struct page *page; - - page = find_get_page(jd->jd_inode->i_mapping, index); - wait_on_page_locked(page); - - if (PageError(page)) - *done = true; - - if (!*done) - *done = gfs2_jhead_pg_srch(jd, head, page); - - put_page(page); /* Once for find_get_page */ - put_page(page); /* Once more for find_or_create_page */ -} - -/** - * gfs2_find_jhead - find the head of a log - * @jd: The journal descriptor - * @head: The log descriptor for the head of the log is returned here - * - * Do a search of a journal by reading it in large chunks using bios and find - * the valid log entry with the highest sequence number. (i.e. the log head) - * - * Returns: 0 on success, errno otherwise - */ - -int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) -{ - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - struct address_space *mapping = jd->jd_inode->i_mapping; - struct gfs2_journal_extent *je; - u32 block, read_idx = 0, submit_idx = 0, index = 0; - int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; - int blocks_per_page = 1 << shift, sz, ret = 0; - struct bio *bio = NULL; - struct page *page; - bool done = false; - errseq_t since; - - memset(head, 0, sizeof(*head)); - if (list_empty(&jd->extent_list)) - gfs2_map_journal_extents(sdp, jd); - - since = filemap_sample_wb_err(mapping); - list_for_each_entry(je, &jd->extent_list, list) { - for (block = 0; block < je->blocks; block += blocks_per_page) { - index = (je->lblock + block) >> shift; - - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; - done = true; - goto out; - } - - if (bio) { - sz = bio_add_page(bio, page, PAGE_SIZE, 0); - if (sz == PAGE_SIZE) - goto page_added; - submit_idx = index; - submit_bio(bio); - bio = NULL; - } - - bio = gfs2_log_alloc_bio(sdp, - je->dblock + (index << shift), - gfs2_end_log_read); - bio->bi_opf = REQ_OP_READ; - sz = bio_add_page(bio, page, PAGE_SIZE, 0); - gfs2_assert_warn(sdp, sz == PAGE_SIZE); - -page_added: - if (submit_idx <= read_idx + BIO_MAX_PAGES) { - /* Keep at least one bio in flight */ - continue; - } - - gfs2_jhead_process_page(jd, read_idx++, head, &done); - if (done) - goto out; /* found */ - } - } - -out: - if (bio) - submit_bio(bio); - while (read_idx <= index) - gfs2_jhead_process_page(jd, read_idx++, head, &done); - - if (!ret) - ret = filemap_check_wb_err(mapping, since); - - return ret; -} - static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, u32 ld_length, u32 ld_data1) { diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 331160fc568b..711c4d89c063 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -30,10 +30,8 @@ extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, unsigned size, unsigned offset, u64 blkno); extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); -extern void gfs2_log_submit_bio(struct bio **biop, int opf); +extern void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); -extern int gfs2_find_jhead(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1179763f6370..b041cb8ae383 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -41,7 +41,6 @@ #include "dir.h" #include "meta_io.h" #include "trace_gfs2.h" -#include "lops.h" #define DO 0 #define UNDO 1 diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 7389e445a7a7..2dac43065382 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -181,6 +181,129 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, return error; } +/** + * find_good_lh - find a good log header + * @jd: the journal + * @blk: the segment to start searching from + * @lh: the log header to fill in + * @forward: if true search forward in the log, else search backward + * + * Call get_log_header() to get a log header for a segment, but if the + * segment is bad, either scan forward or backward until we find a good one. + * + * Returns: errno + */ + +static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk, + struct gfs2_log_header_host *head) +{ + unsigned int orig_blk = *blk; + int error; + + for (;;) { + error = get_log_header(jd, *blk, head); + if (error <= 0) + return error; + + if (++*blk == jd->jd_blocks) + *blk = 0; + + if (*blk == orig_blk) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + } +} + +/** + * jhead_scan - make sure we've found the head of the log + * @jd: the journal + * @head: this is filled in with the log descriptor of the head + * + * At this point, seg and lh should be either the head of the log or just + * before. Scan forward until we find the head. + * + * Returns: errno + */ + +static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + unsigned int blk = head->lh_blkno; + struct gfs2_log_header_host lh; + int error; + + for (;;) { + if (++blk == jd->jd_blocks) + blk = 0; + + error = get_log_header(jd, blk, &lh); + if (error < 0) + return error; + if (error == 1) + continue; + + if (lh.lh_sequence == head->lh_sequence) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + if (lh.lh_sequence < head->lh_sequence) + break; + + *head = lh; + } + + return 0; +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: the journal + * @head: the log descriptor for the head of the log is returned here + * + * Do a binary search of a journal and find the valid log entry with the + * highest sequence number. (i.e. the log head) + * + * Returns: errno + */ + +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + struct gfs2_log_header_host lh_1, lh_m; + u32 blk_1, blk_2, blk_m; + int error; + + blk_1 = 0; + blk_2 = jd->jd_blocks - 1; + + for (;;) { + blk_m = (blk_1 + blk_2) / 2; + + error = find_good_lh(jd, &blk_1, &lh_1); + if (error) + return error; + + error = find_good_lh(jd, &blk_m, &lh_m); + if (error) + return error; + + if (blk_1 == blk_m || blk_m == blk_2) + break; + + if (lh_1.lh_sequence <= lh_m.lh_sequence) + blk_1 = blk_m; + else + blk_2 = blk_m; + } + + error = jhead_scan(jd, &lh_1); + if (error) + return error; + + *head = lh_1; + + return error; +} + /** * foreach_descriptor - go through the active part of the log * @jd: the journal diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 99575ab81202..11d81248be85 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -27,6 +27,8 @@ extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); extern void gfs2_recover_func(struct work_struct *work); extern int __get_log_header(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d4b11c903971..ca71163ff7cf 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -45,7 +45,6 @@ #include "util.h" #include "sys.h" #include "xattr.h" -#include "lops.h" #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) -- cgit v1.2.3 From cb5b020a8d38f77209d0472a0fea755299a8ec78 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Feb 2019 15:02:18 -0800 Subject: Revert "exec: load_script: don't blindly truncate shebang string" This reverts commit 8099b047ecc431518b9bb6bdbba3549bbecdc343. It turns out that people do actually depend on the shebang string being truncated, and on the fact that an interpreter (like perl) will often just re-interpret it entirely to get the full argument list. Reported-by: Samuel Dionne-Riel Acked-by: Kees Cook Cc: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/binfmt_script.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index d0078cbb718b..7cde3f46ad26 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -42,14 +42,10 @@ static int load_script(struct linux_binprm *bprm) fput(bprm->file); bprm->file = NULL; - for (cp = bprm->buf+2;; cp++) { - if (cp >= bprm->buf + BINPRM_BUF_SIZE) - return -ENOEXEC; - if (!*cp || (*cp == '\n')) - break; - } + bprm->buf[BINPRM_BUF_SIZE - 1] = '\0'; + if ((cp = strchr(bprm->buf, '\n')) == NULL) + cp = bprm->buf+BINPRM_BUF_SIZE-1; *cp = '\0'; - while (cp > bprm->buf) { cp--; if ((*cp == ' ') || (*cp == '\t')) -- cgit v1.2.3