diff options
Diffstat (limited to 'fs')
444 files changed, 26697 insertions, 12937 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index ac2ec4543fe1..09fd4a185fd2 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -32,13 +32,13 @@ endif config 9P_FS_SECURITY - bool "9P Security Labels" - depends on 9P_FS - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the 9P filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. + bool "9P Security Labels" + depends on 9P_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the 9P filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index fe7f0bd2048e..92cd1d80218d 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -388,7 +388,10 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", iov_iter_count(to), iocb->ki_pos); - ret = p9_client_read(fid, iocb->ki_pos, to, &err); + if (iocb->ki_filp->f_flags & O_NONBLOCK) + ret = p9_client_read_once(fid, iocb->ki_pos, to, &err); + else + ret = p9_client_read(fid, iocb->ki_pos, to, &err); if (!ret) return err; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b82423a72f68..c9255d399917 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -143,7 +143,7 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, default: p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n", type, stat->extension); - }; + } *rdev = MKDEV(major, minor); } else res |= S_IFREG; diff --git a/fs/Kconfig b/fs/Kconfig index 708ba336e689..f08fbbfafd9a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -140,9 +140,10 @@ endmenu endif # BLOCK if BLOCK -menu "DOS/FAT/NT Filesystems" +menu "DOS/FAT/EXFAT/NT Filesystems" source "fs/fat/Kconfig" +source "fs/exfat/Kconfig" source "fs/ntfs/Kconfig" endmenu diff --git a/fs/Makefile b/fs/Makefile index 505e51166973..2ce5112b02c8 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ obj-$(CONFIG_CODA_FS) += coda/ obj-$(CONFIG_MINIX_FS) += minix/ obj-$(CONFIG_FAT_FS) += fat/ +obj-$(CONFIG_EXFAT_FS) += exfat/ obj-$(CONFIG_BFS_FS) += bfs/ obj-$(CONFIG_ISO9660_FS) += isofs/ obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index a3cdb0036c5d..f3a0f412b43b 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -186,7 +186,7 @@ static int find_autofs_mount(const char *pathname, struct path path; int err; - err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); + err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path); if (err) return err; err = -ENOENT; @@ -519,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, if (!fp || param->ioctlfd == -1) { if (autofs_type_any(type)) - err = kern_path_mountpoint(AT_FDCWD, - name, &path, LOOKUP_FOLLOW); + err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_MOUNTPOINT, + &path); else err = find_autofs_mount(name, &path, test_by_type, &type); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f4713ea76e82..13f25e241ac4 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -27,6 +27,7 @@ #include <linux/highuid.h> #include <linux/compiler.h> #include <linux/highmem.h> +#include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> #include <linux/security.h> @@ -698,19 +699,11 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; - struct { - struct elfhdr interp_elf_ex; - } *loc; + struct elfhdr *interp_elf_ex = NULL; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; struct mm_struct *mm; struct pt_regs *regs; - loc = kmalloc(sizeof(*loc), GFP_KERNEL); - if (!loc) { - retval = -ENOMEM; - goto out_ret; - } - retval = -ENOEXEC; /* First of all, some simple consistency checks */ if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0) @@ -770,9 +763,15 @@ static int load_elf_binary(struct linux_binprm *bprm) */ would_dump(bprm, interpreter); + interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL); + if (!interp_elf_ex) { + retval = -ENOMEM; + goto out_free_ph; + } + /* Get the exec headers */ - retval = elf_read(interpreter, &loc->interp_elf_ex, - sizeof(loc->interp_elf_ex), 0); + retval = elf_read(interpreter, interp_elf_ex, + sizeof(*interp_elf_ex), 0); if (retval < 0) goto out_free_dentry; @@ -806,25 +805,25 @@ out_free_interp: if (interpreter) { retval = -ELIBBAD; /* Not an ELF interpreter */ - if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out_free_dentry; /* Verify the interpreter has a valid arch */ - if (!elf_check_arch(&loc->interp_elf_ex) || - elf_check_fdpic(&loc->interp_elf_ex)) + if (!elf_check_arch(interp_elf_ex) || + elf_check_fdpic(interp_elf_ex)) goto out_free_dentry; /* Load the interpreter program headers */ - interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex, + interp_elf_phdata = load_elf_phdrs(interp_elf_ex, interpreter); if (!interp_elf_phdata) goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ elf_ppnt = interp_elf_phdata; - for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++) + for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_LOPROC ... PT_HIPROC: - retval = arch_elf_pt_proc(&loc->interp_elf_ex, + retval = arch_elf_pt_proc(interp_elf_ex, elf_ppnt, interpreter, true, &arch_state); if (retval) @@ -839,7 +838,7 @@ out_free_interp: * the exec syscall. */ retval = arch_check_elf(elf_ex, - !!interpreter, &loc->interp_elf_ex, + !!interpreter, interp_elf_ex, &arch_state); if (retval) goto out_free_dentry; @@ -1055,7 +1054,7 @@ out_free_interp: } if (interpreter) { - elf_entry = load_elf_interp(&loc->interp_elf_ex, + elf_entry = load_elf_interp(interp_elf_ex, interpreter, load_bias, interp_elf_phdata); if (!IS_ERR((void *)elf_entry)) { @@ -1064,7 +1063,7 @@ out_free_interp: * adjustment */ interp_load_addr = elf_entry; - elf_entry += loc->interp_elf_ex.e_entry; + elf_entry += interp_elf_ex->e_entry; } if (BAD_ADDR(elf_entry)) { retval = IS_ERR((void *)elf_entry) ? @@ -1075,6 +1074,9 @@ out_free_interp: allow_write_access(interpreter); fput(interpreter); + + kfree(interp_elf_ex); + kfree(interp_elf_phdata); } else { elf_entry = e_entry; if (BAD_ADDR(elf_entry)) { @@ -1083,7 +1085,6 @@ out_free_interp: } } - kfree(interp_elf_phdata); kfree(elf_phdata); set_binfmt(&elf_format); @@ -1153,12 +1154,11 @@ out_free_interp: start_thread(regs, elf_entry, bprm->p); retval = 0; out: - kfree(loc); -out_ret: return retval; /* error cleanup */ out_free_dentry: + kfree(interp_elf_ex); kfree(interp_elf_phdata); allow_write_access(interpreter); if (interpreter) @@ -1317,7 +1317,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, } /* Hugetlb memory check */ - if (vma->vm_flags & VM_HUGETLB) { + if (is_vm_hugetlb_page(vma)) { if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) goto whole; if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) diff --git a/fs/block_dev.c b/fs/block_dev.c index 69bf2fb6f7cd..52b6f646cdbd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -34,6 +34,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <linux/uaccess.h> +#include <linux/suspend.h> #include "internal.h" struct bdev_inode { @@ -1520,10 +1521,22 @@ rescan: if (ret) return ret; - if (invalidate) - set_capacity(disk, 0); - else if (disk->fops->revalidate_disk) - disk->fops->revalidate_disk(disk); + /* + * Historically we only set the capacity to zero for devices that + * support partitions (independ of actually having partitions created). + * Doing that is rather inconsistent, but changing it broke legacy + * udisks polling for legacy ide-cdrom devices. Use the crude check + * below to get the sane behavior for most device while not breaking + * userspace for this particular setup. + */ + if (invalidate) { + if (disk_part_scan_enabled(disk) || + !(disk->flags & GENHD_FL_REMOVABLE)) + set_capacity(disk, 0); + } else { + if (disk->fops->revalidate_disk) + disk->fops->revalidate_disk(disk); + } check_disk_size_change(disk, bdev, !invalidate); @@ -2001,7 +2014,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; - if (IS_SWAPFILE(bd_inode)) + /* uswsusp needs write permission to the swap */ + if (IS_SWAPFILE(bd_inode) && !hibernation_available()) return -ETXTBSY; if (!iov_iter_count(from)) diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9a0ff3384381..e738f6206ea5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o discard.o + block-rsv.o delalloc-space.o block-group.o discard.o reflink.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 1d32a07bb2d1..309516e6a968 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -395,3 +395,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work) { set_bit(WORK_HIGH_PRIO_BIT, &work->flags); } + +void btrfs_flush_workqueue(struct btrfs_workqueue *wq) +{ + if (wq->high) + flush_workqueue(wq->high->normal_wq); + + flush_workqueue(wq->normal->normal_wq); +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index a4434301d84d..3204daa51b95 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -44,5 +44,6 @@ void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); +void btrfs_flush_workqueue(struct btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e5d85311d5d5..9c380e7edf62 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -347,33 +347,10 @@ static int add_prelim_ref(const struct btrfs_fs_info *fs_info, return -ENOMEM; ref->root_id = root_id; - if (key) { + if (key) ref->key_for_search = *key; - /* - * We can often find data backrefs with an offset that is too - * large (>= LLONG_MAX, maximum allowed file offset) due to - * underflows when subtracting a file's offset with the data - * offset of its corresponding extent data item. This can - * happen for example in the clone ioctl. - * So if we detect such case we set the search key's offset to - * zero to make sure we will find the matching file extent item - * at add_all_parents(), otherwise we will miss it because the - * offset taken form the backref is much larger then the offset - * of the file extent item. This can make us scan a very large - * number of file extent items, but at least it will not make - * us miss any. - * This is an ugly workaround for a behaviour that should have - * never existed, but it does and a fix for the clone ioctl - * would touch a lot of places, cause backwards incompatibility - * and would not fix the problem for extents cloned with older - * kernels. - */ - if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY && - ref->key_for_search.offset >= LLONG_MAX) - ref->key_for_search.offset = 0; - } else { + else memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); - } ref->inode_list = NULL; ref->level = level; @@ -409,10 +386,36 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info, wanted_disk_byte, count, sc, gfp_mask); } +static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) +{ + struct rb_node **p = &preftrees->direct.root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct prelim_ref *ref = NULL; + struct prelim_ref target = {0}; + int result; + + target.parent = bytenr; + + while (*p) { + parent = *p; + ref = rb_entry(parent, struct prelim_ref, rbnode); + result = prelim_ref_compare(ref, &target); + + if (result < 0) + p = &(*p)->rb_left; + else if (result > 0) + p = &(*p)->rb_right; + else + return 1; + } + return 0; +} + static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, struct prelim_ref *ref, + struct ulist *parents, + struct preftrees *preftrees, struct prelim_ref *ref, int level, u64 time_seq, const u64 *extent_item_pos, - u64 total_refs, bool ignore_offset) + bool ignore_offset) { int ret = 0; int slot; @@ -424,6 +427,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, u64 disk_byte; u64 wanted_disk_byte = ref->wanted_disk_byte; u64 count = 0; + u64 data_offset; if (level != 0) { eb = path->nodes[level]; @@ -434,18 +438,26 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } /* - * We normally enter this function with the path already pointing to - * the first item to check. But sometimes, we may enter it with - * slot==nritems. In that case, go to the next leaf before we continue. + * 1. We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot == nritems. + * 2. We are searching for normal backref but bytenr of this leaf + * matches shared data backref + * 3. The leaf owner is not equal to the root we are searching + * + * For these cases, go to the next leaf before we continue. */ - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb) || + is_shared_data_backref(preftrees, eb->start) || + ref->root_id != btrfs_header_owner(eb)) { if (time_seq == SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); } - while (!ret && count < total_refs) { + while (!ret && count < ref->count) { eb = path->nodes[0]; slot = path->slots[0]; @@ -455,13 +467,31 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, key.type != BTRFS_EXTENT_DATA_KEY) break; + /* + * We are searching for normal backref but bytenr of this leaf + * matches shared data backref, OR + * the leaf owner is not equal to the root we are searching for + */ + if (slot == 0 && + (is_shared_data_backref(preftrees, eb->start) || + ref->root_id != btrfs_header_owner(eb))) { + if (time_seq == SEQ_LAST) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, time_seq); + continue; + } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + data_offset = btrfs_file_extent_offset(eb, fi); if (disk_byte == wanted_disk_byte) { eie = NULL; old = NULL; - count++; + if (ref->key_for_search.offset == key.offset - data_offset) + count++; + else + goto next; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -502,9 +532,9 @@ next: */ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, + struct preftrees *preftrees, struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, u64 total_refs, - bool ignore_offset) + const u64 *extent_item_pos, bool ignore_offset) { struct btrfs_root *root; struct btrfs_key root_key; @@ -512,23 +542,25 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, int ret = 0; int root_level; int level = ref->level; - int index; + struct btrfs_key search_key = ref->key_for_search; root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - root = btrfs_get_fs_root(fs_info, &root_key, false); if (IS_ERR(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(root); + goto out_free; + } + + if (!path->search_commit_root && + test_bit(BTRFS_ROOT_DELETING, &root->state)) { + ret = -ENOENT; goto out; } if (btrfs_is_testing(fs_info)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -ENOENT; goto out; } @@ -540,21 +572,36 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, else root_level = btrfs_old_root_level(root, time_seq); - if (root_level + 1 == level) { - srcu_read_unlock(&fs_info->subvol_srcu, index); + if (root_level + 1 == level) goto out; - } + /* + * We can often find data backrefs with an offset that is too large + * (>= LLONG_MAX, maximum allowed file offset) due to underflows when + * subtracting a file's offset with the data offset of its + * corresponding extent data item. This can happen for example in the + * clone ioctl. + * + * So if we detect such case we set the search key's offset to zero to + * make sure we will find the matching file extent item at + * add_all_parents(), otherwise we will miss it because the offset + * taken form the backref is much larger then the offset of the file + * extent item. This can make us scan a very large number of file + * extent items, but at least it will not make us miss any. + * + * This is an ugly workaround for a behaviour that should have never + * existed, but it does and a fix for the clone ioctl would touch a lot + * of places, cause backwards incompatibility and would not fix the + * problem for extents cloned with older kernels. + */ + if (search_key.type == BTRFS_EXTENT_DATA_KEY && + search_key.offset >= LLONG_MAX) + search_key.offset = 0; path->lowest_level = level; if (time_seq == SEQ_LAST) - ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, - 0, 0); + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, - time_seq); - - /* root node has been locked, we can release @subvol_srcu safely here */ - srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = btrfs_search_old_slot(root, &search_key, path, time_seq); btrfs_debug(fs_info, "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", @@ -574,9 +621,11 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, ref, level, time_seq, - extent_item_pos, total_refs, ignore_offset); + ret = add_all_parents(root, path, parents, preftrees, ref, level, + time_seq, extent_item_pos, ignore_offset); out: + btrfs_put_root(root); +out_free: path->lowest_level = 0; btrfs_release_path(path); return ret; @@ -609,7 +658,7 @@ unode_aux_to_inode_list(struct ulist_node *node) static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, - const u64 *extent_item_pos, u64 total_refs, + const u64 *extent_item_pos, struct share_check *sc, bool ignore_offset) { int err; @@ -653,9 +702,9 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(fs_info, path, time_seq, ref, - parents, extent_item_pos, - total_refs, ignore_offset); + err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, + ref, parents, extent_item_pos, + ignore_offset); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -758,8 +807,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, */ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head, u64 seq, - struct preftrees *preftrees, u64 *total_refs, - struct share_check *sc) + struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; struct btrfs_delayed_extent_op *extent_op = head->extent_op; @@ -793,7 +841,6 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, default: BUG(); } - *total_refs += count; switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ @@ -876,7 +923,7 @@ out: static int add_inline_refs(const struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, int *info_level, struct preftrees *preftrees, - u64 *total_refs, struct share_check *sc) + struct share_check *sc) { int ret = 0; int slot; @@ -900,7 +947,6 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); - *total_refs += btrfs_extent_refs(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); @@ -1125,8 +1171,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct prelim_ref *ref; struct rb_node *node; struct extent_inode_elem *eie = NULL; - /* total of both direct AND indirect refs! */ - u64 total_refs = 0; struct preftrees preftrees = { .direct = PREFTREE_INIT, .indirect = PREFTREE_INIT, @@ -1195,7 +1239,7 @@ again: } spin_unlock(&delayed_refs->lock); ret = add_delayed_refs(fs_info, head, time_seq, - &preftrees, &total_refs, sc); + &preftrees, sc); mutex_unlock(&head->mutex); if (ret) goto out; @@ -1216,8 +1260,7 @@ again: (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { ret = add_inline_refs(fs_info, path, bytenr, - &info_level, &preftrees, - &total_refs, sc); + &info_level, &preftrees, sc); if (ret) goto out; ret = add_keyed_refs(fs_info, path, bytenr, info_level, @@ -1236,7 +1279,7 @@ again: WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, total_refs, sc, ignore_offset); + extent_item_pos, sc, ignore_offset); if (ret) goto out; @@ -1362,10 +1405,10 @@ static void free_leaf_list(struct ulist *blocks) * * returns 0 on success, <0 on error */ -static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset) +int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **leafs, + const u64 *extent_item_pos, bool ignore_offset) { int ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 777f61dc081e..723d6da99114 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -40,6 +40,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); +int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **leafs, + const u64 *extent_item_pos, bool ignore_offset); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **roots, bool ignore_offset); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 7f09147872dc..786849fcc319 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -460,7 +460,7 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end int ret; while (start < end) { - ret = find_first_extent_bit(info->pinned_extents, start, + ret = find_first_extent_bit(&info->excluded_extents, start, &extent_start, &extent_end, EXTENT_DIRTY | EXTENT_UPTODATE, NULL); @@ -1248,6 +1248,55 @@ out: return ret; } +static bool clean_pinned_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_transaction *prev_trans = NULL; + const u64 start = bg->start; + const u64 end = start + bg->length - 1; + int ret; + + spin_lock(&fs_info->trans_lock); + if (trans->transaction->list.prev != &fs_info->trans_list) { + prev_trans = list_last_entry(&trans->transaction->list, + struct btrfs_transaction, list); + refcount_inc(&prev_trans->use_count); + } + spin_unlock(&fs_info->trans_lock); + + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, another + * task might be running finish_extent_commit() for the previous + * transaction N - 1, and have seen a range belonging to the block + * group in pinned_extents before we were able to clear the whole block + * group range from pinned_extents. This means that task can lookup for + * the block group after we unpinned it from pinned_extents and removed + * it, leading to a BUG_ON() at unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + if (prev_trans) { + ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY); + if (ret) + goto err; + } + + ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, + EXTENT_DIRTY); + if (ret) + goto err; + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + + return true; + +err: + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(bg); + return false; +} + /* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. @@ -1265,7 +1314,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { - u64 start, end; int trimming; block_group = list_first_entry(&fs_info->unused_bgs, @@ -1344,35 +1392,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * We could have pending pinned extents for this block group, * just delete them, we don't care about them anymore. */ - start = block_group->start; - end = start + block_group->length - 1; - /* - * Hold the unused_bg_unpin_mutex lock to avoid racing with - * btrfs_finish_extent_commit(). If we are at transaction N, - * another task might be running finish_extent_commit() for the - * previous transaction N - 1, and have seen a range belonging - * to the block group in freed_extents[] before we were able to - * clear the whole block group range from freed_extents[]. This - * means that task can lookup for the block group after we - * unpinned it from freed_extents[] and removed it, leading to - * a BUG_ON() at btrfs_unpin_extent_range(). - */ - mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); + if (!clean_pinned_extents(trans, block_group)) goto end_trans; - } - ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); - goto end_trans; - } - mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* * At this point, the block_group is read only and should fail @@ -1987,6 +2008,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_release_path(path); } + rcu_read_lock(); list_for_each_entry_rcu(space_info, &info->space_info, list) { if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | @@ -2007,6 +2029,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list) inc_block_group_ro(cache, 1); } + rcu_read_unlock(); btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); @@ -2345,7 +2368,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group, return 0; } - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; again: inode = lookup_free_space_inode(block_group, path); @@ -2881,7 +2904,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, &cache->space_info->total_bytes_pinned, num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(info->pinned_extents, + set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); } diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index d07bd41a7c1e..27efec8f7c5b 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,6 +6,98 @@ #include "space-info.h" #include "transaction.h" +/* + * HOW DO BLOCK RESERVES WORK + * + * Think of block_rsv's as buckets for logically grouped metadata + * reservations. Each block_rsv has a ->size and a ->reserved. ->size is + * how large we want our block rsv to be, ->reserved is how much space is + * currently reserved for this block reserve. + * + * ->failfast exists for the truncate case, and is described below. + * + * NORMAL OPERATION + * + * -> Reserve + * Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill + * + * We call into btrfs_reserve_metadata_bytes() with our bytes, which is + * accounted for in space_info->bytes_may_use, and then add the bytes to + * ->reserved, and ->size in the case of btrfs_block_rsv_add. + * + * ->size is an over-estimation of how much we may use for a particular + * operation. + * + * -> Use + * Entrance: btrfs_use_block_rsv + * + * When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv() + * to determine the appropriate block_rsv to use, and then verify that + * ->reserved has enough space for our tree block allocation. Once + * successful we subtract fs_info->nodesize from ->reserved. + * + * -> Finish + * Entrance: btrfs_block_rsv_release + * + * We are finished with our operation, subtract our individual reservation + * from ->size, and then subtract ->size from ->reserved and free up the + * excess if there is any. + * + * There is some logic here to refill the delayed refs rsv or the global rsv + * as needed, otherwise the excess is subtracted from + * space_info->bytes_may_use. + * + * TYPES OF BLOCK RESERVES + * + * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK + * These behave normally, as described above, just within the confines of the + * lifetime of their particular operation (transaction for the whole trans + * handle lifetime, for example). + * + * BLOCK_RSV_GLOBAL + * It is impossible to properly account for all the space that may be required + * to make our extent tree updates. This block reserve acts as an overflow + * buffer in case our delayed refs reserve does not reserve enough space to + * update the extent tree. + * + * We can steal from this in some cases as well, notably on evict() or + * truncate() in order to help users recover from ENOSPC conditions. + * + * BLOCK_RSV_DELALLOC + * The individual item sizes are determined by the per-inode size + * calculations, which are described with the delalloc code. This is pretty + * straightforward, it's just the calculation of ->size encodes a lot of + * different items, and thus it gets used when updating inodes, inserting file + * extents, and inserting checksums. + * + * BLOCK_RSV_DELREFS + * We keep a running tally of how many delayed refs we have on the system. + * We assume each one of these delayed refs are going to use a full + * reservation. We use the transaction items and pre-reserve space for every + * operation, and use this reservation to refill any gap between ->size and + * ->reserved that may exist. + * + * From there it's straightforward, removing a delayed ref means we remove its + * count from ->size and free up reservations as necessary. Since this is + * the most dynamic block reserve in the system, we will try to refill this + * block reserve first with any excess returned by any other block reserve. + * + * BLOCK_RSV_EMPTY + * This is the fallback block reserve to make us try to reserve space if we + * don't have a specific bucket for this allocation. It is mostly used for + * updating the device tree and such, since that is a separate pool we're + * content to just reserve space from the space_info on demand. + * + * BLOCK_RSV_TEMP + * This is used by things like truncate and iput. We will temporarily + * allocate a block reserve, set it to some size, and then truncate bytes + * until we have no space left. With ->failfast set we'll simply return + * ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try + * to make a new reservation. This is because these operations are + * unbounded, so we want to do as much work as we can, and then back off and + * re-reserve. + */ + static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes, @@ -111,7 +203,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, { if (!rsv) return; - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); kfree(rsv); } @@ -178,9 +270,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, u64 *qgroup_to_release) +u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + u64 *qgroup_to_release) { struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; @@ -297,9 +389,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; - block_rsv->reserved += num_bytes; btrfs_space_info_update_bytes_may_use(fs_info, sinfo, num_bytes); + block_rsv->reserved = block_rsv->size; } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; btrfs_space_info_update_bytes_may_use(fs_info, sinfo, @@ -344,7 +436,8 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) { - btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1, + NULL); WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index d1428bb73fc5..0b6ae5302837 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -73,7 +73,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, int min_factor); void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes, bool update_size); -u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, +u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, u64 *qgroup_to_release); void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); @@ -82,20 +82,12 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); - -static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); -} - static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); - btrfs_block_rsv_release(fs_info, block_rsv, 0); + btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL); } #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4e12a477d32e..27a1fefce508 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -60,6 +60,12 @@ struct btrfs_inode { */ struct extent_io_tree io_failure_tree; + /* + * Keep track of where the inode has extent items mapped in order to + * make sure the i_size adjustments are accurate + */ + struct extent_io_tree file_extent_tree; + /* held while logging the inode in tree-log.c */ struct mutex log_mutex; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index a0ce69f2d27c..32e11a23b47f 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -77,7 +77,6 @@ #include <linux/sched.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include <linux/mutex.h> #include <linux/genhd.h> #include <linux/blkdev.h> @@ -152,11 +151,8 @@ struct btrfsic_block { struct list_head ref_to_list; /* list */ struct list_head ref_from_list; /* list */ struct btrfsic_block *next_in_same_bio; - void *orig_bio_bh_private; - union { - bio_end_io_t *bio; - bh_end_io_t *bh; - } orig_bio_bh_end_io; + void *orig_bio_private; + bio_end_io_t *orig_bio_end_io; int submit_bio_bh_rw; u64 flush_gen; /* only valid if !never_written */ }; @@ -325,14 +321,12 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 dev_bytenr, char **mapped_datav, unsigned int num_pages, struct bio *bio, int *bio_is_patched, - struct buffer_head *bh, int submit_bio_bh_rw); static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const block, struct btrfs_super_block *const super_hdr); static void btrfsic_bio_end_io(struct bio *bp); -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, const struct btrfsic_block *block, int recursion_level); @@ -399,8 +393,8 @@ static void btrfsic_block_init(struct btrfsic_block *b) b->never_written = 0; b->mirror_num = 0; b->next_in_same_bio = NULL; - b->orig_bio_bh_private = NULL; - b->orig_bio_bh_end_io.bio = NULL; + b->orig_bio_private = NULL; + b->orig_bio_end_io = NULL; INIT_LIST_HEAD(&b->collision_resolving_node); INIT_LIST_HEAD(&b->all_blocks_node); INIT_LIST_HEAD(&b->ref_to_list); @@ -767,29 +761,31 @@ static int btrfsic_process_superblock_dev_mirror( struct btrfs_fs_info *fs_info = state->fs_info; struct btrfs_super_block *super_tmp; u64 dev_bytenr; - struct buffer_head *bh; struct btrfsic_block *superblock_tmp; int pass; struct block_device *const superblock_bdev = device->bdev; + struct page *page; + struct address_space *mapping = superblock_bdev->bd_inode->i_mapping; + int ret = 0; /* super block bytenr is always the unmapped device bytenr */ dev_bytenr = btrfs_sb_offset(superblock_mirror_num); if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) return -1; - bh = __bread(superblock_bdev, dev_bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (NULL == bh) + + page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page)) return -1; - super_tmp = (struct btrfs_super_block *) - (bh->b_data + (dev_bytenr & (BTRFS_BDEV_BLOCKSIZE - 1))); + + super_tmp = page_address(page); if (btrfs_super_bytenr(super_tmp) != dev_bytenr || btrfs_super_magic(super_tmp) != BTRFS_MAGIC || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || btrfs_super_sectorsize(super_tmp) != state->datablock_size) { - brelse(bh); - return 0; + ret = 0; + goto out; } superblock_tmp = @@ -800,8 +796,8 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp = btrfsic_block_alloc(); if (NULL == superblock_tmp) { pr_info("btrfsic: error, kmalloc failed!\n"); - brelse(bh); - return -1; + ret = -1; + goto out; } /* for superblock, only the dev_bytenr makes sense */ superblock_tmp->dev_bytenr = dev_bytenr; @@ -885,8 +881,8 @@ static int btrfsic_process_superblock_dev_mirror( mirror_num)) { pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n", next_bytenr, mirror_num); - brelse(bh); - return -1; + ret = -1; + goto out; } next_block = btrfsic_block_lookup_or_add( @@ -895,8 +891,8 @@ static int btrfsic_process_superblock_dev_mirror( mirror_num, NULL); if (NULL == next_block) { btrfsic_release_block_ctx(&tmp_next_block_ctx); - brelse(bh); - return -1; + ret = -1; + goto out; } next_block->disk_key = tmp_disk_key; @@ -907,16 +903,17 @@ static int btrfsic_process_superblock_dev_mirror( BTRFSIC_GENERATION_UNKNOWN); btrfsic_release_block_ctx(&tmp_next_block_ctx); if (NULL == l) { - brelse(bh); - return -1; + ret = -1; + goto out; } } } if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) btrfsic_dump_tree_sub(state, superblock_tmp, 0); - brelse(bh); - return 0; +out: + put_page(page); + return ret; } static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) @@ -1743,7 +1740,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 dev_bytenr, char **mapped_datav, unsigned int num_pages, struct bio *bio, int *bio_is_patched, - struct buffer_head *bh, int submit_bio_bh_rw) { int is_metadata; @@ -1902,9 +1898,9 @@ again: block->is_iodone = 0; BUG_ON(NULL == bio_is_patched); if (!*bio_is_patched) { - block->orig_bio_bh_private = + block->orig_bio_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = + block->orig_bio_end_io = bio->bi_end_io; block->next_in_same_bio = NULL; bio->bi_private = block; @@ -1916,25 +1912,17 @@ again: bio->bi_private; BUG_ON(NULL == chained_block); - block->orig_bio_bh_private = - chained_block->orig_bio_bh_private; - block->orig_bio_bh_end_io.bio = - chained_block->orig_bio_bh_end_io. - bio; + block->orig_bio_private = + chained_block->orig_bio_private; + block->orig_bio_end_io = + chained_block->orig_bio_end_io; block->next_in_same_bio = chained_block; bio->bi_private = block; } - } else if (NULL != bh) { - block->is_iodone = 0; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; } else { block->is_iodone = 1; - block->orig_bio_bh_private = NULL; - block->orig_bio_bh_end_io.bio = NULL; + block->orig_bio_private = NULL; + block->orig_bio_end_io = NULL; block->next_in_same_bio = NULL; } } @@ -2042,8 +2030,8 @@ again: block->is_iodone = 0; BUG_ON(NULL == bio_is_patched); if (!*bio_is_patched) { - block->orig_bio_bh_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; block->next_in_same_bio = NULL; bio->bi_private = block; bio->bi_end_io = btrfsic_bio_end_io; @@ -2054,24 +2042,17 @@ again: bio->bi_private; BUG_ON(NULL == chained_block); - block->orig_bio_bh_private = - chained_block->orig_bio_bh_private; - block->orig_bio_bh_end_io.bio = - chained_block->orig_bio_bh_end_io.bio; + block->orig_bio_private = + chained_block->orig_bio_private; + block->orig_bio_end_io = + chained_block->orig_bio_end_io; block->next_in_same_bio = chained_block; bio->bi_private = block; } - } else if (NULL != bh) { - block->is_iodone = 0; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; } else { block->is_iodone = 1; - block->orig_bio_bh_private = NULL; - block->orig_bio_bh_end_io.bio = NULL; + block->orig_bio_private = NULL; + block->orig_bio_end_io = NULL; block->next_in_same_bio = NULL; } if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) @@ -2112,8 +2093,8 @@ static void btrfsic_bio_end_io(struct bio *bp) iodone_w_error = 1; BUG_ON(NULL == block); - bp->bi_private = block->orig_bio_bh_private; - bp->bi_end_io = block->orig_bio_bh_end_io.bio; + bp->bi_private = block->orig_bio_private; + bp->bi_end_io = block->orig_bio_end_io; do { struct btrfsic_block *next_block; @@ -2146,38 +2127,6 @@ static void btrfsic_bio_end_io(struct bio *bp) bp->bi_end_io(bp); } -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) -{ - struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; - int iodone_w_error = !uptodate; - struct btrfsic_dev_state *dev_state; - - BUG_ON(NULL == block); - dev_state = block->dev_state; - if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", - iodone_w_error, - btrfsic_get_block_type(dev_state->state, block), - block->logical_bytenr, block->dev_state->name, - block->dev_bytenr, block->mirror_num); - - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_PREFLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bh_end_io() new %s flush_gen=%llu\n", - dev_state->name, dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is on disk */ - - bh->b_private = block->orig_bio_bh_private; - bh->b_end_io = block->orig_bio_bh_end_io.bh; - block->is_iodone = 1; /* for FLUSH, this releases the block */ - bh->b_end_io(bh, uptodate); -} - static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const superblock, @@ -2730,63 +2679,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) &btrfsic_dev_state_hashtable); } -int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) - return submit_bh(op, op_flags, bh); - - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bh() might also be called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev); - - /* Only called to write the superblock (incl. FLUSH/FUA) */ - if (NULL != dev_state && - (op == REQ_OP_WRITE) && bh->b_size > 0) { - u64 dev_bytenr; - - dev_bytenr = BTRFS_BDEV_BLOCKSIZE * bh->b_blocknr; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bh(op=0x%x,0x%x, blocknr=%llu (bytenr %llu), size=%zu, data=%p, bdev=%p)\n", - op, op_flags, (unsigned long long)bh->b_blocknr, - dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); - btrfsic_process_written_block(dev_state, dev_bytenr, - &bh->b_data, 1, NULL, - NULL, bh, op_flags); - } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n", - op, op_flags, bh->b_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info("btrfsic_submit_bh(%s) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->name); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = op_flags; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; - } - } - mutex_unlock(&btrfsic_mutex); - return submit_bh(op, op_flags, bh); -} - static void __btrfsic_submit_bio(struct bio *bio) { struct btrfsic_dev_state *dev_state; @@ -2838,7 +2730,7 @@ static void __btrfsic_submit_bio(struct bio *bio) btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, bio, &bio_is_patched, - NULL, bio->bi_opf); + bio->bi_opf); bio_for_each_segment(bvec, bio, iter) kunmap(bvec.bv_page); kfree(mapped_datav); @@ -2862,8 +2754,8 @@ static void __btrfsic_submit_bio(struct bio *bio) block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_bh_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; block->next_in_same_bio = NULL; bio->bi_private = block; bio->bi_end_io = btrfsic_bio_end_io; diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 9bf4359cc44c..bcc730a06cb5 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -7,11 +7,9 @@ #define BTRFS_CHECK_INTEGRITY_H #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh); void btrfsic_submit_bio(struct bio *bio); int btrfsic_submit_bio_wait(struct bio *bio); #else -#define btrfsic_submit_bh submit_bh #define btrfsic_submit_bio submit_bio #define btrfsic_submit_bio_wait submit_bio_wait #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f2ec1a9bae28..bfedbbe2311f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -31,8 +31,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, static const struct btrfs_csums { u16 size; - const char *name; - const char *driver; + const char name[10]; + const char driver[12]; } btrfs_csums[] = { [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, @@ -63,7 +63,8 @@ const char *btrfs_super_csum_name(u16 csum_type) const char *btrfs_super_csum_driver(u16 csum_type) { /* csum type is validated at mount time */ - return btrfs_csums[csum_type].driver ?: + return btrfs_csums[csum_type].driver[0] ? + btrfs_csums[csum_type].driver : btrfs_csums[csum_type].name; } @@ -143,44 +144,6 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) return eb; } -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root. A locked buffer - * is returned, with a reference held. - */ -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) -{ - struct extent_buffer *eb; - - while (1) { - eb = btrfs_root_node(root); - btrfs_tree_lock(eb); - if (eb == root->node) - break; - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - return eb; -} - -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root. A locked buffer - * is returned, with a reference held. - */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) -{ - struct extent_buffer *eb; - - while (1) { - eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); - if (eb == root->node) - break; - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - } - return eb; -} - /* cowonly root (everything not a reference counted cow subvolume), just get * put onto a simple dirty list. transaction.c walks this to make sure they * get properly updated on disk. @@ -341,7 +304,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct rb_root *tm_root; struct rb_node *node; struct rb_node *next; - struct seq_list *cur_elem; struct tree_mod_elem *tm; u64 min_seq = (u64)-1; u64 seq_putting = elem->seq; @@ -353,18 +315,20 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, list_del(&elem->list); elem->seq = 0; - list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { - if (cur_elem->seq < min_seq) { - if (seq_putting > cur_elem->seq) { - /* - * blocker with lower sequence number exists, we - * cannot remove anything from the log - */ - write_unlock(&fs_info->tree_mod_log_lock); - return; - } - min_seq = cur_elem->seq; + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *first; + + first = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + if (seq_putting > first->seq) { + /* + * Blocker with lower sequence number exists, we + * cannot remove anything from the log. + */ + write_unlock(&fs_info->tree_mod_log_lock); + return; } + min_seq = first->seq; } /* @@ -962,9 +926,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (new_flags != 0) { int level = btrfs_header_level(buf); - ret = btrfs_set_disk_extent_flags(trans, - buf->start, - buf->len, + ret = btrfs_set_disk_extent_flags(trans, buf, new_flags, level, 0); if (ret) return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 36df977b64d9..8aa7b9dac405 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" +#include "locking.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -596,8 +597,8 @@ struct btrfs_fs_info { /* keep track of unallocated space */ atomic64_t free_chunk_space; - struct extent_io_tree freed_extents[2]; - struct extent_io_tree *pinned_extents; + /* Track ranges which are used by log trees blocks/logged data extents */ + struct extent_io_tree excluded_extents; /* logical->physical extent mapping */ struct extent_map_tree mapping_tree; @@ -696,7 +697,6 @@ struct btrfs_fs_info { struct rw_semaphore cleanup_work_sem; struct rw_semaphore subvol_sem; - struct srcu_struct subvol_srcu; spinlock_t trans_lock; /* @@ -947,6 +947,10 @@ struct btrfs_fs_info { #ifdef CONFIG_BTRFS_DEBUG struct kobject *debug_kobj; struct kobject *discard_debug_kobj; + struct list_head allocated_roots; + + spinlock_t eb_leak_lock; + struct list_head allocated_ebs; #endif }; @@ -955,11 +959,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) return sb->s_fs_info; } -struct btrfs_subvolume_writers { - struct percpu_counter counter; - wait_queue_head_t wait; -}; - /* * The state of btrfs root */ @@ -1131,8 +1130,9 @@ struct btrfs_root { * root_item_lock. */ int dedupe_in_progress; - struct btrfs_subvolume_writers *subv_writers; - atomic_t will_be_snapshotted; + /* For exclusion of snapshot creation and nocow writes */ + struct btrfs_drew_lock snapshot_lock; + atomic_t snapshot_force_cow; /* For qgroup metadata reserved space */ @@ -1149,6 +1149,10 @@ struct btrfs_root { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif + +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif }; struct btrfs_clone_extent_info { @@ -1971,16 +1975,6 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, btrfs_set_header_flags(eb, flags); } -static inline unsigned long btrfs_header_fsid(void) -{ - return offsetof(struct btrfs_header, fsid); -} - -static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb) -{ - return offsetof(struct btrfs_header, chunk_tree_uuid); -} - static inline int btrfs_is_leaf(const struct extent_buffer *eb) { return btrfs_header_level(eb) == 0; @@ -2458,9 +2452,9 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags); -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, + int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, @@ -2490,13 +2484,13 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 flags, + struct extent_buffer *eb, u64 flags, int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); @@ -2665,9 +2659,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(struct extent_buffer *leaf); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - int update_ref, int for_reloc); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, + int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2695,23 +2688,6 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info); } -static inline void free_fs_info(struct btrfs_fs_info *fs_info) -{ - kfree(fs_info->balance_ctl); - kfree(fs_info->delayed_root); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info->quota_root); - kfree(fs_info->uuid_root); - kfree(fs_info->free_space_root); - kfree(fs_info->super_copy); - kfree(fs_info->super_for_commit); - kvfree(fs_info); -} - /* tree mod log functions from ctree.c */ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); @@ -2750,9 +2726,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, - int (*check_func)(struct btrfs_fs_info *, u8 *, u8, - u64)); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, @@ -2859,6 +2833,12 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, struct btrfs_file_extent_item *fi, const bool new_inline, struct extent_map *em); +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); +u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, @@ -2996,9 +2976,6 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -3008,6 +2985,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid); static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) @@ -3401,6 +3380,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 4cdac4d834f5..1245739a3a6e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -9,6 +9,108 @@ #include "qgroup.h" #include "block-group.h" +/* + * HOW DOES THIS WORK + * + * There are two stages to data reservations, one for data and one for metadata + * to handle the new extents and checksums generated by writing data. + * + * + * DATA RESERVATION + * The general flow of the data reservation is as follows + * + * -> Reserve + * We call into btrfs_reserve_data_bytes() for the user request bytes that + * they wish to write. We make this reservation and add it to + * space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree + * for the range and carry on if this is buffered, or follow up trying to + * make a real allocation if we are pre-allocating or doing O_DIRECT. + * + * -> Use + * At writepages()/prealloc/O_DIRECT time we will call into + * btrfs_reserve_extent() for some part or all of this range of bytes. We + * will make the allocation and subtract space_info->bytes_may_use by the + * original requested length and increase the space_info->bytes_reserved by + * the allocated length. This distinction is important because compression + * may allocate a smaller on disk extent than we previously reserved. + * + * -> Allocation + * finish_ordered_io() will insert the new file extent item for this range, + * and then add a delayed ref update for the extent tree. Once that delayed + * ref is written the extent size is subtracted from + * space_info->bytes_reserved and added to space_info->bytes_used. + * + * Error handling + * + * -> By the reservation maker + * This is the simplest case, we haven't completed our operation and we know + * how much we reserved, we can simply call + * btrfs_free_reserved_data_space*() and it will be removed from + * space_info->bytes_may_use. + * + * -> After the reservation has been made, but before cow_file_range() + * This is specifically for the delalloc case. You must clear + * EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will + * be subtracted from space_info->bytes_may_use. + * + * METADATA RESERVATION + * The general metadata reservation lifetimes are discussed elsewhere, this + * will just focus on how it is used for delalloc space. + * + * We keep track of two things on a per inode bases + * + * ->outstanding_extents + * This is the number of file extent items we'll need to handle all of the + * outstanding DELALLOC space we have in this inode. We limit the maximum + * size of an extent, so a large contiguous dirty area may require more than + * one outstanding_extent, which is why count_max_extents() is used to + * determine how many outstanding_extents get added. + * + * ->csum_bytes + * This is essentially how many dirty bytes we have for this inode, so we + * can calculate the number of checksum items we would have to add in order + * to checksum our outstanding data. + * + * We keep a per-inode block_rsv in order to make it easier to keep track of + * our reservation. We use btrfs_calculate_inode_block_rsv_size() to + * calculate the current theoretical maximum reservation we would need for the + * metadata for this inode. We call this and then adjust our reservation as + * necessary, either by attempting to reserve more space, or freeing up excess + * space. + * + * OUTSTANDING_EXTENTS HANDLING + * + * ->outstanding_extents is used for keeping track of how many extents we will + * need to use for this inode, and it will fluctuate depending on where you are + * in the life cycle of the dirty data. Consider the following normal case for + * a completely clean inode, with a num_bytes < our maximum allowed extent size + * + * -> reserve + * ->outstanding_extents += 1 (current value is 1) + * + * -> set_delalloc + * ->outstanding_extents += 1 (currrent value is 2) + * + * -> btrfs_delalloc_release_extents() + * ->outstanding_extents -= 1 (current value is 1) + * + * We must call this once we are done, as we hold our reservation for the + * duration of our operation, and then assume set_delalloc will update the + * counter appropriately. + * + * -> add ordered extent + * ->outstanding_extents += 1 (current value is 2) + * + * -> btrfs_clear_delalloc_extent + * ->outstanding_extents -= 1 (current value is 1) + * + * -> finish_ordered_io/btrfs_remove_ordered_extent + * ->outstanding_extents -= 1 (current value is 0) + * + * Each stage is responsible for their own accounting of the extent, thus + * making error handling and cleanup easier. + */ + int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; @@ -228,8 +330,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, - &qgroup_to_release); + released = btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index d3e15e1d4a91..bf1595a42a98 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/iversion.h> +#include <linux/sched/mm.h> #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" @@ -595,8 +596,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, item->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, rsv, - item->bytes_reserved); + btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); } static int btrfs_delayed_inode_reserve_metadata( @@ -677,8 +677,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, rsv = &fs_info->delayed_block_rsv; trace_btrfs_space_reservation(fs_info, "delayed_inode", node->inode_id, node->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, rsv, - node->bytes_reserved); + btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL); if (qgroup_free) btrfs_qgroup_free_meta_prealloc(node->root, node->bytes_reserved); @@ -805,11 +804,14 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_delayed_item *delayed_item) { struct extent_buffer *leaf; + unsigned int nofs_flag; char *ptr; int ret; + nofs_flag = memalloc_nofs_save(); ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key, delayed_item->data_len); + memalloc_nofs_restore(nofs_flag); if (ret < 0 && ret != -EEXIST) return ret; @@ -937,6 +939,7 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_delayed_item *curr, *prev; + unsigned int nofs_flag; int ret = 0; do_again: @@ -945,7 +948,9 @@ do_again: if (!curr) goto delete_fail; + nofs_flag = memalloc_nofs_save(); ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); + memalloc_nofs_restore(nofs_flag); if (ret < 0) goto delete_fail; else if (ret > 0) { @@ -1012,6 +1017,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + unsigned int nofs_flag; int mod; int ret; @@ -1024,7 +1030,9 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, else mod = 1; + nofs_flag = memalloc_nofs_save(); ret = btrfs_lookup_inode(trans, root, path, &key, mod); + memalloc_nofs_restore(nofs_flag); if (ret > 0) { btrfs_release_path(path); return -ENOENT; @@ -1075,7 +1083,10 @@ search: key.type = BTRFS_INODE_EXTREF_KEY; key.offset = -1; + + nofs_flag = memalloc_nofs_save(); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + memalloc_nofs_restore(nofs_flag); if (ret < 0) goto err_out; ASSERT(ret); @@ -1139,7 +1150,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) int ret = 0; bool count = (nr > 0); - if (trans->aborted) + if (TRANS_ABORTED(trans)) return -EIO; path = btrfs_alloc_path(); @@ -1760,6 +1771,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct inode *inode, u32 *rdev) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; @@ -1779,6 +1791,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); + btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, + round_up(i_size_read(inode), fs_info->sectorsize)); inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 74ae226ffaf0..ca96ef007d8f 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -70,7 +70,7 @@ struct btrfs_delayed_item { refcount_t refs; int ins_or_del; u32 data_len; - char data[0]; + char data[]; }; static inline void btrfs_init_delayed_root( diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index dfdb7d4f8406..353cc2994d10 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -82,8 +82,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); u64 released = 0; - released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, - NULL); + released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2ca2a09d0e23..db93909b25e0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -22,6 +22,46 @@ #include "dev-replace.h" #include "sysfs.h" +/* + * Device replace overview + * + * [Objective] + * To copy all extents (both new and on-disk) from source device to target + * device, while still keeping the filesystem read-write. + * + * [Method] + * There are two main methods involved: + * + * - Write duplication + * + * All new writes will be written to both target and source devices, so even + * if replace gets canceled, sources device still contans up-to-date data. + * + * Location: handle_ops_on_dev_replace() from __btrfs_map_block() + * Start: btrfs_dev_replace_start() + * End: btrfs_dev_replace_finishing() + * Content: Latest data/metadata + * + * - Copy existing extents + * + * This happens by re-using scrub facility, as scrub also iterates through + * existing extents from commit root. + * + * Location: scrub_write_block_to_dev_replace() from + * scrub_block_complete() + * Content: Data/meta from commit root. + * + * Due to the content difference, we need to avoid nocow write when dev-replace + * is happening. This is done by marking the block group read-only and waiting + * for NOCOW writes. + * + * After replace is done, the finishing part is done by swapping the target and + * source devices. + * + * Location: btrfs_dev_replace_update_device_in_mapping_tree() from + * btrfs_dev_replace_finishing() + */ + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); static void btrfs_dev_replace_update_device_in_mapping_tree( @@ -472,7 +512,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); + ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device); if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); @@ -703,7 +743,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* replace the sysfs entry */ - btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); + btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); btrfs_sysfs_update_devid(tgt_device); btrfs_rm_dev_replace_free_srcdev(src_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c6c9a6a8e6c8..a6cb5cbbdb9f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -7,7 +7,6 @@ #include <linux/blkdev.h> #include <linux/radix-tree.h> #include <linux/writeback.h> -#include <linux/buffer_head.h> #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/slab.h> @@ -42,6 +41,7 @@ #include "ref-verify.h" #include "block-group.h" #include "discard.h" +#include "space-info.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -98,6 +98,12 @@ void __cold btrfs_end_io_wq_exit(void) kmem_cache_destroy(btrfs_end_io_wq_cache); } +static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) +{ + if (fs_info->csum_shash) + crypto_free_shash(fs_info->csum_shash); +} + /* * async submit bios are used to offload expensive checksumming * onto the worker threads. They checksum file and metadata bios @@ -247,47 +253,27 @@ out: /* * Compute the csum of a btree block and store the result to provided buffer. - * - * Returns error if the extent buffer cannot be mapped. */ -static int csum_tree_block(struct extent_buffer *buf, u8 *result) +static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; + const int num_pages = fs_info->nodesize >> PAGE_SHIFT; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - unsigned long len; - unsigned long cur_len; - unsigned long offset = BTRFS_CSUM_SIZE; char *kaddr; - unsigned long map_start; - unsigned long map_len; - int err; + int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); + kaddr = page_address(buf->pages[0]); + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, + PAGE_SIZE - BTRFS_CSUM_SIZE); - len = buf->len - offset; - - while (len > 0) { - /* - * Note: we don't need to check for the err == 1 case here, as - * with the given combination of 'start = BTRFS_CSUM_SIZE (32)' - * and 'min_len = 32' and the currently implemented mapping - * algorithm we cannot cross a page boundary. - */ - err = map_private_extent_buffer(buf, offset, 32, - &kaddr, &map_start, &map_len); - if (WARN_ON(err)) - return err; - cur_len = min(len, map_len - (offset - map_start)); - crypto_shash_update(shash, kaddr + offset - map_start, cur_len); - len -= cur_len; - offset += cur_len; + for (i = 1; i < num_pages; i++) { + kaddr = page_address(buf->pages[i]); + crypto_shash_update(shash, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); - crypto_shash_final(shash, result); - - return 0; } /* @@ -535,10 +521,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) return -EUCLEAN; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, - btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); + offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE) == 0); - if (csum_tree_block(eb, result)) - return -EINVAL; + csum_tree_block(eb, result); if (btrfs_header_level(eb)) ret = btrfs_check_node(eb); @@ -565,7 +551,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb) u8 fsid[BTRFS_FSID_SIZE]; int ret = 1; - read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); + read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE); while (fs_devices) { u8 *metadata_uuid; @@ -596,9 +583,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 found_start; int found_level; struct extent_buffer *eb; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct btrfs_fs_info *fs_info = root->fs_info; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + struct btrfs_fs_info *fs_info; + u16 csum_size; int ret = 0; u8 result[BTRFS_CSUM_SIZE]; int reads_done; @@ -607,6 +593,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto out; eb = (struct extent_buffer *)page->private; + fs_info = eb->fs_info; + csum_size = btrfs_super_csum_size(fs_info->super_copy); /* the pending IO might have been the only thing that kept this buffer * in memory. Make sure we have a ref for all this other checks @@ -647,9 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(eb, result); - if (ret) - goto err; + csum_tree_block(eb, result); if (memcmp_extent_buffer(eb, result, 0, csum_size)) { u32 val; @@ -972,9 +958,7 @@ static int btree_writepages(struct address_space *mapping, static int btree_readpage(struct file *file, struct page *page) { - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent, 0); + return extent_read_full_page(page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) @@ -1100,36 +1084,11 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) } } -static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) -{ - struct btrfs_subvolume_writers *writers; - int ret; - - writers = kmalloc(sizeof(*writers), GFP_NOFS); - if (!writers) - return ERR_PTR(-ENOMEM); - - ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS); - if (ret < 0) { - kfree(writers); - return ERR_PTR(ret); - } - - init_waitqueue_head(&writers->wait); - return writers; -} - -static void -btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) -{ - percpu_counter_destroy(&writers->counter); - kfree(writers); -} - static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + root->fs_info = fs_info; root->node = NULL; root->commit_root = NULL; root->state = 0; @@ -1173,7 +1132,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); refcount_set(&root->refs, 1); - atomic_set(&root->will_be_snapshotted, 0); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); root->log_transid = 0; @@ -1195,14 +1153,20 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, spin_lock_init(&root->root_item_lock); btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); +#ifdef CONFIG_BTRFS_DEBUG + INIT_LIST_HEAD(&root->leak_list); + spin_lock(&fs_info->fs_roots_radix_lock); + list_add_tail(&root->leak_list, &fs_info->allocated_roots); + spin_unlock(&fs_info->fs_roots_radix_lock); +#endif } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, - gfp_t flags) + u64 objectid, gfp_t flags) { struct btrfs_root *root = kzalloc(sizeof(*root), flags); if (root) - root->fs_info = fs_info; + __setup_root(root, fs_info, objectid); return root; } @@ -1215,12 +1179,11 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) if (!fs_info) return ERR_PTR(-EINVAL); - root = btrfs_alloc_root(fs_info, GFP_KERNEL); + root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); /* We don't use the stripesize in selftest, set it as sectorsize */ - __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID); root->alloc_bytenr = 0; return root; @@ -1237,19 +1200,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_key key; unsigned int nofs_flag; int ret = 0; - uuid_le uuid = NULL_UUID_LE; /* * We're holding a transaction handle, so use a NOFS memory allocation * context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); - root = btrfs_alloc_root(fs_info, GFP_KERNEL); + root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(root, fs_info, objectid); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; @@ -1277,8 +1238,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); if (is_fstree(objectid)) - uuid_le_gen(&uuid); - memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(root->root_item.uuid); + else + export_guid(root->root_item.uuid, &guid_null); root->root_item.drop_level = 0; key.objectid = objectid; @@ -1293,12 +1255,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, return root; fail: - if (leaf) { + if (leaf) btrfs_tree_unlock(leaf); - free_extent_buffer(root->commit_root); - free_extent_buffer(leaf); - } - kfree(root); + btrfs_put_root(root); return ERR_PTR(ret); } @@ -1309,12 +1268,10 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct extent_buffer *leaf; - root = btrfs_alloc_root(fs_info, GFP_NOFS); + root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID); - root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; @@ -1331,7 +1288,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { - kfree(root); + btrfs_put_root(root); return ERR_CAST(leaf); } @@ -1387,8 +1344,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; @@ -1401,14 +1358,12 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, if (!path) return ERR_PTR(-ENOMEM); - root = btrfs_alloc_root(fs_info, GFP_NOFS); + root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); if (!root) { ret = -ENOMEM; goto alloc_fail; } - __setup_root(root, fs_info, key->objectid); - ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); if (ret) { @@ -1424,10 +1379,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); + root->node = NULL; goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - free_extent_buffer(root->node); goto find_fail; } root->commit_root = btrfs_root_node(root); @@ -1436,33 +1391,16 @@ out: return root; find_fail: - kfree(root); + btrfs_put_root(root); alloc_fail: root = ERR_PTR(ret); goto out; } -struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, - struct btrfs_key *location) -{ - struct btrfs_root *root; - - root = btrfs_read_tree_root(tree_root, location); - if (IS_ERR(root)) - return root; - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - set_bit(BTRFS_ROOT_REF_COWS, &root->state); - btrfs_check_and_init_root_item(&root->root_item); - } - - return root; -} - -int btrfs_init_fs_root(struct btrfs_root *root) +static int btrfs_init_fs_root(struct btrfs_root *root) { int ret; - struct btrfs_subvolume_writers *writers; + unsigned int nofs_flag; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), @@ -1472,12 +1410,20 @@ int btrfs_init_fs_root(struct btrfs_root *root) goto fail; } - writers = btrfs_alloc_subvolume_writers(); - if (IS_ERR(writers)) { - ret = PTR_ERR(writers); + /* + * We might be called under a transaction (e.g. indirect backref + * resolution) which could deadlock if it triggers memory reclaim + */ + nofs_flag = memalloc_nofs_save(); + ret = btrfs_drew_lock_init(&root->snapshot_lock); + memalloc_nofs_restore(nofs_flag); + if (ret) goto fail; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + set_bit(BTRFS_ROOT_REF_COWS, &root->state); + btrfs_check_and_init_root_item(&root->root_item); } - root->subv_writers = writers; btrfs_init_free_ino_ctl(root); spin_lock_init(&root->ino_cache_lock); @@ -1505,14 +1451,16 @@ fail: return ret; } -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); + if (root) + root = btrfs_grab_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1530,14 +1478,62 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); - if (ret == 0) + if (ret == 0) { + btrfs_grab_root(root); set_bit(BTRFS_ROOT_IN_RADIX, &root->state); + } spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); return ret; } +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) +{ +#ifdef CONFIG_BTRFS_DEBUG + struct btrfs_root *root; + + while (!list_empty(&fs_info->allocated_roots)) { + root = list_first_entry(&fs_info->allocated_roots, + struct btrfs_root, leak_list); + btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", + root->root_key.objectid, root->root_key.offset, + refcount_read(&root->refs)); + while (refcount_read(&root->refs) > 1) + btrfs_put_root(root); + btrfs_put_root(root); + } +#endif +} + +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) +{ + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->dio_bytes); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); + btrfs_free_csum_hash(fs_info); + btrfs_free_stripe_hash_table(fs_info); + btrfs_free_ref_cache(fs_info); + kfree(fs_info->balance_ctl); + kfree(fs_info->delayed_root); + btrfs_put_root(fs_info->extent_root); + btrfs_put_root(fs_info->tree_root); + btrfs_put_root(fs_info->chunk_root); + btrfs_put_root(fs_info->dev_root); + btrfs_put_root(fs_info->csum_root); + btrfs_put_root(fs_info->quota_root); + btrfs_put_root(fs_info->uuid_root); + btrfs_put_root(fs_info->free_space_root); + btrfs_put_root(fs_info->fs_root); + btrfs_check_leaked_roots(fs_info); + btrfs_extent_buffer_leak_debug_check(fs_info); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kvfree(fs_info); +} + + struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *location, bool check_ref) @@ -1548,33 +1544,35 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) - return fs_info->tree_root; + return btrfs_grab_root(fs_info->tree_root); if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) - return fs_info->extent_root; + return btrfs_grab_root(fs_info->extent_root); if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) - return fs_info->chunk_root; + return btrfs_grab_root(fs_info->chunk_root); if (location->objectid == BTRFS_DEV_TREE_OBJECTID) - return fs_info->dev_root; + return btrfs_grab_root(fs_info->dev_root); if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) - return fs_info->csum_root; + return btrfs_grab_root(fs_info->csum_root); if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) - return fs_info->quota_root ? fs_info->quota_root : - ERR_PTR(-ENOENT); + return btrfs_grab_root(fs_info->quota_root) ? + fs_info->quota_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_UUID_TREE_OBJECTID) - return fs_info->uuid_root ? fs_info->uuid_root : - ERR_PTR(-ENOENT); + return btrfs_grab_root(fs_info->uuid_root) ? + fs_info->uuid_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return fs_info->free_space_root ? fs_info->free_space_root : - ERR_PTR(-ENOENT); + return btrfs_grab_root(fs_info->free_space_root) ? + fs_info->free_space_root : ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { - if (check_ref && btrfs_root_refs(&root->root_item) == 0) + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { + btrfs_put_root(root); return ERR_PTR(-ENOENT); + } return root; } - root = btrfs_read_fs_root(fs_info->tree_root, location); + root = btrfs_read_tree_root(fs_info->tree_root, location); if (IS_ERR(root)) return root; @@ -1605,15 +1603,14 @@ again: ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - if (ret == -EEXIST) { - btrfs_free_fs_root(root); + btrfs_put_root(root); + if (ret == -EEXIST) goto again; - } goto fail; } return root; fail: - btrfs_free_fs_root(root); + btrfs_put_root(root); return ERR_PTR(ret); } @@ -1985,11 +1982,35 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); + free_root_extent_buffers(info->fs_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root); } +void btrfs_put_root(struct btrfs_root *root) +{ + if (!root) + return; + + if (refcount_dec_and_test(&root->refs)) { + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (root->anon_dev) + free_anon_bdev(root->anon_dev); + btrfs_drew_lock_destroy(&root->snapshot_lock); + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); +#ifdef CONFIG_BTRFS_DEBUG + spin_lock(&root->fs_info->fs_roots_radix_lock); + list_del_init(&root->leak_list); + spin_unlock(&root->fs_info->fs_roots_radix_lock); +#endif + kfree(root); + } +} + void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; @@ -2001,13 +2022,9 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root, root_list); list_del(&gang[0]->root_list); - if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) btrfs_drop_and_free_fs_root(fs_info, gang[0]); - } else { - free_extent_buffer(gang[0]->node); - free_extent_buffer(gang[0]->commit_root); - btrfs_put_fs_root(gang[0]); - } + btrfs_put_root(gang[0]); } while (1) { @@ -2020,10 +2037,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_free_log_root_tree(NULL, fs_info); - btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); - } } static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) @@ -2069,7 +2084,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(inode)->root = fs_info->tree_root; + BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); btrfs_insert_inode_hash(inode); @@ -2189,11 +2204,6 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) return 0; } -static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) -{ - crypto_free_shash(fs_info->csum_shash); -} - static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { @@ -2208,24 +2218,23 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return -EIO; } - log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); + log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, + GFP_KERNEL); if (!log_tree_root) return -ENOMEM; - __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); - log_tree_root->node = read_tree_block(fs_info, bytenr, fs_info->generation + 1, level, NULL); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); - kfree(log_tree_root); + log_tree_root->node = NULL; + btrfs_put_root(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + btrfs_put_root(log_tree_root); return -EIO; } /* returns with log_tree_root freed on success */ @@ -2233,8 +2242,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (ret) { btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + btrfs_put_root(log_tree_root); return ret; } @@ -2624,67 +2632,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } -int __cold open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options) +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - u32 sectorsize; - u32 nodesize; - u32 stripesize; - u64 generation; - u64 features; - u16 csum_type; - struct btrfs_key location; - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *tree_root; - struct btrfs_root *chunk_root; - int ret; - int err = -EINVAL; - int clear_free_space_tree = 0; - int level; - - tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); - chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); - if (!tree_root || !chunk_root) { - err = -ENOMEM; - goto fail; - } - - ret = init_srcu_struct(&fs_info->subvol_srcu); - if (ret) { - err = ret; - goto fail; - } - - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_srcu; - } - - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_dio_bytes; - } - fs_info->dirty_metadata_batch = PAGE_SIZE * - (1 + ilog2(nr_cpu_ids)); - - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_dirty_metadata_bytes; - } - - ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, - GFP_KERNEL); - if (ret) { - err = ret; - goto fail_delalloc_bytes; - } - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -2711,6 +2660,11 @@ int __cold open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); +#ifdef CONFIG_BTRFS_DEBUG + INIT_LIST_HEAD(&fs_info->allocated_roots); + INIT_LIST_HEAD(&fs_info->allocated_ebs); + spin_lock_init(&fs_info->eb_leak_lock); +#endif extent_map_tree_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); @@ -2727,7 +2681,6 @@ int __cold open_ctree(struct super_block *sb, atomic_set(&fs_info->reada_works_cnt, 0); atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); - fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; @@ -2746,21 +2699,6 @@ int __cold open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; - goto fail_bio_counter; - } - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - - fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), - GFP_KERNEL); - if (!fs_info->delayed_root) { - err = -ENOMEM; - goto fail_iput; - } - btrfs_init_delayed_root(fs_info->delayed_root); - btrfs_init_scrub(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; @@ -2768,20 +2706,12 @@ int __cold open_ctree(struct super_block *sb, btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); - sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; - sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - - btrfs_init_btree_inode(fs_info); - spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; - extent_io_tree_init(fs_info, &fs_info->freed_extents[0], - IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); - extent_io_tree_init(fs_info, &fs_info->freed_extents[1], - IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); - fs_info->pinned_extents = &fs_info->freed_extents[0]; + extent_io_tree_init(fs_info, &fs_info->excluded_extents, + IO_TREE_FS_EXCLUDED_EXTENTS, NULL); set_bit(BTRFS_FS_BARRIER, &fs_info->flags); mutex_init(&fs_info->ordered_operations_mutex); @@ -2817,23 +2747,135 @@ int __cold open_ctree(struct super_block *sb, fs_info->swapfile_pins = RB_ROOT; fs_info->send_in_progress = 0; +} + +static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) +{ + int ret; + + fs_info->sb = sb; + sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; + sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); + + ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); + if (ret) + return ret; + + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); + if (ret) + return ret; + + fs_info->dirty_metadata_batch = PAGE_SIZE * + (1 + ilog2(nr_cpu_ids)); + + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); + if (ret) + return ret; + + ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, + GFP_KERNEL); + if (ret) + return ret; - ret = btrfs_alloc_stripe_hash_table(fs_info); + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), + GFP_KERNEL); + if (!fs_info->delayed_root) + return -ENOMEM; + btrfs_init_delayed_root(fs_info->delayed_root); + + return btrfs_alloc_stripe_hash_table(fs_info); +} + +static int btrfs_uuid_rescan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + int ret; + + /* + * 1st step is to iterate through the existing UUID tree and + * to delete all entries that contain outdated data. + * 2nd step is to add all missing entries to the UUID tree. + */ + ret = btrfs_uuid_tree_iterate(fs_info); + if (ret < 0) { + if (ret != -EINTR) + btrfs_warn(fs_info, "iterating uuid_tree failed %d", + ret); + up(&fs_info->uuid_tree_rescan_sem); + return ret; + } + return btrfs_uuid_scan_kthread(data); +} + +static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_rescan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 stripesize; + u64 generation; + u64 features; + u16 csum_type; + struct btrfs_key location; + struct btrfs_super_block *disk_super; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + int ret; + int err = -EINVAL; + int clear_free_space_tree = 0; + int level; + + ret = init_mount_fs_info(fs_info, sb); if (ret) { err = ret; - goto fail_alloc; + goto fail; } - __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); + /* These need to be init'ed before we start creating inodes and such. */ + tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, + GFP_KERNEL); + fs_info->tree_root = tree_root; + chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, + GFP_KERNEL); + fs_info->chunk_root = chunk_root; + if (!tree_root || !chunk_root) { + err = -ENOMEM; + goto fail; + } + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail; + } + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); + btrfs_init_btree_inode(fs_info); invalidate_bdev(fs_devices->latest_bdev); /* * Read super block and check the signature bytes only */ - bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (IS_ERR(bh)) { - err = PTR_ERR(bh); + disk_super = btrfs_read_dev_super(fs_devices->latest_bdev); + if (IS_ERR(disk_super)) { + err = PTR_ERR(disk_super); goto fail_alloc; } @@ -2841,18 +2883,19 @@ int __cold open_ctree(struct super_block *sb, * Verify the type first, if that or the the checksum value are * corrupted, we'll find out */ - csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); + csum_type = btrfs_super_csum_type(disk_super); if (!btrfs_supported_super_csum(csum_type)) { btrfs_err(fs_info, "unsupported checksum algorithm: %u", csum_type); err = -EINVAL; - brelse(bh); + btrfs_release_disk_super(disk_super); goto fail_alloc; } ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { err = ret; + btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -2860,11 +2903,11 @@ int __cold open_ctree(struct super_block *sb, * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ - if (btrfs_check_super_csum(fs_info, bh->b_data)) { + if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; - brelse(bh); - goto fail_csum; + btrfs_release_disk_super(disk_super); + goto fail_alloc; } /* @@ -2872,8 +2915,8 @@ int __cold open_ctree(struct super_block *sb, * following bytes up to INFO_SIZE, the checksum is calculated from * the whole block of INFO_SIZE */ - memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); - brelse(bh); + memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy)); + btrfs_release_disk_super(disk_super); disk_super = fs_info->super_copy; @@ -2901,11 +2944,11 @@ int __cold open_ctree(struct super_block *sb, if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } if (!btrfs_super_root(disk_super)) - goto fail_csum; + goto fail_alloc; /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -2920,7 +2963,7 @@ int __cold open_ctree(struct super_block *sb, ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; - goto fail_csum; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super) & @@ -2930,7 +2973,7 @@ int __cold open_ctree(struct super_block *sb, "cannot mount because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super); @@ -2974,7 +3017,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_err(fs_info, "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", nodesize, sectorsize); - goto fail_csum; + goto fail_alloc; } /* @@ -2990,7 +3033,7 @@ int __cold open_ctree(struct super_block *sb, "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3021,8 +3064,6 @@ int __cold open_ctree(struct super_block *sb, generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); - __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); - chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), generation, level, NULL); @@ -3038,7 +3079,8 @@ int __cold open_ctree(struct super_block *sb, chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); + offsetof(struct btrfs_header, chunk_tree_uuid), + BTRFS_UUID_SIZE); ret = btrfs_read_chunk_tree(fs_info); if (ret) { @@ -3061,6 +3103,18 @@ int __cold open_ctree(struct super_block *sb, if (ret) goto fail_tree_roots; + /* + * If we have a uuid root and we're not being told to rescan we need to + * check the generation here so we can set the + * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the + * transaction during a balance or the log replay without updating the + * uuid generation, and then if we crash we would rescan the uuid tree, + * even though it was perfectly fine. + */ + if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && + fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + ret = btrfs_verify_dev_extents(fs_info); if (ret) { btrfs_err(fs_info, @@ -3196,7 +3250,7 @@ int __cold open_ctree(struct super_block *sb, location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; - fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); + fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); btrfs_warn(fs_info, "failed to read fs tree: %d", err); @@ -3285,8 +3339,6 @@ int __cold open_ctree(struct super_block *sb, close_ctree(fs_info); return ret; } - } else { - set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); } set_bit(BTRFS_FS_OPEN, &fs_info->flags); @@ -3329,90 +3381,78 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); -fail_csum: - btrfs_free_csum_hash(fs_info); fail_alloc: -fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); -fail_bio_counter: - percpu_counter_destroy(&fs_info->dev_replace.bio_counter); -fail_delalloc_bytes: - percpu_counter_destroy(&fs_info->delalloc_bytes); -fail_dirty_metadata_bytes: - percpu_counter_destroy(&fs_info->dirty_metadata_bytes); -fail_dio_bytes: - percpu_counter_destroy(&fs_info->dio_bytes); -fail_srcu: - cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); -static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +static void btrfs_end_super_write(struct bio *bio) { - if (uptodate) { - set_buffer_uptodate(bh); - } else { - struct btrfs_device *device = (struct btrfs_device *) - bh->b_private; - - btrfs_warn_rl_in_rcu(device->fs_info, - "lost page write due to IO error on %s", - rcu_str_deref(device->name)); - /* note, we don't set_buffer_write_io_error because we have - * our own ways of dealing with the IO errors - */ - clear_buffer_uptodate(bh); - btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); + struct btrfs_device *device = bio->bi_private; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + struct page *page; + + bio_for_each_segment_all(bvec, bio, iter_all) { + page = bvec->bv_page; + + if (bio->bi_status) { + btrfs_warn_rl_in_rcu(device->fs_info, + "lost page write due to IO error on %s (%d)", + rcu_str_deref(device->name), + blk_status_to_errno(bio->bi_status)); + ClearPageUptodate(page); + SetPageError(page); + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_WRITE_ERRS); + } else { + SetPageUptodate(page); + } + + put_page(page); + unlock_page(page); } - unlock_buffer(bh); - put_bh(bh); + + bio_put(bio); } -int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, - struct buffer_head **bh_ret) +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, + int copy_num) { - struct buffer_head *bh; struct btrfs_super_block *super; + struct page *page; u64 bytenr; + struct address_space *mapping = bdev->bd_inode->i_mapping; bytenr = btrfs_sb_offset(copy_num); if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) - return -EINVAL; + return ERR_PTR(-EINVAL); - bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); - /* - * If we fail to read from the underlying devices, as of now - * the best option we have is to mark it EIO. - */ - if (!bh) - return -EIO; + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page)) + return ERR_CAST(page); - super = (struct btrfs_super_block *)bh->b_data; + super = page_address(page); if (btrfs_super_bytenr(super) != bytenr || btrfs_super_magic(super) != BTRFS_MAGIC) { - brelse(bh); - return -EINVAL; + btrfs_release_disk_super(super); + return ERR_PTR(-EINVAL); } - *bh_ret = bh; - return 0; + return super; } -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) { - struct buffer_head *bh; - struct buffer_head *latest = NULL; - struct btrfs_super_block *super; + struct btrfs_super_block *super, *latest = NULL; int i; u64 transid = 0; - int ret = -EINVAL; /* we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. @@ -3420,48 +3460,41 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - ret = btrfs_read_dev_one_super(bdev, i, &bh); - if (ret) + super = btrfs_read_dev_one_super(bdev, i); + if (IS_ERR(super)) continue; - super = (struct btrfs_super_block *)bh->b_data; - if (!latest || btrfs_super_generation(super) > transid) { - brelse(latest); - latest = bh; + if (latest) + btrfs_release_disk_super(super); + + latest = super; transid = btrfs_super_generation(super); - } else { - brelse(bh); } } - if (!latest) - return ERR_PTR(ret); - - return latest; + return super; } /* * Write superblock @sb to the @device. Do not wait for completion, all the - * buffer heads we write are pinned. + * pages we use for writing are locked. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * - * Return number of errors when buffer head is not found or submission fails. + * Return number of errors when page is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct btrfs_fs_info *fs_info = device->fs_info; + struct address_space *mapping = device->bdev->bd_inode->i_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - struct buffer_head *bh; int i; - int ret; int errors = 0; u64 bytenr; - int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3469,6 +3502,10 @@ static int write_dev_supers(struct btrfs_device *device, shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { + struct page *page; + struct bio *bio; + struct btrfs_super_block *disk_super; + bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) @@ -3481,37 +3518,45 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); crypto_shash_final(shash, sb->csum); - /* One reference for us, and we leave it for the caller */ - bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { + page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, + GFP_NOFS); + if (!page) { btrfs_err(device->fs_info, - "couldn't get super buffer head for bytenr %llu", + "couldn't get super block page for bytenr %llu", bytenr); errors++; continue; } - memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + /* Bump the refcount for wait_dev_supers() */ + get_page(page); - /* one reference for submit_bh */ - get_bh(bh); + disk_super = page_address(page); + memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); - set_buffer_uptodate(bh); - lock_buffer(bh); - bh->b_end_io = btrfs_end_buffer_write_sync; - bh->b_private = device; + /* + * Directly use bios here instead of relying on the page cache + * to do I/O, so we don't lose the ability to do integrity + * checking. + */ + bio = bio_alloc(GFP_NOFS, 1); + bio_set_dev(bio, device->bdev); + bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; + bio->bi_private = device; + bio->bi_end_io = btrfs_end_super_write; + __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, + offset_in_page(bytenr)); /* - * we fua the first super. The others we allow - * to go down lazy. + * We FUA only the first super block. The others we allow to + * go down lazy and there's a short window where the on-disk + * copies might still contain the older version. */ - op_flags = REQ_SYNC | REQ_META | REQ_PRIO; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) - op_flags |= REQ_FUA; - ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh); - if (ret) - errors++; + bio->bi_opf |= REQ_FUA; + + btrfsic_submit_bio(bio); } return errors < i ? 0 : -1; } @@ -3520,12 +3565,11 @@ static int write_dev_supers(struct btrfs_device *device, * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * - * Return number of errors when buffer head is not found or not marked up to + * Return number of errors when page is not found or not marked up to * date. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { - struct buffer_head *bh; int i; int errors = 0; bool primary_failed = false; @@ -3535,32 +3579,34 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { + struct page *page; + bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; - bh = __find_get_block(device->bdev, - bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { + page = find_get_page(device->bdev->bd_inode->i_mapping, + bytenr >> PAGE_SHIFT); + if (!page) { errors++; if (i == 0) primary_failed = true; continue; } - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { + /* Page is submitted locked and unlocked once the IO completes */ + wait_on_page_locked(page); + if (PageError(page)) { errors++; if (i == 0) primary_failed = true; } - /* drop our reference */ - brelse(bh); + /* Drop our reference */ + put_page(page); - /* drop the reference from the writing run */ - brelse(bh); + /* Drop the reference from the writing run */ + put_page(page); } /* log error, force error return */ @@ -3832,20 +3878,19 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { + bool drop_ref = false; + spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); - if (btrfs_root_refs(&root->root_item) == 0) - synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); if (root->reloc_root) { - free_extent_buffer(root->reloc_root->node); - free_extent_buffer(root->reloc_root->commit_root); - btrfs_put_fs_root(root->reloc_root); + btrfs_put_root(root->reloc_root); root->reloc_root = NULL; } } @@ -3854,22 +3899,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, __btrfs_remove_free_space_cache(root->free_ino_pinned); if (root->free_ino_ctl) __btrfs_remove_free_space_cache(root->free_ino_ctl); - btrfs_free_fs_root(root); -} - -void btrfs_free_fs_root(struct btrfs_root *root) -{ - iput(root->ino_cache_inode); - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - if (root->anon_dev) - free_anon_bdev(root->anon_dev); - if (root->subv_writers) - btrfs_free_subvolume_writers(root->subv_writers); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); - btrfs_put_fs_root(root); + if (root->ino_cache_inode) { + iput(root->ino_cache_inode); + root->ino_cache_inode = NULL; + } + if (drop_ref) + btrfs_put_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -3879,15 +3914,14 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) int i = 0; int err = 0; unsigned int ret = 0; - int index; while (1) { - index = srcu_read_lock(&fs_info->subvol_srcu); + spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); if (!ret) { - srcu_read_unlock(&fs_info->subvol_srcu, index); + spin_unlock(&fs_info->fs_roots_radix_lock); break; } root_objectid = gang[ret - 1]->root_key.objectid + 1; @@ -3899,9 +3933,9 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) continue; } /* grab all the search result for later use */ - gang[i] = btrfs_grab_fs_root(gang[i]); + gang[i] = btrfs_grab_root(gang[i]); } - srcu_read_unlock(&fs_info->subvol_srcu, index); + spin_unlock(&fs_info->fs_roots_radix_lock); for (i = 0; i < ret; i++) { if (!gang[i]) @@ -3910,7 +3944,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) err = btrfs_orphan_cleanup(gang[i]); if (err) break; - btrfs_put_fs_root(gang[i]); + btrfs_put_root(gang[i]); } root_objectid++; } @@ -3918,7 +3952,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) /* release the uncleaned roots due to error */ for (; i < ret; i++) { if (gang[i]) - btrfs_put_fs_root(gang[i]); + btrfs_put_root(gang[i]); } return err; } @@ -3990,6 +4024,19 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ btrfs_delete_unused_bgs(fs_info); + /* + * There might be existing delayed inode workers still running + * and holding an empty delayed inode item. We must wait for + * them to complete first because they can create a transaction. + * This happens when someone calls btrfs_balance_delayed_items() + * and then a transaction commit runs the same delayed nodes + * before any delayed worker has done something with the nodes. + * We must wait for any worker here and not at transaction + * commit time since that could cause a deadlock. + * This is a very rare case. + */ + btrfs_flush_workqueue(fs_info->delayed_workers); + ret = btrfs_commit_super(fs_info); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); @@ -4020,8 +4067,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); - btrfs_free_fs_roots(fs_info); - btrfs_put_block_group_cache(fs_info); /* @@ -4033,6 +4078,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); + btrfs_free_fs_roots(fs_info); /* * We must free the block groups after dropping the fs_roots as we could @@ -4052,16 +4098,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_mapping_tree_free(&fs_info->mapping_tree); btrfs_close_devices(fs_info->fs_devices); - - percpu_counter_destroy(&fs_info->dirty_metadata_bytes); - percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->dio_bytes); - percpu_counter_destroy(&fs_info->dev_replace.bio_counter); - cleanup_srcu_struct(&fs_info->subvol_srcu); - - btrfs_free_csum_hash(fs_info); - btrfs_free_stripe_hash_table(fs_info); - btrfs_free_ref_cache(fs_info); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -4235,7 +4271,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_lock(&delayed_refs->lock); if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); - btrfs_info(fs_info, "delayed_refs has NO entry"); + btrfs_debug(fs_info, "delayed_refs has NO entry"); return ret; } @@ -4269,9 +4305,30 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (pin_bytes) - btrfs_pin_extent(fs_info, head->bytenr, - head->num_bytes, 1); + if (pin_bytes) { + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, head->bytenr); + BUG_ON(!cache); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += head->num_bytes; + btrfs_space_info_update_bytes_pinned(fs_info, + cache->space_info, head->num_bytes); + cache->reserved -= head->num_bytes; + cache->space_info->bytes_reserved -= head->num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + percpu_counter_add_batch( + &cache->space_info->total_bytes_pinned, + head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); + + btrfs_put_block_group(cache); + + btrfs_error_unpin_extent_range(fs_info, head->bytenr, + head->bytenr + head->num_bytes - 1); + } btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); @@ -4327,12 +4384,12 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); spin_unlock(&fs_info->delalloc_root_lock); btrfs_destroy_delalloc_inodes(root); - btrfs_put_fs_root(root); + btrfs_put_root(root); spin_lock(&fs_info->delalloc_root_lock); } @@ -4373,16 +4430,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, } static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *pinned_extents) + struct extent_io_tree *unpin) { - struct extent_io_tree *unpin; u64 start; u64 end; int ret; - bool loop = true; - unpin = pinned_extents; -again: while (1) { struct extent_state *cached_state = NULL; @@ -4407,15 +4460,6 @@ again: cond_resched(); } - if (loop) { - if (unpin == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; - loop = false; - goto again; - } - return 0; } @@ -4506,8 +4550,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); - btrfs_destroy_pinned_extent(fs_info, - fs_info->pinned_extents); + btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); @@ -4559,7 +4602,6 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_destroy_all_ordered_extents(fs_info); btrfs_destroy_delayed_inodes(fs_info); btrfs_assert_delayed_root_empty(fs_info); - btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); btrfs_destroy_all_delalloc_inodes(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8c2d6cf1ce59..cd629113f61c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,6 +39,8 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -54,15 +56,12 @@ int __cold open_ctree(struct super_block *sb, char *options); void __cold close_ctree(struct btrfs_fs_info *fs_info); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); -int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, - struct buffer_head **bh_ret); +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, + int copy_num); int btrfs_commit_super(struct btrfs_fs_info *fs_info); -struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, - struct btrfs_key *location); -int btrfs_init_fs_root(struct btrfs_root *root); -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id); +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); @@ -70,19 +69,13 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key, bool check_ref); -static inline struct btrfs_root * -btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location) -{ - return btrfs_get_fs_root(fs_info, location, true); -} +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); @@ -95,19 +88,16 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); * If you want to ensure the whole tree is safe, you should use * fs_info->subvol_srcu */ -static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { + if (!root) + return NULL; if (refcount_inc_not_zero(&root->refs)) return root; return NULL; } -static inline void btrfs_put_fs_root(struct btrfs_root *root) -{ - if (refcount_dec_and_test(&root->refs)) - kfree(root); -} - +void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 72e312cae69d..2bb25d2dc44b 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -57,16 +57,14 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return type; } -static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, - int check_generation) +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; struct inode *inode; struct btrfs_key key; - int index; - int err = 0; if (objectid < BTRFS_FIRST_FREE_OBJECTID) return ERR_PTR(-ESTALE); @@ -75,25 +73,18 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto fail; - } + root = btrfs_get_fs_root(fs_info, &key, true); + if (IS_ERR(root)) + return ERR_CAST(root); key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(sb, &key, root); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto fail; - } - - srcu_read_unlock(&fs_info->subvol_srcu, index); + btrfs_put_root(root); + if (IS_ERR(inode)) + return ERR_CAST(inode); if (check_generation && generation != inode->i_generation) { iput(inode); @@ -101,9 +92,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, } return d_obtain_alias(inode); -fail: - srcu_read_unlock(&fs_info->subvol_srcu, index); - return ERR_PTR(err); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -152,7 +140,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); } -static struct dentry *btrfs_get_parent(struct dentry *child) +struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = d_inode(child); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 57488ecd7d4e..f32f4113c976 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -18,4 +18,9 @@ struct btrfs_fid { u64 parent_root_objectid; } __attribute__ ((packed)); +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation); +struct dentry *btrfs_get_parent(struct dentry *child); + #endif diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index a3febe746c79..b4a7bad3e82e 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -36,13 +36,14 @@ struct io_failure_record; #define CHUNK_TRIMMED EXTENT_DEFRAG enum { - IO_TREE_FS_INFO_FREED_EXTENTS0, - IO_TREE_FS_INFO_FREED_EXTENTS1, + IO_TREE_FS_PINNED_EXTENTS, + IO_TREE_FS_EXCLUDED_EXTENTS, IO_TREE_INODE_IO, IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, IO_TREE_TRANS_DIRTY_PAGES, IO_TREE_ROOT_DIRTY_LOG_PAGES, + IO_TREE_INODE_FILE_EXTENT, IO_TREE_SELFTEST, }; @@ -222,6 +223,8 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state); void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, unsigned bits); +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a7bc66121330..54a64d1e18c6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -64,10 +64,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; - set_extent_bits(&fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE); - set_extent_bits(&fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE); + set_extent_bits(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE); return 0; } @@ -79,10 +77,8 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) start = cache->start; end = start + cache->length - 1; - clear_extent_bits(&fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE); - clear_extent_bits(&fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE); + clear_extent_bits(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE); } static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) @@ -1193,24 +1189,6 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add) -{ - int ret; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); - } else { - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); - } - return ret; -} - static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, @@ -1469,7 +1447,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, @@ -1494,11 +1471,17 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ - ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, - owner, offset, refs_to_add); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, path, bytenr, parent, + root_objectid); + } else { + ret = insert_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset, + refs_to_add); + } if (ret) btrfs_abort_transaction(trans, ret); out: @@ -1583,7 +1566,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, int err = 0; int metadata = !extent_op->is_data; - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) @@ -1604,7 +1587,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->reada = READA_FORWARD; path->leave_spinning = 1; ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); if (ret < 0) { @@ -1703,10 +1685,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, { int ret = 0; - if (trans->aborted) { + if (TRANS_ABORTED(trans)) { if (insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); return 0; } @@ -1721,8 +1702,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, else BUG(); if (ret && insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); return ret; } @@ -1867,8 +1847,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { - btrfs_pin_extent(fs_info, head->bytenr, - head->num_bytes, 1); + btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); if (head->is_data) { ret = btrfs_del_csums(trans, fs_info->csum_root, head->bytenr, head->num_bytes); @@ -2191,7 +2170,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, int run_all = count == (unsigned long)-1; /* We'll clean this up in btrfs_cleanup_transaction */ - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) @@ -2238,7 +2217,7 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 flags, + struct extent_buffer *eb, u64 flags, int level, int is_data) { struct btrfs_delayed_extent_op *extent_op; @@ -2254,7 +2233,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->is_data = is_data ? true : false; extent_op->level = level; - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; @@ -2588,7 +2567,8 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) return bytenr; } -static int pin_down_extent(struct btrfs_block_group *cache, +static int pin_down_extent(struct btrfs_trans_handle *trans, + struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -2607,22 +2587,20 @@ static int pin_down_extent(struct btrfs_block_group *cache, percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(fs_info->pinned_extents, bytenr, + set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; } -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_block_group *cache; - ASSERT(fs_info->running_transaction); - - cache = btrfs_lookup_block_group(fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(cache, bytenr, num_bytes, reserved); + pin_down_extent(trans, cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; @@ -2631,13 +2609,15 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_block_group *cache; int ret; - cache = btrfs_lookup_block_group(fs_info, bytenr); + btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes); + + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); if (!cache) return -EINVAL; @@ -2649,7 +2629,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, */ btrfs_cache_block_group(cache, 1); - pin_down_extent(cache, bytenr, num_bytes, 0); + pin_down_extent(trans, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); @@ -2763,11 +2743,6 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) } } - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - fs_info->pinned_extents = &fs_info->freed_extents[1]; - else - fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->commit_root_sem); btrfs_update_global_block_rsv(fs_info); @@ -2908,12 +2883,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) u64 end; int ret; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; + unpin = &trans->transaction->pinned_extents; - while (!trans->aborted) { + while (!TRANS_ABORTED(trans)) { struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); @@ -2923,6 +2895,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + clear_extent_bits(&fs_info->excluded_extents, start, + end, EXTENT_UPTODATE); if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, @@ -2950,7 +2925,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) u64 trimmed = 0; ret = -EROFS; - if (!trans->aborted) + if (!TRANS_ABORTED(trans)) ret = btrfs_discard_extent(fs_info, block_group->start, block_group->length, @@ -3000,7 +2975,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; @@ -3301,7 +3275,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(cache, buf->start, buf->len, 1); + pin_down_extent(trans, cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); goto out; } @@ -3345,7 +3319,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) (ref->type == BTRFS_REF_DATA && ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ - btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); + btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); old_ref_mod = new_ref_mod = 0; ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { @@ -3438,6 +3412,10 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, +}; + /* * Structure used internally for find_free_extent() function. Wraps needed * parameters. @@ -3454,6 +3432,8 @@ struct find_free_extent_ctl { /* For clustered allocation */ u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; bool have_caching_bg; bool orig_have_caching_bg; @@ -3489,6 +3469,12 @@ struct find_free_extent_ctl { /* Found result */ u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; }; @@ -3501,11 +3487,11 @@ struct find_free_extent_ctl { * Return 0 means we have found a location and set ffe_ctl->found_offset. */ static int find_free_extent_clustered(struct btrfs_block_group *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group **cluster_bg_ret) + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **cluster_bg_ret) { struct btrfs_block_group *cluster_bg; + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 aligned_cluster; u64 offset; int ret; @@ -3605,9 +3591,9 @@ refill_cluster: * Return -EAGAIN to inform caller that we need to re-search this block group */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl) + struct find_free_extent_ctl *ffe_ctl) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 offset; /* @@ -3663,16 +3649,101 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, return 0; } +static int do_allocation_clustered(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + int ret; + + /* We want to try and use the cluster allocator, so lets look there */ + if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { + ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); + if (ret >= 0 || ret == -EAGAIN) + return ret; + /* ret == -ENOENT case falls through */ + } + + return find_free_extent_unclustered(block_group, ffe_ctl); +} + +static int do_allocation(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return do_allocation_clustered(block_group, ffe_ctl, bg_ret); + default: + BUG(); + } +} + +static void release_block_group(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + int delalloc) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + ffe_ctl->retry_clustered = false; + ffe_ctl->retry_unclustered = false; + break; + default: + BUG(); + } + + BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != + ffe_ctl->index); + btrfs_release_block_group(block_group, delalloc); +} + +static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + if (!ffe_ctl->use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } +} + +static void found_extent(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + found_extent_clustered(ffe_ctl, ins); + break; + default: + BUG(); + } +} + +static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + /* + * If we can't allocate a new chunk we've already looped through + * at least once, move on to the NO_EMPTY_SIZE case. + */ + ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + return 0; + default: + BUG(); + } +} + /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. * Return <0 means we failed to locate any free extent. */ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, - struct btrfs_free_cluster *last_ptr, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, - int full_search, bool use_cluster) + bool full_search) { struct btrfs_root *root = fs_info->extent_root; int ret; @@ -3689,11 +3760,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return 1; if (ins->objectid) { - if (!use_cluster && last_ptr) { - spin_lock(&last_ptr->lock); - last_ptr->window_start = ins->objectid; - spin_unlock(&last_ptr->lock); - } + found_extent(ffe_ctl, ins); return 0; } @@ -3739,16 +3806,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, CHUNK_ALLOC_FORCE); - /* - * If we can't allocate a new chunk we've already looped - * through at least once, move on to the NO_EMPTY_SIZE - * case. - */ - if (ret == -ENOSPC) - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - /* Do not bail out on ENOSPC since we can do more. */ - if (ret < 0 && ret != -ENOSPC) + if (ret == -ENOSPC) + ret = chunk_allocation_failed(ffe_ctl); + else if (ret < 0) btrfs_abort_transaction(trans, ret); else ret = 0; @@ -3759,6 +3820,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) + return -ENOSPC; + /* * Don't loop again if we already have no empty_size and * no empty_cluster. @@ -3774,6 +3838,71 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + /* + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. + */ + if (space_info->max_extent_size) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + ffe_ctl->num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + ffe_ctl->use_cluster = false; + } + spin_unlock(&space_info->lock); + } + + ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl->empty_cluster); + if (ffe_ctl->last_ptr) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + ffe_ctl->hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + ffe_ctl->hint_byte = last_ptr->window_start; + ffe_ctl->use_cluster = false; + } + spin_unlock(&last_ptr->lock); + } + + return 0; +} + +static int prepare_allocation(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + default: + BUG(); + } +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: @@ -3801,16 +3930,14 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, */ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 ram_bytes, u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, + u64 hint_byte_orig, struct btrfs_key *ins, u64 flags, int delalloc) { int ret = 0; int cache_block_group_error = 0; - struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group *block_group = NULL; struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; - bool use_cluster = true; bool full_search = false; WARN_ON(num_bytes < fs_info->sectorsize); @@ -3819,13 +3946,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ffe_ctl.empty_size = empty_size; ffe_ctl.flags = flags; ffe_ctl.search_start = 0; - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; ffe_ctl.delalloc = delalloc; ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); ffe_ctl.have_caching_bg = false; ffe_ctl.orig_have_caching_bg = false; ffe_ctl.found_offset = 0; + ffe_ctl.hint_byte = hint_byte_orig; + ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + + /* For clustered allocation */ + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; + ffe_ctl.last_ptr = NULL; + ffe_ctl.use_cluster = true; ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; @@ -3839,51 +3972,14 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, return -ENOSPC; } - /* - * If our free space is heavily fragmented we may not be able to make - * big contiguous allocations, so instead of doing the expensive search - * for free space, simply return ENOSPC with our max_extent_size so we - * can go ahead and search for a more manageable chunk. - * - * If our max_extent_size is large enough for our allocation simply - * disable clustering since we will likely not be able to find enough - * space to create a cluster and induce latency trying. - */ - if (unlikely(space_info->max_extent_size)) { - spin_lock(&space_info->lock); - if (space_info->max_extent_size && - num_bytes > space_info->max_extent_size) { - ins->offset = space_info->max_extent_size; - spin_unlock(&space_info->lock); - return -ENOSPC; - } else if (space_info->max_extent_size) { - use_cluster = false; - } - spin_unlock(&space_info->lock); - } - - last_ptr = fetch_cluster_info(fs_info, space_info, - &ffe_ctl.empty_cluster); - if (last_ptr) { - spin_lock(&last_ptr->lock); - if (last_ptr->block_group) - hint_byte = last_ptr->window_start; - if (last_ptr->fragmented) { - /* - * We still set window_start so we can keep track of the - * last place we found an allocation to try and save - * some time. - */ - hint_byte = last_ptr->window_start; - use_cluster = false; - } - spin_unlock(&last_ptr->lock); - } + ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins); + if (ret < 0) + return ret; ffe_ctl.search_start = max(ffe_ctl.search_start, first_logical_byte(fs_info, 0)); - ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); - if (ffe_ctl.search_start == hint_byte) { + ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte); + if (ffe_ctl.search_start == ffe_ctl.hint_byte) { block_group = btrfs_lookup_block_group(fs_info, ffe_ctl.search_start); /* @@ -3924,6 +4020,8 @@ search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[ffe_ctl.index], list) { + struct btrfs_block_group *bg_ret; + /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) continue; @@ -3984,39 +4082,20 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; - /* - * Ok we want to try and use the cluster allocator, so - * lets look there - */ - if (last_ptr && use_cluster) { - struct btrfs_block_group *cluster_bg = NULL; - - ret = find_free_extent_clustered(block_group, last_ptr, - &ffe_ctl, &cluster_bg); - - if (ret == 0) { - if (cluster_bg && cluster_bg != block_group) { - btrfs_release_block_group(block_group, - delalloc); - block_group = cluster_bg; - } - goto checks; - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { - goto loop; + bg_ret = NULL; + ret = do_allocation(block_group, &ffe_ctl, &bg_ret); + if (ret == 0) { + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, delalloc); + block_group = bg_ret; } - /* ret == -ENOENT case falls through */ - } - - ret = find_free_extent_unclustered(block_group, last_ptr, - &ffe_ctl); - if (ret == -EAGAIN) + } else if (ret == -EAGAIN) { goto have_block_group; - else if (ret > 0) + } else if (ret > 0) { goto loop; - /* ret == 0 case falls through */ -checks: + } + + /* Checks */ ffe_ctl.search_start = round_up(ffe_ctl.found_offset, fs_info->stripesize); @@ -4050,17 +4129,12 @@ checks: btrfs_release_block_group(block_group, delalloc); break; loop: - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != - ffe_ctl.index); - btrfs_release_block_group(block_group, delalloc); + release_block_group(block_group, &ffe_ctl, delalloc); cond_resched(); } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, - full_search, use_cluster); + ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search); if (ret > 0) goto search; @@ -4189,18 +4263,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, + u64 len) { struct btrfs_block_group *cache; int ret = 0; - cache = btrfs_lookup_block_group(fs_info, start); + cache = btrfs_lookup_block_group(trans->fs_info, start); if (!cache) { - btrfs_err(fs_info, "unable to find block group for %llu", start); + btrfs_err(trans->fs_info, "unable to find block group for %llu", + start); return -ENOSPC; } - ret = pin_down_extent(cache, start, len, 1); + ret = pin_down_extent(trans, cache, start, len, 1); btrfs_put_block_group(cache); return ret; } @@ -4431,7 +4507,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1); if (ret) - btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1); + btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); btrfs_put_block_group(block_group); return ret; } @@ -4750,8 +4826,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb->start, - eb->len, flag, + ret = btrfs_set_disk_extent_flags(trans, eb, flag, btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; @@ -5209,9 +5284,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * * If called with for_reloc == 0, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; @@ -5250,9 +5323,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, if (err) goto out_end_trans; - if (block_rsv) - trans->block_rsv = block_rsv; - /* * This will help us catch people modifying the fs tree while we're * dropping it. It is unsafe to mess with the fs tree while it's being @@ -5380,8 +5450,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, err = PTR_ERR(trans); goto out_free; } - if (block_rsv) - trans->block_rsv = block_rsv; } } btrfs_release_path(path); @@ -5413,13 +5481,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } - if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); - } else { - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - btrfs_put_fs_root(root); - } + else + btrfs_put_root(root); root_dropped = true; out_end_trans: btrfs_end_transaction_throttle(trans); @@ -5749,47 +5814,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) return bg_ret; return dev_ret; } - -/* - * btrfs_{start,end}_write_no_snapshotting() are similar to - * mnt_{want,drop}_write(), they are used to prevent some tasks from writing - * data into the page cache through nocow before the subvolume is snapshoted, - * but flush the data into disk after the snapshot creation, or to prevent - * operations while snapshotting is ongoing and that cause the snapshot to be - * inconsistent (writes followed by expanding truncates for example). - */ -void btrfs_end_write_no_snapshotting(struct btrfs_root *root) -{ - percpu_counter_dec(&root->subv_writers->counter); - cond_wake_up(&root->subv_writers->wait); -} - -int btrfs_start_write_no_snapshotting(struct btrfs_root *root) -{ - if (atomic_read(&root->will_be_snapshotted)) - return 0; - - percpu_counter_inc(&root->subv_writers->counter); - /* - * Make sure counter is updated before we check for snapshot creation. - */ - smp_mb(); - if (atomic_read(&root->will_be_snapshotted)) { - btrfs_end_write_no_snapshotting(root); - return 0; - } - return 1; -} - -void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshotting(root); - if (ret) - break; - wait_var_event(&root->will_be_snapshotted, - !atomic_read(&root->will_be_snapshotted)); - } -} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c0f202741e09..39e45b8a5031 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -35,42 +35,54 @@ static inline bool extent_state_in_tree(const struct extent_state *state) } #ifdef CONFIG_BTRFS_DEBUG -static LIST_HEAD(buffers); static LIST_HEAD(states); - static DEFINE_SPINLOCK(leak_lock); -static inline -void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) +static inline void btrfs_leak_debug_add(spinlock_t *lock, + struct list_head *new, + struct list_head *head) { unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); + spin_lock_irqsave(lock, flags); list_add(new, head); - spin_unlock_irqrestore(&leak_lock, flags); + spin_unlock_irqrestore(lock, flags); } -static inline -void btrfs_leak_debug_del(struct list_head *entry) +static inline void btrfs_leak_debug_del(spinlock_t *lock, + struct list_head *entry) { unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); + spin_lock_irqsave(lock, flags); list_del(entry); - spin_unlock_irqrestore(&leak_lock, flags); + spin_unlock_irqrestore(lock, flags); } -static inline void btrfs_extent_buffer_leak_debug_check(void) +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) { struct extent_buffer *eb; + unsigned long flags; - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); - pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", - eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); + /* + * If we didn't get into open_ctree our allocated_ebs will not be + * initialized, so just skip this. + */ + if (!fs_info->allocated_ebs.next) + return; + + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + while (!list_empty(&fs_info->allocated_ebs)) { + eb = list_first_entry(&fs_info->allocated_ebs, + struct extent_buffer, leak_list); + pr_err( + "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, + btrfs_header_owner(eb)); list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } static inline void btrfs_extent_state_leak_debug_check(void) @@ -107,9 +119,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, } } #else -#define btrfs_leak_debug_add(new, head) do {} while (0) -#define btrfs_leak_debug_del(entry) do {} while (0) -#define btrfs_extent_buffer_leak_debug_check() do {} while (0) +#define btrfs_leak_debug_add(lock, new, head) do {} while (0) +#define btrfs_leak_debug_del(lock, entry) do {} while (0) #define btrfs_extent_state_leak_debug_check() do {} while (0) #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif @@ -122,7 +133,6 @@ struct tree_entry { struct extent_page_data { struct bio *bio; - struct extent_io_tree *tree; /* tells writepage not to lock the state bits for this range * it still does the unlocking */ @@ -246,8 +256,6 @@ void __cold extent_state_cache_exit(void) void __cold extent_io_exit(void) { - btrfs_extent_buffer_leak_debug_check(); - /* * Make sure all delayed rcu free are flushed before we * destroy caches. @@ -257,6 +265,15 @@ void __cold extent_io_exit(void) bioset_exit(&btrfs_bioset); } +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner, void *private_data) @@ -268,6 +285,8 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, spin_lock_init(&tree->lock); tree->private_data = private_data; tree->owner = owner; + if (owner == IO_TREE_INODE_FILE_EXTENT) + lockdep_set_class(&tree->lock, &file_extent_tree_class); } void extent_io_tree_release(struct extent_io_tree *tree) @@ -314,7 +333,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->state = 0; state->failrec = NULL; RB_CLEAR_NODE(&state->rb_node); - btrfs_leak_debug_add(&state->leak_list, &states); + btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); @@ -327,7 +346,7 @@ void free_extent_state(struct extent_state *state) return; if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); - btrfs_leak_debug_del(&state->leak_list); + btrfs_leak_debug_del(&leak_lock, &state->leak_list); trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } @@ -1053,6 +1072,16 @@ hit_next: goto out; } + /* + * If this extent already has all the bits we want set, then + * skip it, not necessary to split it or do anything with it. + */ + if ((state->state & bits) == bits) { + start = state->end + 1; + cache_state(state, cached_state); + goto search_again; + } + prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); @@ -1568,6 +1597,43 @@ out: } /** + * find_contiguous_extent_bit: find a contiguous area of bits + * @tree - io tree to check + * @start - offset to start the search from + * @start_ret - the first offset we found with the bits set + * @end_ret - the final contiguous range of the bits that were set + * @bits - bits to look for + * + * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges + * to set bits appropriately, and then merge them again. During this time it + * will drop the tree->lock, so use this helper if you want to find the actual + * contiguous area for given bits. We will search to the first bit we find, and + * then walk down the tree until we find a non-contiguous area. The area + * returned will be the full contiguous area with the bits set. + */ +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + while ((state = next_state(state)) != NULL) { + if (state->start > (*end_ret + 1)) + break; + *end_ret = state->end; + } + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + +/** * find_first_clear_extent_bit - find the first range that has @bits not set. * This range could start before @start. * @@ -2926,7 +2992,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) /* * @opf: bio REQ_OP_* and REQ_* flags as one value - * @tree: tree so we can call our merge_bio hook * @wbc: optional writeback control for io accounting * @page: page to add to the bio * @pg_offset: offset of the new bio or to check whether we are adding @@ -2939,7 +3004,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @bio_flags: flags of the current bio to see if we can merge them */ -static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, +static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, struct page *page, u64 offset, size_t size, unsigned long pg_offset, @@ -2954,6 +3019,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, struct bio *bio; size_t page_size = min_t(size_t, size, PAGE_SIZE); sector_t sector = offset >> 9; + struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; ASSERT(bio_ret); @@ -3062,8 +3128,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct extent_io_tree *tree, - struct page *page, +static int __do_readpage(struct page *page, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, @@ -3086,6 +3151,7 @@ static int __do_readpage(struct extent_io_tree *tree, size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; unsigned long this_bio_flag = 0; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; set_page_extent_mapped(page); @@ -3242,7 +3308,7 @@ static int __do_readpage(struct extent_io_tree *tree, continue; } - ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, + ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, page, offset, disk_io_size, pg_offset, bio, end_bio_extent_readpage, mirror_num, @@ -3269,8 +3335,7 @@ out: return ret; } -static inline void contiguous_readpages(struct extent_io_tree *tree, - struct page *pages[], int nr_pages, +static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, struct bio **bio, @@ -3280,17 +3345,16 @@ static inline void contiguous_readpages(struct extent_io_tree *tree, struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; - btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, + __do_readpage(pages[index], btrfs_get_extent, em_cached, bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -static int __extent_read_full_page(struct extent_io_tree *tree, - struct page *page, +static int __extent_read_full_page(struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, unsigned long *bio_flags, @@ -3301,21 +3365,21 @@ static int __extent_read_full_page(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; int ret; - btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, + ret = __do_readpage(page, get_extent, NULL, bio, mirror_num, bio_flags, read_flags, NULL); return ret; } -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num) +int extent_read_full_page(struct page *page, get_extent_t *get_extent, + int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, + ret = __extent_read_full_page(page, get_extent, &bio, mirror_num, &bio_flags, 0); if (bio) ret = submit_one_bio(bio, mirror_num, bio_flags); @@ -3423,7 +3487,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, unsigned long nr_written, int *nr_ret) { - struct extent_io_tree *tree = epd->tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 start = page_offset(page); u64 page_end = start + PAGE_SIZE - 1; u64 end; @@ -3509,7 +3573,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page->index, cur, end); } - ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, offset, iosize, pg_offset, &epd->bio, end_bio_extent_writepage, @@ -3830,8 +3894,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) { - struct btrfs_fs_info *fs_info = eb->fs_info; - struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; u32 nritems; int i, num_pages; @@ -3864,7 +3926,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, p, offset, PAGE_SIZE, 0, &epd->bio, end_bio_extent_buffer_writepage, @@ -3897,14 +3959,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; struct extent_buffer *eb, *prev_eb = NULL; struct extent_page_data epd = { .bio = NULL, - .tree = tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; int nr_to_write_done = 0; @@ -4018,7 +4079,39 @@ retry: end_write_bio(&epd, ret); return ret; } - ret = flush_write_bio(&epd); + /* + * If something went wrong, don't allow any metadata write bio to be + * submitted. + * + * This would prevent use-after-free if we had dirty pages not + * cleaned up, which can still happen by fuzzed images. + * + * - Bad extent tree + * Allowing existing tree block to be allocated for other trees. + * + * - Log tree operations + * Exiting tree blocks get allocated to log tree, bumps its + * generation, then get cleaned in tree re-balance. + * Such tree block will not be written back, since it's clean, + * thus no WRITTEN flag set. + * And after log writes back, this tree block is not traced by + * any dirty extent_io_tree. + * + * - Offending tree block gets re-dirtied from its original owner + * Since it has bumped generation, no WRITTEN flag, it can be + * reused without COWing. This tree block will not be traced + * by btrfs_transaction::dirty_pages. + * + * Now such dirty tree block will not be cleaned by any dirty + * extent io tree. Thus we don't want to submit such wild eb + * if the fs already has error. + */ + if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + ret = flush_write_bio(&epd); + } else { + ret = -EUCLEAN; + end_write_bio(&epd, ret); + } return ret; } @@ -4190,7 +4283,6 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) int ret; struct extent_page_data epd = { .bio = NULL, - .tree = &BTRFS_I(page->mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4212,14 +4304,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, { int ret = 0; struct address_space *mapping = inode->i_mapping; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *page; unsigned long nr_pages = (end - start + PAGE_SIZE) >> PAGE_SHIFT; struct extent_page_data epd = { .bio = NULL, - .tree = tree, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, }; @@ -4263,7 +4353,6 @@ int extent_writepages(struct address_space *mapping, int ret = 0; struct extent_page_data epd = { .bio = NULL, - .tree = &BTRFS_I(mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4285,7 +4374,6 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, unsigned long bio_flags = 0; struct page *pagepool[16]; struct extent_map *em_cached = NULL; - struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; int nr = 0; u64 prev_em_start = (u64)-1; @@ -4312,7 +4400,7 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - contiguous_readpages(tree, pagepool, nr, contig_start, + contiguous_readpages(pagepool, nr, contig_start, contig_end, &em_cached, &bio, &bio_flags, &prev_em_start); } @@ -4796,7 +4884,6 @@ out_free_ulist: static void __free_extent_buffer(struct extent_buffer *eb) { - btrfs_leak_debug_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } @@ -4862,6 +4949,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { btrfs_release_extent_buffer_pages(eb); + btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); __free_extent_buffer(eb); } @@ -4883,7 +4971,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); - btrfs_leak_debug_add(&eb->leak_list, &buffers); + btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, + &fs_info->allocated_ebs); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); @@ -5230,6 +5319,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) } static int release_extent_buffer(struct extent_buffer *eb) + __releases(&eb->refs_lock) { lockdep_assert_held(&eb->refs_lock); @@ -5248,6 +5338,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } + btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_pages(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -5405,7 +5496,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; - struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -5453,7 +5543,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = __extent_read_full_page(tree, page, + err = __extent_read_full_page(page, btree_get_extent, &bio, mirror_num, &bio_flags, REQ_META); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5d205bbaafdc..2ed65bd0760e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -189,8 +189,8 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num); +int extent_read_full_page(struct page *page, get_extent_t *get_extent, + int mirror_num); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); @@ -325,4 +325,11 @@ bool find_lock_delalloc_range(struct inode *inode, #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); + +#ifdef CONFIG_BTRFS_DEBUG +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info); +#else +#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0) +#endif + #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c2f365662d55..b618ad5339ba 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -23,6 +23,97 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) +/** + * @inode - the inode we want to update the disk_i_size for + * @new_i_size - the i_size we want to set to, 0 if we use i_size + * + * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read() + * returns as it is perfectly fine with a file that has holes without hole file + * extent items. + * + * However without NO_HOLES we need to only return the area that is contiguous + * from the 0 offset of the file. Otherwise we could end up adjust i_size up + * to an extent that has a gap in between. + * + * Finally new_i_size should only be set in the case of truncate where we're not + * ready to use i_size_read() as the limiter yet. + */ +void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + u64 start, end, i_size; + int ret; + + i_size = new_i_size ?: i_size_read(inode); + if (btrfs_fs_incompat(fs_info, NO_HOLES)) { + BTRFS_I(inode)->disk_i_size = i_size; + return; + } + + spin_lock(&BTRFS_I(inode)->lock); + ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0, + &start, &end, EXTENT_DIRTY); + if (!ret && start == 0) + i_size = min(i_size, end + 1); + else + i_size = 0; + BTRFS_I(inode)->disk_i_size = i_size; + spin_unlock(&BTRFS_I(inode)->lock); +} + +/** + * @inode - the inode we're modifying + * @start - the start file offset of the file extent we've inserted + * @len - the logical length of the file extent item + * + * Call when we are inserting a new file extent where there was none before. + * Does not need to call this in the case where we're replacing an existing file + * extent, however if not sure it's fine to call this multiple times. + * + * The start and len must match the file extent item, so thus must be sectorsize + * aligned. + */ +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len) +{ + if (len == 0) + return 0; + + ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); + + if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) + return 0; + return set_extent_bits(&inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY); +} + +/** + * @inode - the inode we're modifying + * @start - the start file offset of the file extent we've inserted + * @len - the logical length of the file extent item + * + * Called when we drop a file extent, for example when we truncate. Doesn't + * need to be called for cases where we're replacing a file extent, like when + * we've COWed a file extent. + * + * The start and len must match the file extent item, so thus must be sectorsize + * aligned. + */ +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len) +{ + if (len == 0) + return 0; + + ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || + len == (u64)-1); + + if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) + return 0; + return clear_extent_bit(&inode->file_extent_tree, start, + start + len - 1, EXTENT_DIRTY, 0, 0, NULL); +} + static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, u16 csum_size) { @@ -949,18 +1040,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, btrfs_item_key_to_cpu(leaf, &key, slot); extent_start = key.offset; - - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - extent_end = extent_start + - btrfs_file_extent_num_bytes(leaf, fi); - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - size_t size; - size = btrfs_file_extent_ram_bytes(leaf, fi); - extent_end = ALIGN(extent_start + size, - fs_info->sectorsize); - } - + extent_end = btrfs_file_extent_end(path); em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -1007,3 +1087,30 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, root->root_key.objectid); } } + +/* + * Returns the end offset (non inclusive) of the file extent item the given path + * points to. If it points to an inline extent, the returned offset is rounded + * up to the sector size. + */ +u64 btrfs_file_extent_end(const struct btrfs_path *path) +{ + const struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 end; + + btrfs_item_key_to_cpu(leaf, &key, slot); + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + end = btrfs_file_extent_ram_bytes(leaf, fi); + end = ALIGN(key.offset + end, leaf->fs_info->sectorsize); + } else { + end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + } + + return end; +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a16da274c9aa..8a144f9cb7ac 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -27,6 +27,7 @@ #include "qgroup.h" #include "compression.h" #include "delalloc-space.h" +#include "reflink.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -277,7 +278,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; - int index; int ret; /* get the inode */ @@ -285,9 +285,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - - inode_root = btrfs_read_fs_root_no_name(fs_info, &key); + inode_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(inode_root)) { ret = PTR_ERR(inode_root); goto cleanup; @@ -297,11 +295,11 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root); + btrfs_put_root(inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto cleanup; } - srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -337,7 +335,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; cleanup: - srcu_read_unlock(&fs_info->subvol_srcu, index); kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return ret; } @@ -1552,15 +1549,14 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, u64 num_bytes; int ret; - ret = btrfs_start_write_no_snapshotting(root); - if (!ret) + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; - btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart, + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); num_bytes = lockend - lockstart + 1; @@ -1568,7 +1564,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); @@ -1674,7 +1670,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, data_reserved, pos, write_bytes); else - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); break; } @@ -1778,7 +1774,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); if (only_release_metadata && copied > 0) { lockstart = round_down(pos, @@ -1807,7 +1803,7 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { @@ -2071,6 +2067,16 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(&ctx, inode); /* + * Set the range to full if the NO_HOLES feature is not enabled. + * This is to avoid missing file extent items representing holes after + * replaying the log. + */ + if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { + start = 0; + end = LLONG_MAX; + } + + /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by * multi-task, and make the performance up. See @@ -2092,19 +2098,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges, and assertion failures during - * hole detection. Do this while holding the inode lock, to avoid races - * with other tasks. - */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } - - /* * Before we acquired the inode's lock, someone may have dirtied more * pages in the target range. We need to make sure that writeback for * any such pages does not start while we are logging the inode, because @@ -2124,6 +2117,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2486,6 +2480,11 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + clone_info->file_offset, clone_len); + if (ret) + return ret; + /* If it's a hole, nothing more needs to be done. */ if (clone_info->disk_offset == 0) return 0; @@ -2596,6 +2595,24 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } + } else if (!clone_info && cur_offset < drop_end) { + /* + * We are past the i_size here, but since we didn't + * insert holes we need to clear the mapped area so we + * know to not set disk_i_size in this area until a new + * file extent is inserted here. + */ + ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + cur_offset, drop_end - cur_offset); + if (ret) { + /* + * We couldn't clear our area, so we could + * presumably adjust up and corrupt the fs, so + * we need to abort. + */ + btrfs_abort_transaction(trans, ret); + break; + } } if (clone_info && drop_end > clone_info->file_offset) { @@ -2686,6 +2703,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); goto out_trans; } + } else if (!clone_info && cur_offset < drop_end) { + /* See the comment in the loop above for the reasoning here. */ + ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + cur_offset, drop_end - cur_offset); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } if (clone_info) { ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, @@ -2935,7 +2961,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode, inode->i_ctime = current_time(inode); i_size_write(inode, end); - btrfs_ordered_update_i_size(inode, end, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode(trans, root, inode); ret2 = btrfs_end_transaction(trans); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0598fd3c6e3f..3613da065a73 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -371,10 +371,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) } } -static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode, - int uptodate) +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { struct page *page; + struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; @@ -732,7 +732,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, readahead_cache(inode); - ret = io_ctl_prepare_pages(&io_ctl, inode, 1); + ret = io_ctl_prepare_pages(&io_ctl, true); if (ret) goto out; @@ -1067,6 +1067,7 @@ fail: } static noinline_for_stack int write_pinned_extent_entries( + struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, int *entries) @@ -1085,7 +1086,7 @@ static noinline_for_stack int write_pinned_extent_entries( * We shouldn't have switched the pinned extents yet so this is the * right one */ - unpin = block_group->fs_info->pinned_extents; + unpin = &trans->transaction->pinned_extents; start = block_group->start; @@ -1190,7 +1191,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; if (block_group) { -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG btrfs_err(root->fs_info, "failed to write free space cache for block group %llu", block_group->start); @@ -1291,7 +1292,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* Lock all pages first so we can lock the extent safely. */ - ret = io_ctl_prepare_pages(io_ctl, inode, 0); + ret = io_ctl_prepare_pages(io_ctl, false); if (ret) goto out_unlock; @@ -1317,7 +1318,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * If this changes while we are working we'll get added back to * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(block_group, io_ctl, &entries); + ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); if (ret) goto out_nospc_locked; @@ -1366,18 +1367,6 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, return 0; -out: - io_ctl->inode = NULL; - io_ctl_free(io_ctl); - if (ret) { - invalidate_inode_pages2(inode->i_mapping); - BTRFS_I(inode)->generation = 0; - } - btrfs_update_inode(trans, root, inode); - if (must_iput) - iput(inode); - return ret; - out_nospc_locked: cleanup_bitmap_list(&bitmap_list); spin_unlock(&ctl->tree_lock); @@ -1390,7 +1379,17 @@ out_unlock: if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); - goto out; +out: + io_ctl->inode = NULL; + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + } + btrfs_update_inode(trans, root, inode); + if (must_iput) + iput(inode); + return ret; } int btrfs_write_out_cache(struct btrfs_trans_handle *trans, @@ -1416,7 +1415,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, block_group, &block_group->io_ctl, trans); if (ret) { -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG btrfs_err(fs_info, "failed to write free space cache for block group %llu", block_group->start); @@ -4036,7 +4035,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, if (release_metadata) btrfs_delalloc_release_metadata(BTRFS_I(inode), inode->i_size, true); -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG btrfs_err(fs_info, "failed to write free ino cache for root %llu", root->root_key.objectid); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 258cb3fae17a..8b1f5c8897b7 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1251,9 +1251,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_free_tree_block(trans, free_space_root, free_space_root->node, 0, 1); - free_extent_buffer(free_space_root->node); - free_extent_buffer(free_space_root->commit_root); - kfree(free_space_root); + btrfs_put_root(free_space_root); return btrfs_commit_transaction(trans); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d5c9c69d8263..6009e0e939b5 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -515,7 +515,7 @@ out_release: trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved); + trans->bytes_reserved, NULL); out: trans->block_rsv = rsv; trans->bytes_reserved = num_bytes; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d267eb5caa7b..320d1062068d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -28,6 +28,7 @@ #include <linux/magic.h> #include <linux/iversion.h> #include <linux/swap.h> +#include <linux/migrate.h> #include <linux/sched/mm.h> #include <asm/unaligned.h> #include "misc.h" @@ -242,6 +243,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); /* + * We align size to sectorsize for inline extents just for simplicity + * sake. + */ + size = ALIGN(size, root->fs_info->sectorsize); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); + if (ret) + goto fail; + + /* * we're an inline extent, so nobody can * extend the file past i_size without locking * a page we already have locked. @@ -2446,6 +2456,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos, + ram_bytes); + if (ret) + goto out; + /* * Release the reserved range from inode dirty range map, as it is * already moved into delayed_ref_head @@ -2536,7 +2551,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) */ btrfs_qgroup_free_data(inode, NULL, start, ordered_extent->num_bytes); - btrfs_ordered_update_i_size(inode, 0, ordered_extent); + btrfs_inode_safe_disk_i_size_write(inode, 0); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -2607,7 +2622,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - btrfs_ordered_update_i_size(inode, 0, ordered_extent); + btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); @@ -3187,6 +3202,8 @@ static int btrfs_read_locked_inode(struct inode *inode, i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, + round_up(i_size_read(inode), fs_info->sectorsize)); inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); @@ -4158,6 +4175,8 @@ search_again: } while (1) { + u64 clear_start = 0, clear_len = 0; + fi = NULL; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -4208,6 +4227,8 @@ search_again: if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; + + clear_start = found_key.offset; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); if (!del_item) { u64 orig_num_bytes = @@ -4215,6 +4236,7 @@ search_again: extent_num_bytes = ALIGN(new_size - found_key.offset, fs_info->sectorsize); + clear_start = ALIGN(new_size, fs_info->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - @@ -4240,6 +4262,7 @@ search_again: inode_sub_bytes(inode, num_dec); } } + clear_len = num_dec; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* * we can't truncate inline items that have had @@ -4261,12 +4284,33 @@ search_again: */ ret = NEED_TRUNCATE_BLOCK; break; + } else { + /* + * Inline extents are special, we just treat + * them as a full sector worth in the file + * extent tree just for simplicity sake. + */ + clear_len = fs_info->sectorsize; } if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); } delete: + /* + * We use btrfs_truncate_inode_items() to clean up log trees for + * multiple fsyncs, and in this case we don't want to clear the + * file extent range because it's just the log. + */ + if (root == BTRFS_I(inode)->root) { + ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + clear_start, clear_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + } + if (del_item) last_size = found_key.offset; else @@ -4368,7 +4412,7 @@ out: ASSERT(last_size >= new_size); if (!ret && last_size > new_size) last_size = new_size; - btrfs_ordered_update_i_size(inode, last_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, last_size); unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, &cached_state); } @@ -4576,7 +4620,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (size <= hole_start) return 0; - btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start, + btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start, block_end - 1, &cached_state); cur_offset = hole_start; while (1) { @@ -4589,14 +4633,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) } last_byte = min(extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); + hole_size = last_byte - cur_offset; + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; - hole_size = last_byte - cur_offset; err = maybe_insert_hole(root, inode, cur_offset, hole_size); if (err) break; + + err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + cur_offset, hole_size); + if (err) + break; + btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); @@ -4628,6 +4679,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_size - 1, 0); } free_extent_map(hole_em); + } else { + err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + cur_offset, hole_size); + if (err) + break; } next: free_extent_map(em); @@ -4671,24 +4727,24 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * truncation, it must capture all writes that happened before * this truncation. */ - btrfs_wait_for_snapshot_creation(root); + btrfs_drew_write_lock(&root->snapshot_lock); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); return ret; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); return PTR_ERR(trans); } i_size_write(inode, newsize); - btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, inode); - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { @@ -5098,7 +5154,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, btrfs_release_path(path); - new_root = btrfs_read_fs_root_no_name(fs_info, location); + new_root = btrfs_get_fs_root(fs_info, location, true); if (IS_ERR(new_root)) { err = PTR_ERR(new_root); goto out; @@ -5179,7 +5235,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) inode->i_ino = args->location->objectid; memcpy(&BTRFS_I(inode)->location, args->location, sizeof(*args->location)); - BTRFS_I(inode)->root = args->root; + BTRFS_I(inode)->root = btrfs_grab_root(args->root); + BUG_ON(args->root && !BTRFS_I(inode)->root); return 0; } @@ -5260,7 +5317,7 @@ static struct inode *new_simple_dir(struct super_block *s, if (!inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = root; + BTRFS_I(inode)->root = btrfs_grab_root(root); memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -5307,7 +5364,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *sub_root = root; struct btrfs_key location; u8 di_type = 0; - int index; int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) @@ -5334,7 +5390,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } - index = srcu_read_lock(&fs_info->subvol_srcu); ret = fixup_tree_root_location(fs_info, dir, dentry, &location, &sub_root); if (ret < 0) { @@ -5345,7 +5400,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } else { inode = btrfs_iget(dir->i_sb, &location, sub_root); } - srcu_read_unlock(&fs_info->subvol_srcu, index); + if (root != sub_root) + btrfs_put_root(sub_root); if (!IS_ERR(inode) && root != sub_root) { down_read(&fs_info->cleanup_work_sem); @@ -5826,7 +5882,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->dir_index = *index; - BTRFS_I(inode)->root = root; + BTRFS_I(inode)->root = btrfs_grab_root(root); BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -6463,6 +6519,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; + extent_end = btrfs_file_extent_end(path); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ @@ -6473,18 +6530,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, btrfs_ino(inode)); goto out; } - extent_end = extent_start + - btrfs_file_extent_num_bytes(leaf, item); - trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, extent_start); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - size_t size; - - size = btrfs_file_extent_ram_bytes(leaf, item); - extent_end = ALIGN(extent_start + size, - fs_info->sectorsize); - trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, path->slots[0], extent_start); @@ -8211,9 +8259,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btrfs_get_extent, 0); + return extent_read_full_page(page, btrfs_get_extent, 0); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -8272,6 +8318,39 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) return __btrfs_releasepage(page, gfp_flags); } +#ifdef CONFIG_MIGRATION +static int btrfs_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + int ret; + + ret = migrate_page_move_mapping(mapping, newpage, page, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (page_has_private(page)) { + ClearPagePrivate(page); + get_page(newpage); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + SetPagePrivate(newpage); + } + + if (PagePrivate2(page)) { + ClearPagePrivate2(page); + SetPagePrivate2(newpage); + } + + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +#endif + static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { @@ -8647,7 +8726,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) break; } - btrfs_block_rsv_release(fs_info, rsv, -1); + btrfs_block_rsv_release(fs_info, rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ @@ -8672,7 +8751,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) ret = PTR_ERR(trans); goto out; } - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); } if (trans) { @@ -8776,6 +8855,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); extent_io_tree_init(fs_info, &ei->io_failure_tree, IO_TREE_INODE_IO_FAILURE, inode); + extent_io_tree_init(fs_info, &ei->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT, inode); ei->io_tree.track_uptodate = true; ei->io_failure_tree.track_uptodate = true; atomic_set(&ei->sync_writers, 0); @@ -8842,6 +8923,8 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); + btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); + btrfs_put_root(BTRFS_I(inode)->root); } int btrfs_drop_inode(struct inode *inode) @@ -9669,14 +9752,14 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); list_move_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); ret = start_delalloc_inodes(root, nr, false); - btrfs_put_fs_root(root); + btrfs_put_root(root); if (ret < 0) goto out; @@ -9938,7 +10021,7 @@ next: else i_size = cur_offset; i_size_write(inode, i_size); - btrfs_ordered_update_i_size(inode, i_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); } ret = btrfs_update_inode(trans, root, inode); @@ -10474,6 +10557,9 @@ static const struct address_space_operations btrfs_aops = { .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, +#ifdef CONFIG_MIGRATION + .migratepage = btrfs_migratepage, +#endif .set_page_dirty = btrfs_set_page_dirty, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4f4b13830b25..40b729dce91c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -28,6 +28,7 @@ #include <linux/iversion.h> #include "ctree.h" #include "disk-io.h" +#include "export.h" #include "transaction.h" #include "btrfs_inode.h" #include "print-tree.h" @@ -86,10 +87,6 @@ struct btrfs_ioctl_send_args_32 { struct btrfs_ioctl_send_args_32) #endif -static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff, - int no_time_update); - /* Mask out flags that are inappropriate for the given type of inode. */ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, unsigned int flags) @@ -554,7 +551,6 @@ int __pure btrfs_is_empty_uuid(u8 *uuid) static noinline int create_subvol(struct inode *dir, struct dentry *dentry, const char *name, int namelen, - u64 *async_transid, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -573,7 +569,6 @@ static noinline int create_subvol(struct inode *dir, u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; - uuid_le new_uuid; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -643,8 +638,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_root_generation_v2(root_item, btrfs_root_generation(root_item)); - uuid_le_gen(&new_uuid); - memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(root_item->uuid); btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); root_item->ctime = root_item->otime; @@ -666,7 +660,7 @@ static noinline int create_subvol(struct inode *dir, goto fail; key.offset = (u64)-1; - new_root = btrfs_read_fs_root_no_name(fs_info, &key); + new_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); @@ -676,6 +670,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_record_root_in_trans(trans, new_root); ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); + btrfs_put_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ btrfs_abort_transaction(trans, ret); @@ -727,14 +722,7 @@ fail: trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(fs_info, &block_rsv); - if (async_transid) { - *async_transid = trans->transid; - err = btrfs_commit_transaction_async(trans, 1); - if (err) - err = btrfs_commit_transaction(trans); - } else { - err = btrfs_commit_transaction(trans); - } + err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; @@ -752,8 +740,7 @@ fail_free: } static int create_snapshot(struct btrfs_root *root, struct inode *dir, - struct dentry *dentry, - u64 *async_transid, bool readonly, + struct dentry *dentry, bool readonly, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -789,11 +776,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, * possible. This is to avoid later writeback (running dealloc) to * fallback to COW mode and unexpectedly fail with ENOSPC. */ - atomic_inc(&root->will_be_snapshotted); - smp_mb__after_atomic(); - /* wait for no snapshot writes */ - wait_event(root->subv_writers->wait, - percpu_counter_sum(&root->subv_writers->counter) == 0); + btrfs_drew_read_lock(&root->snapshot_lock); ret = btrfs_start_delalloc_snapshot(root); if (ret) @@ -841,14 +824,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); spin_unlock(&fs_info->trans_lock); - if (async_transid) { - *async_transid = trans->transid; - ret = btrfs_commit_transaction_async(trans, 1); - if (ret) - ret = btrfs_commit_transaction(trans); - } else { - ret = btrfs_commit_transaction(trans); - } + + ret = btrfs_commit_transaction(trans); if (ret) goto fail; @@ -869,12 +846,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, d_instantiate(dentry, inode); ret = 0; fail: + btrfs_put_root(pending_snapshot->snap); btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); dec_and_free: if (snapshot_force_cow) atomic_dec(&root->snapshot_force_cow); - if (atomic_dec_and_test(&root->will_be_snapshotted)) - wake_up_var(&root->will_be_snapshotted); + btrfs_drew_read_unlock(&root->snapshot_lock); + free_pending: kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); @@ -953,7 +931,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) static noinline int btrfs_mksubvol(const struct path *parent, const char *name, int namelen, struct btrfs_root *snap_src, - u64 *async_transid, bool readonly, + bool readonly, struct btrfs_qgroup_inherit *inherit) { struct inode *dir = d_inode(parent->dentry); @@ -989,13 +967,11 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) goto out_up_read; - if (snap_src) { - error = create_snapshot(snap_src, dir, dentry, - async_transid, readonly, inherit); - } else { - error = create_subvol(dir, dentry, name, namelen, - async_transid, inherit); - } + if (snap_src) + error = create_snapshot(snap_src, dir, dentry, readonly, inherit); + else + error = create_subvol(dir, dentry, name, namelen, inherit); + if (!error) fsnotify_mkdir(dir, dentry); out_up_read: @@ -1711,9 +1687,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = round_down(new_size, fs_info->sectorsize); - btrfs_info_in_rcu(fs_info, "new size for %s is %llu", - rcu_str_deref(device->name), new_size); - if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -1726,6 +1699,11 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = btrfs_shrink_device(device, new_size); } /* equal, nothing need to do */ + if (ret == 0 && new_size != old_size) + btrfs_info_in_rcu(fs_info, + "resize device %s (devid %llu) from %llu to %llu", + rcu_str_deref(device->name), device->devid, + old_size, new_size); out_free: kfree(vol_args); out: @@ -1734,9 +1712,9 @@ out: return ret; } -static noinline int btrfs_ioctl_snap_create_transid(struct file *file, +static noinline int __btrfs_ioctl_snap_create(struct file *file, const char *name, unsigned long fd, int subvol, - u64 *transid, bool readonly, + bool readonly, struct btrfs_qgroup_inherit *inherit) { int namelen; @@ -1763,7 +1741,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, transid, readonly, inherit); + NULL, readonly, inherit); } else { struct fd src = fdget(fd); struct inode *src_inode; @@ -1786,7 +1764,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, } else { ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, - transid, readonly, inherit); + readonly, inherit); } fdput(src); } @@ -1810,9 +1788,8 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, - NULL, false, NULL); + ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, + subvol, false, NULL); kfree(vol_args); return ret; @@ -1823,8 +1800,6 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, { struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; - u64 transid = 0; - u64 *ptr = NULL; bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; @@ -1836,22 +1811,11 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - if (vol_args->flags & - ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | - BTRFS_SUBVOL_QGROUP_INHERIT)) { + if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; goto free_args; } - if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - btrfs_warn(fs_info, -"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7"); - - ptr = &transid; - } if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { @@ -1866,18 +1830,10 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, } } - ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, ptr, - readonly, inherit); + ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, + subvol, readonly, inherit); if (ret) goto free_inherit; - - if (ptr && copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), - ptr, sizeof(*ptr))) - ret = -EFAULT; - free_inherit: kfree(inherit); free_args: @@ -1936,11 +1892,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_drop_write; } - if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { - ret = -EINVAL; - goto out_drop_write; - } - if (flags & ~BTRFS_SUBVOL_RDONLY) { ret = -EOPNOTSUPP; goto out_drop_write; @@ -2174,12 +2125,12 @@ static noinline int search_ioctl(struct inode *inode, if (sk->tree_id == 0) { /* search the root of the inode that was passed */ - root = BTRFS_I(inode)->root; + root = btrfs_grab_root(BTRFS_I(inode)->root); } else { key.objectid = sk->tree_id; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(info, &key); + root = btrfs_get_fs_root(info, &key, true); if (IS_ERR(root)) { btrfs_free_path(path); return PTR_ERR(root); @@ -2208,6 +2159,7 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; err: sk->nr_items = num_found; + btrfs_put_root(root); btrfs_free_path(path); return ret; } @@ -2314,9 +2266,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.objectid = tree_id; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(info, &key); + root = btrfs_get_fs_root(info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); + root = NULL; goto out; } @@ -2367,6 +2320,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, name[total_len] = '\0'; ret = 0; out: + btrfs_put_root(root); btrfs_free_path(path); return ret; } @@ -2383,7 +2337,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, unsigned long item_len; struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; - struct btrfs_root *root; + struct btrfs_root *root = NULL; struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; @@ -2408,7 +2362,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, key.objectid = treeid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &key); + root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; @@ -2420,15 +2374,15 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - goto out; + goto out_put; } else if (ret > 0) { ret = btrfs_previous_item(root, path, dirid, BTRFS_INODE_REF_KEY); if (ret < 0) { - goto out; + goto out_put; } else if (ret > 0) { ret = -ENOENT; - goto out; + goto out_put; } } @@ -2442,7 +2396,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, total_len += len + 1; if (ptr < args->path) { ret = -ENAMETOOLONG; - goto out; + goto out_put; } *(ptr + len) = '/'; @@ -2453,10 +2407,10 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, ret = btrfs_previous_item(root, path, dirid, BTRFS_INODE_ITEM_KEY); if (ret < 0) { - goto out; + goto out_put; } else if (ret > 0) { ret = -ENOENT; - goto out; + goto out_put; } leaf = path->nodes[0]; @@ -2464,26 +2418,26 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, btrfs_item_key_to_cpu(leaf, &key2, slot); if (key2.objectid != dirid) { ret = -ENOENT; - goto out; + goto out_put; } temp_inode = btrfs_iget(sb, &key2, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); - goto out; + goto out_put; } ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { ret = -EACCES; - goto out; + goto out_put; } if (key.offset == upper_limit.objectid) break; if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { ret = -EACCES; - goto out; + goto out_put; } btrfs_release_path(path); @@ -2494,15 +2448,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, memmove(args->path, ptr, total_len); args->path[total_len] = '\0'; + btrfs_put_root(root); + root = NULL; btrfs_release_path(path); } /* Get the bottom subvolume's name from ROOT_REF */ - root = fs_info->tree_root; key.objectid = treeid; key.type = BTRFS_ROOT_REF_KEY; key.offset = args->treeid; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (ret > 0) { @@ -2529,6 +2484,8 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, read_extent_buffer(leaf, args->name, item_off, item_len); args->name[item_len] = 0; +out_put: + btrfs_put_root(root); out: btrfs_free_path(path); return ret; @@ -2653,10 +2610,10 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) key.objectid = BTRFS_I(inode)->root->root_key.objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &key); + root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); - goto out; + goto out_free; } root_item = &root->root_item; @@ -2689,16 +2646,14 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) if (key.objectid != BTRFS_FS_TREE_OBJECTID) { /* Search root tree for ROOT_BACKREF of this subvolume */ - root = fs_info->tree_root; - key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(fs_info->tree_root, path); if (ret < 0) { goto out; } else if (ret > 0) { @@ -2733,6 +2688,8 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) ret = -EFAULT; out: + btrfs_put_root(root); +out_free: btrfs_free_path(path); kzfree(subvol_info); return ret; @@ -2836,7 +2793,8 @@ out: } static noinline int btrfs_ioctl_snap_destroy(struct file *file, - void __user *arg) + void __user *arg, + bool destroy_v2) { struct dentry *parent = file->f_path.dentry; struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); @@ -2845,34 +2803,120 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; - struct btrfs_ioctl_vol_args *vol_args; - int namelen; + struct btrfs_ioctl_vol_args *vol_args = NULL; + struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; + char *subvol_name, *subvol_name_ptr = NULL; + int subvol_namelen; int err = 0; + bool destroy_parent = false; - if (!S_ISDIR(dir->i_mode)) - return -ENOTDIR; + if (destroy_v2) { + vol_args2 = memdup_user(arg, sizeof(*vol_args2)); + if (IS_ERR(vol_args2)) + return PTR_ERR(vol_args2); - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { + err = -EOPNOTSUPP; + goto out; + } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - namelen = strlen(vol_args->name); - if (strchr(vol_args->name, '/') || - strncmp(vol_args->name, "..", namelen) == 0) { - err = -EINVAL; - goto out; + /* + * If SPEC_BY_ID is not set, we are looking for the subvolume by + * name, same as v1 currently does. + */ + if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { + vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; + subvol_name = vol_args2->name; + + err = mnt_want_write_file(file); + if (err) + goto out; + } else { + if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out; + } + + err = mnt_want_write_file(file); + if (err) + goto out; + + dentry = btrfs_get_dentry(fs_info->sb, + BTRFS_FIRST_FREE_OBJECTID, + vol_args2->subvolid, 0, 0); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_drop_write; + } + + /* + * Change the default parent since the subvolume being + * deleted can be outside of the current mount point. + */ + parent = btrfs_get_parent(dentry); + + /* + * At this point dentry->d_name can point to '/' if the + * subvolume we want to destroy is outsite of the + * current mount point, so we need to release the + * current dentry and execute the lookup to return a new + * one with ->d_name pointing to the + * <mount point>/subvol_name. + */ + dput(dentry); + if (IS_ERR(parent)) { + err = PTR_ERR(parent); + goto out_drop_write; + } + dir = d_inode(parent); + + /* + * If v2 was used with SPEC_BY_ID, a new parent was + * allocated since the subvolume can be outside of the + * current mount point. Later on we need to release this + * new parent dentry. + */ + destroy_parent = true; + + subvol_name_ptr = btrfs_get_subvol_name_from_objectid( + fs_info, vol_args2->subvolid); + if (IS_ERR(subvol_name_ptr)) { + err = PTR_ERR(subvol_name_ptr); + goto free_parent; + } + /* subvol_name_ptr is already NULL termined */ + subvol_name = (char *)kbasename(subvol_name_ptr); + } + } else { + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = 0; + subvol_name = vol_args->name; + + err = mnt_want_write_file(file); + if (err) + goto out; } - err = mnt_want_write_file(file); - if (err) - goto out; + subvol_namelen = strlen(subvol_name); + + if (strchr(subvol_name, '/') || + strncmp(subvol_name, "..", subvol_namelen) == 0) { + err = -EINVAL; + goto free_subvol_name; + } + if (!S_ISDIR(dir->i_mode)) { + err = -ENOTDIR; + goto free_subvol_name; + } err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (err == -EINTR) - goto out_drop_write; - dentry = lookup_one_len(vol_args->name, parent, namelen); + goto free_subvol_name; + dentry = lookup_one_len(subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock_dir; @@ -2941,9 +2985,15 @@ out_dput: dput(dentry); out_unlock_dir: inode_unlock(dir); +free_subvol_name: + kfree(subvol_name_ptr); +free_parent: + if (destroy_parent) + dput(parent); out_drop_write: mnt_drop_write_file(file); out: + kfree(vol_args2); kfree(vol_args); return err; } @@ -3069,8 +3119,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; } - /* Check for compatibility reject unknown flags */ - if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) { + if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) { ret = -EOPNOTSUPP; goto out; } @@ -3220,733 +3269,6 @@ out: return ret; } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - } else if (inode1 == inode2 && loff2 < loff1) { - swap(loff1, loff2); - } - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, - struct inode *dst, u64 dst_loff) -{ - const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - int ret; - - /* - * Lock destination range to serialize with concurrent readpages() and - * source range to serialize with relocation. - */ - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); - ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); - - return ret; -} - -#define BTRFS_MAX_DEDUPE_LEN SZ_16M - -static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, - struct inode *dst, u64 dst_loff) -{ - int ret; - u64 i, tail_len, chunk_count; - struct btrfs_root *root_dst = BTRFS_I(dst)->root; - - spin_lock(&root_dst->root_item_lock); - if (root_dst->send_in_progress) { - btrfs_warn_rl(root_dst->fs_info, -"cannot deduplicate to root %llu while send operations are using it (%d in progress)", - root_dst->root_key.objectid, - root_dst->send_in_progress); - spin_unlock(&root_dst->root_item_lock); - return -EAGAIN; - } - root_dst->dedupe_in_progress++; - spin_unlock(&root_dst->root_item_lock); - - tail_len = olen % BTRFS_MAX_DEDUPE_LEN; - chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); - - for (i = 0; i < chunk_count; i++) { - ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff); - if (ret) - goto out; - - loff += BTRFS_MAX_DEDUPE_LEN; - dst_loff += BTRFS_MAX_DEDUPE_LEN; - } - - if (tail_len > 0) - ret = btrfs_extent_same_range(src, loff, tail_len, dst, - dst_loff); -out: - spin_lock(&root_dst->root_item_lock); - root_dst->dedupe_in_progress--; - spin_unlock(&root_dst->root_item_lock); - - return ret; -} - -static int clone_finish_inode_update(struct btrfs_trans_handle *trans, - struct inode *inode, - u64 endoff, - const u64 destoff, - const u64 olen, - int no_time_update) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - - inode_inc_iversion(inode); - if (!no_time_update) - inode->i_mtime = inode->i_ctime = current_time(inode); - /* - * We round up to the block size at eof when determining which - * extents to clone above, but shouldn't round up the file size. - */ - if (endoff > destoff + olen) - endoff = destoff + olen; - if (endoff > inode->i_size) - btrfs_i_size_write(BTRFS_I(inode), endoff); - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - goto out; - } - ret = btrfs_end_transaction(trans); -out: - return ret; -} - -/* - * Make sure we do not end up inserting an inline extent into a file that has - * already other (non-inline) extents. If a file has an inline extent it can - * not have any other extents and the (single) inline extent must start at the - * file offset 0. Failing to respect these rules will lead to file corruption, - * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc - * - * We can have extents that have been already written to disk or we can have - * dirty ranges still in delalloc, in which case the extent maps and items are - * created only when we run delalloc, and the delalloc ranges might fall outside - * the range we are currently locking in the inode's io tree. So we check the - * inode's i_size because of that (i_size updates are done while holding the - * i_mutex, which we are holding here). - * We also check to see if the inode has a size not greater than "datal" but has - * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are - * protected against such concurrent fallocate calls by the i_mutex). - * - * If the file has no extents but a size greater than datal, do not allow the - * copy because we would need turn the inline extent into a non-inline one (even - * with NO_HOLES enabled). If we find our destination inode only has one inline - * extent, just overwrite it with the source inline extent if its size is less - * than the source extent's size, or we could copy the source inline extent's - * data into the destination inode's inline extent if the later is greater then - * the former. - */ -static int clone_copy_inline_extent(struct inode *dst, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_key *new_key, - const u64 drop_start, - const u64 datal, - const u64 skip, - const u64 size, - char *inline_data) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); - struct btrfs_root *root = BTRFS_I(dst)->root; - const u64 aligned_end = ALIGN(new_key->offset + datal, - fs_info->sectorsize); - int ret; - struct btrfs_key key; - - if (new_key->offset > 0) - return -EOPNOTSUPP; - - key.objectid = btrfs_ino(BTRFS_I(dst)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - return ret; - } else if (ret > 0) { - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ret; - else if (ret > 0) - goto copy_inline_extent; - } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && - key.type == BTRFS_EXTENT_DATA_KEY) { - ASSERT(key.offset > 0); - return -EOPNOTSUPP; - } - } else if (i_size_read(dst) <= datal) { - struct btrfs_file_extent_item *ei; - u64 ext_len; - - /* - * If the file size is <= datal, make sure there are no other - * extents following (can happen do to an fallocate call with - * the flag FALLOC_FL_KEEP_SIZE). - */ - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - /* - * If it's an inline extent, it can not have other extents - * following it. - */ - if (btrfs_file_extent_type(path->nodes[0], ei) == - BTRFS_FILE_EXTENT_INLINE) - goto copy_inline_extent; - - ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - if (ext_len > aligned_end) - return -EOPNOTSUPP; - - ret = btrfs_next_item(root, path); - if (ret < 0) { - return ret; - } else if (ret == 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && - key.type == BTRFS_EXTENT_DATA_KEY) - return -EOPNOTSUPP; - } - } - -copy_inline_extent: - /* - * We have no extent items, or we have an extent at offset 0 which may - * or may not be inlined. All these cases are dealt the same way. - */ - if (i_size_read(dst) > datal) { - /* - * If the destination inode has an inline extent... - * This would require copying the data from the source inline - * extent into the beginning of the destination's inline extent. - * But this is really complex, both extents can be compressed - * or just one of them, which would require decompressing and - * re-compressing data (which could increase the new compressed - * size, not allowing the compressed data to fit anymore in an - * inline extent). - * So just don't support this case for now (it should be rare, - * we are not really saving space when cloning inline extents). - */ - return -EOPNOTSUPP; - } - - btrfs_release_path(path); - ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); - if (ret) - return ret; - ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) - return ret; - - if (skip) { - const u32 start = btrfs_file_extent_calc_inline_size(0); - - memmove(inline_data + start, inline_data + start + skip, datal); - } - - write_extent_buffer(path->nodes[0], inline_data, - btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]), - size); - inode_add_bytes(dst, datal); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); - - return 0; -} - -/** - * btrfs_clone() - clone a range from inode file to another - * - * @src: Inode to clone from - * @inode: Inode to clone to - * @off: Offset within source to start clone from - * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen - * @destoff: Offset within @inode to start clone - * @no_time_update: Whether to update mtime/ctime on the target inode - */ -static int btrfs_clone(struct inode *src, struct inode *inode, - const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff, int no_time_update) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path = NULL; - struct extent_buffer *leaf; - struct btrfs_trans_handle *trans; - char *buf = NULL; - struct btrfs_key key; - u32 nritems; - int slot; - int ret; - const u64 len = olen_aligned; - u64 last_dest_end = destoff; - - ret = -ENOMEM; - buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); - if (!buf) - return ret; - - path = btrfs_alloc_path(); - if (!path) { - kvfree(buf); - return ret; - } - - path->reada = READA_FORWARD; - /* clone data */ - key.objectid = btrfs_ino(BTRFS_I(src)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = off; - - while (1) { - u64 next_key_min_offset = key.offset + 1; - struct btrfs_file_extent_item *extent; - int type; - u32 size; - struct btrfs_key new_key; - u64 disko = 0, diskl = 0; - u64 datao = 0, datal = 0; - u8 comp; - u64 drop_start; - - /* - * note the key will change type as we walk through the - * tree. - */ - path->leave_spinning = 1; - ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, - 0, 0); - if (ret < 0) - goto out; - /* - * First search, if no extent item that starts at offset off was - * found but the previous item is an extent item, it's possible - * it might overlap our target range, therefore process it. - */ - if (key.offset == off && ret > 0 && path->slots[0] > 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0] - 1); - if (key.type == BTRFS_EXTENT_DATA_KEY) - path->slots[0]--; - } - - nritems = btrfs_header_nritems(path->nodes[0]); -process_slot: - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(BTRFS_I(src)->root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - nritems = btrfs_header_nritems(path->nodes[0]); - } - leaf = path->nodes[0]; - slot = path->slots[0]; - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.type > BTRFS_EXTENT_DATA_KEY || - key.objectid != btrfs_ino(BTRFS_I(src))) - break; - - ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - comp = btrfs_file_extent_compression(leaf, extent); - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - disko = btrfs_file_extent_disk_bytenr(leaf, extent); - diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); - datao = btrfs_file_extent_offset(leaf, extent); - datal = btrfs_file_extent_num_bytes(leaf, extent); - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - /* Take upper bound, may be compressed */ - datal = btrfs_file_extent_ram_bytes(leaf, extent); - } - - /* - * The first search might have left us at an extent item that - * ends before our target range's start, can happen if we have - * holes and NO_HOLES feature enabled. - */ - if (key.offset + datal <= off) { - path->slots[0]++; - goto process_slot; - } else if (key.offset >= off + len) { - break; - } - next_key_min_offset = key.offset + datal; - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), - size); - - btrfs_release_path(path); - path->leave_spinning = 0; - - memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = btrfs_ino(BTRFS_I(inode)); - if (off <= key.offset) - new_key.offset = key.offset + destoff - off; - else - new_key.offset = destoff; - - /* - * Deal with a hole that doesn't have an extent item that - * represents it (NO_HOLES feature enabled). - * This hole is either in the middle of the cloning range or at - * the beginning (fully overlaps it or partially overlaps it). - */ - if (new_key.offset != last_dest_end) - drop_start = last_dest_end; - else - drop_start = new_key.offset; - - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - struct btrfs_clone_extent_info clone_info; - - /* - * a | --- range to clone ---| b - * | ------------- extent ------------- | - */ - - /* Subtract range b */ - if (key.offset + datal > off + len) - datal = off + len - key.offset; - - /* Subtract range a */ - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - clone_info.disk_offset = disko; - clone_info.disk_len = diskl; - clone_info.data_offset = datao; - clone_info.data_len = datal; - clone_info.file_offset = new_key.offset; - clone_info.extent_buf = buf; - clone_info.item_size = size; - ret = btrfs_punch_hole_range(inode, path, - drop_start, - new_key.offset + datal - 1, - &clone_info, &trans); - if (ret) - goto out; - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 skip = 0; - u64 trim = 0; - - if (off > key.offset) { - skip = off - key.offset; - new_key.offset += skip; - } - - if (key.offset + datal > off + len) - trim = key.offset + datal - (off + len); - - if (comp && (skip || trim)) { - ret = -EINVAL; - goto out; - } - size -= skip + trim; - datal -= skip + trim; - - /* - * If our extent is inline, we know we will drop or - * adjust at most 1 extent item in the destination root. - * - * 1 - adjusting old extent (we may have to split it) - * 1 - add new extent - * 1 - inode update - */ - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - - ret = clone_copy_inline_extent(inode, trans, path, - &new_key, drop_start, - datal, skip, size, buf); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - goto out; - } - } - - btrfs_release_path(path); - - last_dest_end = ALIGN(new_key.offset + datal, - fs_info->sectorsize); - ret = clone_finish_inode_update(trans, inode, last_dest_end, - destoff, olen, no_time_update); - if (ret) - goto out; - if (new_key.offset + datal >= destoff + len) - break; - - btrfs_release_path(path); - key.offset = next_key_min_offset; - - if (fatal_signal_pending(current)) { - ret = -EINTR; - goto out; - } - } - ret = 0; - - if (last_dest_end < destoff + len) { - /* - * We have an implicit hole that fully or partially overlaps our - * cloning range at its end. This means that we either have the - * NO_HOLES feature enabled or the implicit hole happened due to - * mixing buffered and direct IO writes against this file. - */ - btrfs_release_path(path); - path->leave_spinning = 0; - - ret = btrfs_punch_hole_range(inode, path, - last_dest_end, destoff + len - 1, - NULL, &trans); - if (ret) - goto out; - - ret = clone_finish_inode_update(trans, inode, destoff + len, - destoff, olen, no_time_update); - } - -out: - btrfs_free_path(path); - kvfree(buf); - return ret; -} - -static noinline int btrfs_clone_files(struct file *file, struct file *file_src, - u64 off, u64 olen, u64 destoff) -{ - struct inode *inode = file_inode(file); - struct inode *src = file_inode(file_src); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; - u64 len = olen; - u64 bs = fs_info->sb->s_blocksize; - - /* - * TODO: - * - split compressed inline extents. annoying: we need to - * decompress into destination's address_space (the file offset - * may change, so source mapping won't do), then recompress (or - * otherwise reinsert) a subrange. - * - * - split destination inode's inline extents. The inline extents can - * be either compressed or non-compressed. - */ - - /* - * VFS's generic_remap_file_range_prep() protects us from cloning the - * eof block into the middle of a file, which would result in corruption - * if the file size is not blocksize aligned. So we don't need to check - * for that case here. - */ - if (off + len == src->i_size) - len = ALIGN(src->i_size, bs) - off; - - if (destoff > inode->i_size) { - const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); - - ret = btrfs_cont_expand(inode, inode->i_size, destoff); - if (ret) - return ret; - /* - * We may have truncated the last block if the inode's size is - * not sector size aligned, so we need to wait for writeback to - * complete before proceeding further, otherwise we can race - * with cloning and attempt to increment a reference to an - * extent that no longer exists (writeback completed right after - * we found the previous extent covering eof and before we - * attempted to increment its reference count). - */ - ret = btrfs_wait_ordered_range(inode, wb_start, - destoff - wb_start); - if (ret) - return ret; - } - - /* - * Lock destination range to serialize with concurrent readpages() and - * source range to serialize with relocation. - */ - btrfs_double_extent_lock(src, off, inode, destoff, len); - ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - btrfs_double_extent_unlock(src, off, inode, destoff, len); - /* - * Truncate page cache pages so that future reads will see the cloned - * data immediately and not the previous data. - */ - truncate_inode_pages_range(&inode->i_data, - round_down(destoff, PAGE_SIZE), - round_up(destoff + len, PAGE_SIZE) - 1); - - return ret; -} - -static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; - bool same_inode = inode_out == inode_in; - u64 wb_len; - int ret; - - if (!(remap_flags & REMAP_FILE_DEDUP)) { - struct btrfs_root *root_out = BTRFS_I(inode_out)->root; - - if (btrfs_root_readonly(root_out)) - return -EROFS; - - if (file_in->f_path.mnt != file_out->f_path.mnt || - inode_in->i_sb != inode_out->i_sb) - return -EXDEV; - } - - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { - return -EINVAL; - } - - /* - * Now that the inodes are locked, we need to start writeback ourselves - * and can not rely on the writeback from the VFS's generic helper - * generic_remap_file_range_prep() because: - * - * 1) For compression we must call filemap_fdatawrite_range() range - * twice (btrfs_fdatawrite_range() does it for us), and the generic - * helper only calls it once; - * - * 2) filemap_fdatawrite_range(), called by the generic helper only - * waits for the writeback to complete, i.e. for IO to be done, and - * not for the ordered extents to complete. We need to wait for them - * to complete so that new file extent items are in the fs tree. - */ - if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) - wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); - else - wb_len = ALIGN(*len, bs); - - /* - * Since we don't lock ranges, wait for ongoing lockless dio writes (as - * any in progress could create its ordered extents after we wait for - * existing ordered extents below). - */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - /* - * Workaround to make sure NOCOW buffered write reach disk as NOCOW. - * - * Btrfs' back references do not have a block level granularity, they - * work at the whole extent level. - * NOCOW buffered write without data space reserved may not be able - * to fall back to CoW due to lack of data space, thus could cause - * data loss. - * - * Here we take a shortcut by flushing the whole inode, so that all - * nocow write should reach disk as nocow before we increase the - * reference of the extent. We could do better by only flushing NOCOW - * data, but that needs extra accounting. - * - * Also we don't need to check ASYNC_EXTENT, as async extent will be - * CoWed anyway, not affecting nocow part. - */ - ret = filemap_flush(inode_in->i_mapping); - if (ret < 0) - return ret; - - ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), - wb_len); - if (ret < 0) - return ret; - ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), - wb_len); - if (ret < 0) - return ret; - - return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - len, remap_flags); -} - -loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, loff_t len, - unsigned int remap_flags) -{ - struct inode *src_inode = file_inode(src_file); - struct inode *dst_inode = file_inode(dst_file); - bool same_inode = dst_inode == src_inode; - int ret; - - if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) - return -EINVAL; - - if (same_inode) - inode_lock(src_inode); - else - lock_two_nondirectories(src_inode, dst_inode); - - ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, - &len, remap_flags); - if (ret < 0 || len == 0) - goto out_unlock; - - if (remap_flags & REMAP_FILE_DEDUP) - ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); - else - ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); - -out_unlock: - if (same_inode) - inode_unlock(src_inode); - else - unlock_two_nondirectories(src_inode, dst_inode); - - return ret < 0 ? ret : len; -} - static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); @@ -3955,7 +3277,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_root *new_root; struct btrfs_dir_item *di; struct btrfs_trans_handle *trans; - struct btrfs_path *path; + struct btrfs_path *path = NULL; struct btrfs_key location; struct btrfs_disk_key disk_key; u64 objectid = 0; @@ -3981,49 +3303,51 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) location.type = BTRFS_ROOT_ITEM_KEY; location.offset = (u64)-1; - new_root = btrfs_read_fs_root_no_name(fs_info, &location); + new_root = btrfs_get_fs_root(fs_info, &location, true); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); goto out; } if (!is_fstree(new_root->root_key.objectid)) { ret = -ENOENT; - goto out; + goto out_free; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto out; + goto out_free; } path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_free_path(path); ret = PTR_ERR(trans); - goto out; + goto out_free; } dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { - btrfs_free_path(path); + btrfs_release_path(path); btrfs_end_transaction(trans); btrfs_err(fs_info, "Umm, you don't have the default diritem, this isn't going to work"); ret = -ENOENT; - goto out; + goto out_free; } btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); + btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans); +out_free: + btrfs_put_root(new_root); + btrfs_free_path(path); out: mnt_drop_write_file(file); return ret; @@ -5465,7 +4789,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SUBVOL_CREATE_V2: return btrfs_ioctl_snap_create_v2(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: - return btrfs_ioctl_snap_destroy(file, argp); + return btrfs_ioctl_snap_destroy(file, argp, false); + case BTRFS_IOC_SNAP_DESTROY_V2: + return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: return btrfs_ioctl_subvol_getflags(file, argp); case BTRFS_IOC_SUBVOL_SETFLAGS: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 571c4826c428..fb647d8cf527 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -523,3 +523,138 @@ void btrfs_unlock_up_safe(struct btrfs_path *path, int level) path->locks[i] = 0; } } + +/* + * Loop around taking references on and locking the root node of the tree until + * we end up with a lock on the root node. + * + * Return: root extent buffer with write lock held + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + if (eb == root->node) + break; + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* + * Loop around taking references on and locking the root node of the tree until + * we end up with a lock on the root node. + * + * Return: root extent buffer with read lock held + */ +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_read_lock(eb); + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* + * DREW locks + * ========== + * + * DREW stands for double-reader-writer-exclusion lock. It's used in situation + * where you want to provide A-B exclusion but not AA or BB. + * + * Currently implementation gives more priority to reader. If a reader and a + * writer both race to acquire their respective sides of the lock the writer + * would yield its lock as soon as it detects a concurrent reader. Additionally + * if there are pending readers no new writers would be allowed to come in and + * acquire the lock. + */ + +int btrfs_drew_lock_init(struct btrfs_drew_lock *lock) +{ + int ret; + + ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); + if (ret) + return ret; + + atomic_set(&lock->readers, 0); + init_waitqueue_head(&lock->pending_readers); + init_waitqueue_head(&lock->pending_writers); + + return 0; +} + +void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) +{ + percpu_counter_destroy(&lock->writers); +} + +/* Return true if acquisition is successful, false otherwise */ +bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) +{ + if (atomic_read(&lock->readers)) + return false; + + percpu_counter_inc(&lock->writers); + + /* Ensure writers count is updated before we check for pending readers */ + smp_mb(); + if (atomic_read(&lock->readers)) { + btrfs_drew_write_unlock(lock); + return false; + } + + return true; +} + +void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) +{ + while (true) { + if (btrfs_drew_try_write_lock(lock)) + return; + wait_event(lock->pending_writers, !atomic_read(&lock->readers)); + } +} + +void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) +{ + percpu_counter_dec(&lock->writers); + cond_wake_up(&lock->pending_readers); +} + +void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) +{ + atomic_inc(&lock->readers); + + /* + * Ensure the pending reader count is perceieved BEFORE this reader + * goes to sleep in case of active writers. This guarantees new writers + * won't be allowed and that the current reader will be woken up when + * the last active writer finishes its jobs. + */ + smp_mb__after_atomic(); + + wait_event(lock->pending_readers, + percpu_counter_sum(&lock->writers) == 0); +} + +void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) +{ + /* + * atomic_dec_and_test implies a full barrier, so woken up writers + * are guaranteed to see the decrement + */ + if (atomic_dec_and_test(&lock->readers)) + wake_up(&lock->pending_writers); +} diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 21a285883e89..d715846c10b8 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -6,6 +6,9 @@ #ifndef BTRFS_LOCKING_H #define BTRFS_LOCKING_H +#include <linux/atomic.h> +#include <linux/wait.h> +#include <linux/percpu_counter.h> #include "extent_io.h" #define BTRFS_WRITE_LOCK 1 @@ -13,6 +16,8 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 +struct btrfs_path; + void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); @@ -48,4 +53,19 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) BUG(); } +struct btrfs_drew_lock { + atomic_t readers; + struct percpu_counter writers; + wait_queue_head_t pending_writers; + wait_queue_head_t pending_readers; +}; + +int btrfs_drew_lock_init(struct btrfs_drew_lock *lock); +void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock); +void btrfs_drew_write_lock(struct btrfs_drew_lock *lock); +bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock); +void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); +void btrfs_drew_read_lock(struct btrfs_drew_lock *lock); +void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock); + #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a65f189a5b94..e13b3d28c063 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -580,7 +580,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, ordered_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); list_move_tail(&root->ordered_root, &fs_info->ordered_roots); @@ -588,7 +588,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, done = btrfs_wait_ordered_extents(root, nr, range_start, range_len); - btrfs_put_fs_root(root); + btrfs_put_root(root); spin_lock(&fs_info->ordered_root_lock); if (nr != U64_MAX) { @@ -786,134 +786,6 @@ out: } /* - * After an extent is done, call this to conditionally update the on disk - * i_size. i_size is updated to cover any fully written part of the file. - */ -int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, - struct btrfs_ordered_extent *ordered) -{ - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - u64 disk_i_size; - u64 new_i_size; - u64 i_size = i_size_read(inode); - struct rb_node *node; - struct rb_node *prev = NULL; - struct btrfs_ordered_extent *test; - int ret = 1; - u64 orig_offset = offset; - - spin_lock_irq(&tree->lock); - if (ordered) { - offset = entry_end(ordered); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) - offset = min(offset, - ordered->file_offset + - ordered->truncated_len); - } else { - offset = ALIGN(offset, btrfs_inode_sectorsize(inode)); - } - disk_i_size = BTRFS_I(inode)->disk_i_size; - - /* - * truncate file. - * If ordered is not NULL, then this is called from endio and - * disk_i_size will be updated by either truncate itself or any - * in-flight IOs which are inside the disk_i_size. - * - * Because btrfs_setsize() may set i_size with disk_i_size if truncate - * fails somehow, we need to make sure we have a precise disk_i_size by - * updating it as usual. - * - */ - if (!ordered && disk_i_size > i_size) { - BTRFS_I(inode)->disk_i_size = orig_offset; - ret = 0; - goto out; - } - - /* - * if the disk i_size is already at the inode->i_size, or - * this ordered extent is inside the disk i_size, we're done - */ - if (disk_i_size == i_size) - goto out; - - /* - * We still need to update disk_i_size if outstanding_isize is greater - * than disk_i_size. - */ - if (offset <= disk_i_size && - (!ordered || ordered->outstanding_isize <= disk_i_size)) - goto out; - - /* - * walk backward from this ordered extent to disk_i_size. - * if we find an ordered extent then we can't update disk i_size - * yet - */ - if (ordered) { - node = rb_prev(&ordered->rb_node); - } else { - prev = tree_search(tree, offset); - /* - * we insert file extents without involving ordered struct, - * so there should be no ordered struct cover this offset - */ - if (prev) { - test = rb_entry(prev, struct btrfs_ordered_extent, - rb_node); - BUG_ON(offset_in_entry(test, offset)); - } - node = prev; - } - for (; node; node = rb_prev(node)) { - test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - - /* We treat this entry as if it doesn't exist */ - if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) - continue; - - if (entry_end(test) <= disk_i_size) - break; - if (test->file_offset >= i_size) - break; - - /* - * We don't update disk_i_size now, so record this undealt - * i_size. Or we will not know the real i_size. - */ - if (test->outstanding_isize < offset) - test->outstanding_isize = offset; - if (ordered && - ordered->outstanding_isize > test->outstanding_isize) - test->outstanding_isize = ordered->outstanding_isize; - goto out; - } - new_i_size = min_t(u64, offset, i_size); - - /* - * Some ordered extents may completed before the current one, and - * we hold the real i_size in ->outstanding_isize. - */ - if (ordered && ordered->outstanding_isize > new_i_size) - new_i_size = min_t(u64, ordered->outstanding_isize, i_size); - BTRFS_I(inode)->disk_i_size = new_i_size; - ret = 0; -out: - /* - * We need to do this because we can't remove ordered extents until - * after the i_disk_size has been updated and then the inode has been - * updated to reflect the change, so we need to tell anybody who finds - * this ordered extent that we've already done all the real work, we - * just haven't completed all the other work. - */ - if (ordered) - set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); - spin_unlock_irq(&tree->lock); - return ret; -} - -/* * search the ordered extents for one corresponding to 'offset' and * try to find a checksum. This is used because we allow pages to * be reclaimed before their checksum is actually put into the btree @@ -963,7 +835,6 @@ out: * btrfs_flush_ordered_range - Lock the passed range and ensures all pending * ordered extents in it are run to completion. * - * @tree: IO tree used for locking out other users of the range * @inode: Inode whose ordered tree is to be searched * @start: Beginning of range to flush * @end: Last byte of range to lock @@ -973,8 +844,7 @@ out: * This function always returns with the given range locked, ensuring after it's * called no order extent can be pending. */ -void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, - struct btrfs_inode *inode, u64 start, +void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state) { @@ -986,7 +856,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, cachedp = cached_state; while (1) { - lock_extent_bits(tree, start, end, cachedp); + lock_extent_bits(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -999,7 +869,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, refcount_dec(&cache->refs); break; } - unlock_extent_cached(tree, start, end, cachedp); + unlock_extent_cached(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); btrfs_put_ordered_extent(ordered); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 3beb4da4ab41..c01c9698250b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -52,11 +52,6 @@ enum { BTRFS_ORDERED_DIRECT, /* We had an io error when writing this out */ BTRFS_ORDERED_IOERR, - /* - * indicates whether this ordered extent has done its due diligence in - * updating the isize - */ - BTRFS_ORDERED_UPDATED_ISIZE, /* Set when we have to truncate an extent */ BTRFS_ORDERED_TRUNCATED, /* Regular IO for COW */ @@ -182,16 +177,13 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len); -int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, - struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); -void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, - struct btrfs_inode *inode, u64 start, +void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); int __init ordered_data_init(void); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index deb59e7cfcac..ff1ff90e48b1 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -383,7 +383,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (need_reserve) { btrfs_block_rsv_release(fs_info, trans->block_rsv, - num_bytes); + num_bytes, NULL); if (ret) return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ff1870ff3474..c3888fb367e7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1030,6 +1030,7 @@ out_add_root: ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); + fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); } @@ -1037,11 +1038,8 @@ out_add_root: out_free_path: btrfs_free_path(path); out_free_root: - if (ret) { - free_extent_buffer(quota_root->node); - free_extent_buffer(quota_root->commit_root); - kfree(quota_root); - } + if (ret) + btrfs_put_root(quota_root); out: if (ret) { ulist_free(fs_info->qgroup_ulist); @@ -1104,9 +1102,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); - free_extent_buffer(quota_root->node); - free_extent_buffer(quota_root->commit_root); - kfree(quota_root); + btrfs_put_root(quota_root); end_trans: ret = btrfs_end_transaction(trans); @@ -3237,7 +3233,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, } mutex_lock(&fs_info->qgroup_rescan_lock); - spin_lock(&fs_info->qgroup_lock); if (init_flags) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { @@ -3252,7 +3247,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, } if (ret) { - spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); return ret; } @@ -3263,9 +3257,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, sizeof(fs_info->qgroup_rescan_progress)); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); - fs_info->qgroup_rescan_running = true; - - spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); btrfs_init_work(&fs_info->qgroup_rescan_work, @@ -3326,8 +3317,11 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup_rescan_zero_tracking(fs_info); + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); + mutex_unlock(&fs_info->qgroup_rescan_lock); return 0; } @@ -3339,9 +3333,7 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, int ret = 0; mutex_lock(&fs_info->qgroup_rescan_lock); - spin_lock(&fs_info->qgroup_lock); running = fs_info->qgroup_rescan_running; - spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); if (!running) @@ -3363,9 +3355,13 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) { - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); + mutex_unlock(&fs_info->qgroup_rescan_lock); + } } /* diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a8e53c8e7b01..c870ef70f817 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -206,7 +206,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash *h; int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; int i; - int table_size; if (info->stripe_hash_table) return 0; @@ -218,8 +217,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) * Try harder to allocate and fallback to vmalloc to lower the chance * of a failing mount. */ - table_size = sizeof(*table) + sizeof(*h) * num_entries; - table = kvzalloc(table_size, GFP_KERNEL); + table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); if (!table) return -ENOMEM; @@ -1196,22 +1194,19 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) int nr_data = rbio->nr_data; int stripe; int pagenr; - int p_stripe = -1; - int q_stripe = -1; + bool has_qstripe; struct bio_list bio_list; struct bio *bio; int ret; bio_list_init(&bio_list); - if (rbio->real_stripes - rbio->nr_data == 1) { - p_stripe = rbio->real_stripes - 1; - } else if (rbio->real_stripes - rbio->nr_data == 2) { - p_stripe = rbio->real_stripes - 2; - q_stripe = rbio->real_stripes - 1; - } else { + if (rbio->real_stripes - rbio->nr_data == 1) + has_qstripe = false; + else if (rbio->real_stripes - rbio->nr_data == 2) + has_qstripe = true; + else BUG(); - } /* at this point we either have a full stripe, * or we've read the full stripe from the drive. @@ -1255,7 +1250,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) SetPageUptodate(p); pointers[stripe++] = kmap(p); - if (q_stripe != -1) { + if (has_qstripe) { /* * raid6, add the qstripe and call the @@ -2353,8 +2348,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int nr_data = rbio->nr_data; int stripe; int pagenr; - int p_stripe = -1; - int q_stripe = -1; + bool has_qstripe; struct page *p_page = NULL; struct page *q_page = NULL; struct bio_list bio_list; @@ -2364,14 +2358,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, bio_list_init(&bio_list); - if (rbio->real_stripes - rbio->nr_data == 1) { - p_stripe = rbio->real_stripes - 1; - } else if (rbio->real_stripes - rbio->nr_data == 2) { - p_stripe = rbio->real_stripes - 2; - q_stripe = rbio->real_stripes - 1; - } else { + if (rbio->real_stripes - rbio->nr_data == 1) + has_qstripe = false; + else if (rbio->real_stripes - rbio->nr_data == 2) + has_qstripe = true; + else BUG(); - } if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { is_replace = 1; @@ -2393,7 +2385,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, goto cleanup; SetPageUptodate(p_page); - if (q_stripe != -1) { + if (has_qstripe) { q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!q_page) { __free_page(p_page); @@ -2416,8 +2408,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* then add the parity stripe */ pointers[stripe++] = kmap(p_page); - if (q_stripe != -1) { - + if (has_qstripe) { /* * raid6, add the qstripe and call the * library function to fill in our p/q diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index a97dc74a4d3d..5c1a617eb25d 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -8,7 +8,7 @@ struct rcu_string { struct rcu_head rcu; - char str[0]; + char str[]; }; static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 454a1015d026..7887317033c9 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -803,6 +803,15 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, kfree(ref); kfree(ra); goto out_unlock; + } else if (be->num_refs == 0) { + btrfs_err(fs_info, + "trying to do action %d for a bytenr that has 0 total references", + action); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + goto out_unlock; } if (!parent) { diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c new file mode 100644 index 000000000000..d1973141d3bb --- /dev/null +++ b/fs/btrfs/reflink.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/blkdev.h> +#include <linux/iversion.h> +#include "compression.h" +#include "ctree.h" +#include "delalloc-space.h" +#include "reflink.h" +#include "transaction.h" + +#define BTRFS_MAX_DEDUPE_LEN SZ_16M + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, + struct inode *inode, + u64 endoff, + const u64 destoff, + const u64 olen, + int no_time_update) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + inode_inc_iversion(inode); + if (!no_time_update) + inode->i_mtime = inode->i_ctime = current_time(inode); + /* + * We round up to the block size at eof when determining which + * extents to clone above, but shouldn't round up the file size. + */ + if (endoff > destoff + olen) + endoff = destoff + olen; + if (endoff > inode->i_size) { + i_size_write(inode, endoff); + btrfs_inode_safe_disk_i_size_write(inode, 0); + } + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + ret = btrfs_end_transaction(trans); +out: + return ret; +} + +static int copy_inline_to_page(struct inode *inode, + const u64 file_offset, + char *inline_data, + const u64 size, + const u64 datal, + const u8 comp_type) +{ + const u64 block_size = btrfs_inode_sectorsize(inode); + const u64 range_end = file_offset + block_size - 1; + const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); + char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); + struct extent_changeset *data_reserved = NULL; + struct page *page = NULL; + int ret; + + ASSERT(IS_ALIGNED(file_offset, block_size)); + + /* + * We have flushed and locked the ranges of the source and destination + * inodes, we also have locked the inodes, so we are safe to do a + * reservation here. Also we must not do the reservation while holding + * a transaction open, otherwise we would deadlock. + */ + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, + block_size); + if (ret) + goto out; + + page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, + btrfs_alloc_write_mask(inode->i_mapping)); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + + set_page_extent_mapped(page); + clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, NULL); + ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); + if (ret) + goto out_unlock; + + if (comp_type == BTRFS_COMPRESS_NONE) { + char *map; + + map = kmap(page); + memcpy(map, data_start, datal); + flush_dcache_page(page); + kunmap(page); + } else { + ret = btrfs_decompress(comp_type, data_start, page, 0, + inline_size, datal); + if (ret) + goto out_unlock; + flush_dcache_page(page); + } + + /* + * If our inline data is smaller then the block/page size, then the + * remaining of the block/page is equivalent to zeroes. We had something + * like the following done: + * + * $ xfs_io -f -c "pwrite -S 0xab 0 500" file + * $ sync # (or fsync) + * $ xfs_io -c "falloc 0 4K" file + * $ xfs_io -c "pwrite -S 0xcd 4K 4K" + * + * So what's in the range [500, 4095] corresponds to zeroes. + */ + if (datal < block_size) { + char *map; + + map = kmap(page); + memset(map + datal, 0, block_size - datal); + flush_dcache_page(page); + kunmap(page); + } + + SetPageUptodate(page); + ClearPageChecked(page); + set_page_dirty(page); +out_unlock: + if (page) { + unlock_page(page); + put_page(page); + } + if (ret) + btrfs_delalloc_release_space(inode, data_reserved, file_offset, + block_size, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); +out: + extent_changeset_free(data_reserved); + + return ret; +} + +/* + * Deal with cloning of inline extents. We try to copy the inline extent from + * the source inode to destination inode when possible. When not possible we + * copy the inline extent's data into the respective page of the inode. + */ +static int clone_copy_inline_extent(struct inode *dst, + struct btrfs_path *path, + struct btrfs_key *new_key, + const u64 drop_start, + const u64 datal, + const u64 size, + const u8 comp_type, + char *inline_data, + struct btrfs_trans_handle **trans_out) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); + struct btrfs_root *root = BTRFS_I(dst)->root; + const u64 aligned_end = ALIGN(new_key->offset + datal, + fs_info->sectorsize); + struct btrfs_trans_handle *trans = NULL; + int ret; + struct btrfs_key key; + + if (new_key->offset > 0) { + ret = copy_inline_to_page(dst, new_key->offset, inline_data, + size, datal, comp_type); + goto out; + } + + key.objectid = btrfs_ino(BTRFS_I(dst)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + goto copy_inline_extent; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + key.type == BTRFS_EXTENT_DATA_KEY) { + /* + * There's an implicit hole at file offset 0, copy the + * inline extent's data to the page. + */ + ASSERT(key.offset > 0); + ret = copy_inline_to_page(dst, new_key->offset, + inline_data, size, datal, + comp_type); + goto out; + } + } else if (i_size_read(dst) <= datal) { + struct btrfs_file_extent_item *ei; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + /* + * If it's an inline extent replace it with the source inline + * extent, otherwise copy the source inline extent data into + * the respective page at the destination inode. + */ + if (btrfs_file_extent_type(path->nodes[0], ei) == + BTRFS_FILE_EXTENT_INLINE) + goto copy_inline_extent; + + ret = copy_inline_to_page(dst, new_key->offset, inline_data, + size, datal, comp_type); + goto out; + } + +copy_inline_extent: + ret = 0; + /* + * We have no extent items, or we have an extent at offset 0 which may + * or may not be inlined. All these cases are dealt the same way. + */ + if (i_size_read(dst) > datal) { + /* + * At the destination offset 0 we have either a hole, a regular + * extent or an inline extent larger then the one we want to + * clone. Deal with all these cases by copying the inline extent + * data into the respective page at the destination inode. + */ + ret = copy_inline_to_page(dst, new_key->offset, inline_data, + size, datal, comp_type); + goto out; + } + + btrfs_release_path(path); + /* + * If we end up here it means were copy the inline extent into a leaf + * of the destination inode. We know we will drop or adjust at most one + * extent item in the destination root. + * + * 1 unit - adjusting old extent (we may have to split it) + * 1 unit - add new extent + * 1 unit - inode update + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + if (ret) + goto out; + ret = btrfs_insert_empty_item(trans, root, path, new_key, size); + if (ret) + goto out; + + write_extent_buffer(path->nodes[0], inline_data, + btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]), + size); + inode_add_bytes(dst, datal); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); +out: + if (!ret && !trans) { + /* + * No transaction here means we copied the inline extent into a + * page of the destination inode. + * + * 1 unit to update inode item + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + } + } + if (ret && trans) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + if (!ret) + *trans_out = trans; + + return ret; +} + +/** + * btrfs_clone() - clone a range from inode file to another + * + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen + * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode + */ +static int btrfs_clone(struct inode *src, struct inode *inode, + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff, int no_time_update) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_path *path = NULL; + struct extent_buffer *leaf; + struct btrfs_trans_handle *trans; + char *buf = NULL; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + const u64 len = olen_aligned; + u64 last_dest_end = destoff; + + ret = -ENOMEM; + buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!buf) + return ret; + + path = btrfs_alloc_path(); + if (!path) { + kvfree(buf); + return ret; + } + + path->reada = READA_FORWARD; + /* Clone data */ + key.objectid = btrfs_ino(BTRFS_I(src)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = off; + + while (1) { + u64 next_key_min_offset = key.offset + 1; + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + u64 drop_start; + + /* Note the key will change type as we walk through the tree */ + path->leave_spinning = 1; + ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, + 0, 0); + if (ret < 0) + goto out; + /* + * First search, if no extent item that starts at offset off was + * found but the previous item is an extent item, it's possible + * it might overlap our target range, therefore process it. + */ + if (key.offset == off && ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + nritems = btrfs_header_nritems(path->nodes[0]); +process_slot: + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(BTRFS_I(src)->root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + nritems = btrfs_header_nritems(path->nodes[0]); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.type > BTRFS_EXTENT_DATA_KEY || + key.objectid != btrfs_ino(BTRFS_I(src))) + break; + + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + disko = btrfs_file_extent_disk_bytenr(leaf, extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* Take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, extent); + } + + /* + * The first search might have left us at an extent item that + * ends before our target range's start, can happen if we have + * holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { + path->slots[0]++; + goto process_slot; + } else if (key.offset >= off + len) { + break; + } + next_key_min_offset = key.offset + datal; + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = btrfs_ino(BTRFS_I(inode)); + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; + + /* + * Deal with a hole that doesn't have an extent item that + * represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning range or at + * the beginning (fully overlaps it or partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + struct btrfs_clone_extent_info clone_info; + + /* + * a | --- range to clone ---| b + * | ------------- extent ------------- | + */ + + /* Subtract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* Subtract range a */ + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + clone_info.disk_offset = disko; + clone_info.disk_len = diskl; + clone_info.data_offset = datao; + clone_info.data_len = datal; + clone_info.file_offset = new_key.offset; + clone_info.extent_buf = buf; + clone_info.item_size = size; + ret = btrfs_punch_hole_range(inode, path, drop_start, + new_key.offset + datal - 1, &clone_info, + &trans); + if (ret) + goto out; + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* + * Inline extents always have to start at file offset 0 + * and can never be bigger then the sector size. We can + * never clone only parts of an inline extent, since all + * reflink operations must start at a sector size aligned + * offset, and the length must be aligned too or end at + * the i_size (which implies the whole inlined data). + */ + ASSERT(key.offset == 0); + ASSERT(datal <= fs_info->sectorsize); + if (key.offset != 0 || datal > fs_info->sectorsize) + return -EUCLEAN; + + ret = clone_copy_inline_extent(inode, path, &new_key, + drop_start, datal, size, + comp, buf, &trans); + if (ret) + goto out; + } + + btrfs_release_path(path); + + last_dest_end = ALIGN(new_key.offset + datal, + fs_info->sectorsize); + ret = clone_finish_inode_update(trans, inode, last_dest_end, + destoff, olen, no_time_update); + if (ret) + goto out; + if (new_key.offset + datal >= destoff + len) + break; + + btrfs_release_path(path); + key.offset = next_key_min_offset; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + } + ret = 0; + + if (last_dest_end < destoff + len) { + /* + * We have an implicit hole that fully or partially overlaps our + * cloning range at its end. This means that we either have the + * NO_HOLES feature enabled or the implicit hole happened due to + * mixing buffered and direct IO writes against this file. + */ + btrfs_release_path(path); + path->leave_spinning = 0; + + ret = btrfs_punch_hole_range(inode, path, last_dest_end, + destoff + len - 1, NULL, &trans); + if (ret) + goto out; + + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen, no_time_update); + } + +out: + btrfs_free_path(path); + kvfree(buf); + return ret; +} + +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } else if (inode1 == inode2 && loff2 < loff1) { + swap(loff1, loff2); + } + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, + struct inode *dst, u64 dst_loff) +{ + const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + int ret; + + /* + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. + */ + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + + return ret; +} + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, + struct inode *dst, u64 dst_loff) +{ + int ret; + u64 i, tail_len, chunk_count; + struct btrfs_root *root_dst = BTRFS_I(dst)->root; + + spin_lock(&root_dst->root_item_lock); + if (root_dst->send_in_progress) { + btrfs_warn_rl(root_dst->fs_info, +"cannot deduplicate to root %llu while send operations are using it (%d in progress)", + root_dst->root_key.objectid, + root_dst->send_in_progress); + spin_unlock(&root_dst->root_item_lock); + return -EAGAIN; + } + root_dst->dedupe_in_progress++; + spin_unlock(&root_dst->root_item_lock); + + tail_len = olen % BTRFS_MAX_DEDUPE_LEN; + chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); + + for (i = 0; i < chunk_count; i++) { + ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, + dst, dst_loff); + if (ret) + goto out; + + loff += BTRFS_MAX_DEDUPE_LEN; + dst_loff += BTRFS_MAX_DEDUPE_LEN; + } + + if (tail_len > 0) + ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); +out: + spin_lock(&root_dst->root_item_lock); + root_dst->dedupe_in_progress--; + spin_unlock(&root_dst->root_item_lock); + + return ret; +} + +static noinline int btrfs_clone_files(struct file *file, struct file *file_src, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = file_inode(file); + struct inode *src = file_inode(file_src); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret; + int wb_ret; + u64 len = olen; + u64 bs = fs_info->sb->s_blocksize; + + /* + * VFS's generic_remap_file_range_prep() protects us from cloning the + * eof block into the middle of a file, which would result in corruption + * if the file size is not blocksize aligned. So we don't need to check + * for that case here. + */ + if (off + len == src->i_size) + len = ALIGN(src->i_size, bs) - off; + + if (destoff > inode->i_size) { + const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); + + ret = btrfs_cont_expand(inode, inode->i_size, destoff); + if (ret) + return ret; + /* + * We may have truncated the last block if the inode's size is + * not sector size aligned, so we need to wait for writeback to + * complete before proceeding further, otherwise we can race + * with cloning and attempt to increment a reference to an + * extent that no longer exists (writeback completed right after + * we found the previous extent covering eof and before we + * attempted to increment its reference count). + */ + ret = btrfs_wait_ordered_range(inode, wb_start, + destoff - wb_start); + if (ret) + return ret; + } + + /* + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. + */ + btrfs_double_extent_lock(src, off, inode, destoff, len); + ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); + btrfs_double_extent_unlock(src, off, inode, destoff, len); + + /* + * We may have copied an inline extent into a page of the destination + * range, so wait for writeback to complete before truncating pages + * from the page cache. This is a rare case. + */ + wb_ret = btrfs_wait_ordered_range(inode, destoff, len); + ret = ret ? ret : wb_ret; + /* + * Truncate page cache pages so that future reads will see the cloned + * data immediately and not the previous data. + */ + truncate_inode_pages_range(&inode->i_data, + round_down(destoff, PAGE_SIZE), + round_up(destoff + len, PAGE_SIZE) - 1); + + return ret; +} + +static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + bool same_inode = inode_out == inode_in; + u64 wb_len; + int ret; + + if (!(remap_flags & REMAP_FILE_DEDUP)) { + struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + + if (btrfs_root_readonly(root_out)) + return -EROFS; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + } + + /* Don't make the dst file partly checksummed */ + if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + return -EINVAL; + } + + /* + * Now that the inodes are locked, we need to start writeback ourselves + * and can not rely on the writeback from the VFS's generic helper + * generic_remap_file_range_prep() because: + * + * 1) For compression we must call filemap_fdatawrite_range() range + * twice (btrfs_fdatawrite_range() does it for us), and the generic + * helper only calls it once; + * + * 2) filemap_fdatawrite_range(), called by the generic helper only + * waits for the writeback to complete, i.e. for IO to be done, and + * not for the ordered extents to complete. We need to wait for them + * to complete so that new file extent items are in the fs tree. + */ + if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) + wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + else + wb_len = ALIGN(*len, bs); + + /* + * Since we don't lock ranges, wait for ongoing lockless dio writes (as + * any in progress could create its ordered extents after we wait for + * existing ordered extents below). + */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + /* + * Workaround to make sure NOCOW buffered write reach disk as NOCOW. + * + * Btrfs' back references do not have a block level granularity, they + * work at the whole extent level. + * NOCOW buffered write without data space reserved may not be able + * to fall back to CoW due to lack of data space, thus could cause + * data loss. + * + * Here we take a shortcut by flushing the whole inode, so that all + * nocow write should reach disk as nocow before we increase the + * reference of the extent. We could do better by only flushing NOCOW + * data, but that needs extra accounting. + * + * Also we don't need to check ASYNC_EXTENT, as async extent will be + * CoWed anyway, not affecting nocow part. + */ + ret = filemap_flush(inode_in->i_mapping); + if (ret < 0) + return ret; + + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), + wb_len); + if (ret < 0) + return ret; + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), + wb_len); + if (ret < 0) + return ret; + + return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); +} + +loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, + unsigned int remap_flags) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + bool same_inode = dst_inode == src_inode; + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (same_inode) + inode_lock(src_inode); + else + lock_two_nondirectories(src_inode, dst_inode); + + ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + if (remap_flags & REMAP_FILE_DEDUP) + ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + else + ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); + +out_unlock: + if (same_inode) + inode_unlock(src_inode); + else + unlock_two_nondirectories(src_inode, dst_inode); + + return ret < 0 ? ret : len; +} diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h new file mode 100644 index 000000000000..ecb309b4dad0 --- /dev/null +++ b/fs/btrfs/reflink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_REFLINK_H +#define BTRFS_REFLINK_H + +#include <linux/fs.h> + +loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); + +#endif /* BTRFS_REFLINK_H */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 995d4b8b1cfd..f65595602aa8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -9,6 +9,7 @@ #include <linux/blkdev.h> #include <linux/rbtree.h> #include <linux/slab.h> +#include <linux/error-injection.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -22,6 +23,54 @@ #include "print-tree.h" #include "delalloc-space.h" #include "block-group.h" +#include "backref.h" + +/* + * Relocation overview + * + * [What does relocation do] + * + * The objective of relocation is to relocate all extents of the target block + * group to other block groups. + * This is utilized by resize (shrink only), profile converting, compacting + * space, or balance routine to spread chunks over devices. + * + * Before | After + * ------------------------------------------------------------------ + * BG A: 10 data extents | BG A: deleted + * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) + * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) + * + * [How does relocation work] + * + * 1. Mark the target block group read-only + * New extents won't be allocated from the target block group. + * + * 2.1 Record each extent in the target block group + * To build a proper map of extents to be relocated. + * + * 2.2 Build data reloc tree and reloc trees + * Data reloc tree will contain an inode, recording all newly relocated + * data extents. + * There will be only one data reloc tree for one data block group. + * + * Reloc tree will be a special snapshot of its source tree, containing + * relocated tree blocks. + * Each tree referring to a tree block in target block group will get its + * reloc tree built. + * + * 2.3 Swap source tree with its corresponding reloc tree + * Each involved tree only refers to new extents after swap. + * + * 3. Cleanup reloc trees and data reloc tree. + * As old extents in the target block group are still referenced by reloc + * trees, we need to clean them up before really freeing the target block + * group. + * + * The main complexity is in steps 2.2 and 2.3. + * + * The entry point of relocation is relocate_block_group() function. + */ /* * backref_node, mapping_node and tree_block start with this @@ -256,6 +305,7 @@ static void free_backref_node(struct backref_cache *cache, { if (node) { cache->nr_nodes--; + btrfs_put_root(node->root); kfree(node); } } @@ -589,22 +639,7 @@ static struct btrfs_root *find_reloc_root(struct reloc_control *rc, root = (struct btrfs_root *)node->data; } spin_unlock(&rc->reloc_root_tree.lock); - return root; -} - -static int is_cowonly_root(u64 root_objectid) -{ - if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || - root_objectid == BTRFS_EXTENT_TREE_OBJECTID || - root_objectid == BTRFS_CHUNK_TREE_OBJECTID || - root_objectid == BTRFS_DEV_TREE_OBJECTID || - root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID || - root_objectid == BTRFS_UUID_TREE_OBJECTID || - root_objectid == BTRFS_QUOTA_TREE_OBJECTID || - root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return 1; - return 0; + return btrfs_grab_root(root); } static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, @@ -614,10 +649,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, key.objectid = root_objectid; key.type = BTRFS_ROOT_ITEM_KEY; - if (is_cowonly_root(root_objectid)) - key.offset = 0; - else - key.offset = (u64)-1; + key.offset = (u64)-1; return btrfs_get_fs_root(fs_info, &key, false); } @@ -711,8 +743,6 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } - path1->reada = READA_FORWARD; - path2->reada = READA_FORWARD; node = alloc_backref_node(cache); if (!node) { @@ -899,10 +929,12 @@ again: /* tree root */ ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); - if (should_ignore_root(root)) + if (should_ignore_root(root)) { + btrfs_put_root(root); list_add(&cur->list, &useless); - else + } else { cur->root = root; + } break; } @@ -915,6 +947,7 @@ again: ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); path2->lowest_level = 0; if (ret < 0) { + btrfs_put_root(root); err = ret; goto out; } @@ -930,6 +963,7 @@ again: root->root_key.objectid, node_key->objectid, node_key->type, node_key->offset); + btrfs_put_root(root); err = -ENOENT; goto out; } @@ -941,15 +975,18 @@ again: if (!path2->nodes[level]) { ASSERT(btrfs_root_bytenr(&root->root_item) == lower->bytenr); - if (should_ignore_root(root)) + if (should_ignore_root(root)) { + btrfs_put_root(root); list_add(&lower->list, &useless); - else + } else { lower->root = root; + } break; } edge = alloc_backref_edge(cache); if (!edge) { + btrfs_put_root(root); err = -ENOMEM; goto out; } @@ -959,6 +996,7 @@ again: if (!rb_node) { upper = alloc_backref_node(cache); if (!upper) { + btrfs_put_root(root); free_backref_edge(cache, edge); err = -ENOMEM; goto out; @@ -1006,8 +1044,10 @@ again: edge->node[LOWER] = lower; edge->node[UPPER] = upper; - if (rb_node) + if (rb_node) { + btrfs_put_root(root); break; + } lower = upper; upper = NULL; } @@ -1186,7 +1226,7 @@ out: free_backref_node(cache, lower); } - free_backref_node(cache, node); + remove_backref_node(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); @@ -1244,7 +1284,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, new_node->level = node->level; new_node->lowest = node->lowest; new_node->checked = 1; - new_node->root = dest; + new_node->root = btrfs_grab_root(dest); + ASSERT(new_node->root); if (!node->lowest) { list_for_each_entry(edge, &node->lower, list[UPPER]) { @@ -1298,7 +1339,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (!node) return -ENOMEM; - node->bytenr = root->node->start; + node->bytenr = root->commit_root->start; node->data = root; spin_lock(&rc->reloc_root_tree.lock); @@ -1325,14 +1366,16 @@ static void __del_reloc_root(struct btrfs_root *root) struct rb_node *rb_node; struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; + bool put_ref = false; if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); if (!node) @@ -1340,9 +1383,22 @@ static void __del_reloc_root(struct btrfs_root *root) BUG_ON((struct btrfs_root *)node->data != root); } + /* + * We only put the reloc root here if it's on the list. There's a lot + * of places where the pattern is to splice the rc->reloc_roots, process + * the reloc roots, and then add the reloc root back onto + * rc->reloc_roots. If we call __del_reloc_root while it's off of the + * list we don't want the reference being dropped, because the guy + * messing with the list is in charge of the reference. + */ spin_lock(&fs_info->trans_lock); - list_del_init(&root->root_list); + if (!list_empty(&root->root_list)) { + put_ref = true; + list_del_init(&root->root_list); + } spin_unlock(&fs_info->trans_lock); + if (put_ref) + btrfs_put_root(root); kfree(node); } @@ -1350,7 +1406,7 @@ static void __del_reloc_root(struct btrfs_root *root) * helper to update the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +static int __update_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; @@ -1359,7 +1415,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1371,7 +1427,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) BUG_ON((struct btrfs_root *)node->data != root); spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = new_bytenr; + node->bytenr = root->node->start; rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); @@ -1447,8 +1503,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(root_item); - reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key); + reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); BUG_ON(IS_ERR(reloc_root)); + set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); reloc_root->last_trans = trans->transid; return reloc_root; } @@ -1456,6 +1513,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, /* * create reloc tree for a given fs tree. reloc tree is just a * snapshot of the fs tree with special root objectid. + * + * The reloc_root comes out of here with two references, one for + * root->reloc_root, and another for being on the rc->reloc_roots list. */ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -1467,6 +1527,10 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; + if (!rc || !rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + /* * The subvolume has reloc tree but the swap is finished, no need to * create/update the dead reloc tree @@ -1480,10 +1544,6 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, return 0; } - if (!rc || !rc->create_reloc_tree || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - return 0; - if (!trans->reloc_reserved) { rsv = trans->block_rsv; trans->block_rsv = rc->block_rsv; @@ -1495,7 +1555,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, ret = __add_reloc_root(reloc_root); BUG_ON(ret < 0); - root->reloc_root = reloc_root; + root->reloc_root = btrfs_grab_root(reloc_root); return 0; } @@ -1516,6 +1576,13 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, reloc_root = root->reloc_root; root_item = &reloc_root->root_item; + /* + * We are probably ok here, but __del_reloc_root() will drop its ref of + * the root. We have the ref for root->reloc_root, but just in case + * hold it while we update the reloc root. + */ + btrfs_grab_root(reloc_root); + /* root->reloc_root will stay until current relocation finished */ if (fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { @@ -1529,6 +1596,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, } if (reloc_root->commit_root != reloc_root->node) { + __update_reloc_root(reloc_root); btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->commit_root = btrfs_root_node(reloc_root); @@ -1537,7 +1605,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, fs_info->tree_root, &reloc_root->root_key, root_item); BUG_ON(ret); - + btrfs_put_root(reloc_root); out: return 0; } @@ -2211,7 +2279,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans, btrfs_update_reloc_root(trans, root); if (list_empty(&root->reloc_dirty_list)) { - btrfs_grab_fs_root(root); + btrfs_grab_root(root); list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); } } @@ -2231,24 +2299,34 @@ static int clean_dirty_subvols(struct reloc_control *rc) list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; - if (reloc_root) { - - ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); - if (ret2 < 0 && !ret) - ret = ret2; - } /* * Need barrier to ensure clear_bit() only happens after * root->reloc_root = NULL. Pairs with have_reloc_root. */ smp_wmb(); clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); - btrfs_put_fs_root(root); + if (reloc_root) { + /* + * btrfs_drop_snapshot drops our ref we hold for + * ->reloc_root. If it fails however we must + * drop the ref ourselves. + */ + ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); + if (ret2 < 0) { + btrfs_put_root(reloc_root); + if (!ret) + ret = ret2; + } + } + btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ - ret2 = btrfs_drop_snapshot(root, NULL, 0, 1); - if (ret2 < 0 && !ret) - ret = ret2; + ret2 = btrfs_drop_snapshot(root, 0, 1); + if (ret2 < 0) { + btrfs_put_root(root); + if (!ret) + ret = ret2; + } } } return ret; @@ -2325,6 +2403,18 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, trans = NULL; goto out; } + + /* + * At this point we no longer have a reloc_control, so we can't + * depend on btrfs_init_reloc_root to update our last_trans. + * + * But that's ok, we started the trans handle on our + * corresponding fs_root, which means it's been added to the + * dirty list. At commit time we'll still call + * btrfs_update_reloc_root() and update our root item + * appropriately. + */ + reloc_root->last_trans = trans->transid; trans->block_rsv = rc->block_rsv; replaced = 0; @@ -2435,7 +2525,7 @@ again: if (IS_ERR(trans)) { if (!err) btrfs_block_rsv_release(fs_info, rc->block_rsv, - num_bytes); + num_bytes, NULL); return PTR_ERR(trans); } @@ -2443,7 +2533,7 @@ again: if (num_bytes != rc->merging_rsv_size) { btrfs_end_transaction(trans); btrfs_block_rsv_release(fs_info, rc->block_rsv, - num_bytes); + num_bytes, NULL); goto again; } } @@ -2468,6 +2558,7 @@ again: btrfs_update_reloc_root(trans, root); list_add(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(root); } list_splice(&reloc_roots, &rc->reloc_roots); @@ -2488,10 +2579,6 @@ void free_reloc_roots(struct list_head *list) reloc_root = list_entry(list->next, struct btrfs_root, root_list); __del_reloc_root(reloc_root); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - reloc_root->node = NULL; - reloc_root->commit_root = NULL; } } @@ -2529,6 +2616,7 @@ again: BUG_ON(root->reloc_root != reloc_root); ret = merge_reloc_root(rc, root); + btrfs_put_root(root); if (ret) { if (list_empty(&reloc_root->root_list)) list_add_tail(&reloc_root->root_list, @@ -2561,7 +2649,21 @@ out: free_reloc_roots(&reloc_roots); } - BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + /* + * We used to have + * + * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + * + * here, but it's wrong. If we fail to start the transaction in + * prepare_to_merge() we will have only 0 ref reloc roots, none of which + * have actually been removed from the reloc_root_tree rb tree. This is + * fine because we're bailing here, and we hold a reference on the root + * for the list that holds it, so these roots will be cleaned up when we + * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root + * will be cleaned up on unmount. + * + * The remaining nodes will be cleaned up by free_reloc_control. + */ } static void free_block_list(struct rb_root *blocks) @@ -2580,6 +2682,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = reloc_root->fs_info; struct btrfs_root *root; + int ret; if (reloc_root->last_trans == trans->transid) return 0; @@ -2587,8 +2690,10 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); + ret = btrfs_record_root_in_trans(trans, root); + btrfs_put_root(root); - return btrfs_record_root_in_trans(trans, root); + return ret; } static noinline_for_stack @@ -2621,7 +2726,9 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(next->new_bytenr); BUG_ON(!list_empty(&next->list)); next->new_bytenr = root->node->start; - next->root = root; + btrfs_put_root(next->root); + next->root = btrfs_grab_root(root); + ASSERT(next->root); list_add_tail(&next->list, &rc->backref_cache.changed); __mark_block_processed(rc, next); @@ -3040,7 +3147,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - BUG_ON(block->key_ready); eb = read_tree_block(fs_info, block->bytenr, block->key.offset, block->level, NULL); if (IS_ERR(eb)) { @@ -3073,6 +3179,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, if (!node) return 0; + /* + * If we fail here we want to drop our backref_node because we are going + * to start over and regenerate the tree for it. + */ + ret = reserve_metadata_space(trans, rc, node); + if (ret) + goto out; + BUG_ON(node->processed); root = select_one_root(node); if (root == ERR_PTR(-ENOENT)) { @@ -3080,12 +3194,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, goto out; } - if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { - ret = reserve_metadata_space(trans, rc, node); - if (ret) - goto out; - } - if (root) { if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { BUG_ON(node->new_bytenr); @@ -3093,7 +3201,9 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, btrfs_record_root_in_trans(trans, root); root = root->reloc_root; node->new_bytenr = root->node->start; - node->root = root; + btrfs_put_root(node->root); + node->root = btrfs_grab_root(root); + ASSERT(node->root); list_add_tail(&node->list, &rc->backref_cache.changed); } else { path->lowest_level = node->level; @@ -3161,9 +3271,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { - if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) - err = ret; - goto out; + err = ret; + break; } } out: @@ -3264,6 +3373,15 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, return ret; } +/* + * Allow error injection to test balance cancellation + */ +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +{ + return atomic_read(&fs_info->balance_cancel_req); +} +ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); + static int relocate_file_extent_cluster(struct inode *inode, struct file_extent_cluster *cluster) { @@ -3385,6 +3503,10 @@ static int relocate_file_extent_cluster(struct inode *inode, btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); + if (btrfs_should_cancel_balance(fs_info)) { + ret = -ECANCELED; + goto out; + } } WARN_ON(nr != cluster->nr); out: @@ -3556,31 +3678,6 @@ out: return ret; } -/* - * helper to check if the block use full backrefs for pointers in it - */ -static int block_use_full_backref(struct reloc_control *rc, - struct extent_buffer *eb) -{ - u64 flags; - int ret; - - if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || - btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) - return 1; - - ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info, - eb->start, btrfs_header_level(eb), 1, - NULL, &flags); - BUG_ON(ret); - - if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = 1; - else - ret = 0; - return ret; -} - static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group *block_group, struct inode *inode, @@ -3624,172 +3721,40 @@ out: } /* - * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY - * this function scans fs tree to find blocks reference the data extent + * Locate the free space cache EXTENT_DATA in root tree leaf and delete the + * cache inode, to avoid free space cache data extent blocking data relocation. */ -static int find_data_references(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - struct rb_root *blocks) +static int delete_v1_space_cache(struct extent_buffer *leaf, + struct btrfs_block_group *block_group, + u64 data_bytenr) { - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct btrfs_path *path; - struct tree_block *block; - struct btrfs_root *root; - struct btrfs_file_extent_item *fi; - struct rb_node *rb_node; + u64 space_cache_ino; + struct btrfs_file_extent_item *ei; struct btrfs_key key; - u64 ref_root; - u64 ref_objectid; - u64 ref_offset; - u32 ref_count; - u32 nritems; - int err = 0; - int added = 0; - int counted; + bool found = false; + int i; int ret; - ref_root = btrfs_extent_data_ref_root(leaf, ref); - ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); - ref_offset = btrfs_extent_data_ref_offset(leaf, ref); - ref_count = btrfs_extent_data_ref_count(leaf, ref); - - /* - * This is an extent belonging to the free space cache, lets just delete - * it and redo the search. - */ - if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { - ret = delete_block_group_cache(fs_info, rc->block_group, - NULL, ref_objectid); - if (ret != -ENOENT) - return ret; - ret = 0; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = READA_FORWARD; - - root = read_fs_root(fs_info, ref_root); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto out; - } - - key.objectid = ref_objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - if (ref_offset > ((u64)-1 << 32)) - key.offset = 0; - else - key.offset = ref_offset; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - err = ret; - goto out; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - /* - * the references in tree blocks that use full backrefs - * are not counted in - */ - if (block_use_full_backref(rc, leaf)) - counted = 0; - else - counted = 1; - rb_node = tree_search(blocks, leaf->start); - if (rb_node) { - if (counted) - added = 1; - else - path->slots[0] = nritems; - } - - while (ref_count > 0) { - while (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; - goto out; - } - if (WARN_ON(ret > 0)) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - added = 0; - - if (block_use_full_backref(rc, leaf)) - counted = 0; - else - counted = 1; - rb_node = tree_search(blocks, leaf->start); - if (rb_node) { - if (counted) - added = 1; - else - path->slots[0] = nritems; - } - } + if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) + return 0; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (WARN_ON(key.objectid != ref_objectid || - key.type != BTRFS_EXTENT_DATA_KEY)) + for (i = 0; i < btrfs_header_nritems(leaf); i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG && + btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { + found = true; + space_cache_ino = key.objectid; break; - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - goto next; - - if (btrfs_file_extent_disk_bytenr(leaf, fi) != - extent_key->objectid) - goto next; - - key.offset -= btrfs_file_extent_offset(leaf, fi); - if (key.offset != ref_offset) - goto next; - - if (counted) - ref_count--; - if (added) - goto next; - - if (!tree_block_processed(leaf->start, rc)) { - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) { - err = -ENOMEM; - break; - } - block->bytenr = leaf->start; - btrfs_item_key_to_cpu(leaf, &block->key, 0); - block->level = 0; - block->key_ready = 1; - rb_node = tree_insert(blocks, block->bytenr, - &block->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, - block->bytenr); } - if (counted) - added = 1; - else - path->slots[0] = nritems; -next: - path->slots[0]++; - } -out: - btrfs_free_path(path); - return err; + if (!found) + return -ENOENT; + ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, + space_cache_ino); + return ret; } /* @@ -3801,91 +3766,41 @@ int add_data_references(struct reloc_control *rc, struct btrfs_path *path, struct rb_root *blocks) { - struct btrfs_key key; - struct extent_buffer *eb; - struct btrfs_extent_data_ref *dref; - struct btrfs_extent_inline_ref *iref; - unsigned long ptr; - unsigned long end; - u32 blocksize = rc->extent_root->fs_info->nodesize; + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct ulist *leaves = NULL; + struct ulist_iterator leaf_uiter; + struct ulist_node *ref_node = NULL; + const u32 blocksize = fs_info->nodesize; int ret = 0; - int err = 0; - - eb = path->nodes[0]; - ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - end = ptr + btrfs_item_size_nr(eb, path->slots[0]); - ptr += sizeof(struct btrfs_extent_item); - while (ptr < end) { - iref = (struct btrfs_extent_inline_ref *)ptr; - key.type = btrfs_get_extent_inline_ref_type(eb, iref, - BTRFS_REF_TYPE_DATA); - if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - key.offset = btrfs_extent_inline_ref_offset(eb, iref); - ret = __add_tree_block(rc, key.offset, blocksize, - blocks); - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - ret = find_data_references(rc, extent_key, - eb, dref, blocks); - } else { - ret = -EUCLEAN; - btrfs_err(rc->extent_root->fs_info, - "extent %llu slot %d has an invalid inline ref type", - eb->start, path->slots[0]); - } - if (ret) { - err = ret; - goto out; - } - ptr += btrfs_extent_inline_ref_size(key.type); - } - WARN_ON(ptr > end); + btrfs_release_path(path); + ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, + 0, &leaves, NULL, true); + if (ret < 0) + return ret; - while (1) { - cond_resched(); - eb = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(rc->extent_root, path); - if (ret < 0) { - err = ret; - break; - } - if (ret > 0) - break; - eb = path->nodes[0]; - } + ULIST_ITER_INIT(&leaf_uiter); + while ((ref_node = ulist_next(leaves, &leaf_uiter))) { + struct extent_buffer *eb; - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - if (key.objectid != extent_key->objectid) + eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); break; - - if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ret = __add_tree_block(rc, key.offset, blocksize, - blocks); - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_data_ref); - ret = find_data_references(rc, extent_key, - eb, dref, blocks); - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); - ret = -EINVAL; - } else { - ret = 0; } - if (ret) { - err = ret; + ret = delete_v1_space_cache(eb, rc->block_group, + extent_key->objectid); + free_extent_buffer(eb); + if (ret < 0) + break; + ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); + if (ret < 0) break; - } - path->slots[0]++; } -out: - btrfs_release_path(path); - if (err) + if (ret < 0) free_block_list(blocks); - return err; + ulist_free(leaves); + return ret; } /* @@ -4137,12 +4052,6 @@ restart: if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { - /* - * if we fail to relocate tree blocks, force to update - * backref cache when committing transaction. - */ - rc->backref_cache.last_trans = trans->transid - 1; - if (ret != -EAGAIN) { err = ret; break; @@ -4166,6 +4075,10 @@ restart: break; } } + if (btrfs_should_cancel_balance(fs_info)) { + err = -ECANCELED; + break; + } } if (trans && progress && err == -ENOSPC) { ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); @@ -4195,15 +4108,23 @@ restart: set_reloc_control(rc); backref_cache_cleanup(&rc->backref_cache); - btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); + /* + * Even in the case when the relocation is cancelled, we should all go + * through prepare_to_merge() and merge_reloc_roots(). + * + * For error (including cancelled balance), prepare_to_merge() will + * mark all reloc trees orphan, then queue them for cleanup in + * merge_reloc_roots() + */ err = prepare_to_merge(rc, err); merge_reloc_roots(rc); rc->merge_reloc_tree = 0; unset_reloc_control(rc); - btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root); @@ -4212,10 +4133,10 @@ restart: goto out_free; } btrfs_commit_transaction(trans); +out_free: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) err = ret; -out_free: btrfs_free_block_rsv(fs_info, rc->block_rsv); btrfs_free_path(path); return err; @@ -4271,8 +4192,10 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, return ERR_CAST(root); trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_put_root(root); return ERR_CAST(trans); + } err = btrfs_find_free_objectid(root, &objectid); if (err) @@ -4290,6 +4213,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, err = btrfs_orphan_add(trans, BTRFS_I(inode)); out: + btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (err) { @@ -4317,6 +4241,18 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) return rc; } +static void free_reloc_control(struct reloc_control *rc) +{ + struct mapping_node *node, *tmp; + + free_reloc_roots(&rc->reloc_roots); + rbtree_postorder_for_each_entry_safe(node, tmp, + &rc->reloc_root_tree.rb_root, rb_node) + kfree(node); + + kfree(rc); +} + /* * Print the block group being relocated */ @@ -4461,7 +4397,7 @@ out: btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); btrfs_put_block_group(rc->block_group); - kfree(rc); + free_reloc_control(rc); return err; } @@ -4537,12 +4473,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_fs_root(root, &key); + reloc_root = btrfs_read_tree_root(root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; } + set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); list_add(&reloc_root->root_list, &reloc_roots); if (btrfs_root_refs(&reloc_root->root_item) > 0) { @@ -4559,6 +4496,8 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = ret; goto out; } + } else { + btrfs_put_root(fs_root); } } @@ -4584,9 +4523,8 @@ int btrfs_recover_relocation(struct btrfs_root *root) trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { - unset_reloc_control(rc); err = PTR_ERR(trans); - goto out_free; + goto out_unset; } rc->merge_reloc_tree = 1; @@ -4606,17 +4544,18 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); list_add_tail(&reloc_root->root_list, &reloc_roots); - goto out_free; + goto out_unset; } err = __add_reloc_root(reloc_root); BUG_ON(err < 0); /* -ENOMEM or logic error */ - fs_root->reloc_root = reloc_root; + fs_root->reloc_root = btrfs_grab_root(reloc_root); + btrfs_put_root(fs_root); } err = btrfs_commit_transaction(trans); if (err) - goto out_free; + goto out_unset; merge_reloc_roots(rc); @@ -4625,15 +4564,16 @@ int btrfs_recover_relocation(struct btrfs_root *root) trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto out_free; + goto out_clean; } err = btrfs_commit_transaction(trans); - +out_clean: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) err = ret; -out_free: - kfree(rc); +out_unset: + unset_reloc_control(rc); + free_reloc_control(rc); out: if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); @@ -4643,10 +4583,12 @@ out: if (err == 0) { /* cleanup orphan inode in data relocation tree */ fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); - if (IS_ERR(fs_root)) + if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); - else + } else { err = btrfs_orphan_cleanup(fs_root); + btrfs_put_root(fs_root); + } } return err; } @@ -4720,11 +4662,6 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (buf == root->node) - __update_reloc_root(root, cow->start); - } - level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) @@ -4795,6 +4732,10 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, /* * called after snapshot is created. migrate block reservation * and create reloc root for the newly created snapshot + * + * This is similar to btrfs_init_reloc_root(), we come out of here with two + * references held on the reloc_root, one for root->reloc_root and one for + * rc->reloc_roots. */ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending) @@ -4827,7 +4768,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, ret = __add_reloc_root(reloc_root); BUG_ON(ret < 0); - new_root->reloc_root = reloc_root; + new_root->reloc_root = btrfs_grab_root(reloc_root); if (rc->create_reloc_tree) ret = clone_backref_node(trans, rc, root, reloc_root); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 612411c74550..668f22844017 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -22,7 +22,6 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, struct btrfs_root_item *item) { - uuid_le uuid; u32 len; int need_reset = 0; @@ -44,8 +43,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, sizeof(*item) - offsetof(struct btrfs_root_item, generation_v2)); - uuid_le_gen(&uuid); - memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(item->uuid); } } @@ -255,25 +253,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) root_key.objectid = key.offset; key.offset++; - /* - * The root might have been inserted already, as before we look - * for orphan roots, log replay might have happened, which - * triggers a transaction commit and qgroup accounting, which - * in turn reads and inserts fs roots while doing backref - * walking. - */ - root = btrfs_lookup_fs_root(fs_info, root_key.objectid); - if (root) { - WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, - &root->state)); - if (btrfs_root_refs(&root->root_item) == 0) { - set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); - btrfs_add_dead_root(root); - } - continue; - } - - root = btrfs_read_fs_root(tree_root, &root_key); + root = btrfs_get_fs_root(fs_info, &root_key, false); err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { break; @@ -300,25 +280,12 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) continue; } - err = btrfs_init_fs_root(root); - if (err) { - btrfs_free_fs_root(root); - break; - } - - set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); - - err = btrfs_insert_fs_root(fs_info, root); - if (err) { - BUG_ON(err == -EEXIST); - btrfs_free_fs_root(root); - break; - } - + WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); if (btrfs_root_refs(&root->root_item) == 0) { set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); } + btrfs_put_root(root); } btrfs_free_path(path); @@ -553,5 +520,5 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 61b37c56a7fb..adaf8ab694d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -149,7 +149,7 @@ struct scrub_parity { */ unsigned long *ebitmap; - unsigned long bitmap[0]; + unsigned long bitmap[]; }; struct scrub_ctx { @@ -653,7 +653,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, root_key.objectid = root; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + local_root = btrfs_get_fs_root(fs_info, &root_key, true); if (IS_ERR(local_root)) { ret = PTR_ERR(local_root); goto err; @@ -668,6 +668,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); if (ret) { + btrfs_put_root(local_root); btrfs_release_path(swarn->path); goto err; } @@ -688,6 +689,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, ipath = init_ipath(4096, local_root, swarn->path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { + btrfs_put_root(local_root); ret = PTR_ERR(ipath); ipath = NULL; goto err; @@ -711,6 +713,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); + btrfs_put_root(local_root); free_ipath(ipath); return 0; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a055b657cb85..c5f41bd86765 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5586,10 +5586,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) { struct btrfs_path *path; struct btrfs_root *root = sctx->send_root; - struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 extent_end; - u8 type; int ret; path = alloc_path_for_send(); @@ -5609,18 +5606,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) goto out; - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], fi); - if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); - extent_end = ALIGN(key.offset + size, - sctx->send_root->fs_info->sectorsize); - } else { - extent_end = key.offset + - btrfs_file_extent_num_bytes(path->nodes[0], fi); - } - sctx->cur_inode_last_extent = extent_end; + sctx->cur_inode_last_extent = btrfs_file_extent_end(path); out: btrfs_free_path(path); return ret; @@ -5674,16 +5660,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, break; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(leaf, fi); - - extent_end = ALIGN(key.offset + size, - root->fs_info->sectorsize); - } else { - extent_end = key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - } + extent_end = btrfs_file_extent_end(path); if (extent_end <= start) goto next; if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { @@ -5704,9 +5681,6 @@ out: static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { - struct btrfs_file_extent_item *fi; - u64 extent_end; - u8 type; int ret = 0; if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) @@ -5718,18 +5692,6 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, return ret; } - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], fi); - if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); - extent_end = ALIGN(key->offset + size, - sctx->send_root->fs_info->sectorsize); - } else { - extent_end = key->offset + - btrfs_file_extent_num_bytes(path->nodes[0], fi); - } - if (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset) { /* @@ -5755,7 +5717,7 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, else ret = 0; } - sctx->cur_inode_last_extent = extent_end; + sctx->cur_inode_last_extent = btrfs_file_extent_end(path); return ret; } @@ -7066,7 +7028,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) int clone_sources_to_rollback = 0; unsigned alloc_size; int sort_clone_roots = 0; - int index; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -7193,11 +7154,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - - clone_root = btrfs_read_fs_root_no_name(fs_info, &key); + clone_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(clone_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(clone_root); goto out; } @@ -7205,20 +7163,19 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (!btrfs_root_readonly(clone_root) || btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); + btrfs_put_root(clone_root); ret = -EPERM; goto out; } if (clone_root->dedupe_in_progress) { dedupe_in_progress_warn(clone_root); spin_unlock(&clone_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); + btrfs_put_root(clone_root); ret = -EAGAIN; goto out; } clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; @@ -7232,11 +7189,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - - sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); + sctx->parent_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(sctx->parent_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(sctx->parent_root); goto out; } @@ -7246,20 +7200,16 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (!btrfs_root_readonly(sctx->parent_root) || btrfs_root_dead(sctx->parent_root)) { spin_unlock(&sctx->parent_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; } if (sctx->parent_root->dedupe_in_progress) { dedupe_in_progress_warn(sctx->parent_root); spin_unlock(&sctx->parent_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EAGAIN; goto out; } spin_unlock(&sctx->parent_root->root_item_lock); - - srcu_read_unlock(&fs_info->subvol_srcu, index); } /* @@ -7267,7 +7217,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) * is behind the current send position. This is checked while searching * for possible clone sources. */ - sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root; + sctx->clone_roots[sctx->clone_roots_cnt++].root = + btrfs_grab_root(sctx->send_root); /* We do a bsearch later */ sort(sctx->clone_roots, sctx->clone_roots_cnt, @@ -7352,18 +7303,24 @@ out: } if (sort_clone_roots) { - for (i = 0; i < sctx->clone_roots_cnt; i++) + for (i = 0; i < sctx->clone_roots_cnt; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); + btrfs_put_root(sctx->clone_roots[i].root); + } } else { - for (i = 0; sctx && i < clone_sources_to_rollback; i++) + for (i = 0; sctx && i < clone_sources_to_rollback; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); + btrfs_put_root(sctx->clone_roots[i].root); + } btrfs_root_dec_send_in_progress(send_root); } - if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) { btrfs_root_dec_send_in_progress(sctx->parent_root); + btrfs_put_root(sctx->parent_root); + } kvfree(clone_sources_tmp); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 01297c5b2666..8b0fe053a25d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -10,6 +10,153 @@ #include "transaction.h" #include "block-group.h" +/* + * HOW DOES SPACE RESERVATION WORK + * + * If you want to know about delalloc specifically, there is a separate comment + * for that with the delalloc code. This comment is about how the whole system + * works generally. + * + * BASIC CONCEPTS + * + * 1) space_info. This is the ultimate arbiter of how much space we can use. + * There's a description of the bytes_ fields with the struct declaration, + * refer to that for specifics on each field. Suffice it to say that for + * reservations we care about total_bytes - SUM(space_info->bytes_) when + * determining if there is space to make an allocation. There is a space_info + * for METADATA, SYSTEM, and DATA areas. + * + * 2) block_rsv's. These are basically buckets for every different type of + * metadata reservation we have. You can see the comment in the block_rsv + * code on the rules for each type, but generally block_rsv->reserved is how + * much space is accounted for in space_info->bytes_may_use. + * + * 3) btrfs_calc*_size. These are the worst case calculations we used based + * on the number of items we will want to modify. We have one for changing + * items, and one for inserting new items. Generally we use these helpers to + * determine the size of the block reserves, and then use the actual bytes + * values to adjust the space_info counters. + * + * MAKING RESERVATIONS, THE NORMAL CASE + * + * We call into either btrfs_reserve_data_bytes() or + * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with + * num_bytes we want to reserve. + * + * ->reserve + * space_info->bytes_may_reserve += num_bytes + * + * ->extent allocation + * Call btrfs_add_reserved_bytes() which does + * space_info->bytes_may_reserve -= num_bytes + * space_info->bytes_reserved += extent_bytes + * + * ->insert reference + * Call btrfs_update_block_group() which does + * space_info->bytes_reserved -= extent_bytes + * space_info->bytes_used += extent_bytes + * + * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority) + * + * Assume we are unable to simply make the reservation because we do not have + * enough space + * + * -> __reserve_bytes + * create a reserve_ticket with ->bytes set to our reservation, add it to + * the tail of space_info->tickets, kick async flush thread + * + * ->handle_reserve_ticket + * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set + * on the ticket. + * + * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space + * Flushes various things attempting to free up space. + * + * -> btrfs_try_granting_tickets() + * This is called by anything that either subtracts space from + * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the + * space_info->total_bytes. This loops through the ->priority_tickets and + * then the ->tickets list checking to see if the reservation can be + * completed. If it can the space is added to space_info->bytes_may_use and + * the ticket is woken up. + * + * -> ticket wakeup + * Check if ->bytes == 0, if it does we got our reservation and we can carry + * on, if not return the appropriate error (ENOSPC, but can be EINTR if we + * were interrupted.) + * + * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY + * + * Same as the above, except we add ourselves to the + * space_info->priority_tickets, and we do not use ticket->wait, we simply + * call flush_space() ourselves for the states that are safe for us to call + * without deadlocking and hope for the best. + * + * THE FLUSHING STATES + * + * Generally speaking we will have two cases for each state, a "nice" state + * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to + * reduce the locking over head on the various trees, and even to keep from + * doing any work at all in the case of delayed refs. Each of these delayed + * things however hold reservations, and so letting them run allows us to + * reclaim space so we can make new reservations. + * + * FLUSH_DELAYED_ITEMS + * Every inode has a delayed item to update the inode. Take a simple write + * for example, we would update the inode item at write time to update the + * mtime, and then again at finish_ordered_io() time in order to update the + * isize or bytes. We keep these delayed items to coalesce these operations + * into a single operation done on demand. These are an easy way to reclaim + * metadata space. + * + * FLUSH_DELALLOC + * Look at the delalloc comment to get an idea of how much space is reserved + * for delayed allocation. We can reclaim some of this space simply by + * running delalloc, but usually we need to wait for ordered extents to + * reclaim the bulk of this space. + * + * FLUSH_DELAYED_REFS + * We have a block reserve for the outstanding delayed refs space, and every + * delayed ref operation holds a reservation. Running these is a quick way + * to reclaim space, but we want to hold this until the end because COW can + * churn a lot and we can avoid making some extent tree modifications if we + * are able to delay for as long as possible. + * + * ALLOC_CHUNK + * We will skip this the first time through space reservation, because of + * overcommit and we don't want to have a lot of useless metadata space when + * our worst case reservations will likely never come true. + * + * RUN_DELAYED_IPUTS + * If we're freeing inodes we're likely freeing checksums, file extent + * items, and extent tree items. Loads of space could be freed up by these + * operations, however they won't be usable until the transaction commits. + * + * COMMIT_TRANS + * may_commit_transaction() is the ultimate arbiter on whether we commit the + * transaction or not. In order to avoid constantly churning we do all the + * above flushing first and then commit the transaction as the last resort. + * However we need to take into account things like pinned space that would + * be freed, plus any delayed work we may not have gotten rid of in the case + * of metadata. + * + * OVERCOMMIT + * + * Because we hold so many reservations for metadata we will allow you to + * reserve more space than is currently free in the currently allocate + * metadata space. This only happens with metadata, data does not allow + * overcommitting. + * + * You can see the current logic for when we allow overcommit in + * btrfs_can_overcommit(), but it only applies to unallocated space. If there + * is no unallocated space to be had, all reservations are kept within the + * free space in the allocated metadata chunks. + * + * Because of overcommitting, you generally want to use the + * btrfs_can_overcommit() logic for metadata allocations, as it does the right + * thing with or without extra unallocated space. + */ + u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) { @@ -159,25 +306,19 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) return (global->size << 1); } -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) +static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + enum btrfs_reserve_flush_enum flush) { u64 profile; u64 avail; - u64 used; int factor; - /* Don't overcommit when in mixed mode. */ - if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) - return 0; - if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) profile = btrfs_system_alloc_profile(fs_info); else profile = btrfs_metadata_alloc_profile(fs_info); - used = btrfs_space_info_used(space_info, true); avail = atomic64_read(&fs_info->free_chunk_space); /* @@ -198,6 +339,22 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + return avail; +} + +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + u64 avail; + u64 used; + + /* Don't overcommit when in mixed mode */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + used = btrfs_space_info_used(space_info, true); + avail = calc_available_free_space(fs_info, space_info, flush); if (used + bytes < space_info->total_bytes + avail) return 1; @@ -232,6 +389,8 @@ again: space_info, ticket->bytes); list_del_init(&ticket->list); + ASSERT(space_info->reclaim_size >= ticket->bytes); + space_info->reclaim_size -= ticket->bytes; ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); @@ -627,15 +786,26 @@ static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { - struct reserve_ticket *ticket; u64 used; + u64 avail; u64 expected; - u64 to_reclaim = 0; + u64 to_reclaim = space_info->reclaim_size; + + lockdep_assert_held(&space_info->lock); + + avail = calc_available_free_space(fs_info, space_info, + BTRFS_RESERVE_FLUSH_ALL); + used = btrfs_space_info_used(space_info, true); + + /* + * We may be flushing because suddenly we have less space than we had + * before, and now we're well over-committed based on our current free + * space. If that's the case add in our overage so we make sure to put + * appropriate pressure on the flushing state machine. + */ + if (space_info->total_bytes + avail < used) + to_reclaim += used - (space_info->total_bytes + avail); - list_for_each_entry(ticket, &space_info->tickets, list) - to_reclaim += ticket->bytes; - list_for_each_entry(ticket, &space_info->priority_tickets, list) - to_reclaim += ticket->bytes; if (to_reclaim) return to_reclaim; @@ -1020,8 +1190,10 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + ASSERT(space_info->reclaim_size >= 0); ticket.bytes = orig_bytes; ticket.error = 0; + space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); if (flush == BTRFS_RESERVE_FLUSH_ALL) { list_add_tail(&ticket.list, &space_info->tickets); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 24514cd2c6c1..0a5001ef1481 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -54,6 +54,13 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + + /* + * Size of space that needs to be reclaimed in order to satisfy pending + * tickets + */ + u64 reclaim_size; + /* * tickets_id just indicates the next ticket will be handled, so note * it's not stored per ticket. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 67c63858812a..7932d8d07cff 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -244,7 +244,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; - trans->aborted = errno; + WRITE_ONCE(trans->aborted, errno); /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ if (!trans->dirty && list_empty(&trans->new_bgs)) { @@ -873,7 +873,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; #endif case Opt_err: - btrfs_info(info, "unrecognized mount option '%s'", p); + btrfs_err(info, "unrecognized mount option '%s'", p); ret = -EINVAL; goto out; default: @@ -1024,11 +1024,11 @@ out: return error; } -static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, - u64 subvol_objectid) +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *fs_root; + struct btrfs_root *fs_root = NULL; struct btrfs_root_ref *root_ref; struct btrfs_inode_ref *inode_ref; struct btrfs_key key; @@ -1096,9 +1096,10 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, key.objectid = subvol_objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - fs_root = btrfs_read_fs_root_no_name(fs_info, &key); + fs_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); + fs_root = NULL; goto err; } @@ -1143,6 +1144,8 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, ptr[0] = '/'; btrfs_release_path(path); } + btrfs_put_root(fs_root); + fs_root = NULL; } btrfs_free_path(path); @@ -1155,6 +1158,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, return name; err: + btrfs_put_root(fs_root); btrfs_free_path(path); kfree(name); return ERR_PTR(ret); @@ -1438,8 +1442,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, goto out; } } - subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), - subvol_objectid); + subvol_name = btrfs_get_subvol_name_from_objectid( + btrfs_sb(mnt->mnt_sb), subvol_objectid); if (IS_ERR(subvol_name)) { root = ERR_CAST(subvol_name); subvol_name = NULL; @@ -1518,14 +1522,17 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, /* * Setup a dummy root and fs_info for test/set super. This is because * we don't actually fill this stuff out until open_ctree, but we need - * it for searching for existing supers, so this lets us do that and - * then open_ctree will properly initialize everything later. + * then open_ctree will properly initialize the file system specific + * settings later. btrfs_init_fs_info initializes the static elements + * of the fs_info (locks and such) to make cleanup easier if we find a + * superblock with our given fs_devices later on at sget() time. */ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); if (!fs_info) { error = -ENOMEM; goto error_sec_opts; } + btrfs_init_fs_info(fs_info); fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); @@ -1571,7 +1578,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, if (s->s_root) { btrfs_close_devices(fs_devices); - free_fs_info(fs_info); + btrfs_free_fs_info(fs_info); if ((flags ^ s->s_flags) & SB_RDONLY) error = -EBUSY; } else { @@ -1594,7 +1601,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error_close_devices: btrfs_close_devices(fs_devices); error_fs_info: - free_fs_info(fs_info); + btrfs_free_fs_info(fs_info); error_sec_opts: security_free_mnt_opts(&new_sec_opts); return ERR_PTR(error); @@ -2170,7 +2177,7 @@ static void btrfs_kill_super(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); kill_anon_super(sb); - free_fs_info(fs_info); + btrfs_free_fs_info(fs_info); } static struct file_system_type btrfs_fs_type = { @@ -2203,7 +2210,7 @@ static int btrfs_control_open(struct inode *inode, struct file *file) } /* - * used by btrfsctl to scan devices when no FS is mounted + * Used by /dev/btrfs-control for devices ioctls. */ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, unsigned long arg) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3c10e78924d0..a39bff64ff24 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -155,7 +155,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj, } else val = can_modify_feature(fa); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } static ssize_t btrfs_feature_attr_store(struct kobject *kobj, @@ -295,7 +295,7 @@ static const struct attribute_group btrfs_feature_attr_group = { static ssize_t rmdir_subvol_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return snprintf(buf, PAGE_SIZE, "0\n"); + return scnprintf(buf, PAGE_SIZE, "0\n"); } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); @@ -310,12 +310,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj, * This "trick" only works as long as 'enum btrfs_csum_type' has * no holes in it */ - ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", (i == 0 ? "" : " "), btrfs_super_csum_name(i)); } - ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); @@ -350,7 +350,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", atomic64_read(&fs_info->discard_ctl.discardable_bytes)); } BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); @@ -361,7 +361,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%d\n", + return scnprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&fs_info->discard_ctl.discardable_extents)); } BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); @@ -372,7 +372,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); @@ -383,7 +383,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); } BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); @@ -394,7 +394,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); @@ -405,7 +405,7 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", + return scnprintf(buf, PAGE_SIZE, "%u\n", READ_ONCE(fs_info->discard_ctl.iops_limit)); } @@ -435,7 +435,7 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", + return scnprintf(buf, PAGE_SIZE, "%u\n", READ_ONCE(fs_info->discard_ctl.kbps_limit)); } @@ -465,7 +465,7 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%llu\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", READ_ONCE(fs_info->discard_ctl.max_discard_size)); } @@ -530,7 +530,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) val = *value_ptr; if (lock) spin_unlock(lock); - return snprintf(buf, PAGE_SIZE, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } static ssize_t global_rsv_size_show(struct kobject *kobj, @@ -576,7 +576,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, val += block_group->used; } up_read(&sinfo->groups_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } static struct attribute *raid_attrs[] = { @@ -613,7 +613,7 @@ static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, { struct btrfs_space_info *sinfo = to_space_info(kobj); s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); - return snprintf(buf, PAGE_SIZE, "%lld\n", val); + return scnprintf(buf, PAGE_SIZE, "%lld\n", val); } SPACE_INFO_ATTR(flags); @@ -670,7 +670,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj, ssize_t ret; spin_lock(&fs_info->super_lock); - ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); spin_unlock(&fs_info->super_lock); return ret; @@ -718,7 +718,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); + return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -728,8 +728,8 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", - fs_info->super_copy->sectorsize); + return scnprintf(buf, PAGE_SIZE, "%u\n", + fs_info->super_copy->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -739,8 +739,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", - fs_info->super_copy->sectorsize); + return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -752,7 +751,7 @@ static ssize_t quota_override_show(struct kobject *kobj, int quota_override; quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); - return snprintf(buf, PAGE_SIZE, "%d\n", quota_override); + return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override); } static ssize_t quota_override_store(struct kobject *kobj, @@ -790,7 +789,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%pU\n", + return scnprintf(buf, PAGE_SIZE, "%pU\n", fs_info->fs_devices->metadata_uuid); } @@ -802,7 +801,7 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); - return snprintf(buf, PAGE_SIZE, "%s (%s)\n", + return scnprintf(buf, PAGE_SIZE, "%s (%s)\n", btrfs_super_csum_name(csum_type), crypto_shash_driver_name(fs_info->csum_shash)); } @@ -960,7 +959,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); + btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -992,7 +991,7 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) continue; name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; - len += snprintf(str + len, bufsize - len, "%s%s", + len += scnprintf(str + len, bufsize - len, "%s%s", len ? "," : "", name); } @@ -1149,7 +1148,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, /* when one_device is NULL, it removes all device links */ -int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; @@ -1201,11 +1200,11 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); -static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj, +static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; @@ -1214,9 +1213,9 @@ static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } -BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show); +BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, struct kobj_attribute *a, @@ -1228,7 +1227,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); @@ -1241,7 +1240,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); @@ -1269,7 +1268,7 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { int error = 0; @@ -1371,7 +1370,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) if (!fs_devs->devices_kobj) { btrfs_err(fs_devs->fs_info, "failed to init sysfs device interface"); - kobject_put(&fs_devs->fsid_kobj); + btrfs_sysfs_remove_fsid(fs_devs); return -ENOMEM; } @@ -1395,13 +1394,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) btrfs_set_fs_info_ptr(fs_info); - error = btrfs_sysfs_add_device_link(fs_devs, NULL); + error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); if (error) return error; error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_sysfs_rm_device_link(fs_devs, NULL); + btrfs_sysfs_remove_devices_dir(fs_devs, NULL); return error; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index c68582add92e..718a26c97833 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -14,9 +14,9 @@ enum btrfs_feature_set { char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char * const btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 84fb3fa940a6..999c14e5d0bd 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -120,6 +120,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) kfree(fs_info); return NULL; } + INIT_LIST_HEAD(&fs_info->fs_devices->devices); + fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), GFP_KERNEL); if (!fs_info->super_copy) { @@ -128,39 +130,10 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) return NULL; } + btrfs_init_fs_info(fs_info); + fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; - - if (init_srcu_struct(&fs_info->subvol_srcu)) { - kfree(fs_info->fs_devices); - kfree(fs_info->super_copy); - kfree(fs_info); - return NULL; - } - - spin_lock_init(&fs_info->buffer_lock); - spin_lock_init(&fs_info->qgroup_lock); - spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->fs_roots_radix_lock); - mutex_init(&fs_info->qgroup_ioctl_lock); - mutex_init(&fs_info->qgroup_rescan_lock); - rwlock_init(&fs_info->tree_mod_log_lock); - fs_info->running_transaction = NULL; - fs_info->qgroup_tree = RB_ROOT; - fs_info->qgroup_ulist = NULL; - atomic64_set(&fs_info->tree_mod_seq, 0); - INIT_LIST_HEAD(&fs_info->dirty_qgroups); - INIT_LIST_HEAD(&fs_info->dead_roots); - INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); - INIT_LIST_HEAD(&fs_info->fs_devices->devices); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - extent_io_tree_init(fs_info, &fs_info->freed_extents[0], - IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); - extent_io_tree_init(fs_info, &fs_info->freed_extents[1], - IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); - extent_map_tree_init(&fs_info->mapping_tree); - fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; @@ -210,8 +183,9 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) } btrfs_free_qgroup_config(fs_info); btrfs_free_fs_roots(fs_info); - cleanup_srcu_struct(&fs_info->subvol_srcu); kfree(fs_info->super_copy); + btrfs_check_leaked_roots(fs_info); + btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->fs_devices); kfree(fs_info); } @@ -223,11 +197,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; - if (root->node) { - /* One for allocate_extent_buffer */ - free_extent_buffer(root->node); - } - kfree(root); + btrfs_put_root(root); } struct btrfs_block_group * diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ac035a6fa003..ce1ca8e73c2d 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -507,6 +507,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) test_err("couldn't insert fs root %d", ret); goto out; } + btrfs_put_root(tmp_root); tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { @@ -521,6 +522,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) test_err("couldn't insert fs root %d", ret); goto out; } + btrfs_put_root(tmp_root); test_msg("running qgroup tests"); ret = test_no_shared_qgroup(root, sectorsize, nodesize); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index beb6c69cd1e5..8cede6eb9843 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -221,7 +221,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) WARN_ON_ONCE(!list_empty(&trans->new_bgs)); btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, - trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved, NULL); trans->chunk_bytes_reserved = 0; } @@ -243,7 +243,7 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { - if (cur_trans->aborted) { + if (TRANS_ABORTED(cur_trans)) { spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } @@ -336,6 +336,8 @@ loop: list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); + extent_io_tree_init(fs_info, &cur_trans->pinned_extents, + IO_TREE_FS_PINNED_EXTENTS, NULL); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -459,7 +461,7 @@ static inline int is_transaction_blocked(struct btrfs_transaction *trans) { return (trans->state >= TRANS_STATE_COMMIT_START && trans->state < TRANS_STATE_UNBLOCKED && - !trans->aborted); + !TRANS_ABORTED(trans)); } /* wait for commit against the current transaction to become unblocked @@ -478,7 +480,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) wait_event(fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || - cur_trans->aborted); + TRANS_ABORTED(cur_trans)); btrfs_put_transaction(cur_trans); } else { spin_unlock(&fs_info->trans_lock); @@ -673,7 +675,7 @@ join_fail: alloc_fail: if (num_bytes) btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, - num_bytes); + num_bytes, NULL); reserve_fail: btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved); return ERR_PTR(ret); @@ -896,7 +898,7 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved); + trans->bytes_reserved, NULL); trans->bytes_reserved = 0; } @@ -937,7 +939,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (throttle) btrfs_run_delayed_iputs(info); - if (trans->aborted || + if (TRANS_ABORTED(trans) || test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { wake_up_process(info->transaction_kthread); err = -EIO; @@ -1262,8 +1264,10 @@ void btrfs_add_dead_root(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); - if (list_empty(&root->root_list)) + if (list_empty(&root->root_list)) { + btrfs_grab_root(root); list_add_tail(&root->root_list, &fs_info->dead_roots); + } spin_unlock(&fs_info->trans_lock); } @@ -1477,7 +1481,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 index = 0; u64 objectid; u64 root_flags; - uuid_le new_uuid; ASSERT(pending->path); path = pending->path; @@ -1570,8 +1573,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_generation_v2(new_root_item, trans->transid); - uuid_le_gen(&new_uuid); - memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(new_root_item->uuid); memcpy(new_root_item->parent_uuid, root->root_item.uuid, BTRFS_UUID_SIZE); if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { @@ -1633,7 +1635,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_read_fs_root_no_name(fs_info, &key); + pending->snap = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); btrfs_abort_transaction(trans, ret); @@ -1682,7 +1684,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto fail; } - ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, + ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1794,7 +1797,8 @@ static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info, struct btrfs_transaction *trans) { wait_event(fs_info->transaction_blocked_wait, - trans->state >= TRANS_STATE_COMMIT_START || trans->aborted); + trans->state >= TRANS_STATE_COMMIT_START || + TRANS_ABORTED(trans)); } /* @@ -1806,7 +1810,8 @@ static void wait_current_trans_commit_start_and_unblock( struct btrfs_transaction *trans) { wait_event(fs_info->transaction_wait, - trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted); + trans->state >= TRANS_STATE_UNBLOCKED || + TRANS_ABORTED(trans)); } /* @@ -2026,7 +2031,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trans->dirty = true; /* Stop the commit early if ->aborted is set */ - if (unlikely(READ_ONCE(cur_trans->aborted))) { + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; btrfs_end_transaction(trans); return ret; @@ -2100,7 +2105,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wait_for_commit(cur_trans); - if (unlikely(cur_trans->aborted)) + if (TRANS_ABORTED(cur_trans)) ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); @@ -2119,7 +2124,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->trans_lock); wait_for_commit(prev_trans); - ret = prev_trans->aborted; + ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); if (ret) @@ -2173,8 +2178,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); - /* ->aborted might be set after the previous check, so check it */ - if (unlikely(READ_ONCE(cur_trans->aborted))) { + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto scrub_continue; } @@ -2191,10 +2195,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * core function of the snapshot creation. */ ret = create_pending_snapshots(trans); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_reloc; /* * We insert the dir indexes of the snapshots and update the inode @@ -2207,16 +2209,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_reloc; ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_reloc; /* * make sure none of the code above managed to slip in a @@ -2242,11 +2240,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) mutex_lock(&fs_info->tree_log_mutex); ret = commit_fs_roots(trans); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_tree_log; /* * Since the transaction is done, we can apply the pending changes @@ -2264,39 +2259,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * new delayed refs. Must handle them or qgroup can be wrong. */ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_tree_log; /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ ret = btrfs_qgroup_account_extents(trans); - if (ret < 0) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret < 0) + goto unlock_tree_log; ret = commit_cowonly_roots(trans); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_tree_log; /* * The tasks which save the space cache and inode cache may also * update ->aborted, check it. */ - if (unlikely(READ_ONCE(cur_trans->aborted))) { + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; + goto unlock_tree_log; } btrfs_prepare_extent_commit(fs_info); @@ -2343,6 +2327,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) { btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction"); + /* + * reloc_mutex has been unlocked, tree_log_mutex is still held + * but we can't jump to unlock_tree_log causing double unlock + */ mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } @@ -2391,6 +2379,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; +unlock_tree_log: + mutex_unlock(&fs_info->tree_log_mutex); +unlock_reloc: + mutex_unlock(&fs_info->reloc_mutex); scrub_continue: btrfs_scrub_continue(fs_info); cleanup_transaction: @@ -2434,13 +2426,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); btrfs_kill_all_delayed_nodes(root); + if (root->ino_cache_inode) { + iput(root->ino_cache_inode); + root->ino_cache_inode = NULL; + } if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, NULL, 0, 0); + ret = btrfs_drop_snapshot(root, 0, 0); else - ret = btrfs_drop_snapshot(root, NULL, 1, 0); + ret = btrfs_drop_snapshot(root, 1, 0); + btrfs_put_root(root); return (ret < 0) ? 0 : 1; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 49f7196368f5..31ae8d273065 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -71,6 +71,7 @@ struct btrfs_transaction { */ struct list_head io_bgs; struct list_head dropped_roots; + struct extent_io_tree pinned_extents; /* * we need to make sure block group deletion doesn't race with @@ -115,6 +116,10 @@ struct btrfs_trans_handle { struct btrfs_block_rsv *orig_rsv; refcount_t use_count; unsigned int type; + /* + * Error code of transaction abort, set outside of locks and must use + * the READ_ONCE/WRITE_ONCE access + */ short aborted; bool adding_csums; bool allocating_chunk; @@ -126,6 +131,14 @@ struct btrfs_trans_handle { struct list_head new_bgs; }; +/* + * The abort status can be changed between calls and is not protected by locks. + * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's + * set to a non-zero value it does not change, so the macro should be in checks + * but is not necessary for further reads of the value. + */ +#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted))) + struct btrfs_pending_snapshot { struct dentry *dentry; struct inode *dir; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7dd7552f53a4..58c111474ba5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,6 +18,8 @@ #include "compression.h" #include "qgroup.h" #include "inode-map.h" +#include "block-group.h" +#include "space-info.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -94,8 +96,8 @@ enum { static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, + u64 start, + u64 end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -311,7 +313,7 @@ static int process_one_buffer(struct btrfs_root *log, } if (wc->pin) - ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, + ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, eb->len); if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { @@ -830,6 +832,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, + extent_end - start); + if (ret) + goto out; + inode_add_bytes(inode, nbytes); update_inode: ret = btrfs_update_inode(trans, root, inode); @@ -2659,18 +2666,39 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, return ret; } +/* + * Correctly adjust the reserved bytes occupied by a log tree extent buffer + */ +static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) +{ + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "unable to find block group for %llu", start); + return; + } + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->reserved -= fs_info->nodesize; + cache->space_info->bytes_reserved -= fs_info->nodesize; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + btrfs_put_block_group(cache); +} + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 root_owner; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; struct extent_buffer *cur; - struct extent_buffer *parent; u32 blocksize; int ret = 0; @@ -2690,9 +2718,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); blocksize = fs_info->nodesize; - parent = path->nodes[*level]; - root_owner = btrfs_header_owner(parent); - next = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(next)) return PTR_ERR(next); @@ -2720,18 +2745,16 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + bytenr, blocksize); + if (ret) { + free_extent_buffer(next); + return ret; + } } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); - } - - WARN_ON(root_owner != - BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_pin_reserved_extent(fs_info, - bytenr, blocksize); - if (ret) { - free_extent_buffer(next); - return ret; + unaccount_log_buffer(fs_info, bytenr); } } free_extent_buffer(next); @@ -2762,7 +2785,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 root_owner; int i; int slot; int ret; @@ -2775,13 +2797,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level == 0); return 0; } else { - struct extent_buffer *parent; - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - root_owner = btrfs_header_owner(parent); ret = wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level]), *level); @@ -2799,17 +2814,18 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + path->nodes[*level]->start, + path->nodes[*level]->len); + if (ret) + return ret; } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); - } - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_pin_reserved_extent(fs_info, - path->nodes[*level]->start, - path->nodes[*level]->len); - if (ret) - return ret; + unaccount_log_buffer(fs_info, + path->nodes[*level]->start); + } } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -2880,15 +2896,15 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + next->start, next->len); + if (ret) + goto out; } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); + unaccount_log_buffer(fs_info, next->start); } - - ret = btrfs_pin_reserved_extent(fs_info, next->start, - next->len); - if (ret) - goto out; } } @@ -3283,8 +3299,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); - free_extent_buffer(log->node); - kfree(log); + btrfs_put_root(log); } /* @@ -4518,13 +4533,15 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, static int btrfs_log_holes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + const u64 start, + const u64 end) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; const u64 ino = btrfs_ino(inode); const u64 i_size = i_size_read(&inode->vfs_inode); - u64 prev_extent_end = 0; + u64 prev_extent_end = start; int ret; if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) @@ -4532,16 +4549,21 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; + key.offset = start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + while (true) { - struct btrfs_file_extent_item *extent; struct extent_buffer *leaf = path->nodes[0]; - u64 len; + u64 extent_end; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); @@ -4558,9 +4580,18 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) break; + extent_end = btrfs_file_extent_end(path); + if (extent_end <= start) + goto next_slot; + /* We have a hole, log it. */ if (prev_extent_end < key.offset) { - const u64 hole_len = key.offset - prev_extent_end; + u64 hole_len; + + if (key.offset >= end) + hole_len = end - prev_extent_end; + else + hole_len = key.offset - prev_extent_end; /* * Release the path to avoid deadlocks with other code @@ -4590,27 +4621,20 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; } - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(leaf, extent); - prev_extent_end = ALIGN(key.offset + len, - fs_info->sectorsize); - } else { - len = btrfs_file_extent_num_bytes(leaf, extent); - prev_extent_end = key.offset + len; - } - + prev_extent_end = min(extent_end, end); + if (extent_end >= end) + break; +next_slot: path->slots[0]++; cond_resched(); } - if (prev_extent_end < i_size) { + if (prev_extent_end < end && prev_extent_end < i_size) { u64 hole_len; btrfs_release_path(path); - hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); + hole_len = min(ALIGN(i_size, fs_info->sectorsize), end); + hole_len -= prev_extent_end; ret = btrfs_insert_file_extent(trans, root->log_root, ino, prev_extent_end, 0, 0, hole_len, 0, hole_len, @@ -4938,6 +4962,178 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, return ret; } +static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_key *min_key, + const struct btrfs_key *max_key, + struct btrfs_path *path, + struct btrfs_path *dst_path, + const u64 logged_isize, + const bool recursive_logging, + const int inode_only, + const u64 start, + const u64 end, + struct btrfs_log_ctx *ctx, + bool *need_log_inode_item) +{ + struct btrfs_root *root = inode->root; + int ins_start_slot = 0; + int ins_nr = 0; + int ret; + + /* + * We must make sure we don't copy extent items that are entirely out of + * the range [start, end - 1]. This is not just an optimization to avoid + * copying but also needed to avoid a corruption where we end up with + * file extent items in the log tree that have overlapping ranges - this + * can happen if we race with ordered extent completion for ranges that + * are outside our target range. For example we copy an extent item and + * when we move to the next leaf, that extent was trimmed and a new one + * covering a subrange of it, but with a higher key, was inserted - we + * would then copy this other extent too, resulting in a log tree with + * 2 extent items that represent overlapping ranges. + * + * We can copy the entire extents at the range bondaries however, even + * if they cover an area outside the target range. That's ok. + */ + while (1) { + ret = btrfs_search_forward(root, min_key, path, trans->transid); + if (ret < 0) + return ret; + if (ret > 0) { + ret = 0; + break; + } +again: + /* Note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key->objectid != max_key->objectid) + break; + if (min_key->type > max_key->type) + break; + + if (min_key->type == BTRFS_INODE_ITEM_KEY) + *need_log_inode_item = false; + + if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + inode->generation == trans->transid && + !recursive_logging) { + u64 other_ino = 0; + u64 other_parent = 0; + + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], min_key, inode, + &other_ino, &other_parent); + if (ret < 0) { + return ret; + } else if (ret > 0 && ctx && + other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { + if (ins_nr > 0) { + ins_nr++; + } else { + ins_nr = 1; + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, + inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + + ret = log_conflicting_inodes(trans, root, path, + ctx, other_ino, other_parent); + if (ret) + return ret; + btrfs_release_path(path); + goto next_key; + } + } + + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + goto next_slot; + } + + if (min_key->type == BTRFS_EXTENT_DATA_KEY) { + const u64 extent_end = btrfs_file_extent_end(path); + + if (extent_end <= start) { + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, + path, ins_start_slot, + ins_nr, inode_only, + logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + } + goto next_slot; + } + if (extent_end >= end) { + ins_nr++; + if (ins_nr == 1) + ins_start_slot = path->slots[0]; + break; + } + } + + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key_to_cpu(path->nodes[0], min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, inode_only, + logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + } + btrfs_release_path(path); +next_key: + if (min_key->offset < (u64)-1) { + min_key->offset++; + } else if (min_key->type < max_key->type) { + min_key->type++; + min_key->offset = 0; + } else { + break; + } + } + if (ins_nr) + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, + ins_nr, inode_only, logged_isize); + + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4955,8 +5151,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, + u64 start, + u64 end, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4967,9 +5163,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; int err = 0; int ret; - int nritems; - int ins_start_slot = 0; - int ins_nr; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; @@ -4987,6 +5180,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return -ENOMEM; } + start = ALIGN_DOWN(start, fs_info->sectorsize); + end = ALIGN(end, fs_info->sectorsize); + min_key.objectid = ino; min_key.type = BTRFS_INODE_ITEM_KEY; min_key.offset = 0; @@ -5100,139 +5296,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, goto out_unlock; } - while (1) { - ins_nr = 0; - ret = btrfs_search_forward(root, &min_key, - path, trans->transid); - if (ret < 0) { - err = ret; - goto out_unlock; - } - if (ret != 0) - break; -again: - /* note, ins_nr might be > 0 here, cleanup outside the loop */ - if (min_key.objectid != ino) - break; - if (min_key.type > max_key.type) - break; - - if (min_key.type == BTRFS_INODE_ITEM_KEY) - need_log_inode_item = false; - - if ((min_key.type == BTRFS_INODE_REF_KEY || - min_key.type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { - u64 other_ino = 0; - u64 other_parent = 0; - - ret = btrfs_check_ref_name_override(path->nodes[0], - path->slots[0], &min_key, inode, - &other_ino, &other_parent); - if (ret < 0) { - err = ret; - goto out_unlock; - } else if (ret > 0 && ctx && - other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { - if (ins_nr > 0) { - ins_nr++; - } else { - ins_nr = 1; - ins_start_slot = path->slots[0]; - } - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - - err = log_conflicting_inodes(trans, root, path, - ctx, other_ino, other_parent); - if (err) - goto out_unlock; - btrfs_release_path(path); - goto next_key; - } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key.type == BTRFS_XATTR_ITEM_KEY) { - if (ins_nr == 0) - goto next_slot; - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - goto next_slot; - } - - if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { - ins_nr++; - goto next_slot; - } else if (!ins_nr) { - ins_start_slot = path->slots[0]; - ins_nr = 1; - goto next_slot; - } - - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 1; - ins_start_slot = path->slots[0]; -next_slot: - - nritems = btrfs_header_nritems(path->nodes[0]); - path->slots[0]++; - if (path->slots[0] < nritems) { - btrfs_item_key_to_cpu(path->nodes[0], &min_key, - path->slots[0]); - goto again; - } - if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - } - btrfs_release_path(path); -next_key: - if (min_key.offset < (u64)-1) { - min_key.offset++; - } else if (min_key.type < max_key.type) { - min_key.type++; - min_key.offset = 0; - } else { - break; - } - } - if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - } + err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, + path, dst_path, logged_isize, + recursive_logging, inode_only, + start, end, ctx, &need_log_inode_item); + if (err) + goto out_unlock; btrfs_release_path(path); btrfs_release_path(dst_path); @@ -5243,7 +5312,7 @@ next_key: if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, root, inode, path); + err = btrfs_log_holes(trans, root, inode, path, start, end); if (err) goto out_unlock; } @@ -6145,7 +6214,7 @@ again: if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_fs_root(log_root_tree, &found_key); + log = btrfs_read_tree_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); btrfs_handle_fs_error(fs_info, ret, @@ -6157,7 +6226,7 @@ again: tmp_key.type = BTRFS_ROOT_ITEM_KEY; tmp_key.offset = (u64)-1; - wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + wc.replay_dest = btrfs_get_fs_root(fs_info, &tmp_key, true); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); @@ -6173,12 +6242,10 @@ again: * each subsequent pass. */ if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(fs_info, + ret = btrfs_pin_extent_for_log_replay(trans, log->node->start, log->node->len); - free_extent_buffer(log->node); - free_extent_buffer(log->commit_root); - kfree(log); + btrfs_put_root(log); if (!ret) goto next; @@ -6214,9 +6281,8 @@ again: } wc.replay_dest->log_root = NULL; - free_extent_buffer(log->node); - free_extent_buffer(log->commit_root); - kfree(log); + btrfs_put_root(wc.replay_dest); + btrfs_put_root(log); if (ret) goto error; @@ -6247,10 +6313,9 @@ next: if (ret) return ret; - free_extent_buffer(log_root_tree->node); log_root_tree->log_root = NULL; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); - kfree(log_root_tree); + btrfs_put_root(log_root_tree); return 0; error: diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 76b84f2397b1..76671a6bcb61 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -246,9 +246,53 @@ out: return ret; } -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, - int (*check_func)(struct btrfs_fs_info *, u8 *, u8, - u64)) +/* + * Check if there's an matching subvolume for given UUID + * + * Return: + * 0 check succeeded, the entry is not outdated + * > 0 if the check failed, the caller should remove the entry + * < 0 if an error occurred + */ +static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, + u8 *uuid, u8 type, u64 subvolid) +{ + struct btrfs_key key; + int ret = 0; + struct btrfs_root *subvol_root; + + if (type != BTRFS_UUID_KEY_SUBVOL && + type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto out; + + key.objectid = subvolid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + subvol_root = btrfs_get_fs_root(fs_info, &key, true); + if (IS_ERR(subvol_root)) { + ret = PTR_ERR(subvol_root); + if (ret == -ENOENT) + ret = 1; + goto out; + } + + switch (type) { + case BTRFS_UUID_KEY_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) + ret = 1; + break; + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.received_uuid, + BTRFS_UUID_SIZE)) + ret = 1; + break; + } + btrfs_put_root(subvol_root); +out: + return ret; +} + +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->uuid_root; struct btrfs_key key; @@ -278,6 +322,10 @@ again_search_slot: } while (1) { + if (btrfs_fs_closing(fs_info)) { + ret = -EINTR; + goto out; + } cond_resched(); leaf = path->nodes[0]; slot = path->slots[0]; @@ -305,7 +353,8 @@ again_search_slot: read_extent_buffer(leaf, &subid_le, offset, sizeof(subid_le)); subid_cpu = le64_to_cpu(subid_le); - ret = check_func(fs_info, uuid, key.type, subid_cpu); + ret = btrfs_check_uuid_tree_entry(fs_info, uuid, + key.type, subid_cpu); if (ret < 0) goto out; if (ret > 0) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9cfc668f91f4..c1909e5f4506 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6,7 +6,6 @@ #include <linux/sched.h> #include <linux/bio.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/kthread.h> @@ -500,7 +499,7 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( static int btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, int flush, struct block_device **bdev, - struct buffer_head **bh) + struct btrfs_super_block **disk_super) { int ret; @@ -519,9 +518,9 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, goto error; } invalidate_bdev(*bdev); - *bh = btrfs_read_dev_super(*bdev); - if (IS_ERR(*bh)) { - ret = PTR_ERR(*bh); + *disk_super = btrfs_read_dev_super(*bdev); + if (IS_ERR(*disk_super)) { + ret = PTR_ERR(*disk_super); blkdev_put(*bdev, flags); goto error; } @@ -530,7 +529,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, error: *bdev = NULL; - *bh = NULL; return ret; } @@ -611,7 +609,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, { struct request_queue *q; struct block_device *bdev; - struct buffer_head *bh; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -622,17 +619,16 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &bh); + &bdev, &disk_super); if (ret) return ret; - disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); if (devid != device->devid) - goto error_brelse; + goto error_free_page; if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) - goto error_brelse; + goto error_free_page; device->generation = btrfs_super_generation(disk_super); @@ -641,7 +637,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { pr_err( "BTRFS: Invalid seeding and uuid-changed device detected\n"); - goto error_brelse; + goto error_free_page; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); @@ -667,12 +663,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, fs_devices->rw_devices++; list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); } - brelse(bh); + btrfs_release_disk_super(disk_super); return 0; -error_brelse: - brelse(bh); +error_free_page: + btrfs_release_disk_super(disk_super); blkdev_put(bdev, flags); return -EINVAL; @@ -1209,6 +1205,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->opened = 1; fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; + fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; out: return ret; } @@ -1247,9 +1244,10 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } -static void btrfs_release_disk_super(struct page *page) +void btrfs_release_disk_super(struct btrfs_super_block *super) { - kunmap(page); + struct page *page = virt_to_page(super); + put_page(page); } @@ -1277,17 +1275,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); - if (IS_ERR_OR_NULL(*page)) + if (IS_ERR(*page)) return 1; - p = kmap(*page); + p = page_address(*page); /* align our pointer to the offset of the super block */ *disk_super = p + offset_in_page(bytenr); if (btrfs_super_bytenr(*disk_super) != bytenr || btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { - btrfs_release_disk_super(*page); + btrfs_release_disk_super(p); return 1; } @@ -1350,7 +1348,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, btrfs_free_stale_devices(path, device); } - btrfs_release_disk_super(page); + btrfs_release_disk_super(disk_super); error_bdev_put: blkdev_put(bdev, flags); @@ -1383,6 +1381,59 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, return false; } +static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) +{ + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* + * We don't want to overwrite the superblock on the drive nor + * any area used by the boot loader (grub for example), so we + * make sure to start at an offset of at least 1MB. + */ + return max_t(u64, start, SZ_1M); + default: + BUG(); + } +} + +/** + * dev_extent_hole_check - check if specified hole is suitable for allocation + * @device: the device which we have the hole + * @hole_start: starting position of the hole + * @hole_size: the size of the hole + * @num_bytes: the size of the free space that we need + * + * This function may modify @hole_start and @hole_end to reflect the suitable + * position for allocation. Returns 1 if hole position is updated, 0 otherwise. + */ +static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, + u64 *hole_size, u64 num_bytes) +{ + bool changed = false; + u64 hole_end = *hole_start + *hole_size; + + /* + * Check before we set max_hole_start, otherwise we could end up + * sending back this offset anyway. + */ + if (contains_pending_extent(device, hole_start, *hole_size)) { + if (hole_end >= *hole_start) + *hole_size = hole_end - *hole_start; + else + *hole_size = 0; + changed = true; + } + + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* No extra check */ + break; + default: + BUG(); + } + + return changed; +} /* * find_free_dev_extent_start - find free space in the specified device @@ -1429,12 +1480,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device, int slot; struct extent_buffer *l; - /* - * We don't want to overwrite the superblock on the drive nor any area - * used by the boot loader (grub for example), so we make sure to start - * at an offset of at least 1MB. - */ - search_start = max_t(u64, search_start, SZ_1M); + search_start = dev_extent_search_start(device, search_start); path = btrfs_alloc_path(); if (!path) @@ -1492,18 +1538,8 @@ again: if (key.offset > search_start) { hole_size = key.offset - search_start; - - /* - * Have to check before we set max_hole_start, otherwise - * we could end up sending back this offset anyway. - */ - if (contains_pending_extent(device, &search_start, - hole_size)) { - if (key.offset >= search_start) - hole_size = key.offset - search_start; - else - hole_size = 0; - } + dev_extent_hole_check(device, &search_start, &hole_size, + num_bytes); if (hole_size > max_hole_size) { max_hole_start = search_start; @@ -1542,8 +1578,8 @@ next: */ if (search_end > search_start) { hole_size = search_end - search_start; - - if (contains_pending_extent(device, &search_start, hole_size)) { + if (dev_extent_hole_check(device, &search_start, &hole_size, + num_bytes)) { btrfs_release_path(path); goto again; } @@ -1949,6 +1985,46 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) return num_devices; } +static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path) +{ + struct btrfs_super_block *disk_super; + int copy_num; + + if (!bdev) + return; + + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { + struct page *page; + int ret; + + disk_super = btrfs_read_dev_one_super(bdev, copy_num); + if (IS_ERR(disk_super)) + continue; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + + page = virt_to_page(disk_super); + set_page_dirty(page); + lock_page(page); + /* write_on_page() unlocks the page */ + ret = write_one_page(page); + if (ret) + btrfs_warn(fs_info, + "error clearing superblock number %d (%d)", + copy_num, ret); + btrfs_release_disk_super(disk_super); + + } + + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); +} + int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 devid) { @@ -2054,7 +2130,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_rm_device_link(fs_devices, device); + btrfs_sysfs_remove_devices_dir(fs_devices, device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2067,7 +2143,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * supers and free the device. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) - btrfs_scratch_superblocks(device->bdev, device->name->str); + btrfs_scratch_superblocks(fs_info, device->bdev, + device->name->str); btrfs_close_bdev(device); synchronize_rcu(); @@ -2135,7 +2212,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); + btrfs_scratch_superblocks(fs_info, srcdev->bdev, + srcdev->name->str); } btrfs_close_bdev(srcdev); @@ -2174,7 +2252,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_rm_device_link(fs_devices, tgtdev); + btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2194,7 +2272,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) * is already out of device list, so we don't have to hold * the device_list_mutex lock. */ - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, + tgtdev->name->str); btrfs_close_bdev(tgtdev); synchronize_rcu(); @@ -2209,14 +2288,13 @@ static struct btrfs_device *btrfs_find_device_by_path( u64 devid; u8 *dev_uuid; struct block_device *bdev; - struct buffer_head *bh; struct btrfs_device *device; ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, - fs_info->bdev_holder, 0, &bdev, &bh); + fs_info->bdev_holder, 0, &bdev, &disk_super); if (ret) return ERR_PTR(ret); - disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; if (btrfs_fs_incompat(fs_info, METADATA_UUID)) @@ -2226,7 +2304,7 @@ static struct btrfs_device *btrfs_find_device_by_path( device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, disk_super->fsid, true); - brelse(bh); + btrfs_release_disk_super(disk_super); if (!device) device = ERR_PTR(-ENOENT); blkdev_put(bdev, FMODE_READ); @@ -2522,7 +2600,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path orig_super_num_devices + 1); /* add sysfs device entry */ - btrfs_sysfs_add_device_link(fs_devices, device); + btrfs_sysfs_add_devices_dir(fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2590,7 +2668,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_rm_device_link(fs_devices, device); + btrfs_sysfs_remove_devices_dir(fs_devices, device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); @@ -3723,13 +3801,25 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info) atomic_read(&fs_info->balance_cancel_req) == 0); } -/* Non-zero return value signifies invalidity */ -static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, - u64 allowed) +/* + * Validate target profile against allowed profiles and return true if it's OK. + * Otherwise print the error message and return false. + */ +static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, + const struct btrfs_balance_args *bargs, + u64 allowed, const char *type) { - return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl_arg->target, 1) || - (bctl_arg->target & ~allowed))); + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return true; + + /* Profile is valid and does not have bits outside of the allowed set */ + if (alloc_profile_is_valid(bargs->target, 1) && + (bargs->target & ~allowed) == 0) + return true; + + btrfs_err(fs_info, "balance: invalid convert %s profile %s", + type, btrfs_bg_type_to_raid_name(bargs->target)); + return false; } /* @@ -3904,7 +3994,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || - atomic_read(&fs_info->balance_cancel_req)) { + btrfs_should_cancel_balance(fs_info)) { ret = -EINVAL; goto out; } @@ -3945,24 +4035,9 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (num_devices >= btrfs_raid_array[i].devs_min) allowed |= btrfs_raid_array[i].bg_flag; - if (validate_convert_profile(&bctl->data, allowed)) { - btrfs_err(fs_info, - "balance: invalid convert data profile %s", - btrfs_bg_type_to_raid_name(bctl->data.target)); - ret = -EINVAL; - goto out; - } - if (validate_convert_profile(&bctl->meta, allowed)) { - btrfs_err(fs_info, - "balance: invalid convert metadata profile %s", - btrfs_bg_type_to_raid_name(bctl->meta.target)); - ret = -EINVAL; - goto out; - } - if (validate_convert_profile(&bctl->sys, allowed)) { - btrfs_err(fs_info, - "balance: invalid convert system profile %s", - btrfs_bg_type_to_raid_name(bctl->sys.target)); + if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || + !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || + !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { ret = -EINVAL; goto out; } @@ -4274,7 +4349,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } -static int btrfs_uuid_scan_kthread(void *data) +int btrfs_uuid_scan_kthread(void *data) { struct btrfs_fs_info *fs_info = data; struct btrfs_root *root = fs_info->tree_root; @@ -4286,6 +4361,7 @@ static int btrfs_uuid_scan_kthread(void *data) struct btrfs_root_item root_item; u32 item_size; struct btrfs_trans_handle *trans = NULL; + bool closing = false; path = btrfs_alloc_path(); if (!path) { @@ -4298,6 +4374,10 @@ static int btrfs_uuid_scan_kthread(void *data) key.offset = 0; while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret) { @@ -4397,76 +4477,12 @@ out: btrfs_end_transaction(trans); if (ret) btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); - else + else if (!closing) set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); up(&fs_info->uuid_tree_rescan_sem); return 0; } -/* - * Callback for btrfs_uuid_tree_iterate(). - * returns: - * 0 check succeeded, the entry is not outdated. - * < 0 if an error occurred. - * > 0 if the check failed, which means the caller shall remove the entry. - */ -static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, - u8 *uuid, u8 type, u64 subid) -{ - struct btrfs_key key; - int ret = 0; - struct btrfs_root *subvol_root; - - if (type != BTRFS_UUID_KEY_SUBVOL && - type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) - goto out; - - key.objectid = subid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(subvol_root)) { - ret = PTR_ERR(subvol_root); - if (ret == -ENOENT) - ret = 1; - goto out; - } - - switch (type) { - case BTRFS_UUID_KEY_SUBVOL: - if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) - ret = 1; - break; - case BTRFS_UUID_KEY_RECEIVED_SUBVOL: - if (memcmp(uuid, subvol_root->root_item.received_uuid, - BTRFS_UUID_SIZE)) - ret = 1; - break; - } - -out: - return ret; -} - -static int btrfs_uuid_rescan_kthread(void *data) -{ - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; - int ret; - - /* - * 1st step is to iterate through the existing UUID tree and - * to delete all entries that contain outdated data. - * 2nd step is to add all missing entries to the UUID tree. - */ - ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); - if (ret < 0) { - btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); - up(&fs_info->uuid_tree_rescan_sem); - return ret; - } - return btrfs_uuid_scan_kthread(data); -} - int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; @@ -4509,22 +4525,6 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) -{ - struct task_struct *task; - - down(&fs_info->uuid_tree_rescan_sem); - task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); - if (IS_ERR(task)) { - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_rescan task"); - up(&fs_info->uuid_tree_rescan_sem); - return PTR_ERR(task); - } - - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -4777,96 +4777,111 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID1C34); } -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - u64 start, u64 type) -{ - struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct btrfs_device *device; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct btrfs_device_info *devices_info = NULL; - u64 total_avail; - int num_stripes; /* total number of stripes to allocate */ - int data_stripes; /* number of stripes that count for - block group size */ - int sub_stripes; /* sub_stripes info for map */ - int dev_stripes; /* stripes per dev */ - int devs_max; /* max devs to use */ - int devs_min; /* min devs needed */ - int devs_increment; /* ndevs has to be a multiple of this */ - int ncopies; /* how many copies to data has */ - int nparity; /* number of stripes worth of bytes to - store parity information */ - int ret; +/* + * Structure used internally for __btrfs_alloc_chunk() function. + * Wraps needed parameters. + */ +struct alloc_chunk_ctl { + u64 start; + u64 type; + /* Total number of stripes to allocate */ + int num_stripes; + /* sub_stripes info for map */ + int sub_stripes; + /* Stripes per device */ + int dev_stripes; + /* Maximum number of devices to use */ + int devs_max; + /* Minimum number of devices to use */ + int devs_min; + /* ndevs has to be a multiple of this */ + int devs_increment; + /* Number of copies */ + int ncopies; + /* Number of stripes worth of bytes to store parity information */ + int nparity; u64 max_stripe_size; u64 max_chunk_size; + u64 dev_extent_min; u64 stripe_size; u64 chunk_size; int ndevs; - int i; - int j; - int index; - - BUG_ON(!alloc_profile_is_valid(type, 0)); - - if (list_empty(&fs_devices->alloc_list)) { - if (btrfs_test_opt(info, ENOSPC_DEBUG)) - btrfs_debug(info, "%s: no writable device", __func__); - return -ENOSPC; - } - - index = btrfs_bg_flags_to_raid_index(type); +}; - sub_stripes = btrfs_raid_array[index].sub_stripes; - dev_stripes = btrfs_raid_array[index].dev_stripes; - devs_max = btrfs_raid_array[index].devs_max; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); - devs_min = btrfs_raid_array[index].devs_min; - devs_increment = btrfs_raid_array[index].devs_increment; - ncopies = btrfs_raid_array[index].ncopies; - nparity = btrfs_raid_array[index].nparity; +static void init_alloc_chunk_ctl_policy_regular( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + u64 type = ctl->type; if (type & BTRFS_BLOCK_GROUP_DATA) { - max_stripe_size = SZ_1G; - max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; + ctl->max_stripe_size = SZ_1G; + ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* for larger filesystems, use larger metadata chunks */ + /* For larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) - max_stripe_size = SZ_1G; + ctl->max_stripe_size = SZ_1G; else - max_stripe_size = SZ_256M; - max_chunk_size = max_stripe_size; + ctl->max_stripe_size = SZ_256M; + ctl->max_chunk_size = ctl->max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = SZ_32M; - max_chunk_size = 2 * max_stripe_size; - devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); + ctl->max_stripe_size = SZ_32M; + ctl->max_chunk_size = 2 * ctl->max_stripe_size; + ctl->devs_max = min_t(int, ctl->devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); } else { - btrfs_err(info, "invalid chunk type 0x%llx requested", - type); BUG(); } /* We don't want a chunk larger than 10% of writable space */ - max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), - max_chunk_size); + ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + ctl->max_chunk_size); + ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; +} + +static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + int index = btrfs_bg_flags_to_raid_index(ctl->type); + + ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; + ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; + ctl->devs_max = btrfs_raid_array[index].devs_max; + if (!ctl->devs_max) + ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); + ctl->devs_min = btrfs_raid_array[index].devs_min; + ctl->devs_increment = btrfs_raid_array[index].devs_increment; + ctl->ncopies = btrfs_raid_array[index].ncopies; + ctl->nparity = btrfs_raid_array[index].nparity; + ctl->ndevs = 0; + + switch (fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); + break; + default: + BUG(); + } +} - devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), - GFP_NOFS); - if (!devices_info) - return -ENOMEM; +static int gather_device_info(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = fs_devices->fs_info; + struct btrfs_device *device; + u64 total_avail; + u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; + int ret; + int ndevs = 0; + u64 max_avail; + u64 dev_offset; /* * in the first pass through the devices list, we gather information * about the available holes on each device. */ - ndevs = 0; list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 max_avail; - u64 dev_offset; - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { WARN(1, KERN_ERR "BTRFS: read-only device in alloc_list\n"); @@ -4884,24 +4899,23 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, total_avail = 0; /* If there is no space on this device, skip it. */ - if (total_avail == 0) + if (total_avail < ctl->dev_extent_min) continue; - ret = find_free_dev_extent(device, - max_stripe_size * dev_stripes, - &dev_offset, &max_avail); + ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, + &max_avail); if (ret && ret != -ENOSPC) - goto error; + return ret; if (ret == 0) - max_avail = max_stripe_size * dev_stripes; + max_avail = dev_extent_want; - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { + if (max_avail < ctl->dev_extent_min) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) btrfs_debug(info, - "%s: devid %llu has no free space, have=%llu want=%u", + "%s: devid %llu has no free space, have=%llu want=%llu", __func__, device->devid, max_avail, - BTRFS_STRIPE_LEN * dev_stripes); + ctl->dev_extent_min); continue; } @@ -4916,6 +4930,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devices_info[ndevs].dev = device; ++ndevs; } + ctl->ndevs = ndevs; /* * now sort the devices by hole size / available space @@ -4923,23 +4938,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); - /* - * Round down to number of usable stripes, devs_increment can be any - * number so we can't use round_down() - */ - ndevs -= ndevs % devs_increment; - - if (ndevs < devs_min) { - ret = -ENOSPC; - if (btrfs_test_opt(info, ENOSPC_DEBUG)) { - btrfs_debug(info, - "%s: not enough devices with free space: have=%d minimum required=%d", - __func__, ndevs, devs_min); - } - goto error; - } + return 0; +} - ndevs = min(ndevs, devs_max); +static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + /* Number of stripes that count for block group size */ + int data_stripes; /* * The primary goal is to maximize the number of stripes, so use as @@ -4948,73 +4954,116 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * The DUP profile stores more than one stripe per device, the * max_avail is the total size so we have to adjust. */ - stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); - num_stripes = ndevs * dev_stripes; + ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; - /* - * this will have to be fixed for RAID1 and RAID10 over - * more drives - */ - data_stripes = (num_stripes - nparity) / ncopies; + /* This will have to be fixed for RAID1 and RAID10 over more drives */ + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; /* - * Use the number of data stripes to figure out how big this chunk - * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size. If it's higher, - * we try to reduce stripe_size. + * Use the number of data stripes to figure out how big this chunk is + * really going to be in terms of logical address space, and compare + * that answer with the max chunk size. If it's higher, we try to + * reduce stripe_size. */ - if (stripe_size * data_stripes > max_chunk_size) { + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { /* * Reduce stripe_size, round it up to a 16MB boundary again and * then use it, unless it ends up being even bigger than the * previous value we had already. */ - stripe_size = min(round_up(div_u64(max_chunk_size, - data_stripes), SZ_16M), - stripe_size); + ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, + data_stripes), SZ_16M), + ctl->stripe_size); } - /* align to BTRFS_STRIPE_LEN */ - stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); + /* Align to BTRFS_STRIPE_LEN */ + ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); + ctl->chunk_size = ctl->stripe_size * data_stripes; - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) { - ret = -ENOMEM; - goto error; + return 0; +} + +static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = fs_devices->fs_info; + + /* + * Round down to number of usable stripes, devs_increment can be any + * number so we can't use round_down() that requires power of 2, while + * rounddown is safe. + */ + ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); + + if (ctl->ndevs < ctl->devs_min) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) { + btrfs_debug(info, + "%s: not enough devices with free space: have=%d minimum required=%d", + __func__, ctl->ndevs, ctl->devs_min); + } + return -ENOSPC; } - map->num_stripes = num_stripes; - for (i = 0; i < ndevs; ++i) { - for (j = 0; j < dev_stripes; ++j) { - int s = i * dev_stripes + j; + ctl->ndevs = min(ctl->ndevs, ctl->devs_max); + + switch (fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + return decide_stripe_size_regular(ctl, devices_info); + default: + BUG(); + } +} + +static int create_chunk(struct btrfs_trans_handle *trans, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct extent_map *em; + u64 start = ctl->start; + u64 type = ctl->type; + int ret; + int i; + int j; + + map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); + if (!map) + return -ENOMEM; + map->num_stripes = ctl->num_stripes; + + for (i = 0; i < ctl->ndevs; ++i) { + for (j = 0; j < ctl->dev_stripes; ++j) { + int s = i * ctl->dev_stripes + j; map->stripes[s].dev = devices_info[i].dev; map->stripes[s].physical = devices_info[i].dev_offset + - j * stripe_size; + j * ctl->stripe_size; } } map->stripe_len = BTRFS_STRIPE_LEN; map->io_align = BTRFS_STRIPE_LEN; map->io_width = BTRFS_STRIPE_LEN; map->type = type; - map->sub_stripes = sub_stripes; - - chunk_size = stripe_size * data_stripes; + map->sub_stripes = ctl->sub_stripes; - trace_btrfs_chunk_alloc(info, map, start, chunk_size); + trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); em = alloc_extent_map(); if (!em) { kfree(map); - ret = -ENOMEM; - goto error; + return -ENOMEM; } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->map_lookup = map; em->start = start; - em->len = chunk_size; + em->len = ctl->chunk_size; em->block_start = 0; em->block_len = em->len; - em->orig_block_len = stripe_size; + em->orig_block_len = ctl->stripe_size; em_tree = &info->mapping_tree; write_lock(&em_tree->lock); @@ -5022,30 +5071,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (ret) { write_unlock(&em_tree->lock); free_extent_map(em); - goto error; + return ret; } write_unlock(&em_tree->lock); - ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); + ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); if (ret) goto error_del_extent; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; - btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size); + btrfs_device_set_bytes_used(dev, + dev->bytes_used + ctl->stripe_size); if (list_empty(&dev->post_commit_list)) list_add_tail(&dev->post_commit_list, &trans->transaction->dev_update_list); } - atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); + atomic64_sub(ctl->stripe_size * map->num_stripes, + &info->free_chunk_space); free_extent_map(em); check_raid56_incompat_flag(info, type); check_raid1c34_incompat_flag(info, type); - kfree(devices_info); return 0; error_del_extent: @@ -5057,11 +5107,68 @@ error_del_extent: free_extent_map(em); /* One for the tree reference */ free_extent_map(em); -error: + + return ret; +} + +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct btrfs_device_info *devices_info = NULL; + struct alloc_chunk_ctl ctl; + int ret; + + lockdep_assert_held(&info->chunk_mutex); + + if (!alloc_profile_is_valid(type, 0)) { + ASSERT(0); + return -EINVAL; + } + + if (list_empty(&fs_devices->alloc_list)) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, "%s: no writable device", __func__); + return -ENOSPC; + } + + if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(info, "invalid chunk type 0x%llx requested", type); + ASSERT(0); + return -EINVAL; + } + + ctl.start = find_next_chunk(info); + ctl.type = type; + init_alloc_chunk_ctl(fs_devices, &ctl); + + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + ret = gather_device_info(fs_devices, &ctl, devices_info); + if (ret < 0) + goto out; + + ret = decide_stripe_size(fs_devices, &ctl, devices_info); + if (ret < 0) + goto out; + + ret = create_chunk(trans, &ctl, devices_info); + +out: kfree(devices_info); return ret; } +/* + * Chunk allocation falls into two parts. The first part does work + * that makes the new allocated chunk usable, but does not do any operation + * that modifies the chunk tree. The second part does the work that + * requires modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 chunk_size) { @@ -5160,39 +5267,19 @@ out: return ret; } -/* - * Chunk allocation falls into two parts. The first part does work - * that makes the new allocated chunk usable, but does not do any operation - * that modifies the chunk tree. The second part does the work that - * requires modifying the chunk tree. This division is important for the - * bootstrap process of adding storage to a seed btrfs. - */ -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) -{ - u64 chunk_offset; - - lockdep_assert_held(&trans->fs_info->chunk_mutex); - chunk_offset = find_next_chunk(trans->fs_info); - return __btrfs_alloc_chunk(trans, chunk_offset, type); -} - static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - u64 chunk_offset; - u64 sys_chunk_offset; u64 alloc_profile; int ret; - chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_metadata_alloc_profile(fs_info); - ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); + ret = btrfs_alloc_chunk(trans, alloc_profile); if (ret) return ret; - sys_chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_system_alloc_profile(fs_info); - ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); + ret = btrfs_alloc_chunk(trans, alloc_profile); return ret; } @@ -5389,31 +5476,19 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -static inline int parity_smaller(u64 a, u64 b) -{ - return a > b; -} - /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) { - struct btrfs_bio_stripe s; int i; - u64 l; int again = 1; while (again) { again = 0; for (i = 0; i < num_stripes - 1; i++) { - if (parity_smaller(bbio->raid_map[i], - bbio->raid_map[i+1])) { - s = bbio->stripes[i]; - l = bbio->raid_map[i]; - bbio->stripes[i] = bbio->stripes[i+1]; - bbio->raid_map[i] = bbio->raid_map[i+1]; - bbio->stripes[i+1] = s; - bbio->raid_map[i+1] = l; - + /* Swap if parity is on a smaller index */ + if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { + swap(bbio->stripes[i], bbio->stripes[i + 1]); + swap(bbio->raid_map[i], bbio->raid_map[i + 1]); again = 1; } } @@ -5914,10 +5989,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, struct btrfs_io_geometry geom; ASSERT(bbio_ret); - - if (op == BTRFS_MAP_DISCARD) - return __btrfs_map_block_for_discard(fs_info, logical, - length, bbio_ret); + ASSERT(op != BTRFS_MAP_DISCARD); ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); if (ret < 0) @@ -6147,6 +6219,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { + if (op == BTRFS_MAP_DISCARD) + return __btrfs_map_block_for_discard(fs_info, logical, + length, bbio_ret); + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, 0); } @@ -6241,8 +6317,8 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, - (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, - bio->bi_iter.bi_size); + (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), + dev->devid, bio->bi_iter.bi_size); bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); @@ -7317,36 +7393,6 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, return 0; } -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) -{ - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - int copy_num; - - if (!bdev) - return; - - for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; - copy_num++) { - - if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) - continue; - - disk_super = (struct btrfs_super_block *)bh->b_data; - - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); - } - - /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - - /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); -} - /* * Update the size and bytes used for each device where it changed. This is * delayed since we would otherwise get errors while writing out the diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f01552a0785e..f067b5934c46 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -17,8 +17,6 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K -struct buffer_head; - struct btrfs_io_geometry { /* remaining bytes before crossing a stripe */ u64 len; @@ -209,6 +207,10 @@ BTRFS_DEVICE_GETSET_FUNCS(total_bytes); BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes); BTRFS_DEVICE_GETSET_FUNCS(bytes_used); +enum btrfs_chunk_allocation_policy { + BTRFS_CHUNK_ALLOC_REGULAR, +}; + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ u8 metadata_uuid[BTRFS_FSID_SIZE]; @@ -260,6 +262,8 @@ struct btrfs_fs_devices { struct kobject *devices_kobj; struct kobject *devinfo_kobj; struct completion kobj_unregister; + + enum btrfs_chunk_allocation_policy chunk_alloc_policy; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -461,7 +465,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data); int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); @@ -474,7 +478,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, @@ -484,6 +487,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) diff --git a/fs/buffer.c b/fs/buffer.c index b8d28370cfd7..f73276d746bb 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -274,8 +274,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * decide that the page is now completely done. */ first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -288,8 +287,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } while (tmp != bh); - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* * If none of the buffers had errors and they are all @@ -301,8 +299,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } @@ -371,8 +368,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) } first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_write(bh); unlock_buffer(bh); @@ -384,14 +380,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); end_page_writeback(page); return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } EXPORT_SYMBOL(end_buffer_async_write); @@ -3019,49 +3013,6 @@ static void end_bio_bh_io_sync(struct bio *bio) bio_put(bio); } -/* - * This allows us to do IO even on the odd last sectors - * of a device, even if the block size is some multiple - * of the physical sector size. - * - * We'll just truncate the bio to the size of the device, - * and clear the end of the buffer head manually. - * - * Truly out-of-range accesses will turn into actual IO - * errors, this only handles the "we need to be able to - * do IO at the final sector" case. - */ -void guard_bio_eod(struct bio *bio) -{ - sector_t maxsector; - struct hd_struct *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = part_nr_sects_read(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); - - if (!maxsector) - return; - - /* - * If the *whole* IO is past the end of the device, - * let it through, and the IO layer will turn it into - * an EIO. - */ - if (unlikely(bio->bi_iter.bi_sector >= maxsector)) - return; - - maxsector -= bio->bi_iter.bi_sector; - if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) - return; - - bio_truncate(bio, maxsector << 9); -} - static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, enum rw_hint write_hint, struct writeback_control *wbc) { @@ -3385,6 +3336,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); + spin_lock_init(&ret->b_uptodate_lock); preempt_disable(); __this_cpu_inc(bh_accounting.nr); recalc_bh_state(); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7ab616601141..6f4678d98df7 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -159,8 +159,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, if (!PagePrivate(page)) return; - ClearPageChecked(page); - dout("%p invalidatepage %p idx %lu full dirty page\n", inode, page, page->index); @@ -183,6 +181,47 @@ static int ceph_releasepage(struct page *page, gfp_t g) } /* + * Read some contiguous pages. If we cross a stripe boundary, shorten + * *plen. Return number of bytes read, or error. + */ +static int ceph_sync_readpages(struct ceph_fs_client *fsc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + u64 off, u64 *plen, + u32 truncate_seq, u64 truncate_size, + struct page **pages, int num_pages, + int page_align) +{ + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; + int rc = 0; + + dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, + vino.snap, off, *plen); + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, truncate_seq, truncate_size, + false); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* it may be a short read due to an object boundary */ + osd_req_op_extent_osd_data_pages(req, 0, + pages, *plen, page_align, false, false); + + dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", + off, *plen, *plen, page_align); + + rc = ceph_osdc_start_request(osdc, req, false); + if (!rc) + rc = ceph_osdc_wait_request(osdc, req); + + ceph_osdc_put_request(req); + dout("readpages result %d\n", rc); + return rc; +} + +/* * read a single page, without unlocking it. */ static int ceph_do_readpage(struct file *filp, struct page *page) @@ -218,7 +257,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page) dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); - err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), + err = ceph_sync_readpages(fsc, ceph_vino(inode), &ci->i_layout, off, &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); @@ -571,6 +610,47 @@ static u64 get_writepages_data_length(struct inode *inode, } /* + * do a synchronous write on N pages + */ +static int ceph_sync_writepages(struct ceph_fs_client *fsc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + struct ceph_snap_context *snapc, + u64 off, u64 len, + u32 truncate_seq, u64 truncate_size, + struct timespec64 *mtime, + struct page **pages, int num_pages) +{ + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; + int rc = 0; + int page_align = off & ~PAGE_MASK; + + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + snapc, truncate_seq, truncate_size, + true); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* it may be a short write due to an object boundary */ + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, false); + dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); + + req->r_mtime = *mtime; + rc = ceph_osdc_start_request(osdc, req, true); + if (!rc) + rc = ceph_osdc_wait_request(osdc, req); + + ceph_osdc_put_request(req); + if (rc == 0) + rc = len; + dout("writepages result %d\n", rc); + return rc; +} + +/* * Write a single page, but leave the page locked. * * If we get a write error, mark the mapping for error, but still adjust the @@ -628,7 +708,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); - err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), + err = ceph_sync_writepages(fsc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, @@ -1575,7 +1655,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) do { lock_page(page); - if ((off > size) || (page->mapping != inode->i_mapping)) { + if (page_mkwrite_check_truncate(page, inode) < 0) { unlock_page(page); ret = VM_FAULT_NOPAGE; break; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 270b769607a2..2f5cb6bc78e1 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -32,7 +32,7 @@ struct ceph_fscache_entry { size_t uniq_len; /* The following members must be last */ struct ceph_fsid fsid; - char uniquifier[0]; + char uniquifier[]; }; static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 28ae0c134700..185db76300b3 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct ceph_mount_options *opt = mdsc->fsc->mount_options; - - ci->i_hold_caps_min = round_jiffies(jiffies + - opt->caps_wanted_delay_min * HZ); ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); - dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, - ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); + dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, + ci->i_hold_caps_max - jiffies); } /* @@ -508,10 +505,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, - struct ceph_inode_info *ci, - bool set_timeout) + struct ceph_inode_info *ci) { - dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, + dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); @@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, goto no_change; list_del_init(&ci->i_cap_delay_list); } - if (set_timeout) - __cap_set_timeouts(mdsc, ci); + __cap_set_timeouts(mdsc, ci); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); no_change: spin_unlock(&mdsc->cap_delay_lock); @@ -561,19 +556,20 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, spin_unlock(&mdsc->cap_delay_lock); } -/* - * Common issue checks for add_cap, handle_cap_grant. - */ +/* Common issue checks for add_cap, handle_cap_grant. */ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, unsigned issued) { unsigned had = __ceph_caps_issued(ci, NULL); + lockdep_assert_held(&ci->i_ceph_lock); + /* * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + if (S_ISREG(ci->vfs_inode.i_mode) && + (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; } @@ -592,6 +588,13 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, __ceph_dir_clear_complete(ci); } } + + /* Wipe saved layout if we're losing DIR_CREATE caps */ + if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && + !(issued & CEPH_CAP_DIR_CREATE)) { + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); + } } /* @@ -605,7 +608,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, */ void ceph_add_cap(struct inode *inode, struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, + unsigned issued, unsigned wanted, unsigned seq, unsigned mseq, u64 realmino, int flags, struct ceph_cap **new_cap) { @@ -621,13 +624,6 @@ void ceph_add_cap(struct inode *inode, dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); - /* - * If we are opening the file, include file mode wanted bits - * in wanted. - */ - if (fmode >= 0) - wanted |= ceph_caps_for_mode(fmode); - spin_lock(&session->s_gen_ttl_lock); gen = session->s_cap_gen; spin_unlock(&session->s_gen_ttl_lock); @@ -725,7 +721,7 @@ void ceph_add_cap(struct inode *inode, dout(" issued %s, mds wanted %s, actual %s, queueing\n", ceph_cap_string(issued), ceph_cap_string(wanted), ceph_cap_string(actual_wanted)); - __cap_delay_requeue(mdsc, ci, true); + __cap_delay_requeue(mdsc, ci); } if (flags & CEPH_CAP_FLAG_AUTH) { @@ -752,9 +748,6 @@ void ceph_add_cap(struct inode *inode, cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; - - if (fmode >= 0) - __ceph_get_fmode(ci, fmode); } /* @@ -958,29 +951,97 @@ int __ceph_caps_used(struct ceph_inode_info *ci) if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; if (ci->i_rdcache_ref || - (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ + (S_ISREG(ci->vfs_inode.i_mode) && ci->vfs_inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; if (ci->i_wb_ref || ci->i_wrbuffer_ref) used |= CEPH_CAP_FILE_BUFFER; + if (ci->i_fx_ref) + used |= CEPH_CAP_FILE_EXCL; return used; } +#define FMODE_WAIT_BIAS 1000 + /* * wanted, by virtue of open file modes */ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { - int i, bits = 0; - for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { - if (ci->i_nr_by_mode[i]) - bits |= 1 << i; + const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); + const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); + const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); + const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); + struct ceph_mount_options *opt = + ceph_inode_to_client(&ci->vfs_inode)->mount_options; + unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; + unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; + + if (S_ISDIR(ci->vfs_inode.i_mode)) { + int want = 0; + + /* use used_cutoff here, to keep dir's wanted caps longer */ + if (ci->i_nr_by_mode[RD_SHIFT] > 0 || + time_after(ci->i_last_rd, used_cutoff)) + want |= CEPH_CAP_ANY_SHARED; + + if (ci->i_nr_by_mode[WR_SHIFT] > 0 || + time_after(ci->i_last_wr, used_cutoff)) { + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + want |= CEPH_CAP_ANY_DIR_OPS; + } + + if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) + want |= CEPH_CAP_PIN; + + return want; + } else { + int bits = 0; + + if (ci->i_nr_by_mode[RD_SHIFT] > 0) { + if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || + time_after(ci->i_last_rd, used_cutoff)) + bits |= 1 << RD_SHIFT; + } else if (time_after(ci->i_last_rd, idle_cutoff)) { + bits |= 1 << RD_SHIFT; + } + + if (ci->i_nr_by_mode[WR_SHIFT] > 0) { + if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || + time_after(ci->i_last_wr, used_cutoff)) + bits |= 1 << WR_SHIFT; + } else if (time_after(ci->i_last_wr, idle_cutoff)) { + bits |= 1 << WR_SHIFT; + } + + /* check lazyio only when read/write is wanted */ + if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && + ci->i_nr_by_mode[LAZY_SHIFT] > 0) + bits |= 1 << LAZY_SHIFT; + + return bits ? ceph_caps_for_mode(bits >> 1) : 0; } - if (bits == 0) - return 0; - return ceph_caps_for_mode(bits >> 1); +} + +/* + * wanted, by virtue of open file modes AND cap refs (buffered/cached data) + */ +int __ceph_caps_wanted(struct ceph_inode_info *ci) +{ + int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); + if (S_ISDIR(ci->vfs_inode.i_mode)) { + /* we want EXCL if holding caps of dir ops */ + if (w & CEPH_CAP_ANY_DIR_OPS) + w |= CEPH_CAP_FILE_EXCL; + } else { + /* we want EXCL if dirty data */ + if (w & CEPH_CAP_FILE_BUFFER) + w |= CEPH_CAP_FILE_EXCL; + } + return w; } /* @@ -1004,14 +1065,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) return mds_wanted; } -/* - * called under i_ceph_lock - */ -static int __ceph_is_single_caps(struct ceph_inode_info *ci) -{ - return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); -} - int ceph_is_any_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -1274,9 +1327,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct cap_msg_args arg; int held, revoking; int wake = 0; - int delayed = 0; int ret; + /* Don't send anything if it's still being created. Return delayed */ + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + spin_unlock(&ci->i_ceph_lock); + dout("%s async create in flight for %p\n", __func__, inode); + return 1; + } + held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; @@ -1287,28 +1346,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); - arg.session = cap->session; - - /* don't release wanted unless we've waited a bit. */ - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && - time_before(jiffies, ci->i_hold_caps_min)) { - dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", - ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & retain), - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(want)); - want |= cap->mds_wanted; - retain |= cap->issued; - delayed = 1; - } - ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); - if (want & ~cap->mds_wanted) { - /* user space may open/close single file frequently. - * This avoids droping mds_wanted immediately after - * requesting new mds_wanted. - */ - __cap_set_timeouts(mdsc, ci); - } + ci->i_ceph_flags &= ~CEPH_I_FLUSH; cap->issued &= retain; /* drop bits we don't want */ if (cap->implemented & ~cap->issued) { @@ -1323,6 +1361,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, cap->implemented &= cap->issued | used; cap->mds_wanted = want; + arg.session = cap->session; arg.ino = ceph_vino(inode).ino; arg.cid = cap->cap_id; arg.follows = flushing ? ci->i_head_snapc->seq : 0; @@ -1332,7 +1371,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.size = inode->i_size; ci->i_reported_size = arg.size; arg.max_size = ci->i_wanted_max_size; - ci->i_requested_max_size = arg.max_size; + if (cap == ci->i_auth_cap) + ci->i_requested_max_size = arg.max_size; if (flushing & CEPH_CAP_XATTR_EXCL) { old_blob = __ceph_build_xattrs_blob(ci); @@ -1383,14 +1423,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ret = send_cap_msg(&arg); if (ret < 0) { - dout("error sending cap msg, must requeue %p\n", inode); - delayed = 1; + pr_err("error sending cap msg, ino (%llx.%llx) " + "flushing %s tid %llu, requeue\n", + ceph_vinop(inode), ceph_cap_string(flushing), + flush_tid); + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); } if (wake) wake_up_all(&ci->i_cap_wq); - return delayed; + return ret; } static inline int __send_flush_snap(struct inode *inode, @@ -1617,6 +1662,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, int was = ci->i_dirty_caps; int dirty = 0; + lockdep_assert_held(&ci->i_ceph_lock); + if (!ci->i_auth_cap) { pr_warn("__mark_dirty_caps %p %llx mask %s, " "but no auth cap (session was closed?)\n", @@ -1654,7 +1701,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; - __cap_delay_requeue(mdsc, ci, true); + __cap_delay_requeue(mdsc, ci); return dirty; } @@ -1726,6 +1773,7 @@ static u64 __mark_caps_flushing(struct inode *inode, struct ceph_cap_flush *cf = NULL; int flushing; + lockdep_assert_held(&ci->i_ceph_lock); BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); BUG_ON(!ci->i_prealloc_cap_flush); @@ -1805,8 +1853,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) * versus held caps. Release, flush, ack revoked caps to mds as * appropriate. * - * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay - * cap release further. * CHECK_CAPS_AUTHONLY - we should only check the auth cap * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. @@ -1825,24 +1871,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ struct rb_node *p; - int delayed = 0, sent = 0; - bool no_delay = flags & CHECK_CAPS_NODELAY; bool queue_invalidate = false; bool tried_invalidate = false; - /* if we are unmounting, flush any unused caps immediately. */ - if (mdsc->stopping) - no_delay = true; - spin_lock(&ci->i_ceph_lock); - if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; - if (!(flags & CHECK_CAPS_AUTHONLY) || - (ci->i_auth_cap && __ceph_is_single_caps(ci))) - __cap_delay_cancel(mdsc, ci); - goto retry_locked; retry: spin_lock(&ci->i_ceph_lock); @@ -1866,10 +1901,11 @@ retry_locked: * revoking the shared cap on every create/unlink * operation. */ - if (IS_RDONLY(inode)) + if (IS_RDONLY(inode)) { want = CEPH_CAP_ANY_SHARED; - else - want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + } else { + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + } retain |= want; } else { @@ -1885,14 +1921,13 @@ retry_locked: } dout("check_caps %p file_want %s used %s dirty %s flushing %s" - " issued %s revoking %s retain %s %s%s%s\n", inode, + " issued %s revoking %s retain %s %s%s\n", inode, ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", - (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); /* @@ -1900,8 +1935,8 @@ retry_locked: * have cached pages, but don't want them, then try to invalidate. * If we fail, it's because pages are locked.... try again later. */ - if ((!no_delay || mdsc->stopping) && - !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ + if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && + S_ISREG(inode->i_mode) && !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| @@ -1973,28 +2008,17 @@ retry_locked: } /* want more caps from mds? */ - if (want & ~(cap->mds_wanted | cap->issued)) - goto ack; + if (want & ~cap->mds_wanted) { + if (want & ~(cap->mds_wanted | cap->issued)) + goto ack; + if (!__cap_is_valid(cap)) + goto ack; + } /* things we might delay */ if ((cap->issued & ~retain) == 0) continue; /* nope, all good */ - if (no_delay) - goto ack; - - /* delay? */ - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && - time_before(jiffies, ci->i_hold_caps_max)) { - dout(" delaying issued %s -> %s, wanted %s -> %s\n", - ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & retain), - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(want)); - delayed++; - continue; - } - ack: if (session && session != cap->session) { dout("oops, wrong session %p mutex\n", session); @@ -2055,18 +2079,20 @@ ack: } mds = cap->mds; /* remember mds, so we don't repeat */ - sent++; /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, - cap_used, want, retain, flushing, - flush_tid, oldest_flush_tid); + __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want, + retain, flushing, flush_tid, oldest_flush_tid); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } - /* Reschedule delayed caps release if we delayed anything */ - if (delayed) - __cap_delay_requeue(mdsc, ci, false); + /* periodically re-calculate caps wanted by open files */ + if (__ceph_is_any_real_caps(ci) && + list_empty(&ci->i_cap_delay_list) && + (file_wanted & ~CEPH_CAP_PIN) && + !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + __cap_delay_requeue(mdsc, ci); + } spin_unlock(&ci->i_ceph_lock); @@ -2095,7 +2121,6 @@ retry: retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; - int delayed; if (session != cap->session) { spin_unlock(&ci->i_ceph_lock); @@ -2124,18 +2149,10 @@ retry_locked: &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - CEPH_CLIENT_CAPS_SYNC, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - (cap->issued | cap->implemented), - flushing, flush_tid, oldest_flush_tid); - - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci, true); - spin_unlock(&ci->i_ceph_lock); - } + __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, + __ceph_caps_used(ci), __ceph_caps_wanted(ci), + (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); } else { if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = @@ -2233,6 +2250,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (datasync) goto out; + ret = ceph_wait_on_async_create(inode); + if (ret) + goto out; + dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); @@ -2335,22 +2356,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, if (cf->caps) { dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, cap, cf->tid, ceph_cap_string(cf->caps)); - ci->i_ceph_flags |= CEPH_I_NODELAY; - - ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, (cf->tid < last_snap_flush ? CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); - if (ret) { - pr_err("kick_flushing_caps: error sending " - "cap flush, ino (%llx.%llx) " - "tid %llu flushing %s\n", - ceph_vinop(inode), cf->tid, - ceph_cap_string(cf->caps)); - } } else { struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, @@ -2457,16 +2469,15 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, } } -static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - struct inode *inode) - __releases(ci->i_ceph_lock) +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, + struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap *cap; + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_cap *cap = ci->i_auth_cap; + + lockdep_assert_held(&ci->i_ceph_lock); - cap = ci->i_auth_cap; - dout("kick_flushing_inode_caps %p flushing %s\n", inode, + dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { @@ -2478,9 +2489,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, spin_unlock(&mdsc->cap_dirty_lock); __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); - spin_unlock(&ci->i_ceph_lock); - } else { - spin_unlock(&ci->i_ceph_lock); } } @@ -2488,18 +2496,20 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, /* * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. - * - * Protected by i_ceph_lock. */ -static void __take_cap_refs(struct ceph_inode_info *ci, int got, +void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, bool snap_rwsem_locked) { + lockdep_assert_held(&ci->i_ceph_lock); + if (got & CEPH_CAP_PIN) ci->i_pin_ref++; if (got & CEPH_CAP_FILE_RD) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; + if (got & CEPH_CAP_FILE_EXCL) + ci->i_fx_ref++; if (got & CEPH_CAP_FILE_WR) { if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { BUG_ON(!snap_rwsem_locked); @@ -2512,7 +2522,7 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, if (ci->i_wb_ref == 0) ihold(&ci->vfs_inode); ci->i_wb_ref++; - dout("__take_cap_refs %p wb %d -> %d (?)\n", + dout("%s %p wb %d -> %d (?)\n", __func__, &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -2524,14 +2534,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * Note that caller is responsible for ensuring max_size increases are * requested from the MDS. * - * Returns 0 if caps were not able to be acquired (yet), a 1 if they were, - * or a negative error code. - * - * FIXME: how does a 0 return differ from -EAGAIN? + * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, + * or a negative error code. There are 3 speical error codes: + * -EAGAIN: need to sleep but non-blocking is specified + * -EFBIG: ask caller to call check_max_size() and try again. + * -ESTALE: ask caller to call ceph_renew_caps() and try again. */ enum { - NON_BLOCKING = 1, - CHECK_FILELOCK = 2, + /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ + NON_BLOCKING = (1 << 8), + CHECK_FILELOCK = (1 << 9), }; static int try_get_cap_refs(struct inode *inode, int need, int want, @@ -2541,7 +2553,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want, struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; - int file_wanted; bool snap_rwsem_locked = false; dout("get_cap_refs %p need %s want %s\n", inode, @@ -2557,15 +2568,6 @@ again: goto out_unlock; } - /* make sure file is actually open */ - file_wanted = __ceph_caps_file_wanted(ci); - if ((file_wanted & need) != need) { - dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", - ceph_cap_string(need), ceph_cap_string(file_wanted)); - ret = -EBADF; - goto out_unlock; - } - /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); @@ -2584,7 +2586,7 @@ again: dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) - ret = -EAGAIN; + ret = ci->i_auth_cap ? -EFBIG : -ESTALE; goto out_unlock; } /* @@ -2630,51 +2632,55 @@ again: } snap_rwsem_locked = true; } - *got = need | (have & want); - if ((need & CEPH_CAP_FILE_RD) && + if ((have & want) == want) + *got = need | want; + else + *got = need; + if (S_ISREG(inode->i_mode) && + (need & CEPH_CAP_FILE_RD) && !(*got & CEPH_CAP_FILE_CACHE)) ceph_disable_fscache_readpage(ci); - __take_cap_refs(ci, *got, true); + ceph_take_cap_refs(ci, *got, true); ret = 1; } } else { int session_readonly = false; - if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { + int mds_wanted; + if (ci->i_auth_cap && + (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { struct ceph_mds_session *s = ci->i_auth_cap->session; spin_lock(&s->s_cap_lock); session_readonly = s->s_readonly; spin_unlock(&s->s_cap_lock); } if (session_readonly) { - dout("get_cap_refs %p needed %s but mds%d readonly\n", + dout("get_cap_refs %p need %s but mds%d readonly\n", inode, ceph_cap_string(need), ci->i_auth_cap->mds); ret = -EROFS; goto out_unlock; } - if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { - int mds_wanted; - if (READ_ONCE(mdsc->fsc->mount_state) == - CEPH_MOUNT_SHUTDOWN) { - dout("get_cap_refs %p forced umount\n", inode); - ret = -EIO; - goto out_unlock; - } - mds_wanted = __ceph_caps_mds_wanted(ci, false); - if (need & ~(mds_wanted & need)) { - dout("get_cap_refs %p caps were dropped" - " (session killed?)\n", inode); - ret = -ESTALE; - goto out_unlock; - } - if (!(file_wanted & ~mds_wanted)) - ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + ret = -EIO; + goto out_unlock; + } + mds_wanted = __ceph_caps_mds_wanted(ci, false); + if (need & ~mds_wanted) { + dout("get_cap_refs %p need %s > mds_wanted %s\n", + inode, ceph_cap_string(need), + ceph_cap_string(mds_wanted)); + ret = -ESTALE; + goto out_unlock; } - dout("get_cap_refs %p have %s needed %s\n", inode, + dout("get_cap_refs %p have %s need %s\n", inode, ceph_cap_string(have), ceph_cap_string(need)); } out_unlock: + + __ceph_touch_fmode(ci, mdsc, flags); + spin_unlock(&ci->i_ceph_lock); if (snap_rwsem_locked) up_read(&mdsc->snap_rwsem); @@ -2712,20 +2718,40 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } +static inline int get_used_fmode(int caps) +{ + int fmode = 0; + if (caps & CEPH_CAP_FILE_RD) + fmode |= CEPH_FILE_MODE_RD; + if (caps & CEPH_CAP_FILE_WR) + fmode |= CEPH_FILE_MODE_WR; + return fmode; +} + int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { - int ret; + int ret, flags; BUG_ON(need & ~CEPH_CAP_FILE_RD); - BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); - ret = ceph_pool_perm_check(inode, need); - if (ret < 0) - return ret; + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | + CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_ANY_DIR_OPS)); + if (need) { + ret = ceph_pool_perm_check(inode, need); + if (ret < 0) + return ret; + } + + flags = get_used_fmode(need | want); + if (nonblock) + flags |= NON_BLOCKING; - ret = try_get_cap_refs(inode, need, want, 0, - (nonblock ? NON_BLOCKING : 0), got); - return ret == -EAGAIN ? 0 : ret; + ret = try_get_cap_refs(inode, need, want, 0, flags, got); + /* three special error codes */ + if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN) + ret = 0; + return ret; } /* @@ -2750,16 +2776,16 @@ int ceph_get_caps(struct file *filp, int need, int want, fi->filp_gen != READ_ONCE(fsc->filp_gen)) return -EBADF; - while (true) { - if (endoff > 0) - check_max_size(inode, endoff); + flags = get_used_fmode(need | want); - flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0; + while (true) { + flags &= CEPH_FILE_MODE_MASK; + if (atomic_read(&fi->num_locks)) + flags |= CHECK_FILELOCK; _got = 0; ret = try_get_cap_refs(inode, need, want, endoff, flags, &_got); - if (ret == -EAGAIN) - continue; + WARN_ON_ONCE(ret == -EAGAIN); if (!ret) { struct ceph_mds_client *mdsc = fsc->mdsc; struct cap_wait cw; @@ -2774,6 +2800,8 @@ int ceph_get_caps(struct file *filp, int need, int want, list_add(&cw.list, &mdsc->cap_wait_list); spin_unlock(&mdsc->caps_list_lock); + /* make sure used fmode not timeout */ + ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); add_wait_queue(&ci->i_cap_wq, &wait); flags |= NON_BLOCKING; @@ -2787,6 +2815,7 @@ int ceph_get_caps(struct file *filp, int need, int want, } remove_wait_queue(&ci->i_cap_wq, &wait); + ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); spin_lock(&mdsc->caps_list_lock); list_del(&cw.list); @@ -2804,16 +2833,26 @@ int ceph_get_caps(struct file *filp, int need, int want, } if (ret < 0) { + if (ret == -EFBIG || ret == -ESTALE) { + int ret2 = ceph_wait_on_async_create(inode); + if (ret2 < 0) + return ret2; + } + if (ret == -EFBIG) { + check_max_size(inode, endoff); + continue; + } if (ret == -ESTALE) { /* session was killed, try renew caps */ - ret = ceph_renew_caps(inode); + ret = ceph_renew_caps(inode, flags); if (ret == 0) continue; } return ret; } - if (ci->i_inline_version != CEPH_INLINE_NONE && + if (S_ISREG(ci->vfs_inode.i_mode) && + ci->i_inline_version != CEPH_INLINE_NONE && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { struct page *page = @@ -2846,7 +2885,8 @@ int ceph_get_caps(struct file *filp, int need, int want, break; } - if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) + if (S_ISREG(ci->vfs_inode.i_mode) && + (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) ceph_fscache_revalidate_cookie(ci); *got = _got; @@ -2860,7 +2900,7 @@ int ceph_get_caps(struct file *filp, int need, int want, void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); - __take_cap_refs(ci, caps, false); + ceph_take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } @@ -2911,6 +2951,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (had & CEPH_CAP_FILE_CACHE) if (--ci->i_rdcache_ref == 0) last++; + if (had & CEPH_CAP_FILE_EXCL) + if (--ci->i_fx_ref == 0) + last++; if (had & CEPH_CAP_FILE_BUFFER) { if (--ci->i_wb_ref == 0) { last++; @@ -2950,7 +2993,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); - if (last && !flushsnaps) + if (last) ceph_check_caps(ci, 0, NULL); else if (flushsnaps) ceph_flush_snaps(ci, NULL); @@ -3032,7 +3075,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, spin_unlock(&ci->i_ceph_lock); if (last) { - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, 0, NULL); } else if (flush_snaps) { ceph_flush_snaps(ci, NULL); } @@ -3133,7 +3176,7 @@ static void handle_cap_grant(struct inode *inode, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ - if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ + if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { @@ -3297,11 +3340,12 @@ static void handle_cap_grant(struct inode *inode, ceph_cap_string(cap->issued), ceph_cap_string(newcaps), ceph_cap_string(revoking)); - if (revoking & used & CEPH_CAP_FILE_BUFFER) + if (S_ISREG(inode->i_mode) && + (revoking & used & CEPH_CAP_FILE_BUFFER)) writeback = true; /* initiate writeback; will delay ack */ - else if (revoking == CEPH_CAP_FILE_CACHE && - (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && - queue_invalidate) + else if (queue_invalidate && + revoking == CEPH_CAP_FILE_CACHE && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) ; /* do nothing yet, invalidation will be queued */ else if (cap == ci->i_auth_cap) check_caps = 1; /* check auth cap only */ @@ -3339,7 +3383,8 @@ static void handle_cap_grant(struct inode *inode, if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { if (newcaps & ~extra_info->issued) wake = true; - kick_flushing_inode_caps(session->s_mdsc, session, inode); + ceph_kick_flushing_inode_caps(session, ci); + spin_unlock(&ci->i_ceph_lock); up_read(&session->s_mdsc->snap_rwsem); } else { spin_unlock(&ci->i_ceph_lock); @@ -3367,10 +3412,10 @@ static void handle_cap_grant(struct inode *inode, wake_up_all(&ci->i_cap_wq); if (check_caps == 1) - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, session); else if (check_caps == 2) - ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); + ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); else mutex_unlock(&session->s_mutex); } @@ -3619,8 +3664,6 @@ retry: goto out_unlock; if (target < 0) { - if (cap->mds_wanted | cap->issued) - ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; __ceph_remove_cap(cap, false); goto out_unlock; } @@ -3668,7 +3711,7 @@ retry: /* add placeholder for the export tagert */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; tcap = new_cap; - ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, + ceph_add_cap(inode, tsession, t_cap_id, issued, 0, t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); if (!list_empty(&ci->i_cap_flush_list) && @@ -3773,7 +3816,7 @@ retry: __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); - ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, + ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, &new_cap); ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; @@ -4047,7 +4090,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) { struct inode *inode; struct ceph_inode_info *ci; - int flags = CHECK_CAPS_NODELAY; dout("check_delayed_caps\n"); while (1) { @@ -4067,7 +4109,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) if (inode) { dout("check_delayed_caps on %p\n", inode); - ceph_check_caps(ci, flags, NULL); + ceph_check_caps(ci, 0, NULL); /* avoid calling iput_final() in tick thread */ ceph_async_iput(inode); } @@ -4092,7 +4134,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) ihold(inode); dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); iput(inode); spin_lock(&mdsc->cap_dirty_lock); } @@ -4100,14 +4142,31 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) dout("flush_dirty_caps done\n"); } -void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) +void __ceph_touch_fmode(struct ceph_inode_info *ci, + struct ceph_mds_client *mdsc, int fmode) +{ + unsigned long now = jiffies; + if (fmode & CEPH_FILE_MODE_RD) + ci->i_last_rd = now; + if (fmode & CEPH_FILE_MODE_WR) + ci->i_last_wr = now; + /* queue periodic check */ + if (fmode && + __ceph_is_any_real_caps(ci) && + list_empty(&ci->i_cap_delay_list)) + __cap_delay_requeue(mdsc, ci); +} + +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { int i; int bits = (fmode << 1) | 1; + spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) - ci->i_nr_by_mode[i]++; + ci->i_nr_by_mode[i] += count; } + spin_unlock(&ci->i_ceph_lock); } /* @@ -4115,26 +4174,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) * we may need to release capabilities to the MDS (or schedule * their delayed release). */ -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { - int i, last = 0; + int i; int bits = (fmode << 1) | 1; spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) { - BUG_ON(ci->i_nr_by_mode[i] == 0); - if (--ci->i_nr_by_mode[i] == 0) - last++; + BUG_ON(ci->i_nr_by_mode[i] < count); + ci->i_nr_by_mode[i] -= count; } } - dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n", - &ci->vfs_inode, fmode, - ci->i_nr_by_mode[0], ci->i_nr_by_mode[1], - ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]); spin_unlock(&ci->i_ceph_lock); - - if (last && ci->i_vino.snap == CEPH_NOSNAP) - ceph_check_caps(ci, 0, NULL); } /* @@ -4152,7 +4203,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode) if (inode->i_nlink == 1) { drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); - ci->i_ceph_flags |= CEPH_I_NODELAY; if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; @@ -4208,8 +4258,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode, if (force || (cap->issued & drop)) { if (cap->issued & drop) { int wanted = __ceph_caps_wanted(ci); - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) - wanted |= cap->mds_wanted; dout("encode_inode_release %p cap %p " "%s -> %s, wanted %s -> %s\n", inode, cap, ceph_cap_string(cap->issued), diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fb7cabd98e7b..481ac97b4d25 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -218,10 +218,10 @@ static int mds_sessions_show(struct seq_file *s, void *ptr) return 0; } -CEPH_DEFINE_SHOW_FUNC(mdsmap_show) -CEPH_DEFINE_SHOW_FUNC(mdsc_show) -CEPH_DEFINE_SHOW_FUNC(caps_show) -CEPH_DEFINE_SHOW_FUNC(mds_sessions_show) +DEFINE_SHOW_ATTRIBUTE(mdsmap); +DEFINE_SHOW_ATTRIBUTE(mdsc); +DEFINE_SHOW_ATTRIBUTE(caps); +DEFINE_SHOW_ATTRIBUTE(mds_sessions); /* @@ -281,25 +281,25 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) 0400, fsc->client->debugfs_dir, fsc, - &mdsmap_show_fops); + &mdsmap_fops); fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions", 0400, fsc->client->debugfs_dir, fsc, - &mds_sessions_show_fops); + &mds_sessions_fops); fsc->debugfs_mdsc = debugfs_create_file("mdsc", 0400, fsc->client->debugfs_dir, fsc, - &mdsc_show_fops); + &mdsc_fops); fsc->debugfs_caps = debugfs_create_file("caps", 0400, fsc->client->debugfs_dir, fsc, - &caps_show_fops); + &caps_fops); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d0cd0aba5843..d594c2627430 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -335,8 +335,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ctx->pos = 2; } - /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); + /* request Fx cap. if have Fx, we don't need to release Fs cap + * for later create/unlink. */ + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR); + /* can we use the dcache? */ if (ceph_test_mount_opt(fsc, DCACHE) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && @@ -752,7 +755,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct ceph_dentry_info *di = ceph_dentry(dentry); spin_lock(&ci->i_ceph_lock); - dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); + dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags); if (strncmp(dentry->d_name.name, fsc->mount_options->snapdir_name, dentry->d_name.len) && @@ -760,6 +763,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, ceph_test_mount_opt(fsc, DCACHE) && __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); d_add(dentry, NULL); @@ -1036,6 +1040,78 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, return err; } +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + int result = req->r_err ? req->r_err : + le32_to_cpu(req->r_reply_info.head->result); + + if (result == -EJUKEBOX) + goto out; + + /* If op failed, mark everyone involved for errors */ + if (result) { + int pathlen; + u64 base; + char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &base, 0); + + /* mark error on parent + clear complete */ + mapping_set_error(req->r_parent->i_mapping, result); + ceph_dir_clear_complete(req->r_parent); + + /* drop the dentry -- we don't know its status */ + if (!d_unhashed(req->r_dentry)) + d_drop(req->r_dentry); + + /* mark inode itself for an error (since metadata is bogus) */ + mapping_set_error(req->r_old_inode->i_mapping, result); + + pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path(path, pathlen); + } +out: + iput(req->r_old_inode); + ceph_mdsc_release_dir_caps(req); +} + +static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di; + int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK; + + spin_lock(&ci->i_ceph_lock); + if ((__ceph_caps_issued(ci, NULL) & want) == want) { + ceph_take_cap_refs(ci, want, false); + got = want; + } + spin_unlock(&ci->i_ceph_lock); + + /* If we didn't get anything, return 0 */ + if (!got) + return 0; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + /* + * - We are holding Fx, which implies Fs caps. + * - Only support async unlink for primary linkage + */ + if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen || + !(di->flags & CEPH_DENTRY_PRIMARY_LINK)) + want = 0; + spin_unlock(&dentry->d_lock); + + /* Do we still want what we've got? */ + if (want == got) + return got; + + ceph_put_cap_refs(ci, got); + return 0; +} + /* * rmdir and unlink are differ only by the metadata op code */ @@ -1045,6 +1121,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; + bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); int err = -EROFS; int op; @@ -1059,6 +1136,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; } else goto out; +retry: req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1067,13 +1145,39 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; - set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_inode_drop = ceph_drop_caps_for_unlink(inode); - err = ceph_mdsc_do_request(mdsc, dir, req); - if (!err && !req->r_reply_info.head->is_dentry) - d_delete(dentry); + + if (try_async && op == CEPH_MDS_OP_UNLINK && + (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { + dout("async unlink on %lu/%.*s caps=%s", dir->i_ino, + dentry->d_name.len, dentry->d_name.name, + ceph_cap_string(req->r_dir_caps)); + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); + req->r_callback = ceph_async_unlink_cb; + req->r_old_inode = d_inode(dentry); + ihold(req->r_old_inode); + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) { + /* + * We have enough caps, so we assume that the unlink + * will succeed. Fix up the target inode and dcache. + */ + drop_nlink(inode); + d_delete(dentry); + } else if (err == -EJUKEBOX) { + try_async = false; + ceph_mdsc_put_request(req); + goto retry; + } + } else { + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + d_delete(dentry); + } + ceph_mdsc_put_request(req); out: return err; @@ -1411,6 +1515,7 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry) spin_lock(&dentry->d_lock); di->time = jiffies; di->lease_shared_gen = 0; + di->flags &= ~CEPH_DENTRY_PRIMARY_LINK; __dentry_lease_unlist(di); spin_unlock(&dentry->d_lock); } @@ -1520,7 +1625,8 @@ static int __dir_lease_try_check(const struct dentry *dentry) /* * Check if directory-wide content lease/cap is valid. */ -static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) +static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry, + struct ceph_mds_client *mdsc) { struct ceph_inode_info *ci = ceph_inode(dir); int valid; @@ -1528,7 +1634,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) spin_lock(&ci->i_ceph_lock); valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); - shared_gen = atomic_read(&ci->i_shared_gen); + if (valid) { + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); + shared_gen = atomic_read(&ci->i_shared_gen); + } spin_unlock(&ci->i_ceph_lock); if (valid) { struct ceph_dentry_info *di; @@ -1554,6 +1663,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) int valid = 0; struct dentry *parent; struct inode *dir, *inode; + struct ceph_mds_client *mdsc; if (flags & LOOKUP_RCU) { parent = READ_ONCE(dentry->d_parent); @@ -1570,6 +1680,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry, dentry, inode, ceph_dentry(dentry)->offset); + mdsc = ceph_sb_to_client(dir->i_sb)->mdsc; + /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, @@ -1581,7 +1693,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) valid = dentry_lease_is_valid(dentry, flags); if (valid == -ECHILD) return valid; - if (valid || dir_lease_is_valid(dir, dentry)) { + if (valid || dir_lease_is_valid(dir, dentry, mdsc)) { if (inode) valid = ceph_is_any_caps(inode); else @@ -1590,8 +1702,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) } if (!valid) { - struct ceph_mds_client *mdsc = - ceph_sb_to_client(dir->i_sb)->mdsc; struct ceph_mds_request *req; int op, err; u32 mask; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index b6bfa94332c3..79dc06881e78 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -315,6 +315,11 @@ static struct dentry *__get_parent(struct super_block *sb, req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err) { + ceph_mdsc_put_request(req); + return ERR_PTR(err); + } + inode = req->r_target_inode; if (inode) ihold(inode); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5a478cd06e11..4a5ccbb7e808 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, if (isdir) { struct ceph_dir_file_info *dfi = kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); - if (!dfi) { - ceph_put_fmode(ci, fmode); /* clean up */ + if (!dfi) return -ENOMEM; - } file->private_data = dfi; fi = &dfi->file_info; @@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, dfi->readdir_cache_idx = -1; } else { fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); - if (!fi) { - ceph_put_fmode(ci, fmode); /* clean up */ + if (!fi) return -ENOMEM; - } file->private_data = fi; } + ceph_get_fmode(ci, fmode, 1); fi->fmode = fmode; + spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); fi->meta_err = errseq_sample(&ci->i_meta_err); @@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFLNK: dout("init_file %p %p 0%o (symlink)\n", inode, file, inode->i_mode); - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ break; default: @@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) * we need to drop the open ref now, since we don't * have .release set to ceph_release. */ - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ BUG_ON(inode->i_fop->release == ceph_release); /* call the proper open fop */ @@ -285,14 +281,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) /* * try renew caps after session gets killed. */ -int ceph_renew_caps(struct inode *inode) +int ceph_renew_caps(struct inode *inode, int fmode) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; int err, flags, wanted; spin_lock(&ci->i_ceph_lock); + __ceph_touch_fmode(ci, mdsc, fmode); wanted = __ceph_caps_file_wanted(ci); if (__ceph_is_any_real_caps(ci) && (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { @@ -326,7 +323,6 @@ int ceph_renew_caps(struct inode *inode) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; - req->r_fmode = -1; err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); @@ -372,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file) /* trivially open snapdir */ if (ceph_snap(inode) == CEPH_SNAPDIR) { - spin_lock(&ci->i_ceph_lock); - __ceph_get_fmode(ci, fmode); - spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } @@ -392,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file) dout("open %p fmode %d want %s issued %s using existing\n", inode, fmode, ceph_cap_string(wanted), ceph_cap_string(issued)); - __ceph_get_fmode(ci, fmode); + __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); /* adjust wanted? */ @@ -404,7 +397,7 @@ int ceph_open(struct inode *inode, struct file *file) return ceph_init_file(inode, file, fmode); } else if (ceph_snap(inode) != CEPH_NOSNAP && (ci->i_snap_caps & wanted) == wanted) { - __ceph_get_fmode(ci, fmode); + __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } @@ -430,6 +423,236 @@ out: return err; } +/* Clone the layout from a synchronous create, if the dir now has Dc caps */ +static void +cache_file_layout(struct inode *dst, struct inode *src) +{ + struct ceph_inode_info *cdst = ceph_inode(dst); + struct ceph_inode_info *csrc = ceph_inode(src); + + spin_lock(&cdst->i_ceph_lock); + if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) && + !ceph_file_layout_is_valid(&cdst->i_cached_layout)) { + memcpy(&cdst->i_cached_layout, &csrc->i_layout, + sizeof(cdst->i_cached_layout)); + rcu_assign_pointer(cdst->i_cached_layout.pool_ns, + ceph_try_get_string(csrc->i_layout.pool_ns)); + } + spin_unlock(&cdst->i_ceph_lock); +} + +/* + * Try to set up an async create. We need caps, a file layout, and inode number, + * and either a lease on the dentry or complete dir info. If any of those + * criteria are not satisfied, then return false and the caller can go + * synchronous. + */ +static int try_prep_async_create(struct inode *dir, struct dentry *dentry, + struct ceph_file_layout *lo, u64 *pino) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE; + u64 ino; + + spin_lock(&ci->i_ceph_lock); + /* No auth cap means no chance for Dc caps */ + if (!ci->i_auth_cap) + goto no_async; + + /* Any delegated inos? */ + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) + goto no_async; + + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) + goto no_async; + + if ((__ceph_caps_issued(ci, NULL) & want) != want) + goto no_async; + + if (d_in_lookup(dentry)) { + if (!__ceph_dir_is_complete(ci)) + goto no_async; + spin_lock(&dentry->d_lock); + di->lease_shared_gen = atomic_read(&ci->i_shared_gen); + spin_unlock(&dentry->d_lock); + } else if (atomic_read(&ci->i_shared_gen) != + READ_ONCE(di->lease_shared_gen)) { + goto no_async; + } + + ino = ceph_get_deleg_ino(ci->i_auth_cap->session); + if (!ino) + goto no_async; + + *pino = ino; + ceph_take_cap_refs(ci, want, false); + memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); + rcu_assign_pointer(lo->pool_ns, + ceph_try_get_string(ci->i_cached_layout.pool_ns)); + got = want; +no_async: + spin_unlock(&ci->i_ceph_lock); + return got; +} + +static void restore_deleg_ino(struct inode *dir, u64 ino) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_session *s = NULL; + + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + s = ceph_get_mds_session(ci->i_auth_cap->session); + spin_unlock(&ci->i_ceph_lock); + if (s) { + int err = ceph_restore_deleg_ino(s, ino); + if (err) + pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n", + ino, err); + ceph_put_mds_session(s); + } +} + +static void ceph_async_create_cb(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + int result = req->r_err ? req->r_err : + le32_to_cpu(req->r_reply_info.head->result); + + if (result == -EJUKEBOX) + goto out; + + mapping_set_error(req->r_parent->i_mapping, result); + + if (result) { + struct dentry *dentry = req->r_dentry; + int pathlen; + u64 base; + char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &base, 0); + + ceph_dir_clear_complete(req->r_parent); + if (!d_unhashed(dentry)) + d_drop(dentry); + + /* FIXME: start returning I/O errors on all accesses? */ + pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path(path, pathlen); + } + + if (req->r_target_inode) { + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); + u64 ino = ceph_vino(req->r_target_inode).ino; + + if (req->r_deleg_ino != ino) + pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", + __func__, req->r_err, req->r_deleg_ino, ino); + mapping_set_error(req->r_target_inode->i_mapping, result); + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + } + ceph_kick_flushing_inode_caps(req->r_session, ci); + spin_unlock(&ci->i_ceph_lock); + } else { + pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, + req->r_deleg_ino); + } +out: + ceph_mdsc_release_dir_caps(req); +} + +static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, + struct file *file, umode_t mode, + struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx, + struct ceph_file_layout *lo) +{ + int ret; + char xattr_buf[4]; + struct ceph_mds_reply_inode in = { }; + struct ceph_mds_reply_info_in iinfo = { .in = &in }; + struct ceph_inode_info *ci = ceph_inode(dir); + struct inode *inode; + struct timespec64 now; + struct ceph_vino vino = { .ino = req->r_deleg_ino, + .snap = CEPH_NOSNAP }; + + ktime_get_real_ts64(&now); + + inode = ceph_get_inode(dentry->d_sb, vino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + iinfo.inline_version = CEPH_INLINE_NONE; + iinfo.change_attr = 1; + ceph_encode_timespec64(&iinfo.btime, &now); + + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + + in.ino = cpu_to_le64(vino.ino); + in.snapid = cpu_to_le64(CEPH_NOSNAP); + in.version = cpu_to_le64(1); // ??? + in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); + in.cap.cap_id = cpu_to_le64(1); + in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); + in.cap.flags = CEPH_CAP_FLAG_AUTH; + in.ctime = in.mtime = in.atime = iinfo.btime; + in.mode = cpu_to_le32((u32)mode); + in.truncate_seq = cpu_to_le32(1); + in.truncate_size = cpu_to_le64(-1ULL); + in.xattr_version = cpu_to_le64(1); + in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ? + dir->i_gid : current_fsgid())); + in.nlink = cpu_to_le32(1); + in.max_size = cpu_to_le64(lo->stripe_unit); + + ceph_file_layout_to_legacy(lo, &in.layout); + + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, + req->r_fmode, NULL); + if (ret) { + dout("%s failed to fill inode: %d\n", __func__, ret); + ceph_dir_clear_complete(dir); + if (!d_unhashed(dentry)) + d_drop(dentry); + if (inode->i_state & I_NEW) + discard_new_inode(inode); + } else { + struct dentry *dn; + + dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__, + vino.ino, dir->i_ino, dentry->d_name.name); + ceph_dir_clear_ordered(dir); + ceph_init_inode_acls(inode, as_ctx); + if (inode->i_state & I_NEW) { + /* + * If it's not I_NEW, then someone created this before + * we got here. Assume the server is aware of it at + * that point and don't worry about setting + * CEPH_I_ASYNC_CREATE. + */ + ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; + unlock_new_inode(inode); + } + if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { + if (!d_unhashed(dentry)) + d_drop(dentry); + dn = d_splice_alias(inode, dentry); + WARN_ON_ONCE(dn && dn != dentry); + } + file->f_mode |= FMODE_CREATED; + ret = finish_open(file, dentry, ceph_open); + } + return ret; +} /* * Do a lookup + open with a single request. If we get a non-existent @@ -443,6 +666,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct ceph_mds_request *req; struct dentry *dn; struct ceph_acl_sec_ctx as_ctx = {}; + bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); int mask; int err; @@ -466,7 +690,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, /* If it's not being looked up, it's negative */ return -ENOENT; } - +retry: /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) { @@ -475,21 +699,43 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } req->r_dentry = dget(dentry); req->r_num_caps = 2; + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.open.mask = cpu_to_le32(mask); + req->r_parent = dir; + if (flags & O_CREAT) { + struct ceph_file_layout lo; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; if (as_ctx.pagelist) { req->r_pagelist = as_ctx.pagelist; as_ctx.pagelist = NULL; } + if (try_async && + (req->r_dir_caps = + try_prep_async_create(dir, dentry, &lo, + &req->r_deleg_ino))) { + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); + req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); + req->r_callback = ceph_async_create_cb; + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) { + err = ceph_finish_async_create(dir, dentry, + file, mode, req, + &as_ctx, &lo); + } else if (err == -EJUKEBOX) { + restore_deleg_ino(dir, req->r_deleg_ino); + ceph_mdsc_put_request(req); + try_async = false; + goto retry; + } + goto out_req; + } } - mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; - if (ceph_security_xattr_wanted(dir)) - mask |= CEPH_CAP_XATTR_SHARED; - req->r_args.open.mask = cpu_to_le32(mask); - - req->r_parent = dir; set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, @@ -518,14 +764,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } else { dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { - ceph_init_inode_acls(d_inode(dentry), &as_ctx); + struct inode *newino = d_inode(dentry); + + cache_file_layout(dir, newino); + ceph_init_inode_acls(newino, &as_ctx); file->f_mode |= FMODE_CREATED; } err = finish_open(file, dentry, ceph_open); } out_req: - if (!req->r_err && req->r_target_inode) - ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); ceph_mdsc_put_request(req); out_ctx: ceph_release_acl_sec_ctx(&as_ctx); @@ -542,7 +789,7 @@ int ceph_release(struct inode *inode, struct file *file) dout("release inode %p dir file %p\n", inode, file); WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); - ceph_put_fmode(ci, dfi->file_info.fmode); + ceph_put_fmode(ci, dfi->file_info.fmode, 1); if (dfi->last_readdir) ceph_mdsc_put_request(dfi->last_readdir); @@ -554,7 +801,8 @@ int ceph_release(struct inode *inode, struct file *file) dout("release inode %p regular file %p\n", inode, file); WARN_ON(!list_empty(&fi->rw_contexts)); - ceph_put_fmode(ci, fi->fmode); + ceph_put_fmode(ci, fi->fmode, 1); + kmem_cache_free(ceph_file_cachep, fi); } @@ -1567,7 +1815,7 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) - ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); + ceph_check_caps(ci, 0, NULL); } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", @@ -1944,6 +2192,71 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, return 0; } +static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off, + struct ceph_inode_info *dst_ci, u64 *dst_off, + struct ceph_fs_client *fsc, + size_t len, unsigned int flags) +{ + struct ceph_object_locator src_oloc, dst_oloc; + struct ceph_object_id src_oid, dst_oid; + size_t bytes = 0; + u64 src_objnum, src_objoff, dst_objnum, dst_objoff; + u32 src_objlen, dst_objlen; + u32 object_size = src_ci->i_layout.object_size; + int ret; + + src_oloc.pool = src_ci->i_layout.pool_id; + src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); + dst_oloc.pool = dst_ci->i_layout.pool_id; + dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); + + while (len >= object_size) { + ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off, + object_size, &src_objnum, + &src_objoff, &src_objlen); + ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off, + object_size, &dst_objnum, + &dst_objoff, &dst_objlen); + ceph_oid_init(&src_oid); + ceph_oid_printf(&src_oid, "%llx.%08llx", + src_ci->i_vino.ino, src_objnum); + ceph_oid_init(&dst_oid); + ceph_oid_printf(&dst_oid, "%llx.%08llx", + dst_ci->i_vino.ino, dst_objnum); + /* Do an object remote copy */ + ret = ceph_osdc_copy_from(&fsc->client->osdc, + src_ci->i_vino.snap, 0, + &src_oid, &src_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, + &dst_oid, &dst_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, + dst_ci->i_truncate_seq, + dst_ci->i_truncate_size, + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); + if (ret) { + if (ret == -EOPNOTSUPP) { + fsc->have_copy_from2 = false; + pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); + } + dout("ceph_osdc_copy_from returned %d\n", ret); + if (!bytes) + bytes = ret; + goto out; + } + len -= object_size; + bytes += object_size; + *src_off += object_size; + *dst_off += object_size; + } + +out: + ceph_oloc_destroy(&src_oloc); + ceph_oloc_destroy(&dst_oloc); + return bytes; +} + static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags) @@ -1954,14 +2267,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); struct ceph_cap_flush *prealloc_cf; struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); - struct ceph_object_locator src_oloc, dst_oloc; - struct ceph_object_id src_oid, dst_oid; - loff_t endoff = 0, size; - ssize_t ret = -EIO; + loff_t size; + ssize_t ret = -EIO, bytes; u64 src_objnum, dst_objnum, src_objoff, dst_objoff; - u32 src_objlen, dst_objlen, object_size; + u32 src_objlen, dst_objlen; int src_got = 0, dst_got = 0, err, dirty; - bool do_final_copy = false; if (src_inode->i_sb != dst_inode->i_sb) { struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); @@ -2039,22 +2349,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, if (ret < 0) goto out_caps; - size = i_size_read(dst_inode); - endoff = dst_off + len; - /* Drop dst file cached pages */ ret = invalidate_inode_pages2_range(dst_inode->i_mapping, dst_off >> PAGE_SHIFT, - endoff >> PAGE_SHIFT); + (dst_off + len) >> PAGE_SHIFT); if (ret < 0) { dout("Failed to invalidate inode pages (%zd)\n", ret); ret = 0; /* XXX */ } - src_oloc.pool = src_ci->i_layout.pool_id; - src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); - dst_oloc.pool = dst_ci->i_layout.pool_id; - dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); - ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, src_ci->i_layout.object_size, &src_objnum, &src_objoff, &src_objlen); @@ -2073,6 +2375,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * starting at the src_off */ if (src_objoff) { + dout("Initial partial copy of %u bytes\n", src_objlen); + /* * we need to temporarily drop all caps as we'll be calling * {read,write}_iter, which will get caps again. @@ -2080,8 +2384,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); ret = do_splice_direct(src_file, &src_off, dst_file, &dst_off, src_objlen, flags); - if (ret < 0) { - dout("do_splice_direct returned %d\n", err); + /* Abort on short copies or on error */ + if (ret < src_objlen) { + dout("Failed partial copy (%zd)\n", ret); goto out; } len -= ret; @@ -2094,65 +2399,27 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, if (err < 0) goto out_caps; } - object_size = src_ci->i_layout.object_size; - while (len >= object_size) { - ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, - object_size, &src_objnum, - &src_objoff, &src_objlen); - ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, - object_size, &dst_objnum, - &dst_objoff, &dst_objlen); - ceph_oid_init(&src_oid); - ceph_oid_printf(&src_oid, "%llx.%08llx", - src_ci->i_vino.ino, src_objnum); - ceph_oid_init(&dst_oid); - ceph_oid_printf(&dst_oid, "%llx.%08llx", - dst_ci->i_vino.ino, dst_objnum); - /* Do an object remote copy */ - err = ceph_osdc_copy_from( - &src_fsc->client->osdc, - src_ci->i_vino.snap, 0, - &src_oid, &src_oloc, - CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, - &dst_oid, &dst_oloc, - CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, - dst_ci->i_truncate_seq, dst_ci->i_truncate_size, - CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); - if (err) { - if (err == -EOPNOTSUPP) { - src_fsc->have_copy_from2 = false; - pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); - } - dout("ceph_osdc_copy_from returned %d\n", err); - if (!ret) - ret = err; - goto out_caps; - } - len -= object_size; - src_off += object_size; - dst_off += object_size; - ret += object_size; - } - if (len) - /* We still need one final local copy */ - do_final_copy = true; + size = i_size_read(dst_inode); + bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off, + src_fsc, len, flags); + if (bytes <= 0) { + if (!ret) + ret = bytes; + goto out_caps; + } + dout("Copied %zu bytes out of %zu\n", bytes, len); + len -= bytes; + ret += bytes; file_update_time(dst_file); inode_inc_iversion_raw(dst_inode); - if (endoff > size) { - int caps_flags = 0; - + if (dst_off > size) { /* Let the MDS know about dst file size change */ - if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff)) - caps_flags |= CHECK_CAPS_NODELAY; - if (ceph_inode_set_size(dst_inode, endoff)) - caps_flags |= CHECK_CAPS_AUTHONLY; - if (caps_flags) - ceph_check_caps(dst_ci, caps_flags, NULL); + if (ceph_inode_set_size(dst_inode, dst_off) || + ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); @@ -2165,15 +2432,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, out_caps: put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); - if (do_final_copy) { - err = do_splice_direct(src_file, &src_off, dst_file, - &dst_off, len, flags); - if (err < 0) { - dout("do_splice_direct returned %d\n", err); - goto out; - } - len -= err; - ret += err; + /* + * Do the final manual copy if we still have some bytes left, unless + * there were errors in remote object copies (len >= object_size). + */ + if (len && (len < src_ci->i_layout.object_size)) { + dout("Final partial copy of %zu bytes\n", len); + bytes = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, len, flags); + if (bytes > 0) + ret += bytes; + else + dout("Failed partial copy (%zd)\n", bytes); } out: diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d01710a16a4a..7fef94fd1e55 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -82,10 +82,14 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; + inode->i_mtime = parent->i_mtime; + inode->i_ctime = parent->i_ctime; + inode->i_atime = parent->i_atime; inode->i_op = &ceph_snapdir_iops; inode->i_fop = &ceph_snapdir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_rbytes = 0; + ci->i_btime = ceph_inode(parent)->i_btime; if (inode->i_state & I_NEW) unlock_new_inode(inode); @@ -447,6 +451,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_max_files = 0; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); ci->i_fragtree = RB_ROOT; @@ -471,13 +476,13 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_prealloc_cap_flush = NULL; INIT_LIST_HEAD(&ci->i_cap_flush_list); init_waitqueue_head(&ci->i_cap_wq); - ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; INIT_LIST_HEAD(&ci->i_cap_delay_list); INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; + ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ; for (i = 0; i < CEPH_FILE_MODE_BITS; i++) ci->i_nr_by_mode[i] = 0; @@ -496,6 +501,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_rdcache_ref = 0; ci->i_wr_ref = 0; ci->i_wb_ref = 0; + ci->i_fx_ref = 0; ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; atomic_set(&ci->i_filelock_ref, 0); @@ -586,6 +592,7 @@ void ceph_evict_inode(struct inode *inode) ceph_buffer_put(ci->i_xattrs.prealloc_blob); ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); } static inline blkcnt_t calc_inode_blocks(u64 size) @@ -636,7 +643,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, if ((issued & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_BUFFER)) || mapping_mapped(inode->i_mapping) || - __ceph_caps_file_wanted(ci)) { + __ceph_is_file_opened(ci)) { ci->i_truncate_pending++; queue_trunc = 1; } @@ -727,11 +734,11 @@ void ceph_fill_file_time(struct inode *inode, int issued, * Populate an inode based on info from mds. May be called on new or * existing inodes. */ -static int fill_inode(struct inode *inode, struct page *locked_page, - struct ceph_mds_reply_info_in *iinfo, - struct ceph_mds_reply_dirfrag *dirinfo, - struct ceph_mds_session *session, int cap_fmode, - struct ceph_cap_reservation *caps_reservation) +int ceph_fill_inode(struct inode *inode, struct page *locked_page, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, int cap_fmode, + struct ceph_cap_reservation *caps_reservation) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_reply_inode *info = iinfo->in; @@ -748,7 +755,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, bool new_version = false; bool fill_inline = false; - dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", + dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); @@ -769,7 +776,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (iinfo->xattr_len > 4) { xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); if (!xattr_blob) - pr_err("fill_inode ENOMEM xattr blob %d bytes\n", + pr_err("%s ENOMEM xattr blob %d bytes\n", __func__, iinfo->xattr_len); } @@ -932,8 +939,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); if (symlen != i_size_read(inode)) { - pr_err("fill_inode %llx.%llx BAD symlink " - "size %lld\n", ceph_vinop(inode), + pr_err("%s %llx.%llx BAD symlink " + "size %lld\n", __func__, + ceph_vinop(inode), i_size_read(inode)); i_size_write(inode, symlen); inode->i_blocks = calc_inode_blocks(symlen); @@ -957,7 +965,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, inode->i_fop = &ceph_dir_fops; break; default: - pr_err("fill_inode %llx.%llx BAD mode 0%o\n", + pr_err("%s %llx.%llx BAD mode 0%o\n", __func__, ceph_vinop(inode), inode->i_mode); } @@ -966,7 +974,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (ceph_snap(inode) == CEPH_NOSNAP) { ceph_add_cap(inode, session, le64_to_cpu(info->cap.cap_id), - cap_fmode, info_caps, + info_caps, le32_to_cpu(info->cap.wanted), le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), @@ -991,13 +999,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, dout(" %p got snap_caps %s\n", inode, ceph_cap_string(info_caps)); ci->i_snap_caps |= info_caps; - if (cap_fmode >= 0) - __ceph_get_fmode(ci, cap_fmode); } - } else if (cap_fmode >= 0) { - pr_warn("mds issued no caps on %llx.%llx\n", - ceph_vinop(inode)); - __ceph_get_fmode(ci, cap_fmode); } if (iinfo->inline_version > 0 && @@ -1009,6 +1011,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page, fill_inline = true; } + if (cap_fmode >= 0) { + if (!info_caps) + pr_warn("mds issued no caps on %llx.%llx\n", + ceph_vinop(inode)); + __ceph_touch_fmode(ci, mdsc, cap_fmode); + } + spin_unlock(&ci->i_ceph_lock); if (fill_inline) @@ -1050,6 +1059,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, struct ceph_mds_session **old_lease_session) { struct ceph_dentry_info *di = ceph_dentry(dentry); + unsigned mask = le16_to_cpu(lease->mask); long unsigned duration = le32_to_cpu(lease->duration_ms); long unsigned ttl = from_time + (duration * HZ) / 1000; long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; @@ -1061,8 +1071,13 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, if (ceph_snap(dir) != CEPH_NOSNAP) return; + if (mask & CEPH_LEASE_PRIMARY_LINK) + di->flags |= CEPH_DENTRY_PRIMARY_LINK; + else + di->flags &= ~CEPH_DENTRY_PRIMARY_LINK; + di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); - if (duration == 0) { + if (!(mask & CEPH_LEASE_VALID)) { __ceph_dentry_dir_lease_touch(di); return; } @@ -1239,10 +1254,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) struct inode *dir = req->r_parent; if (dir) { - err = fill_inode(dir, NULL, - &rinfo->diri, rinfo->dirfrag, - session, -1, - &req->r_caps_reservation); + err = ceph_fill_inode(dir, NULL, &rinfo->diri, + rinfo->dirfrag, session, -1, + &req->r_caps_reservation); if (err < 0) goto done; } else { @@ -1307,13 +1321,14 @@ retry_lookup: goto done; } - err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, - session, + err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, + NULL, session, (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && rinfo->head->result == 0) ? req->r_fmode : -1, &req->r_caps_reservation); if (err < 0) { - pr_err("fill_inode badness %p %llx.%llx\n", + pr_err("ceph_fill_inode badness %p %llx.%llx\n", in, ceph_vinop(in)); if (in->i_state & I_NEW) discard_new_inode(in); @@ -1500,10 +1515,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, dout("new_inode badness got %d\n", err); continue; } - rc = fill_inode(in, NULL, &rde->inode, NULL, session, - -1, &req->r_caps_reservation); + rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, + -1, &req->r_caps_reservation); if (rc < 0) { - pr_err("fill_inode badness on %p got %d\n", in, rc); + pr_err("ceph_fill_inode badness on %p got %d\n", + in, rc); err = rc; if (in->i_state & I_NEW) { ihold(in); @@ -1707,10 +1723,10 @@ retry_lookup: } } - ret = fill_inode(in, NULL, &rde->inode, NULL, session, - -1, &req->r_caps_reservation); + ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, + -1, &req->r_caps_reservation); if (ret < 0) { - pr_err("fill_inode badness on %p\n", in); + pr_err("ceph_fill_inode badness on %p\n", in); if (d_really_is_negative(dn)) { /* avoid calling iput_final() in mds * dispatch threads */ @@ -1972,7 +1988,7 @@ retry: mutex_unlock(&ci->i_truncate_mutex); if (wrbuffer_refs == 0) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, 0, NULL); wake_up_all(&ci->i_cap_wq); } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index c90f03beb15d..6e061bf62ad4 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file) struct ceph_file_info *fi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { spin_lock(&ci->i_ceph_lock); fi->fmode |= CEPH_FILE_MODE_LAZY; ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; + __ceph_touch_fmode(ci, mdsc, fi->fmode); spin_unlock(&ci->i_ceph_lock); dout("ioctl_layzio: file %p marked lazy\n", file); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 544e9e85b120..d6b9166e71e4 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -210,6 +210,21 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, return 0; } +static int try_unlock_file(struct file *file, struct file_lock *fl) +{ + int err; + unsigned int orig_flags = fl->fl_flags; + fl->fl_flags |= FL_EXISTS; + err = locks_lock_file_wait(file, fl); + fl->fl_flags = orig_flags; + if (err == -ENOENT) { + if (!(orig_flags & FL_EXISTS)) + err = 0; + return err; + } + return 1; +} + /** * Attempt to set an fcntl lock. * For now, this just goes away to the server. Later it may be more awesome. @@ -255,9 +270,15 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else lock_cmd = CEPH_LOCK_UNLOCK; + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) { + err = try_unlock_file(file, fl); + if (err <= 0) + return err; + } + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { - if (op == CEPH_MDS_OP_SETFILELOCK) { + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) { dout("mds locked, locking locally\n"); err = posix_lock_file(file, fl, NULL); if (err) { @@ -311,9 +332,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) else lock_cmd = CEPH_LOCK_UNLOCK; + if (F_UNLCK == fl->fl_type) { + err = try_unlock_file(file, fl); + if (err <= 0) + return err; + } + err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, inode, lock_cmd, wait, fl); - if (!err) { + if (!err && F_UNLCK != fl->fl_type) { err = locks_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bbbbddf71326..486f91f9685b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -415,21 +415,121 @@ bad: return -EIO; } + +#if BITS_PER_LONG == 64 + +#define DELEGATED_INO_AVAILABLE xa_mk_value(1) + +static int ceph_parse_deleg_inos(void **p, void *end, + struct ceph_mds_session *s) +{ + u32 sets; + + ceph_decode_32_safe(p, end, sets, bad); + dout("got %u sets of delegated inodes\n", sets); + while (sets--) { + u64 start, len, ino; + + ceph_decode_64_safe(p, end, start, bad); + ceph_decode_64_safe(p, end, len, bad); + while (len--) { + int err = xa_insert(&s->s_delegated_inos, ino = start++, + DELEGATED_INO_AVAILABLE, + GFP_KERNEL); + if (!err) { + dout("added delegated inode 0x%llx\n", + start - 1); + } else if (err == -EBUSY) { + pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n", + start - 1); + } else { + return err; + } + } + } + return 0; +bad: + return -EIO; +} + +u64 ceph_get_deleg_ino(struct ceph_mds_session *s) +{ + unsigned long ino; + void *val; + + xa_for_each(&s->s_delegated_inos, ino, val) { + val = xa_erase(&s->s_delegated_inos, ino); + if (val == DELEGATED_INO_AVAILABLE) + return ino; + } + return 0; +} + +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) +{ + return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE, + GFP_KERNEL); +} +#else /* BITS_PER_LONG == 64 */ +/* + * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just + * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top + * and bottom words? + */ +static int ceph_parse_deleg_inos(void **p, void *end, + struct ceph_mds_session *s) +{ + u32 sets; + + ceph_decode_32_safe(p, end, sets, bad); + if (sets) + ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad); + return 0; +bad: + return -EIO; +} + +u64 ceph_get_deleg_ino(struct ceph_mds_session *s) +{ + return 0; +} + +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) +{ + return 0; +} +#endif /* BITS_PER_LONG == 64 */ + /* * parse create results */ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - u64 features) + u64 features, struct ceph_mds_session *s) { + int ret; + if (features == (u64)-1 || (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { - /* Malformed reply? */ if (*p == end) { + /* Malformed reply? */ info->has_create_ino = false; - } else { + } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) { + u8 struct_v, struct_compat; + u32 len; + info->has_create_ino = true; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_64_safe(p, end, info->ino, bad); + ret = ceph_parse_deleg_inos(p, end, s); + if (ret) + return ret; + } else { + /* legacy */ ceph_decode_64_safe(p, end, info->ino, bad); + info->has_create_ino = true; } } else { if (*p != end) @@ -448,7 +548,7 @@ bad: */ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - u64 features) + u64 features, struct ceph_mds_session *s) { u32 op = le32_to_cpu(info->head->op); @@ -457,7 +557,7 @@ static int parse_reply_info_extra(void **p, void *end, else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_readdir(p, end, info, features); else if (op == CEPH_MDS_OP_CREATE) - return parse_reply_info_create(p, end, info, features); + return parse_reply_info_create(p, end, info, features, s); else return -EIO; } @@ -465,7 +565,7 @@ static int parse_reply_info_extra(void **p, void *end, /* * parse entire mds reply */ -static int parse_reply_info(struct ceph_msg *msg, +static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, struct ceph_mds_reply_info_parsed *info, u64 features) { @@ -490,7 +590,7 @@ static int parse_reply_info(struct ceph_msg *msg, ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); - err = parse_reply_info_extra(&p, p+len, info, features); + err = parse_reply_info_extra(&p, p+len, info, features, s); if (err < 0) goto out_bad; } @@ -558,6 +658,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s) if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); + xa_destroy(&s->s_delegated_inos); kfree(s); } } @@ -645,6 +746,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); + xa_init(&s->s_delegated_inos); s->s_num_cap_releases = 0; s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; @@ -699,6 +801,7 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); + ceph_mdsc_release_dir_caps(req); destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); @@ -736,7 +839,7 @@ void ceph_mdsc_release_request(struct kref *kref) put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); WARN_ON_ONCE(!list_empty(&req->r_wait)); - kfree(req); + kmem_cache_free(ceph_mds_request_cachep, req); } DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) @@ -793,8 +896,13 @@ static void __register_request(struct ceph_mds_client *mdsc, mdsc->oldest_tid = req->r_tid; if (dir) { + struct ceph_inode_info *ci = ceph_inode(dir); + ihold(dir); req->r_unsafe_dir = dir; + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); + spin_unlock(&ci->i_unsafe_lock); } } @@ -822,8 +930,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, erase_request(&mdsc->request_tree, req); - if (req->r_unsafe_dir && - test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + if (req->r_unsafe_dir) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); @@ -1407,8 +1514,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); - if (cap->mds_wanted | cap->issued) - ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { struct ceph_cap_flush *cf; @@ -1574,9 +1679,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, /* mds did not re-issue stale cap */ spin_lock(&ci->i_ceph_lock); cap->issued = cap->implemented = CEPH_CAP_PIN; - /* make sure mds knows what we want */ - if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted) - ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; spin_unlock(&ci->i_ceph_lock); } } else if (ev == FORCE_RO) { @@ -1772,7 +1874,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) } /* The inode has cached pages, but it's no longer used. * we can safely drop it */ - if (wanted == 0 && used == CEPH_CAP_FILE_CACHE && + if (S_ISREG(inode->i_mode) && + wanted == 0 && used == CEPH_CAP_FILE_CACHE && !(oissued & CEPH_CAP_FILE_CACHE)) { used = 0; oissued = 0; @@ -2089,8 +2192,9 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { - struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); + struct ceph_mds_request *req; + req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); @@ -2368,7 +2472,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->op = cpu_to_le32(req->r_op); head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); - head->ino = 0; + head->ino = cpu_to_le64(req->r_deleg_ino); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); @@ -2382,7 +2486,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_inode_drop) releases += ceph_encode_inode_release(&p, req->r_inode ? req->r_inode : d_inode(req->r_dentry), - mds, req->r_inode_drop, req->r_inode_unless, 0); + mds, req->r_inode_drop, req->r_inode_unless, + req->r_op == CEPH_MDS_OP_READDIR); if (req->r_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_dentry, req->r_parent, mds, req->r_dentry_drop, @@ -2522,12 +2627,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) + flags |= CEPH_MDS_FLAG_ASYNC; if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; - rhead->ino = 0; dout(" r_parent = %p\n", req->r_parent); return 0; @@ -2573,7 +2679,7 @@ static void __do_request(struct ceph_mds_client *mdsc, if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { dout("do_request timed out\n"); - err = -EIO; + err = -ETIMEDOUT; goto finish; } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { @@ -2605,6 +2711,10 @@ static void __do_request(struct ceph_mds_client *mdsc, mds = __choose_mds(mdsc, req, &random); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { + err = -EJUKEBOX; + goto finish; + } dout("do_request no mds or not active, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); return; @@ -2629,6 +2739,15 @@ static void __do_request(struct ceph_mds_client *mdsc, err = -EACCES; goto out_session; } + /* + * We cannot queue async requests since the caps and delegated + * inodes are bound to the session. Just return -EJUKEBOX and + * let the caller retry a sync request in that case. + */ + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { + err = -EJUKEBOX; + goto out_session; + } if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) { __open_session(mdsc, session); @@ -2709,19 +2828,43 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { - int err; + int err = 0; /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); if (req->r_parent) { - ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + struct ceph_inode_info *ci = ceph_inode(req->r_parent); + int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ? + CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD; + spin_lock(&ci->i_ceph_lock); + ceph_take_cap_refs(ci, CEPH_CAP_PIN, false); + __ceph_touch_fmode(ci, mdsc, fmode); + spin_unlock(&ci->i_ceph_lock); ihold(req->r_parent); } if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); + if (req->r_inode) { + err = ceph_wait_on_async_create(req->r_inode); + if (err) { + dout("%s: wait for async create returned: %d\n", + __func__, err); + return err; + } + } + + if (!err && req->r_old_inode) { + err = ceph_wait_on_async_create(req->r_old_inode); + if (err) { + dout("%s: wait for async create returned: %d\n", + __func__, err); + return err; + } + } + dout("submit_request on %p for inode %p\n", req, dir); mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); @@ -2747,7 +2890,7 @@ static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, if (timeleft > 0) err = 0; else if (!timeleft) - err = -EIO; /* timed out */ + err = -ETIMEDOUT; /* timed out */ else err = timeleft; /* killed */ } @@ -2935,22 +3078,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } else { set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags); list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); - if (req->r_unsafe_dir) { - struct ceph_inode_info *ci = - ceph_inode(req->r_unsafe_dir); - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_dir_item, - &ci->i_unsafe_dirops); - spin_unlock(&ci->i_unsafe_lock); - } } dout("handle_reply tid %lld result %d\n", tid, result); rinfo = &req->r_reply_info; if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) - err = parse_reply_info(msg, rinfo, (u64)-1); + err = parse_reply_info(session, msg, rinfo, (u64)-1); else - err = parse_reply_info(msg, rinfo, session->s_con.peer_features); + err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); @@ -3249,6 +3384,17 @@ bad: return; } +void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) +{ + int dcaps; + + dcaps = xchg(&req->r_dir_caps, 0); + if (dcaps) { + dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); + ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps); + } +} + /* * called under session->mutex. */ @@ -3276,9 +3422,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, continue; if (req->r_attempts == 0) continue; /* only old requests */ - if (req->r_session && - req->r_session->s_mds == session->s_mds) - __send_request(mdsc, session, req, true); + if (!req->r_session) + continue; + if (req->r_session->s_mds != session->s_mds) + continue; + + ceph_mdsc_release_dir_caps(req); + + __send_request(mdsc, session, req, true); } mutex_unlock(&mdsc->mutex); } @@ -3362,7 +3513,7 @@ fail_msg: /* * Encode information about a cap for a reconnect with the MDS. */ -static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, +static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { union { @@ -3385,6 +3536,15 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, cap->mseq = 0; /* and migrate_seq */ cap->cap_gen = cap->session->s_cap_gen; + /* These are lost when the session goes away */ + if (S_ISDIR(inode->i_mode)) { + if (cap->issued & CEPH_CAP_DIR_CREATE) { + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); + } + cap->issued &= ~CEPH_CAP_ANY_DIR_OPS; + } + if (recon_state->msg_version >= 2) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); @@ -3626,6 +3786,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, if (!reply) goto fail_nomsg; + xa_destroy(&session->s_delegated_inos); + mutex_lock(&session->s_mutex); session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; @@ -3681,7 +3843,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, recon_state.msg_version = 2; } /* trsaverse this session's caps */ - err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state); + err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state); spin_lock(&session->s_cap_lock); session->s_cap_reconnect = 0; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 27a7446e10d3..4e5be79bf080 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -23,8 +23,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_RECLAIM_CLIENT, CEPHFS_FEATURE_LAZY_CAP_WANTED, CEPHFS_FEATURE_MULTI_RECONNECT, + CEPHFS_FEATURE_DELEG_INO, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO, }; /* @@ -37,6 +38,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_REPLY_ENCODING, \ CEPHFS_FEATURE_LAZY_CAP_WANTED, \ CEPHFS_FEATURE_MULTI_RECONNECT, \ + CEPHFS_FEATURE_DELEG_INO, \ \ CEPHFS_FEATURE_MAX, \ } @@ -201,6 +203,7 @@ struct ceph_mds_session { struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ + struct xarray s_delegated_inos; }; /* @@ -255,6 +258,7 @@ struct ceph_mds_request { #define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ +#define CEPH_MDS_R_ASYNC (8) /* async request */ unsigned long r_req_flags; struct mutex r_fill_mutex; @@ -263,6 +267,7 @@ struct ceph_mds_request { int r_fmode; /* file mode, if expecting cap */ kuid_t r_uid; kgid_t r_gid; + int r_request_release_offset; struct timespec64 r_stamp; /* for choosing which mds to send this request to */ @@ -280,12 +285,16 @@ struct ceph_mds_request { int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ - int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; - struct page *r_locked_page; int r_err; + + struct page *r_locked_page; + int r_dir_caps; + int r_num_caps; + u32 r_readdir_offset; + unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ unsigned long r_request_started; /* start time for mds request only, @@ -304,6 +313,7 @@ struct ceph_mds_request { int r_num_fwd; /* number of forward attempts */ int r_resend_mds; /* mds to resend to next, if any*/ u32 r_sent_on_mseq; /* cap mseq request was sent at*/ + u64 r_deleg_ino; struct list_head r_wait; struct completion r_completion; @@ -315,10 +325,8 @@ struct ceph_mds_request { long long r_dir_release_cnt; long long r_dir_ordered_cnt; int r_readdir_cache_idx; - u32 r_readdir_offset; struct ceph_cap_reservation r_caps_reservation; - int r_num_caps; }; struct ceph_pool_perm { @@ -488,6 +496,7 @@ extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); @@ -537,4 +546,15 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, extern int ceph_trim_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int max_caps); + +static inline int ceph_wait_on_async_create(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, + TASK_INTERRUPTIBLE); +} + +extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); +extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); #endif diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c7f150686a53..c9784eb1159a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -155,6 +155,7 @@ enum { Opt_acl, Opt_quotadf, Opt_copyfrom, + Opt_wsync, }; enum ceph_recover_session_mode { @@ -194,6 +195,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), fsparam_u32 ("wsize", Opt_wsize), + fsparam_flag_no ("wsync", Opt_wsync), {} }; @@ -444,6 +446,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, fc->sb_flags &= ~SB_POSIXACL; } break; + case Opt_wsync: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS; + else + fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; + break; default: BUG(); } @@ -567,6 +575,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); + if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + seq_puts(m, ",nowsync"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -729,6 +740,7 @@ struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; struct kmem_cache *ceph_dir_file_cachep; +struct kmem_cache *ceph_mds_request_cachep; static void ceph_inode_init_once(void *foo) { @@ -769,6 +781,10 @@ static int __init init_caches(void) if (!ceph_dir_file_cachep) goto bad_dir_file; + ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD); + if (!ceph_mds_request_cachep) + goto bad_mds_req; + error = ceph_fscache_register(); if (error) goto bad_fscache; @@ -776,6 +792,8 @@ static int __init init_caches(void) return 0; bad_fscache: + kmem_cache_destroy(ceph_mds_request_cachep); +bad_mds_req: kmem_cache_destroy(ceph_dir_file_cachep); bad_dir_file: kmem_cache_destroy(ceph_file_cachep); @@ -804,6 +822,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); kmem_cache_destroy(ceph_dir_file_cachep); + kmem_cache_destroy(ceph_mds_request_cachep); ceph_fscache_unregister(); } @@ -1107,6 +1126,15 @@ static void ceph_free_fc(struct fs_context *fc) static int ceph_reconfigure_fc(struct fs_context *fc) { + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb); + + if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + ceph_set_mount_opt(fsc, ASYNC_DIROPS); + else + ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + sync_filesystem(fc->root->d_sb); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 037cdfb2ad4f..60aac3aee055 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -43,13 +43,16 @@ #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ +#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ CEPH_MOUNT_OPT_NOCOPYFROM) #define ceph_set_mount_opt(fsc, opt) \ - (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt +#define ceph_clear_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) @@ -170,9 +173,9 @@ struct ceph_cap { struct list_head caps_item; }; -#define CHECK_CAPS_NODELAY 1 /* do not delay any further */ -#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */ -#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */ +#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ +#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ +#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ struct ceph_cap_flush { u64 tid; @@ -284,6 +287,7 @@ struct ceph_dentry_info { #define CEPH_DENTRY_REFERENCED 1 #define CEPH_DENTRY_LEASE_LIST 2 #define CEPH_DENTRY_SHRINK_LIST 4 +#define CEPH_DENTRY_PRIMARY_LINK 8 struct ceph_inode_xattrs_info { /* @@ -315,13 +319,14 @@ struct ceph_inode_info { u64 i_inline_version; u32 i_time_warp_seq; - unsigned i_ceph_flags; + unsigned long i_ceph_flags; atomic64_t i_release_count; atomic64_t i_ordered_count; atomic64_t i_complete_seq[2]; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; + struct ceph_file_layout i_cached_layout; // for async creates char *i_symlink; /* for dirs */ @@ -352,7 +357,6 @@ struct ceph_inode_info { struct ceph_cap_flush *i_prealloc_cap_flush; struct list_head i_cap_flush_list; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ - unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ struct ceph_cap_reservation i_cap_migration_resv; @@ -361,6 +365,8 @@ struct ceph_inode_info { dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ + unsigned long i_last_rd; + unsigned long i_last_wr; int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ struct mutex i_truncate_mutex; @@ -375,7 +381,7 @@ struct ceph_inode_info { /* held references to caps */ int i_pin_ref; - int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; + int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; atomic_t i_filelock_ref; atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */ @@ -511,18 +517,18 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, * Ceph inode. */ #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ -#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ #define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ #define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ -#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ -#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ -#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ -#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ -#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ -#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */ +#define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */ +#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */ +#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ +#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) /* * Masks of ceph inode work. @@ -674,18 +680,12 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); -extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); - -/* - * wanted, by virtue of open file modes AND cap refs (buffered/cached data) - */ -static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci) { - int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); - if (w & CEPH_CAP_FILE_BUFFER) - w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */ - return w; + return ci->i_nr_by_mode[0]; } +extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); +extern int __ceph_caps_wanted(struct ceph_inode_info *ci); /* what the mds thinks we want */ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); @@ -899,6 +899,9 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) } /* inode.c */ +struct ceph_mds_reply_info_in; +struct ceph_mds_reply_dirfrag; + extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); @@ -914,6 +917,11 @@ extern void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec64 *ctime, struct timespec64 *mtime, struct timespec64 *atime); +extern int ceph_fill_inode(struct inode *inode, struct page *locked_page, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, int cap_fmode, + struct ceph_cap_reservation *caps_reservation); extern int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req); extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, @@ -1042,7 +1050,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); extern void ceph_add_cap(struct inode *inode, struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, + unsigned issued, unsigned wanted, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); @@ -1058,8 +1066,12 @@ extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, + struct ceph_inode_info *ci); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); +extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, + bool snap_rwsem_locked); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, @@ -1084,8 +1096,10 @@ extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); /* for counting open files by mode */ -extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count); +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count); +extern void __ceph_touch_fmode(struct ceph_inode_info *ci, + struct ceph_mds_client *mdsc, int fmode); /* addr.c */ extern const struct address_space_operations ceph_aops; @@ -1097,7 +1111,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; -extern int ceph_renew_caps(struct inode *inode); +extern int ceph_renew_caps(struct inode *inode, int fmode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 716574aab3b6..ae421634aa42 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -342,7 +342,7 @@ static int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) { - int rc; + int rc = 0; struct key *sidkey; char *sidstr; const struct cred *saved_cred; @@ -450,11 +450,12 @@ out_revert_creds: * fails then we just fall back to using the mnt_uid/mnt_gid. */ got_valid_id: + rc = 0; if (sidtype == SIDOWNER) fattr->cf_uid = fuid; else fattr->cf_gid = fgid; - return 0; + return rc; } int diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index fa77fe5258b0..94e3ed4850b5 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1018,7 +1018,7 @@ struct file_system_type cifs_fs_type = { .name = "cifs", .mount = cifs_do_mount, .kill_sb = cifs_kill_sb, - /* .fs_flags */ + .fs_flags = FS_RENAME_DOES_D_MOVE, }; MODULE_ALIAS_FS("cifs"); @@ -1027,7 +1027,7 @@ static struct file_system_type smb3_fs_type = { .name = "smb3", .mount = smb3_do_mount, .kill_sb = cifs_kill_sb, - /* .fs_flags */ + .fs_flags = FS_RENAME_DOES_D_MOVE, }; MODULE_ALIAS_FS("smb3"); MODULE_ALIAS("smb3"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index b87456bae1a1..c9e2e6bbca13 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.25" +#define CIFS_VERSION "2.26" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 79d842e7240c..593d826820c3 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1021,7 +1021,7 @@ typedef struct smb_com_writex_req { __le16 ByteCount; __u8 Pad; /* BB check for whether padded to DWORD boundary and optimum performance here */ - char Data[0]; + char Data[]; } __attribute__((packed)) WRITEX_REQ; typedef struct smb_com_write_req { @@ -1041,7 +1041,7 @@ typedef struct smb_com_write_req { __le16 ByteCount; __u8 Pad; /* BB check for whether padded to DWORD boundary and optimum performance here */ - char Data[0]; + char Data[]; } __attribute__((packed)) WRITE_REQ; typedef struct smb_com_write_rsp { @@ -1306,7 +1306,7 @@ typedef struct smb_com_ntransact_req { /* SetupCount words follow then */ __le16 ByteCount; __u8 Pad[3]; - __u8 Parms[0]; + __u8 Parms[]; } __attribute__((packed)) NTRANSACT_REQ; typedef struct smb_com_ntransact_rsp { @@ -1523,7 +1523,7 @@ struct file_notify_information { __le32 NextEntryOffset; __le32 Action; __le32 FileNameLength; - __u8 FileName[0]; + __u8 FileName[]; } __attribute__((packed)); /* For IO_REPARSE_TAG_SYMLINK */ @@ -1536,7 +1536,7 @@ struct reparse_symlink_data { __le16 PrintNameOffset; __le16 PrintNameLength; __le32 Flags; - char PathBuffer[0]; + char PathBuffer[]; } __attribute__((packed)); /* Flag above */ @@ -1553,7 +1553,7 @@ struct reparse_posix_data { __le16 ReparseDataLength; __u16 Reserved; __le64 InodeType; /* LNK, FIFO, CHR etc. */ - char PathBuffer[0]; + char PathBuffer[]; } __attribute__((packed)); struct cifs_quota_data { @@ -1691,6 +1691,7 @@ struct smb_t2_rsp { #define SMB_FIND_FILE_ID_FULL_DIR_INFO 0x105 #define SMB_FIND_FILE_ID_BOTH_DIR_INFO 0x106 #define SMB_FIND_FILE_UNIX 0x202 +#define SMB_FIND_FILE_POSIX_INFO 0x064 typedef struct smb_com_transaction2_qpi_req { struct smb_hdr hdr; /* wct = 14+ */ @@ -1761,7 +1762,7 @@ struct set_file_rename { __le32 overwrite; /* 1 = overwrite dest */ __u32 root_fid; /* zero */ __le32 target_name_len; - char target_name[0]; /* Must be unicode */ + char target_name[]; /* Must be unicode */ } __attribute__((packed)); struct smb_com_transaction2_sfi_req { @@ -2450,7 +2451,7 @@ struct cifs_posix_acl { /* access conrol list (ACL) */ __le16 version; __le16 access_entry_count; /* access ACL - count of entries */ __le16 default_entry_count; /* default ACL - count of entries */ - struct cifs_posix_ace ace_array[0]; + struct cifs_posix_ace ace_array[]; /* followed by struct cifs_posix_ace default_ace_arraay[] */ } __attribute__((packed)); /* level 0x204 */ @@ -2756,7 +2757,7 @@ typedef struct file_xattr_info { /* BB do we need another field for flags? BB */ __u32 xattr_name_len; __u32 xattr_value_len; - char xattr_name[0]; + char xattr_name[]; /* followed by xattr_value[xattr_value_len], no pad */ } __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info level 0x205 */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e5cb681ec138..12a895e02db4 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -602,6 +602,11 @@ int smb2_parse_query_directory(struct cifs_tcon *tcon, struct kvec *rsp_iov, int resp_buftype, struct cifs_search_info *srch_inf); +struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server); +void cifs_put_tcp_super(struct super_block *sb); +int update_super_prepath(struct cifs_tcon *tcon, const char *prefix, + size_t prefix_len); + #ifdef CONFIG_CIFS_DFS_UPCALL static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6f6fb3606a5d..140efc1a9374 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -162,9 +162,18 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, for (it = dfs_cache_get_tgt_iterator(&tl); it; it = dfs_cache_get_next_tgt(&tl, it)) { - const char *tgt = dfs_cache_get_tgt_name(it); + const char *share, *prefix; + size_t share_len, prefix_len; - extract_unc_hostname(tgt, &dfs_host, &dfs_host_len); + rc = dfs_cache_get_tgt_share(it, &share, &share_len, &prefix, + &prefix_len); + if (rc) { + cifs_dbg(VFS, "%s: failed to parse target share %d\n", + __func__, rc); + continue; + } + + extract_unc_hostname(share, &dfs_host, &dfs_host_len); if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) { @@ -175,11 +184,13 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, continue; } - scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); + scnprintf(tree, MAX_TREE_SIZE, "\\%.*s", (int)share_len, share); rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); - if (!rc) + if (!rc) { + rc = update_super_prepath(tcon, prefix, prefix_len); break; + } if (rc == -EREMOTE) break; } @@ -320,7 +331,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) atomic_inc(&tconInfoReconnectCount); /* tell server Unix caps we support */ - if (ses->capabilities & CAP_UNIX) + if (cap_unix(ses)) reset_cifs_unix_caps(0, tcon, NULL, NULL); /* @@ -1591,7 +1602,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (server->ops->is_session_expired && server->ops->is_session_expired(buf)) { cifs_reconnect(server); - wake_up(&server->response_q); return -1; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4804d1df8c1c..95b3ab0ca8c0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -21,6 +21,7 @@ #include <linux/fs.h> #include <linux/net.h> #include <linux/string.h> +#include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/list.h> #include <linux/wait.h> @@ -57,7 +58,6 @@ #include "smb2proto.h" #include "smbdirect.h" #include "dns_resolve.h" -#include "cifsfs.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif @@ -389,54 +389,7 @@ static inline int reconn_set_ipaddr(struct TCP_Server_Info *server) #endif #ifdef CONFIG_CIFS_DFS_UPCALL -struct super_cb_data { - struct TCP_Server_Info *server; - struct super_block *sb; -}; - /* These functions must be called with server->srv_mutex held */ - -static void super_cb(struct super_block *sb, void *arg) -{ - struct super_cb_data *d = arg; - struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; - - if (d->sb) - return; - - cifs_sb = CIFS_SB(sb); - tcon = cifs_sb_master_tcon(cifs_sb); - if (tcon->ses->server == d->server) - d->sb = sb; -} - -static struct super_block *get_tcp_super(struct TCP_Server_Info *server) -{ - struct super_cb_data d = { - .server = server, - .sb = NULL, - }; - - iterate_supers_type(&cifs_fs_type, super_cb, &d); - - if (unlikely(!d.sb)) - return ERR_PTR(-ENOENT); - /* - * Grab an active reference in order to prevent automounts (DFS links) - * of expiring and then freeing up our cifs superblock pointer while - * we're doing failover. - */ - cifs_sb_active(d.sb); - return d.sb; -} - -static inline void put_tcp_super(struct super_block *sb) -{ - if (!IS_ERR_OR_NULL(sb)) - cifs_sb_deactive(sb); -} - static void reconn_inval_dfs_target(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb, struct dfs_cache_tgt_list *tgt_list, @@ -508,7 +461,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->nr_targets = 1; #ifdef CONFIG_CIFS_DFS_UPCALL spin_unlock(&GlobalMid_Lock); - sb = get_tcp_super(server); + sb = cifs_get_tcp_super(server); if (IS_ERR(sb)) { rc = PTR_ERR(sb); cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n", @@ -535,8 +488,9 @@ cifs_reconnect(struct TCP_Server_Info *server) spin_unlock(&GlobalMid_Lock); #ifdef CONFIG_CIFS_DFS_UPCALL dfs_cache_free_tgts(&tgt_list); - put_tcp_super(sb); + cifs_put_tcp_super(sb); #endif + wake_up(&server->response_q); return rc; } else server->tcpStatus = CifsNeedReconnect; @@ -666,11 +620,12 @@ cifs_reconnect(struct TCP_Server_Info *server) } - put_tcp_super(sb); + cifs_put_tcp_super(sb); #endif if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); + wake_up(&server->response_q); return rc; } @@ -765,7 +720,6 @@ server_unresponsive(struct TCP_Server_Info *server) cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n", (3 * server->echo_interval) / HZ); cifs_reconnect(server); - wake_up(&server->response_q); return true; } @@ -898,7 +852,6 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) */ cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT); cifs_reconnect(server); - wake_up(&server->response_q); break; default: cifs_server_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type); @@ -1070,7 +1023,6 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) server->vals->header_preamble_size) { cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); cifs_reconnect(server); - wake_up(&server->response_q); return -ECONNABORTED; } @@ -1118,7 +1070,6 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (server->ops->is_session_expired && server->ops->is_session_expired(buf)) { cifs_reconnect(server); - wake_up(&server->response_q); return -1; } @@ -1164,8 +1115,9 @@ cifs_demultiplex_thread(void *p) struct task_struct *task_to_wake = NULL; struct mid_q_entry *mids[MAX_COMPOUND]; char *bufs[MAX_COMPOUND]; + unsigned int noreclaim_flag; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current)); length = atomic_inc_return(&tcpSesAllocCount); @@ -1212,7 +1164,6 @@ next_pdu: cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n", server->pdu_size); cifs_reconnect(server); - wake_up(&server->response_q); continue; } @@ -1320,6 +1271,7 @@ next_pdu: set_current_state(TASK_RUNNING); } + memalloc_noreclaim_restore(noreclaim_flag); module_put_and_exit(0); } @@ -1522,6 +1474,9 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3) cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); return 1; } + cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 " + "is not recommended unless required for " + "access to very old servers\n"); vol->ops = &smb1_operations; vol->vals = &smb1_values; break; @@ -2517,11 +2472,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n"); if (got_version == false) - pr_warn("No dialect specified on mount. Default has changed to " - "a more secure dialect, SMB2.1 or later (e.g. SMB3), from CIFS " - "(SMB1). To use the less secure SMB1 dialect to access " - "old servers which do not support SMB3 (or SMB2.1) specify vers=1.0" - " on mount.\n"); + pr_warn_once("No dialect specified on mount. Default has changed" + " to a more secure dialect, SMB2.1 or later (e.g. " + "SMB3.1.1), from CIFS (SMB1). To use the less secure " + "SMB1 dialect to access old servers which do not " + "support SMB3.1.1 (or even SMB3 or SMB2.1) specify " + "vers=1.0 on mount.\n"); kfree(mountdata_copy); return 0; @@ -4999,6 +4955,15 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) * dentry revalidation to think the dentry are stale (ESTALE). */ cifs_autodisable_serverino(cifs_sb); + /* + * Force the use of prefix path to support failover on DFS paths that + * resolve to targets that have different prefix paths. + */ + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + kfree(cifs_sb->prepath); + cifs_sb->prepath = vol->prepath; + vol->prepath = NULL; + out: free_xid(xid); cifs_try_adding_channels(ses); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 43c1b43a07ec..a67f88bf7ae1 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1260,6 +1260,44 @@ void dfs_cache_del_vol(const char *fullpath) kref_put(&vi->refcnt, vol_release); } +/** + * dfs_cache_get_tgt_share - parse a DFS target + * + * @it: DFS target iterator. + * @share: tree name. + * @share_len: length of tree name. + * @prefix: prefix path. + * @prefix_len: length of prefix path. + * + * Return zero if target was parsed correctly, otherwise non-zero. + */ +int dfs_cache_get_tgt_share(const struct dfs_cache_tgt_iterator *it, + const char **share, size_t *share_len, + const char **prefix, size_t *prefix_len) +{ + char *s, sep; + + if (!it || !share || !share_len || !prefix || !prefix_len) + return -EINVAL; + + sep = it->it_name[0]; + if (sep != '\\' && sep != '/') + return -EINVAL; + + s = strchr(it->it_name + 1, sep); + if (!s) + return -EINVAL; + + s = strchrnul(s + 1, sep); + + *share = it->it_name; + *share_len = s - it->it_name; + *prefix = *s ? s + 1 : s; + *prefix_len = &it->it_name[strlen(it->it_name)] - *prefix; + + return 0; +} + /* Get all tcons that are within a DFS namespace and can be refreshed */ static void get_tcons(struct TCP_Server_Info *server, struct list_head *head) { diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h index 99ee44f8ad07..bf94d08cfb5a 100644 --- a/fs/cifs/dfs_cache.h +++ b/fs/cifs/dfs_cache.h @@ -49,6 +49,10 @@ extern int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server); extern void dfs_cache_del_vol(const char *fullpath); +extern int dfs_cache_get_tgt_share(const struct dfs_cache_tgt_iterator *it, + const char **share, size_t *share_len, + const char **prefix, size_t *prefix_len); + static inline struct dfs_cache_tgt_iterator * dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl, struct dfs_cache_tgt_iterator *it) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8f9d849a0012..5920820bfbd0 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3841,7 +3841,7 @@ again: if (rc == -ENODATA) rc = 0; - ctx->rc = (rc == 0) ? ctx->total_len : rc; + ctx->rc = (rc == 0) ? (ssize_t)ctx->total_len : rc; mutex_unlock(&ctx->aio_mutex); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b16f8d23e97b..8d01ec2dca66 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1835,6 +1835,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, CIFSSMBClose(xid, tcon, fid.netfid); } do_rename_exit: + if (rc == 0) + d_move(from_dentry, to_dentry); cifs_put_tlink(tlink); return rc; } @@ -2148,8 +2150,9 @@ int cifs_getattr(const struct path *path, struct kstat *stat, * We need to be sure that all dirty pages are written and the server * has actual ctime, mtime and file length. */ - if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping && - inode->i_mapping->nrpages != 0) { + if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE)) && + !CIFS_CACHE_READ(CIFS_I(inode)) && + inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = filemap_fdatawait(inode->i_mapping); if (rc) { mapping_set_error(inode->i_mapping, rc); @@ -2157,9 +2160,20 @@ int cifs_getattr(const struct path *path, struct kstat *stat, } } - rc = cifs_revalidate_dentry_attr(dentry); - if (rc) - return rc; + if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_FORCE_SYNC) + CIFS_I(inode)->time = 0; /* force revalidate */ + + /* + * If the caller doesn't require syncing, only sync if + * necessary (e.g. due to earlier truncate or setattr + * invalidating the cached metadata) + */ + if (((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) || + (CIFS_I(inode)->time == 0)) { + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + } generic_fillattr(inode, stat); stat->blksize = cifs_sb->bsize; @@ -2516,25 +2530,26 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) /* * Attempt to flush data before changing attributes. We need to do - * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the - * ownership or mode then we may also need to do this. Here, we take - * the safe way out and just do the flush on all setattr requests. If - * the flush returns error, store it to report later and continue. + * this for ATTR_SIZE and ATTR_MTIME. If the flush of the data + * returns error, store it to report later and continue. * * BB: This should be smarter. Why bother flushing pages that * will be truncated anyway? Also, should we error out here if - * the flush returns error? + * the flush returns error? Do we need to check for ATTR_MTIME_SET flag? */ - rc = filemap_write_and_wait(inode->i_mapping); - if (is_interrupt_error(rc)) { - rc = -ERESTARTSYS; - goto cifs_setattr_exit; + if (attrs->ia_valid & (ATTR_MTIME | ATTR_SIZE | ATTR_CTIME)) { + rc = filemap_write_and_wait(inode->i_mapping); + if (is_interrupt_error(rc)) { + rc = -ERESTARTSYS; + goto cifs_setattr_exit; + } + mapping_set_error(inode->i_mapping, rc); } - mapping_set_error(inode->i_mapping, rc); rc = 0; - if (attrs->ia_valid & ATTR_MTIME) { + if ((attrs->ia_valid & ATTR_MTIME) && + !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { rc = cifs_get_writable_file(cifsInode, FIND_WR_ANY, &wfile); if (!rc) { tcon = tlink_tcon(wfile->tlink); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 852aa00ec729..a25ef35b023e 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -416,7 +416,7 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, } rc = SMB2_open(xid, &oparms, utf16_path, &oplock, pfile_info, NULL, - NULL); + NULL, NULL); if (rc) goto qmf_out_open_fail; @@ -470,7 +470,7 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, oparms.reconnect = false; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, - NULL); + NULL, NULL); if (rc) { kfree(utf16_path); return rc; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 40ca394fd5de..a456febd4109 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -31,6 +31,7 @@ #include "nterr.h" #include "cifs_unicode.h" #include "smb2pdu.h" +#include "cifsfs.h" extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; @@ -1022,3 +1023,82 @@ int copy_path_name(char *dst, const char *src) name_len++; return name_len; } + +struct super_cb_data { + struct TCP_Server_Info *server; + struct super_block *sb; +}; + +static void super_cb(struct super_block *sb, void *arg) +{ + struct super_cb_data *d = arg; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + + if (d->sb) + return; + + cifs_sb = CIFS_SB(sb); + tcon = cifs_sb_master_tcon(cifs_sb); + if (tcon->ses->server == d->server) + d->sb = sb; +} + +struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server) +{ + struct super_cb_data d = { + .server = server, + .sb = NULL, + }; + + iterate_supers_type(&cifs_fs_type, super_cb, &d); + + if (unlikely(!d.sb)) + return ERR_PTR(-ENOENT); + /* + * Grab an active reference in order to prevent automounts (DFS links) + * of expiring and then freeing up our cifs superblock pointer while + * we're doing failover. + */ + cifs_sb_active(d.sb); + return d.sb; +} + +void cifs_put_tcp_super(struct super_block *sb) +{ + if (!IS_ERR_OR_NULL(sb)) + cifs_sb_deactive(sb); +} + +int update_super_prepath(struct cifs_tcon *tcon, const char *prefix, + size_t prefix_len) +{ + struct super_block *sb; + struct cifs_sb_info *cifs_sb; + int rc = 0; + + sb = cifs_get_tcp_super(tcon->ses->server); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + cifs_sb = CIFS_SB(sb); + + kfree(cifs_sb->prepath); + + if (*prefix && prefix_len) { + cifs_sb->prepath = kstrndup(prefix, prefix_len, GFP_ATOMIC); + if (!cifs_sb->prepath) { + rc = -ENOMEM; + goto out; + } + + convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb)); + } else + cifs_sb->prepath = NULL; + + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + +out: + cifs_put_tcp_super(sb); + return rc; +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index ba9dadf3be24..19e4a5d3b4ca 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -32,6 +32,7 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" #include "cifsfs.h" +#include "smb2proto.h" /* * To be safe - for UCS to UTF-8 with strings loaded with the rare long @@ -217,6 +218,60 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) } } +/* Fill a cifs_fattr struct with info from SMB_FIND_FILE_POSIX_INFO. */ +static void +cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info, + struct cifs_sb_info *cifs_sb) +{ + struct smb2_posix_info_parsed parsed; + + posix_info_parse(info, NULL, &parsed); + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_uniqueid = le64_to_cpu(info->Inode); + fattr->cf_bytes = le64_to_cpu(info->AllocationSize); + fattr->cf_eof = le64_to_cpu(info->EndOfFile); + + fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); + fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); + fattr->cf_ctime = cifs_NTtimeToUnix(info->CreationTime); + + fattr->cf_nlink = le32_to_cpu(info->HardLinks); + fattr->cf_cifsattrs = le32_to_cpu(info->DosAttributes); + + /* + * Since we set the inode type below we need to mask off + * to avoid strange results if bits set above. + * XXX: why not make server&client use the type bits? + */ + fattr->cf_mode = le32_to_cpu(info->Mode) & ~S_IFMT; + + cifs_dbg(VFS, "XXX dev %d, reparse %d, mode %o", + le32_to_cpu(info->DeviceId), + le32_to_cpu(info->ReparseTag), + le32_to_cpu(info->Mode)); + + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + fattr->cf_mode |= S_IFDIR; + fattr->cf_dtype = DT_DIR; + } else { + /* + * mark anything that is not a dir as regular + * file. special files should have the REPARSE + * attribute and will be marked as needing revaluation + */ + fattr->cf_mode |= S_IFREG; + fattr->cf_dtype = DT_REG; + } + + if (reparse_file_needs_reval(fattr)) + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + + /* TODO map SIDs */ + fattr->cf_uid = cifs_sb->mnt_uid; + fattr->cf_gid = cifs_sb->mnt_gid; +} + static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info) { const FILE_DIRECTORY_INFO *fi = info; @@ -359,6 +414,8 @@ ffirst_retry: /* if (cap_unix(tcon->ses) { */ if (tcon->unix_ext) cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX; + else if (tcon->posix_extensions) + cifsFile->srch_inf.info_level = SMB_FIND_FILE_POSIX_INFO; else if ((tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find) == 0) { cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD; @@ -451,6 +508,23 @@ struct cifs_dirent { u64 ino; }; +static void cifs_fill_dirent_posix(struct cifs_dirent *de, + const struct smb2_posix_info *info) +{ + struct smb2_posix_info_parsed parsed; + + /* payload should have already been checked at this point */ + if (posix_info_parse(info, NULL, &parsed) < 0) { + cifs_dbg(VFS, "invalid POSIX info payload"); + return; + } + + de->name = parsed.name; + de->namelen = parsed.name_len; + de->resume_key = info->Ignored; + de->ino = le64_to_cpu(info->Inode); +} + static void cifs_fill_dirent_unix(struct cifs_dirent *de, const FILE_UNIX_INFO *info, bool is_unicode) { @@ -511,6 +585,9 @@ static int cifs_fill_dirent(struct cifs_dirent *de, const void *info, memset(de, 0, sizeof(*de)); switch (level) { + case SMB_FIND_FILE_POSIX_INFO: + cifs_fill_dirent_posix(de, info); + break; case SMB_FIND_FILE_UNIX: cifs_fill_dirent_unix(de, info, is_unicode); break; @@ -786,6 +863,11 @@ static int cifs_filldir(char *find_entry, struct file *file, } switch (file_info->srch_inf.info_level) { + case SMB_FIND_FILE_POSIX_INFO: + cifs_posix_to_fattr(&fattr, + (struct smb2_posix_info *)find_entry, + cifs_sb); + break; case SMB_FIND_FILE_UNIX: cifs_unix_basic_to_fattr(&fattr, &((FILE_UNIX_INFO *)find_entry)->basic, diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index afe1f03aabe3..2fa3ba354cc9 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -62,7 +62,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, - NULL); + NULL, NULL); if (rc) goto out; @@ -152,7 +152,12 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, (li->offset + li->length)) continue; if (current->tgid != li->pid) - continue; + /* + * flock and OFD lock are associated with an open + * file description, not the process. + */ + if (!(flock->fl_flags & (FL_FLOCK | FL_OFDLCK))) + continue; if (cinode->can_cache_brlcks) { /* * We can cache brlock requests - simply remove a lock diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index cfe9b800ea8c..b36c46f48705 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -328,16 +328,6 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified wsize, or default */ wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); -#ifdef CONFIG_CIFS_SMB_DIRECT - if (server->rdma) { - if (server->sign) - wsize = min_t(unsigned int, - wsize, server->smbd_conn->max_fragmented_send_size); - else - wsize = min_t(unsigned int, - wsize, server->smbd_conn->max_readwrite_size); - } -#endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); @@ -356,8 +346,15 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { if (server->sign) + /* + * Account for SMB2 data transfer packet header and + * possible encryption header + */ wsize = min_t(unsigned int, - wsize, server->smbd_conn->max_fragmented_send_size); + wsize, + server->smbd_conn->max_fragmented_send_size - + SMB2_READWRITE_PDU_HEADER_SIZE - + sizeof(struct smb2_transform_hdr)); else wsize = min_t(unsigned int, wsize, server->smbd_conn->max_readwrite_size); @@ -378,16 +375,6 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified rsize, or default */ rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); -#ifdef CONFIG_CIFS_SMB_DIRECT - if (server->rdma) { - if (server->sign) - rsize = min_t(unsigned int, - rsize, server->smbd_conn->max_fragmented_recv_size); - else - rsize = min_t(unsigned int, - rsize, server->smbd_conn->max_readwrite_size); - } -#endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); @@ -407,8 +394,15 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { if (server->sign) + /* + * Account for SMB2 data transfer packet header and + * possible encryption header + */ rsize = min_t(unsigned int, - rsize, server->smbd_conn->max_fragmented_recv_size); + rsize, + server->smbd_conn->max_fragmented_recv_size - + SMB2_READWRITE_PDU_HEADER_SIZE - + sizeof(struct smb2_transform_hdr)); else rsize = min_t(unsigned int, rsize, server->smbd_conn->max_readwrite_size); @@ -794,7 +788,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, tcon->crfid.has_lease = true; smb2_parse_contexts(server, o_rsp, &oparms.fid->epoch, - oparms.fid->lease_key, &oplock, NULL); + oparms.fid->lease_key, &oplock, + NULL, NULL); } else goto oshr_exit; @@ -838,7 +833,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, if (no_cached_open) rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, - NULL); + NULL, NULL); else rc = open_shroot(xid, tcon, cifs_sb, &fid); @@ -878,7 +873,8 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL); + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, + NULL, NULL); if (rc) return; @@ -913,7 +909,8 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, + NULL); if (rc) { kfree(utf16_path); return rc; @@ -2122,7 +2119,8 @@ smb3_notify(const unsigned int xid, struct file *pfile, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, + NULL); if (rc) goto notify_exit; @@ -2543,7 +2541,8 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL); + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, + NULL, NULL); if (rc) return rc; @@ -3028,7 +3027,8 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, + NULL); kfree(utf16_path); if (!rc) { rc = SMB2_query_acl(xid, tlink_tcon(tlink), fid.persistent_fid, @@ -3086,7 +3086,8 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, + NULL, NULL); kfree(utf16_path); if (!rc) { rc = SMB2_set_acl(xid, tlink_tcon(tlink), fid.persistent_fid, @@ -3248,6 +3249,10 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, * Extending the file */ if ((keep_size == false) && i_size_read(inode) < off + len) { + rc = inode_newsize_ok(inode, off + len); + if (rc) + goto out; + if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) smb2_set_sparse(xid, tcon, cfile, inode, false); @@ -4151,7 +4156,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, if (server->ops->is_session_expired && server->ops->is_session_expired(buf)) { cifs_reconnect(server); - wake_up(&server->response_q); return -1; } @@ -4515,14 +4519,12 @@ smb3_receive_transform(struct TCP_Server_Info *server, cifs_server_dbg(VFS, "Transform message is too small (%u)\n", pdu_length); cifs_reconnect(server); - wake_up(&server->response_q); return -ECONNABORTED; } if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) { cifs_server_dbg(VFS, "Transform message is broken\n"); cifs_reconnect(server); - wake_up(&server->response_q); return -ECONNABORTED; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 28c0be5e69b7..47d3e382ecaa 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -193,9 +193,18 @@ static int __smb2_reconnect(const struct nls_table *nlsc, for (it = dfs_cache_get_tgt_iterator(&tl); it; it = dfs_cache_get_next_tgt(&tl, it)) { - const char *tgt = dfs_cache_get_tgt_name(it); + const char *share, *prefix; + size_t share_len, prefix_len; - extract_unc_hostname(tgt, &dfs_host, &dfs_host_len); + rc = dfs_cache_get_tgt_share(it, &share, &share_len, &prefix, + &prefix_len); + if (rc) { + cifs_dbg(VFS, "%s: failed to parse target share %d\n", + __func__, rc); + continue; + } + + extract_unc_hostname(share, &dfs_host, &dfs_host_len); if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) { @@ -206,11 +215,13 @@ static int __smb2_reconnect(const struct nls_table *nlsc, continue; } - scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); + scnprintf(tree, MAX_TREE_SIZE, "\\%.*s", (int)share_len, share); rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); - if (!rc) + if (!rc) { + rc = update_super_prepath(tcon, prefix, prefix_len); break; + } if (rc == -EREMOTE) break; } @@ -378,7 +389,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) } if (smb2_command != SMB2_INTERNAL_CMD) - queue_delayed_work(cifsiod_wq, &server->reconnect, 0); + mod_delayed_work(cifsiod_wq, &server->reconnect, 0); atomic_inc(&tconInfoReconnectCount); out: @@ -1940,20 +1951,46 @@ parse_query_id_ctxt(struct create_context *cc, struct smb2_file_all_info *buf) } static void -parse_posix_ctxt(struct create_context *cc, struct smb_posix_info *pposix_inf) +parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info, + struct create_posix_rsp *posix) { - /* struct smb_posix_info *ppinf = (struct smb_posix_info *)cc; */ + int sid_len; + u8 *beg = (u8 *)cc + le16_to_cpu(cc->DataOffset); + u8 *end = beg + le32_to_cpu(cc->DataLength); + u8 *sid; + + memset(posix, 0, sizeof(*posix)); + + posix->nlink = le32_to_cpu(*(__le32 *)(beg + 0)); + posix->reparse_tag = le32_to_cpu(*(__le32 *)(beg + 4)); + posix->mode = le32_to_cpu(*(__le32 *)(beg + 8)); - /* TODO: Need to add parsing for the context and return */ - printk_once(KERN_WARNING - "SMB3 3.11 POSIX response context not completed yet\n"); + sid = beg + 12; + sid_len = posix_info_sid_size(sid, end); + if (sid_len < 0) { + cifs_dbg(VFS, "bad owner sid in posix create response\n"); + return; + } + memcpy(&posix->owner, sid, sid_len); + + sid = sid + sid_len; + sid_len = posix_info_sid_size(sid, end); + if (sid_len < 0) { + cifs_dbg(VFS, "bad group sid in posix create response\n"); + return; + } + memcpy(&posix->group, sid, sid_len); + + cifs_dbg(FYI, "nlink=%d mode=%o reparse_tag=%x\n", + posix->nlink, posix->mode, posix->reparse_tag); } void smb2_parse_contexts(struct TCP_Server_Info *server, - struct smb2_create_rsp *rsp, - unsigned int *epoch, char *lease_key, __u8 *oplock, - struct smb2_file_all_info *buf) + struct smb2_create_rsp *rsp, + unsigned int *epoch, char *lease_key, __u8 *oplock, + struct smb2_file_all_info *buf, + struct create_posix_rsp *posix) { char *data_offset; struct create_context *cc; @@ -1983,8 +2020,9 @@ smb2_parse_contexts(struct TCP_Server_Info *server, strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4) == 0) parse_query_id_ctxt(cc, buf); else if ((le16_to_cpu(cc->NameLength) == 16)) { - if (memcmp(name, smb3_create_tag_posix, 16) == 0) - parse_posix_ctxt(cc, NULL); + if (posix && + memcmp(name, smb3_create_tag_posix, 16) == 0) + parse_posix_ctxt(cc, buf, posix); } /* else { cifs_dbg(FYI, "Context not matched with len %d\n", @@ -2709,6 +2747,7 @@ SMB2_open_free(struct smb_rqst *rqst) int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, + struct create_posix_rsp *posix, struct kvec *err_iov, int *buftype) { struct smb_rqst rqst; @@ -2787,7 +2826,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, smb2_parse_contexts(server, rsp, &oparms->fid->epoch, - oparms->fid->lease_key, oplock, buf); + oparms->fid->lease_key, oplock, buf, posix); creat_exit: SMB2_open_free(&rqst); free_rsp_buf(resp_buftype, rsp); @@ -3559,7 +3598,7 @@ SMB2_echo(struct TCP_Server_Info *server) if (server->tcpStatus == CifsNeedNegotiate) { /* No need to send echo on newly established connections */ - queue_delayed_work(cifsiod_wq, &server->reconnect, 0); + mod_delayed_work(cifsiod_wq, &server->reconnect, 0); return rc; } @@ -4286,8 +4325,104 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, return rc; } +int posix_info_sid_size(const void *beg, const void *end) +{ + size_t subauth; + int total; + + if (beg + 1 > end) + return -1; + + subauth = *(u8 *)(beg+1); + if (subauth < 1 || subauth > 15) + return -1; + + total = 1 + 1 + 6 + 4*subauth; + if (beg + total > end) + return -1; + + return total; +} + +int posix_info_parse(const void *beg, const void *end, + struct smb2_posix_info_parsed *out) + +{ + int total_len = 0; + int sid_len; + int name_len; + const void *owner_sid; + const void *group_sid; + const void *name; + + /* if no end bound given, assume payload to be correct */ + if (!end) { + const struct smb2_posix_info *p = beg; + + end = beg + le32_to_cpu(p->NextEntryOffset); + /* last element will have a 0 offset, pick a sensible bound */ + if (end == beg) + end += 0xFFFF; + } + + /* check base buf */ + if (beg + sizeof(struct smb2_posix_info) > end) + return -1; + total_len = sizeof(struct smb2_posix_info); + + /* check owner sid */ + owner_sid = beg + total_len; + sid_len = posix_info_sid_size(owner_sid, end); + if (sid_len < 0) + return -1; + total_len += sid_len; + + /* check group sid */ + group_sid = beg + total_len; + sid_len = posix_info_sid_size(group_sid, end); + if (sid_len < 0) + return -1; + total_len += sid_len; + + /* check name len */ + if (beg + total_len + 4 > end) + return -1; + name_len = le32_to_cpu(*(__le32 *)(beg + total_len)); + if (name_len < 1 || name_len > 0xFFFF) + return -1; + total_len += 4; + + /* check name */ + name = beg + total_len; + if (name + name_len > end) + return -1; + total_len += name_len; + + if (out) { + out->base = beg; + out->size = total_len; + out->name_len = name_len; + out->name = name; + memcpy(&out->owner, owner_sid, + posix_info_sid_size(owner_sid, end)); + memcpy(&out->group, group_sid, + posix_info_sid_size(group_sid, end)); + } + return total_len; +} + +static int posix_info_extra_size(const void *beg, const void *end) +{ + int len = posix_info_parse(beg, end, NULL); + + if (len < 0) + return -1; + return len - sizeof(struct smb2_posix_info); +} + static unsigned int -num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size) +num_entries(int infotype, char *bufstart, char *end_of_buf, char **lastentry, + size_t size) { int len; unsigned int entrycount = 0; @@ -4311,8 +4446,13 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size) entryptr = entryptr + next_offset; dir_info = (FILE_DIRECTORY_INFO *)entryptr; - len = le32_to_cpu(dir_info->FileNameLength); - if (entryptr + len < entryptr || + if (infotype == SMB_FIND_FILE_POSIX_INFO) + len = posix_info_extra_size(entryptr, end_of_buf); + else + len = le32_to_cpu(dir_info->FileNameLength); + + if (len < 0 || + entryptr + len < entryptr || entryptr + len > end_of_buf || entryptr + len + size > end_of_buf) { cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n", @@ -4362,6 +4502,9 @@ int SMB2_query_directory_init(const unsigned int xid, case SMB_FIND_FILE_ID_FULL_DIR_INFO: req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION; break; + case SMB_FIND_FILE_POSIX_INFO: + req->FileInformationClass = SMB_FIND_FILE_POSIX_INFO; + break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", info_level); @@ -4427,6 +4570,10 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, case SMB_FIND_FILE_ID_FULL_DIR_INFO: info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; break; + case SMB_FIND_FILE_POSIX_INFO: + /* note that posix payload are variable size */ + info_buf_size = sizeof(struct smb2_posix_info); + break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", srch_inf->info_level); @@ -4436,8 +4583,10 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), rsp_iov, info_buf_size); - if (rc) + if (rc) { + cifs_tcon_dbg(VFS, "bad info payload"); return rc; + } srch_inf->unicode = true; @@ -4451,9 +4600,14 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, srch_inf->srch_entries_start = srch_inf->last_entry = (char *)rsp + le16_to_cpu(rsp->OutputBufferOffset); end_of_smb = rsp_iov->iov_len + (char *)rsp; - srch_inf->entries_in_buffer = - num_entries(srch_inf->srch_entries_start, end_of_smb, - &srch_inf->last_entry, info_buf_size); + + srch_inf->entries_in_buffer = num_entries( + srch_inf->info_level, + srch_inf->srch_entries_start, + end_of_smb, + &srch_inf->last_entry, + info_buf_size); + srch_inf->index_of_last_entry += srch_inf->entries_in_buffer; cifs_dbg(FYI, "num entries %d last_index %lld srch start %p srch end %p\n", srch_inf->entries_in_buffer, srch_inf->index_of_last_entry, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index fa03df130f1a..10acf90f858d 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -91,6 +91,7 @@ #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) +#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc) /* * SMB2 Header Definition @@ -119,6 +120,9 @@ struct smb2_sync_hdr { __u8 Signature[16]; } __packed; +/* The total header size for SMB2 read and write */ +#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_sync_hdr)) + struct smb2_sync_pdu { struct smb2_sync_hdr sync_hdr; __le16 StructureSize2; /* size of wct area (varies, request specific) */ @@ -127,16 +131,33 @@ struct smb2_sync_pdu { #define SMB3_AES128CCM_NONCE 11 #define SMB3_AES128GCM_NONCE 12 +/* Transform flags (for 3.0 dialect this flag indicates CCM */ +#define TRANSFORM_FLAG_ENCRYPTED 0x0001 struct smb2_transform_hdr { __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ __u8 Signature[16]; __u8 Nonce[16]; __le32 OriginalMessageSize; __u16 Reserved1; - __le16 Flags; /* EncryptionAlgorithm */ + __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */ __u64 SessionId; } __packed; +/* See MS-SMB2 2.2.42.1 */ +struct compression_playload_header { + __le16 AlgorithmId; + __le16 Reserved; + __le32 Length; +} __packed; + +/* See MS-SMB2 2.2.42.2 */ +struct compression_pattern_payload_v1 { + __le16 Pattern; + __le16 Reserved1; + __le16 Reserved2; + __le32 Repetitions; +} __packed; + /* * SMB2 flag definitions */ @@ -182,7 +203,7 @@ struct smb2_symlink_err_rsp { __le16 PrintNameOffset; __le16 PrintNameLength; __le32 Flags; - __u8 PathBuffer[0]; + __u8 PathBuffer[]; } __packed; /* SMB 3.1.1 and later dialects. See MS-SMB2 section 2.2.2.1 */ @@ -192,6 +213,10 @@ struct smb2_error_context_rsp { __u8 ErrorContextData; /* ErrorDataLength long array */ } __packed; +/* ErrorId values */ +#define SMB2_ERROR_ID_DEFAULT 0x00000000 +#define SMB2_ERROR_ID_SHARE_REDIRECT cpu_to_le32(0x72645253) /* "rdRS" */ + /* Defines for Type field below (see MS-SMB2 2.2.2.2.2.1) */ #define MOVE_DST_IPADDR_V4 cpu_to_le32(0x00000001) #define MOVE_DST_IPADDR_V6 cpu_to_le32(0x00000002) @@ -210,7 +235,7 @@ struct share_redirect_error_context_rsp { __le16 Flags; __le16 TargetType; __le32 IPAddrCount; - struct move_dst_ipaddr IpAddrMoveList[0]; + struct move_dst_ipaddr IpAddrMoveList[]; /* __u8 ResourceName[] */ /* Name of share as counted Unicode string */ } __packed; @@ -307,11 +332,17 @@ struct smb2_encryption_neg_context { #define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001) #define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002) #define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) +/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */ +#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) + +/* Compression Flags */ +#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000) +#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001) struct smb2_compression_capabilities_context { __le16 ContextType; /* 3 */ __le16 DataLength; - __u32 Reserved; + __u32 Flags; __le16 CompressionAlgorithmCount; __u16 Padding; __u32 Reserved1; @@ -326,7 +357,7 @@ struct smb2_netname_neg_context { __le16 ContextType; /* 0x100 */ __le16 DataLength; __le32 Reserved; - __le16 NetName[0]; /* hostname of target converted to UCS-2 */ + __le16 NetName[]; /* hostname of target converted to UCS-2 */ } __packed; #define POSIX_CTXT_DATA_LEN 16 @@ -406,7 +437,7 @@ struct smb2_logoff_rsp { struct smb2_tree_connect_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 9 */ - __le16 Reserved; /* Flags in SMB3.1.1 */ + __le16 Flags; /* Reserved MBZ for dialects prior to SMB3.1.1 */ __le16 PathOffset; __le16 PathLength; __u8 Buffer[1]; /* variable length */ @@ -421,13 +452,13 @@ struct tree_connect_contexts { __le16 ContextType; __le16 DataLength; __le32 Reserved; - __u8 Data[0]; + __u8 Data[]; } __packed; /* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */ struct smb3_blob_data { __le16 BlobSize; - __u8 BlobData[0]; + __u8 BlobData[]; } __packed; /* Valid values for Attr */ @@ -477,14 +508,14 @@ struct remoted_identity_tcon_context { __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */ __le16 UserClaims; /* offset to BLOB_DATA struct */ __le16 DeviceClaims; /* offset to BLOB_DATA struct */ - __u8 TicketInfo[0]; /* variable length buf - remoted identity data */ + __u8 TicketInfo[]; /* variable length buf - remoted identity data */ } __packed; struct smb2_tree_connect_req_extension { __le32 TreeConnectContextOffset; __le16 TreeConnectContextCount; __u8 Reserved[10]; - __u8 PathName[0]; /* variable sized array */ + __u8 PathName[]; /* variable sized array */ /* followed by array of TreeConnectContexts */ } __packed; @@ -633,7 +664,7 @@ struct smb2_tree_disconnect_rsp { | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE) #define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE) -/* Impersonation Levels */ +/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */ #define IL_ANONYMOUS cpu_to_le32(0x00000000) #define IL_IDENTIFICATION cpu_to_le32(0x00000001) #define IL_IMPERSONATION cpu_to_le32(0x00000002) @@ -689,7 +720,7 @@ struct smb2_create_req { __le16 NameLength; __le32 CreateContextsOffset; __le32 CreateContextsLength; - __u8 Buffer[0]; + __u8 Buffer[]; } __packed; /* @@ -727,7 +758,7 @@ struct create_context { __le16 Reserved; __le16 DataOffset; __le32 DataLength; - __u8 Buffer[0]; + __u8 Buffer[]; } __packed; #define SMB2_LEASE_READ_CACHING_HE 0x01 @@ -739,7 +770,7 @@ struct create_context { #define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02) #define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04) -#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02) +#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x00000002) #define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET cpu_to_le32(0x00000004) #define SMB2_LEASE_KEY_SIZE 16 @@ -869,7 +900,7 @@ struct crt_sd_ctxt { struct resume_key_req { char ResumeKey[COPY_CHUNK_RES_KEY_SIZE]; __le32 ContextLength; /* MBZ */ - char Context[0]; /* ignored, Windows sets to 4 bytes of zero */ + char Context[]; /* ignored, Windows sets to 4 bytes of zero */ } __packed; /* this goes in the ioctl buffer when doing a copychunk request */ @@ -931,7 +962,7 @@ struct reparse_data_buffer { __le32 ReparseTag; __le16 ReparseDataLength; __u16 Reserved; - __u8 DataBuffer[0]; /* Variable Length */ + __u8 DataBuffer[]; /* Variable Length */ } __packed; struct reparse_guid_data_buffer { @@ -939,7 +970,7 @@ struct reparse_guid_data_buffer { __le16 ReparseDataLength; __u16 Reserved; __u8 ReparseGuid[16]; - __u8 DataBuffer[0]; /* Variable Length */ + __u8 DataBuffer[]; /* Variable Length */ } __packed; struct reparse_mount_point_data_buffer { @@ -950,7 +981,7 @@ struct reparse_mount_point_data_buffer { __le16 SubstituteNameLength; __le16 PrintNameOffset; __le16 PrintNameLength; - __u8 PathBuffer[0]; /* Variable Length */ + __u8 PathBuffer[]; /* Variable Length */ } __packed; #define SYMLINK_FLAG_RELATIVE 0x00000001 @@ -964,7 +995,7 @@ struct reparse_symlink_data_buffer { __le16 PrintNameOffset; __le16 PrintNameLength; __le32 Flags; - __u8 PathBuffer[0]; /* Variable Length */ + __u8 PathBuffer[]; /* Variable Length */ } __packed; /* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */ @@ -1066,7 +1097,7 @@ struct smb2_ioctl_req { __le32 MaxOutputResponse; __le32 Flags; __u32 Reserved2; - __u8 Buffer[0]; + __u8 Buffer[]; } __packed; struct smb2_ioctl_rsp { @@ -1180,7 +1211,7 @@ struct smb2_write_req { __le64 Offset; __u64 PersistentFileId; /* opaque endianness */ __u64 VolatileFileId; /* opaque endianness */ - __le32 Channel; /* Reserved MBZ */ + __le32 Channel; /* MBZ unless SMB3.02 or later */ __le32 RemainingBytes; __le16 WriteChannelInfoOffset; __le16 WriteChannelInfoLength; @@ -1469,7 +1500,7 @@ struct smb3_fs_vol_info { __le32 VolumeLabelLength; /* includes trailing null */ __u8 SupportsObjects; /* True if eg like NTFS, supports objects */ __u8 Reserved; - __u8 VolumeLabel[0]; /* variable len */ + __u8 VolumeLabel[]; /* variable len */ } __packed; /* partial list of QUERY INFO levels */ @@ -1531,7 +1562,7 @@ struct smb2_file_rename_info { /* encoding of request for level 10 */ __u8 Reserved[7]; __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ __le32 FileNameLength; - char FileName[0]; /* New name to be assigned */ + char FileName[]; /* New name to be assigned */ } __packed; /* level 10 Set */ struct smb2_file_link_info { /* encoding of request for level 11 */ @@ -1540,7 +1571,7 @@ struct smb2_file_link_info { /* encoding of request for level 11 */ __u8 Reserved[7]; __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ __le32 FileNameLength; - char FileName[0]; /* Name to be assigned to new link */ + char FileName[]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ struct smb2_file_full_ea_info { /* encoding of response for level 15 */ @@ -1548,7 +1579,7 @@ struct smb2_file_full_ea_info { /* encoding of response for level 15 */ __u8 flags; __u8 ea_name_length; __le16 ea_value_length; - char ea_data[0]; /* \0 terminated name plus value */ + char ea_data[]; /* \0 terminated name plus value */ } __packed; /* level 15 Set */ /* @@ -1604,11 +1635,56 @@ struct smb2_file_id_information { extern char smb2_padding[7]; /* equivalent of the contents of SMB3.1.1 POSIX open context response */ -struct smb_posix_info { - __le32 nlink; - __le32 reparse_tag; - __le32 mode; - kuid_t uid; - kuid_t gid; +struct create_posix_rsp { + u32 nlink; + u32 reparse_tag; + u32 mode; + struct cifs_sid owner; /* var-sized on the wire */ + struct cifs_sid group; /* var-sized on the wire */ +} __packed; + +/* + * SMB2-only POSIX info level + * + * See posix_info_sid_size(), posix_info_extra_size() and + * posix_info_parse() to help with the handling of this struct. + */ +struct smb2_posix_info { + __le32 NextEntryOffset; + __u32 Ignored; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 DosAttributes; + __le64 Inode; + __le32 DeviceId; + __le32 Zero; + /* beginning of POSIX Create Context Response */ + __le32 HardLinks; + __le32 ReparseTag; + __le32 Mode; + /* + * var sized owner SID + * var sized group SID + * le32 filenamelength + * u8 filename[] + */ +} __packed; + +/* + * Parsed version of the above struct. Allows direct access to the + * variable length fields + */ +struct smb2_posix_info_parsed { + const struct smb2_posix_info *base; + size_t size; + struct cifs_sid owner; + struct cifs_sid group; + int name_len; + const u8 *name; }; + #endif /* _SMB2PDU_H */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index de6388ef344f..4d1ff7b66fdc 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -139,6 +139,7 @@ extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, + struct create_posix_rsp *posix, struct kvec *err_iov, int *resp_buftype); extern int SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, struct cifs_open_parms *oparms, @@ -252,7 +253,8 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, extern void smb2_parse_contexts(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, unsigned int *epoch, char *lease_key, - __u8 *oplock, struct smb2_file_all_info *buf); + __u8 *oplock, struct smb2_file_all_info *buf, + struct create_posix_rsp *posix); extern int smb3_encryption_required(const struct cifs_tcon *tcon); extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size); @@ -272,4 +274,7 @@ extern int smb2_query_info_compound(const unsigned int xid, u32 class, u32 type, u32 output_len, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb); +int posix_info_parse(const void *beg, const void *end, + struct smb2_posix_info_parsed *out); +int posix_info_sid_size(const void *beg, const void *end); #endif /* _SMB2PROTO_H */ diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 08b703b7a15e..20cc79e5c15d 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -602,7 +602,7 @@ int smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { unsigned int rc; - char server_response_sig[16]; + char server_response_sig[SMB2_SIGNATURE_SIZE]; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; @@ -638,9 +638,11 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) if (rc) return rc; - if (memcmp(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE)) + if (memcmp(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE)) { + dump_stack(); + cifs_dbg(VFS, "sign fail cmd 0x%x message id 0x%llx\n", shdr->Command, shdr->MessageId); return -EACCES; - else + } else return 0; } diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5b1b97e9e0c9..8da43a500686 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -459,25 +459,6 @@ static void smbd_post_send_credits(struct work_struct *work) check_and_send_immediate(info); } -static void smbd_recv_done_work(struct work_struct *work) -{ - struct smbd_connection *info = - container_of(work, struct smbd_connection, recv_done_work); - - /* - * We may have new send credits granted from remote peer - * If any sender is blcoked on lack of credets, unblock it - */ - if (atomic_read(&info->send_credits)) - wake_up_interruptible(&info->wait_send_queue); - - /* - * Check if we need to send something to remote peer to - * grant more credits or respond to KEEP_ALIVE packet - */ - check_and_send_immediate(info); -} - /* Called from softirq, when recv is done */ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) { @@ -546,8 +527,15 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) atomic_dec(&info->receive_credits); info->receive_credit_target = le16_to_cpu(data_transfer->credits_requested); - atomic_add(le16_to_cpu(data_transfer->credits_granted), - &info->send_credits); + if (le16_to_cpu(data_transfer->credits_granted)) { + atomic_add(le16_to_cpu(data_transfer->credits_granted), + &info->send_credits); + /* + * We have new send credits granted from remote peer + * If any sender is waiting for credits, unblock it + */ + wake_up_interruptible(&info->wait_send_queue); + } log_incoming(INFO, "data flags %d data_offset %d " "data_length %d remaining_data_length %d\n", @@ -563,7 +551,12 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) info->keep_alive_requested = KEEP_ALIVE_PENDING; } - queue_work(info->workqueue, &info->recv_done_work); + /* + * Check if we need to send something to remote peer to + * grant more credits or respond to KEEP_ALIVE packet + */ + check_and_send_immediate(info); + return; default: @@ -1762,7 +1755,6 @@ static struct smbd_connection *_smbd_get_connection( atomic_set(&info->send_payload_pending, 0); INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work); - INIT_WORK(&info->recv_done_work, smbd_recv_done_work); INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); info->new_credits_offered = 0; spin_lock_init(&info->lock_new_credits_offered); @@ -2097,8 +2089,7 @@ int smbd_send(struct TCP_Server_Info *server, for (i = 0; i < num_rqst; i++) remaining_data_length += smb_rqst_len(server, &rqst_array[i]); - if (remaining_data_length + sizeof(struct smbd_data_transfer) > - info->max_fragmented_send_size) { + if (remaining_data_length > info->max_fragmented_send_size) { log_write(ERR, "payload size %d > max size %d\n", remaining_data_length, info->max_fragmented_send_size); rc = -EINVAL; diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h index 6ff880a1e186..8ede915f2b24 100644 --- a/fs/cifs/smbdirect.h +++ b/fs/cifs/smbdirect.h @@ -67,7 +67,6 @@ struct smbd_connection { bool negotiate_done; struct work_struct disconnect_work; - struct work_struct recv_done_work; struct work_struct post_send_credits_work; spinlock_t lock_new_credits_offered; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index cb3ee916f527..c97570eb2c18 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -466,7 +466,7 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst, int flags) { struct kvec iov; - struct smb2_transform_hdr tr_hdr; + struct smb2_transform_hdr *tr_hdr; struct smb_rqst cur_rqst[MAX_COMPOUND]; int rc; @@ -476,28 +476,34 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, if (num_rqst > MAX_COMPOUND - 1) return -ENOMEM; - memset(&cur_rqst[0], 0, sizeof(cur_rqst)); - memset(&iov, 0, sizeof(iov)); - memset(&tr_hdr, 0, sizeof(tr_hdr)); - - iov.iov_base = &tr_hdr; - iov.iov_len = sizeof(tr_hdr); - cur_rqst[0].rq_iov = &iov; - cur_rqst[0].rq_nvec = 1; - if (!server->ops->init_transform_rq) { cifs_server_dbg(VFS, "Encryption requested but transform " "callback is missing\n"); return -EIO; } + tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS); + if (!tr_hdr) + return -ENOMEM; + + memset(&cur_rqst[0], 0, sizeof(cur_rqst)); + memset(&iov, 0, sizeof(iov)); + memset(tr_hdr, 0, sizeof(*tr_hdr)); + + iov.iov_base = tr_hdr; + iov.iov_len = sizeof(*tr_hdr); + cur_rqst[0].rq_iov = &iov; + cur_rqst[0].rq_nvec = 1; + rc = server->ops->init_transform_rq(server, num_rqst + 1, &cur_rqst[0], rqst); if (rc) - return rc; + goto out; rc = __smb_send_rqst(server, num_rqst + 1, &cur_rqst[0]); smb3_free_compound_rqst(num_rqst, &cur_rqst[1]); +out: + kfree(tr_hdr); return rc; } diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 9aae851409e5..dbced2937ec8 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -76,6 +76,26 @@ static inline int fscrypt_context_size(const union fscrypt_context *ctx) return 0; } +/* Check whether an fscrypt_context has a recognized version number and size */ +static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx, + int ctx_size) +{ + return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx); +} + +/* Retrieve the context's nonce, assuming the context was already validated */ +static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) +{ + switch (ctx->version) { + case FSCRYPT_CONTEXT_V1: + return ctx->v1.nonce; + case FSCRYPT_CONTEXT_V2: + return ctx->v2.nonce; + } + WARN_ON(1); + return NULL; +} + #undef fscrypt_policy union fscrypt_policy { u8 version; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 08c9f216a54d..302375e9f719 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -425,20 +425,8 @@ int fscrypt_get_encryption_info(struct inode *inode) goto out; } - switch (ctx.version) { - case FSCRYPT_CONTEXT_V1: - memcpy(crypt_info->ci_nonce, ctx.v1.nonce, - FS_KEY_DERIVATION_NONCE_SIZE); - break; - case FSCRYPT_CONTEXT_V2: - memcpy(crypt_info->ci_nonce, ctx.v2.nonce, - FS_KEY_DERIVATION_NONCE_SIZE); - break; - default: - WARN_ON(1); - res = -EINVAL; - goto out; - } + memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx), + FS_KEY_DERIVATION_NONCE_SIZE); if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) { res = -EINVAL; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index cf2a9d26ef7d..10ccf945020c 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -258,7 +258,7 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u, { memset(policy_u, 0, sizeof(*policy_u)); - if (ctx_size <= 0 || ctx_size != fscrypt_context_size(ctx_u)) + if (!fscrypt_context_is_valid(ctx_u, ctx_size)) return -EINVAL; switch (ctx_u->version) { @@ -481,6 +481,25 @@ int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg) } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex); +/* FS_IOC_GET_ENCRYPTION_NONCE: retrieve file's encryption nonce for testing */ +int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) +{ + struct inode *inode = file_inode(filp); + union fscrypt_context ctx; + int ret; + + ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret < 0) + return ret; + if (!fscrypt_context_is_valid(&ctx, ret)) + return -EINVAL; + if (copy_to_user(arg, fscrypt_context_nonce(&ctx), + FS_KEY_DERIVATION_NONCE_SIZE)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_nonce); + /** * fscrypt_has_permitted_context() - is a file's encryption policy permitted * within its directory? @@ -1038,50 +1038,43 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, return ret; } -static bool dax_range_is_aligned(struct block_device *bdev, - unsigned int offset, unsigned int length) +int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, + struct iomap *iomap) { - unsigned short sector_size = bdev_logical_block_size(bdev); + sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); + pgoff_t pgoff; + long rc, id; + void *kaddr; + bool page_aligned = false; - if (!IS_ALIGNED(offset, sector_size)) - return false; - if (!IS_ALIGNED(length, sector_size)) - return false; - return true; -} + if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && + IS_ALIGNED(size, PAGE_SIZE)) + page_aligned = true; -int __dax_zero_page_range(struct block_device *bdev, - struct dax_device *dax_dev, sector_t sector, - unsigned int offset, unsigned int size) -{ - if (dax_range_is_aligned(bdev, offset, size)) { - sector_t start_sector = sector + (offset >> 9); + rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); + if (rc) + return rc; - return blkdev_issue_zeroout(bdev, start_sector, - size >> 9, GFP_NOFS, 0); - } else { - pgoff_t pgoff; - long rc, id; - void *kaddr; + id = dax_read_lock(); - rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); - if (rc) - return rc; + if (page_aligned) + rc = dax_zero_page_range(iomap->dax_dev, pgoff, + size >> PAGE_SHIFT); + else + rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } - id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); - if (rc < 0) { - dax_read_unlock(id); - return rc; - } + if (!page_aligned) { memset(kaddr + offset, 0, size); - dax_flush(dax_dev, kaddr + offset, size); - dax_read_unlock(id); + dax_flush(iomap->dax_dev, kaddr + offset, size); } + dax_read_unlock(id); return 0; } -EXPORT_SYMBOL_GPL(__dax_zero_page_range); static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index db987b5110a9..2d357680094c 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/atomic.h> #include <linux/device.h> +#include <linux/pm_runtime.h> #include <linux/poll.h> #include <linux/security.h> @@ -175,8 +176,13 @@ static int open_proxy_open(struct inode *inode, struct file *filp) if (r) goto out; - real_fops = fops_get(real_fops); - if (!real_fops) { + if (!fops_get(real_fops)) { +#ifdef MODULE + if (real_fops->owner && + real_fops->owner->state == MODULE_STATE_GOING) + goto out; +#endif + /* Huh? Module did not clean up after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); @@ -305,8 +311,13 @@ static int full_proxy_open(struct inode *inode, struct file *filp) if (r) goto out; - real_fops = fops_get(real_fops); - if (!real_fops) { + if (!fops_get(real_fops)) { +#ifdef MODULE + if (real_fops->owner && + real_fops->owner->state == MODULE_STATE_GOING) + goto out; +#endif + /* Huh? Module did not cleanup after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); @@ -1060,7 +1071,14 @@ static int debugfs_show_regset32(struct seq_file *s, void *data) { struct debugfs_regset32 *regset = s->private; + if (regset->dev) + pm_runtime_get_sync(regset->dev); + debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, ""); + + if (regset->dev) + pm_runtime_put(regset->dev); + return 0; } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e742dfc66933..b7f2e971ecbc 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -501,26 +501,16 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); * wide range of flexibility in creating a file, or a directory (if you want * to create a directory, the debugfs_create_dir() function is * recommended to be used instead.) - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. */ -struct dentry *debugfs_create_file_size(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops, - loff_t file_size) +void debugfs_create_file_size(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops, + loff_t file_size) { struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); if (de) d_inode(de)->i_size = file_size; - return de; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index fa4f6447ddad..12c66f5d92dd 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -252,7 +252,7 @@ static struct file_system_type efivarfs_type = { static __init int efivarfs_init(void) { - if (!efi_enabled(EFI_RUNTIME_SERVICES)) + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES)) return -ENODEV; if (!efivars_kobject()) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 5779a15c2cd6..5d2d81940679 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -157,17 +157,27 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) } } - ret = LZ4_decompress_safe_partial(src + inputmargin, out, - inlen, rq->outputsize, - rq->outputsize); - if (ret < 0) { - erofs_err(rq->sb, "failed to decompress, in[%u, %u] out[%u]", - inlen, inputmargin, rq->outputsize); + /* legacy format could compress extra data in a pcluster. */ + if (rq->partial_decoding || !support_0padding) + ret = LZ4_decompress_safe_partial(src + inputmargin, out, + inlen, rq->outputsize, + rq->outputsize); + else + ret = LZ4_decompress_safe(src + inputmargin, out, + inlen, rq->outputsize); + + if (ret != rq->outputsize) { + erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", + ret, inlen, inputmargin, rq->outputsize); + WARN_ON(1); print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, 16, 1, src + inputmargin, inlen, true); print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, 16, 1, out, rq->outputsize, true); + + if (ret >= 0) + memset(out + ret, 0, rq->outputsize - ret); ret = -EIO; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index c4c6dcdc89ad..5eead7fdc7a6 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -52,8 +52,8 @@ struct erofs_sb_info { struct list_head list; struct mutex umount_mutex; - /* the dedicated workstation for compression */ - struct radix_tree_root workstn_tree; + /* managed XArray arranged in physical block number */ + struct xarray managed_pslots; /* threshold for decompression synchronously */ unsigned int max_sync_decompress_pages; @@ -402,8 +402,8 @@ static inline void *erofs_get_pcpubuf(unsigned int pagenr) int erofs_workgroup_put(struct erofs_workgroup *grp); struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, pgoff_t index); -int erofs_register_workgroup(struct super_block *sb, - struct erofs_workgroup *grp); +struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, + struct erofs_workgroup *grp); void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); void erofs_shrinker_register(struct super_block *sb); void erofs_shrinker_unregister(struct super_block *sb); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 057e6d7b5b7f..b514c67e5fc2 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -425,7 +425,7 @@ static int erofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~SB_POSIXACL; #ifdef CONFIG_EROFS_FS_ZIP - INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC); + xa_init(&sbi->managed_pslots); #endif /* get the root inode */ diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index fddc5059c930..52d0be10f1aa 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -37,9 +37,6 @@ void *erofs_get_pcpubuf(unsigned int pagenr) /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; -#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount) -#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount) - static int erofs_workgroup_get(struct erofs_workgroup *grp) { int o; @@ -66,7 +63,7 @@ struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, repeat: rcu_read_lock(); - grp = radix_tree_lookup(&sbi->workstn_tree, index); + grp = xa_load(&sbi->managed_pslots, index); if (grp) { if (erofs_workgroup_get(grp)) { /* prefer to relax rcu read side */ @@ -80,43 +77,37 @@ repeat: return grp; } -int erofs_register_workgroup(struct super_block *sb, - struct erofs_workgroup *grp) +struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, + struct erofs_workgroup *grp) { - struct erofs_sb_info *sbi; - int err; - - /* grp shouldn't be broken or used before */ - if (atomic_read(&grp->refcount) != 1) { - DBG_BUGON(1); - return -EINVAL; - } - - err = radix_tree_preload(GFP_NOFS); - if (err) - return err; - - sbi = EROFS_SB(sb); - xa_lock(&sbi->workstn_tree); + struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct erofs_workgroup *pre; /* - * Bump up reference count before making this workgroup - * visible to other users in order to avoid potential UAF - * without serialized by workstn_lock. + * Bump up a reference count before making this visible + * to others for the XArray in order to avoid potential + * UAF without serialized by xa_lock. */ - __erofs_workgroup_get(grp); - - err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp); - if (err) - /* - * it's safe to decrease since the workgroup isn't visible - * and refcount >= 2 (cannot be freezed). - */ - __erofs_workgroup_put(grp); + atomic_inc(&grp->refcount); - xa_unlock(&sbi->workstn_tree); - radix_tree_preload_end(); - return err; +repeat: + xa_lock(&sbi->managed_pslots); + pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, + NULL, grp, GFP_NOFS); + if (pre) { + if (xa_is_err(pre)) { + pre = ERR_PTR(xa_err(pre)); + } else if (erofs_workgroup_get(pre)) { + /* try to legitimize the current in-tree one */ + xa_unlock(&sbi->managed_pslots); + cond_resched(); + goto repeat; + } + atomic_dec(&grp->refcount); + grp = pre; + } + xa_unlock(&sbi->managed_pslots); + return grp; } static void __erofs_workgroup_free(struct erofs_workgroup *grp) @@ -155,7 +146,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, /* * Note that all cached pages should be unattached - * before deleted from the radix tree. Otherwise some + * before deleted from the XArray. Otherwise some * cached pages could be still attached to the orphan * old workgroup when the new one is available in the tree. */ @@ -169,7 +160,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, * however in order to avoid some race conditions, add a * DBG_BUGON to observe this in advance. */ - DBG_BUGON(radix_tree_delete(&sbi->workstn_tree, grp->index) != grp); + DBG_BUGON(xa_erase(&sbi->managed_pslots, grp->index) != grp); /* * If managed cache is on, last refcount should indicate @@ -182,22 +173,11 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, unsigned long nr_shrink) { - pgoff_t first_index = 0; - void *batch[PAGEVEC_SIZE]; + struct erofs_workgroup *grp; unsigned int freed = 0; + unsigned long index; - int i, found; -repeat: - xa_lock(&sbi->workstn_tree); - - found = radix_tree_gang_lookup(&sbi->workstn_tree, - batch, first_index, PAGEVEC_SIZE); - - for (i = 0; i < found; ++i) { - struct erofs_workgroup *grp = batch[i]; - - first_index = grp->index + 1; - + xa_for_each(&sbi->managed_pslots, index, grp) { /* try to shrink each valid workgroup */ if (!erofs_try_to_release_workgroup(sbi, grp)) continue; @@ -206,10 +186,6 @@ repeat: if (!--nr_shrink) break; } - xa_unlock(&sbi->workstn_tree); - - if (i && nr_shrink) - goto repeat; return freed; } @@ -286,7 +262,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, spin_unlock(&erofs_sb_list_lock); sbi->shrinker_run_no = run_no; - freed += erofs_shrink_workstation(sbi, nr); + freed += erofs_shrink_workstation(sbi, nr - freed); spin_lock(&erofs_sb_list_lock); /* Get the next list element before we move this one */ diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 80e47f07d946..c4b6c9aa87ec 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -67,16 +67,6 @@ static void z_erofs_pcluster_init_once(void *ptr) pcl->compressed_pages[i] = NULL; } -static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl) -{ - struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); - - atomic_set(&pcl->obj.refcount, 1); - - DBG_BUGON(cl->nr_pages); - DBG_BUGON(cl->vcnt); -} - int __init z_erofs_init_zip_subsystem(void) { pcluster_cachep = kmem_cache_create("erofs_compress", @@ -341,26 +331,19 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt, struct inode *inode, struct erofs_map_blocks *map) { - struct erofs_workgroup *grp; - struct z_erofs_pcluster *pcl; + struct z_erofs_pcluster *pcl = clt->pcl; struct z_erofs_collection *cl; unsigned int length; - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); - if (!grp) - return -ENOENT; - - pcl = container_of(grp, struct z_erofs_pcluster, obj); + /* to avoid unexpected loop formed by corrupted images */ if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) { DBG_BUGON(1); - erofs_workgroup_put(grp); return -EFSCORRUPTED; } cl = z_erofs_primarycollection(pcl); if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { DBG_BUGON(1); - erofs_workgroup_put(grp); return -EFSCORRUPTED; } @@ -368,7 +351,6 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt, if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { DBG_BUGON(1); - erofs_workgroup_put(grp); return -EFSCORRUPTED; } } else { @@ -391,7 +373,6 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt, /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) clt->tailpcl = NULL; - clt->pcl = pcl; clt->cl = cl; return 0; } @@ -402,6 +383,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, { struct z_erofs_pcluster *pcl; struct z_erofs_collection *cl; + struct erofs_workgroup *grp; int err; /* no available workgroup, let's allocate one */ @@ -409,7 +391,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, if (!pcl) return -ENOMEM; - z_erofs_pcluster_init_always(pcl); + atomic_set(&pcl->obj.refcount, 1); pcl->obj.index = map->m_pa >> PAGE_SHIFT; pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | @@ -429,19 +411,29 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, clt->mode = COLLECT_PRIMARY_FOLLOWED; cl = z_erofs_primarycollection(pcl); + + /* must be cleaned before freeing to slab */ + DBG_BUGON(cl->nr_pages); + DBG_BUGON(cl->vcnt); + cl->pageofs = map->m_la & ~PAGE_MASK; /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ - mutex_trylock(&cl->lock); + DBG_BUGON(!mutex_trylock(&cl->lock)); - err = erofs_register_workgroup(inode->i_sb, &pcl->obj); - if (err) { - mutex_unlock(&cl->lock); - kmem_cache_free(pcluster_cachep, pcl); - return -EAGAIN; + grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + if (IS_ERR(grp)) { + err = PTR_ERR(grp); + goto err_out; + } + + if (grp != &pcl->obj) { + clt->pcl = container_of(grp, struct z_erofs_pcluster, obj); + err = -EEXIST; + goto err_out; } /* used to check tail merging loop due to corrupted images */ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) @@ -450,12 +442,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, clt->pcl = pcl; clt->cl = cl; return 0; + +err_out: + mutex_unlock(&cl->lock); + kmem_cache_free(pcluster_cachep, pcl); + return err; } static int z_erofs_collector_begin(struct z_erofs_collector *clt, struct inode *inode, struct erofs_map_blocks *map) { + struct erofs_workgroup *grp; int ret; DBG_BUGON(clt->cl); @@ -469,21 +467,25 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt, return -EINVAL; } -repeat: - ret = z_erofs_lookup_collection(clt, inode, map); - if (ret == -ENOENT) { + grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); + if (grp) { + clt->pcl = container_of(grp, struct z_erofs_pcluster, obj); + } else { ret = z_erofs_register_collection(clt, inode, map); - /* someone registered at the same time, give another try */ - if (ret == -EAGAIN) { - cond_resched(); - goto repeat; - } + if (!ret) + goto out; + if (ret != -EEXIST) + return ret; } - if (ret) + ret = z_erofs_lookup_collection(clt, inode, map); + if (ret) { + erofs_workgroup_put(&clt->pcl->obj); return ret; + } +out: z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, clt->cl->pagevec, clt->cl->vcnt); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eee3c92a9ebf..8c596641a72b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,13 +218,18 @@ struct eventpoll { struct file *file; /* used to optimize loop detection check */ - int visited; struct list_head visited_list_link; + int visited; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; #endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* tracks wakeup nests for lockdep validation */ + u8 nests; +#endif }; /* Wait structure used by the poll hooks */ @@ -545,30 +550,47 @@ out_unlock: */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static DEFINE_PER_CPU(int, wakeup_nest); - -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) { + struct eventpoll *ep_src; unsigned long flags; - int subclass; + u8 nests = 0; - local_irq_save(flags); - preempt_disable(); - subclass = __this_cpu_read(wakeup_nest); - spin_lock_nested(&wq->lock, subclass + 1); - __this_cpu_inc(wakeup_nest); - wake_up_locked_poll(wq, POLLIN); - __this_cpu_dec(wakeup_nest); - spin_unlock(&wq->lock); - local_irq_restore(flags); - preempt_enable(); + /* + * To set the subclass or nesting level for spin_lock_irqsave_nested() + * it might be natural to create a per-cpu nest count. However, since + * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can + * schedule() in the -rt kernel, the per-cpu variable are no longer + * protected. Thus, we are introducing a per eventpoll nest field. + * If we are not being call from ep_poll_callback(), epi is NULL and + * we are at the first level of nesting, 0. Otherwise, we are being + * called from ep_poll_callback() and if a previous wakeup source is + * not an epoll file itself, we are at depth 1 since the wakeup source + * is depth 0. If the wakeup source is a previous epoll file in the + * wakeup chain then we use its nests value and record ours as + * nests + 1. The previous epoll file nests value is stable since its + * already holding its own poll_wait.lock. + */ + if (epi) { + if ((is_file_epoll(epi->ffd.file))) { + ep_src = epi->ffd.file->private_data; + nests = ep_src->nests; + } else { + nests = 1; + } + } + spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); + ep->nests = nests + 1; + wake_up_locked_poll(&ep->poll_wait, EPOLLIN); + ep->nests = 0; + spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) { - wake_up_poll(wq, EPOLLIN); + wake_up_poll(&ep->poll_wait, EPOLLIN); } #endif @@ -789,7 +811,7 @@ static void ep_free(struct eventpoll *ep) /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); /* * We need to lock this because we could be hit by @@ -1258,7 +1280,7 @@ out_unlock: /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, epi); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; @@ -1562,7 +1584,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); return 0; @@ -1666,7 +1688,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); return 0; } diff --git a/fs/exec.c b/fs/exec.c index db17be51b112..06b4c550af5d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -985,6 +985,32 @@ int kernel_read_file_from_path(const char *path, void **buf, loff_t *size, } EXPORT_SYMBOL_GPL(kernel_read_file_from_path); +int kernel_read_file_from_path_initns(const char *path, void **buf, + loff_t *size, loff_t max_size, + enum kernel_read_file_id id) +{ + struct file *file; + struct path root; + int ret; + + if (!path || !*path) + return -EINVAL; + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + + file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0); + path_put(&root); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = kernel_read_file(file, buf, size, max_size, id); + fput(file); + return ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); + int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size, enum kernel_read_file_id id) { @@ -1010,16 +1036,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) } EXPORT_SYMBOL(read_code); +/* + * Maps the mm_struct mm into the current task struct. + * On success, this function returns with the mutex + * exec_update_mutex locked. + */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; + int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); + ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); + if (ret) + return ret; + if (old_mm) { sync_mm_rss(old_mm); /* @@ -1031,9 +1067,11 @@ static int exec_mmap(struct mm_struct *mm) down_read(&old_mm->mmap_sem); if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); + mutex_unlock(&tsk->signal->exec_update_mutex); return -EINTR; } } + task_lock(tsk); active_mm = tsk->active_mm; membarrier_exec_mmap(mm); @@ -1189,10 +1227,22 @@ no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; -#ifdef CONFIG_POSIX_TIMERS - exit_itimers(sig); - flush_itimer_signals(); -#endif + BUG_ON(!thread_group_leader(tsk)); + return 0; + +killed: + /* protects against exit_notify() and __exit_signal() */ + read_lock(&tasklist_lock); + sig->group_exit_task = NULL; + sig->notify_count = 0; + read_unlock(&tasklist_lock); + return -EAGAIN; +} + + +static int unshare_sighand(struct task_struct *me) +{ + struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; @@ -1210,23 +1260,13 @@ no_thread_group: write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); - rcu_assign_pointer(tsk->sighand, newsighand); + rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } - - BUG_ON(!thread_group_leader(tsk)); return 0; - -killed: - /* protects against exit_notify() and __exit_signal() */ - read_lock(&tasklist_lock); - sig->group_exit_task = NULL; - sig->notify_count = 0; - read_unlock(&tasklist_lock); - return -EAGAIN; } char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) @@ -1260,13 +1300,13 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) */ int flush_old_exec(struct linux_binprm * bprm) { + struct task_struct *me = current; int retval; /* - * Make sure we have a private signal table and that - * we are unassociated from the previous thread group. + * Make this the only thread in the thread group. */ - retval = de_thread(current); + retval = de_thread(me); if (retval) goto out; @@ -1286,18 +1326,31 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; /* - * After clearing bprm->mm (to mark that current is using the - * prepared mm now), we have nothing left of the original + * After setting bprm->called_exec_mmap (to mark that current is + * using the prepared mm now), we have nothing left of the original * process. If anything from here on returns an error, the check * in search_binary_handler() will SEGV current. */ + bprm->called_exec_mmap = 1; bprm->mm = NULL; +#ifdef CONFIG_POSIX_TIMERS + exit_itimers(me->signal); + flush_itimer_signals(); +#endif + + /* + * Make the signal table private. + */ + retval = unshare_sighand(me); + if (retval) + goto out; + set_fs(USER_DS); - current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); - current->personality &= ~bprm->per_clear; + me->personality &= ~bprm->per_clear; /* * We have to apply CLOEXEC before we change whether the process is @@ -1305,7 +1358,7 @@ int flush_old_exec(struct linux_binprm * bprm) * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ - do_close_on_exec(current->files); + do_close_on_exec(me->files); return 0; out: @@ -1386,7 +1439,7 @@ void setup_new_exec(struct linux_binprm * bprm) /* An exec changes our domain. We are no longer part of the thread group */ - current->self_exec_id++; + WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1); flush_signal_handlers(current, 0); } EXPORT_SYMBOL(setup_new_exec); @@ -1424,6 +1477,8 @@ static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { + if (bprm->called_exec_mmap) + mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } @@ -1473,6 +1528,7 @@ void install_exec_creds(struct linux_binprm *bprm) * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); + mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); @@ -1664,7 +1720,7 @@ int search_binary_handler(struct linux_binprm *bprm) read_lock(&binfmt_lock); put_binfmt(fmt); - if (retval < 0 && !bprm->mm) { + if (retval < 0 && bprm->called_exec_mmap) { /* we got to flush_old_exec() and failed after it */ read_unlock(&binfmt_lock); force_sigsegv(SIGSEGV); diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig new file mode 100644 index 000000000000..2d3636dc5b8c --- /dev/null +++ b/fs/exfat/Kconfig @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config EXFAT_FS + tristate "exFAT filesystem support" + select NLS + help + This allows you to mount devices formatted with the exFAT file system. + exFAT is typically used on SD-Cards or USB sticks. + + To compile this as a module, choose M here: the module will be called + exfat. + +config EXFAT_DEFAULT_IOCHARSET + string "Default iocharset for exFAT" + default "utf8" + depends on EXFAT_FS + help + Set this to the default input/output character set to use for + converting between the encoding is used for user visible filename and + UTF-16 character that exfat filesystem use, and can be overridden with + the "iocharset" mount option for exFAT filesystems. diff --git a/fs/exfat/Makefile b/fs/exfat/Makefile new file mode 100644 index 000000000000..ed51926a4971 --- /dev/null +++ b/fs/exfat/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Makefile for the linux exFAT filesystem support. +# +obj-$(CONFIG_EXFAT_FS) += exfat.o + +exfat-y := inode.o namei.o dir.o super.o fatent.o cache.o nls.o misc.o \ + file.o balloc.o diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c new file mode 100644 index 000000000000..6a04cc02565a --- /dev/null +++ b/fs/exfat/balloc.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static const unsigned char free_bit[] = { + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/* 0 ~ 19*/ + 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3,/* 20 ~ 39*/ + 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/* 40 ~ 59*/ + 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/* 60 ~ 79*/ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2,/* 80 ~ 99*/ + 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,/*100 ~ 119*/ + 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*120 ~ 139*/ + 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,/*140 ~ 159*/ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/*160 ~ 179*/ + 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3,/*180 ~ 199*/ + 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*200 ~ 219*/ + 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/*220 ~ 239*/ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 /*240 ~ 254*/ +}; + +static const unsigned char used_bit[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3,/* 0 ~ 19*/ + 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4,/* 20 ~ 39*/ + 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5,/* 40 ~ 59*/ + 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,/* 60 ~ 79*/ + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4,/* 80 ~ 99*/ + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,/*100 ~ 119*/ + 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4,/*120 ~ 139*/ + 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,/*140 ~ 159*/ + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5,/*160 ~ 179*/ + 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,/*180 ~ 199*/ + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6,/*200 ~ 219*/ + 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,/*220 ~ 239*/ + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 /*240 ~ 255*/ +}; + +/* + * Allocation Bitmap Management Functions + */ +static int exfat_allocate_bitmap(struct super_block *sb, + struct exfat_dentry *ep) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + long long map_size; + unsigned int i, need_map_size; + sector_t sector; + + sbi->map_clu = le32_to_cpu(ep->dentry.bitmap.start_clu); + map_size = le64_to_cpu(ep->dentry.bitmap.size); + need_map_size = ((EXFAT_DATA_CLUSTER_COUNT(sbi) - 1) / BITS_PER_BYTE) + + 1; + if (need_map_size != map_size) { + exfat_msg(sb, KERN_ERR, + "bogus allocation bitmap size(need : %u, cur : %lld)", + need_map_size, map_size); + /* + * Only allowed when bogus allocation + * bitmap size is large + */ + if (need_map_size > map_size) + return -EIO; + } + sbi->map_sectors = ((need_map_size - 1) >> + (sb->s_blocksize_bits)) + 1; + sbi->vol_amap = kmalloc_array(sbi->map_sectors, + sizeof(struct buffer_head *), GFP_KERNEL); + if (!sbi->vol_amap) + return -ENOMEM; + + sector = exfat_cluster_to_sector(sbi, sbi->map_clu); + for (i = 0; i < sbi->map_sectors; i++) { + sbi->vol_amap[i] = sb_bread(sb, sector + i); + if (!sbi->vol_amap[i]) { + /* release all buffers and free vol_amap */ + int j = 0; + + while (j < i) + brelse(sbi->vol_amap[j++]); + + kfree(sbi->vol_amap); + sbi->vol_amap = NULL; + return -EIO; + } + } + + sbi->pbr_bh = NULL; + return 0; +} + +int exfat_load_bitmap(struct super_block *sb) +{ + unsigned int i, type; + struct exfat_chain clu; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + exfat_chain_set(&clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + while (clu.dir != EXFAT_EOF_CLUSTER) { + for (i = 0; i < sbi->dentries_per_clu; i++) { + struct exfat_dentry *ep; + struct buffer_head *bh; + + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + + type = exfat_get_entry_type(ep); + if (type == TYPE_UNUSED) + break; + if (type != TYPE_BITMAP) + continue; + if (ep->dentry.bitmap.flags == 0x0) { + int err; + + err = exfat_allocate_bitmap(sb, ep); + brelse(bh); + return err; + } + brelse(bh); + } + + if (exfat_get_next_cluster(sb, &clu.dir)) + return -EIO; + } + + return -EINVAL; +} + +void exfat_free_bitmap(struct exfat_sb_info *sbi) +{ + int i; + + brelse(sbi->pbr_bh); + + for (i = 0; i < sbi->map_sectors; i++) + __brelse(sbi->vol_amap[i]); + + kfree(sbi->vol_amap); +} + +/* + * If the value of "clu" is 0, it means cluster 2 which is the first cluster of + * the cluster heap. + */ +int exfat_set_bitmap(struct inode *inode, unsigned int clu) +{ + int i, b; + unsigned int ent_idx; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + WARN_ON(clu < EXFAT_FIRST_CLUSTER); + ent_idx = CLUSTER_TO_BITMAP_ENT(clu); + i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); + b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); + + set_bit_le(b, sbi->vol_amap[i]->b_data); + exfat_update_bh(sb, sbi->vol_amap[i], IS_DIRSYNC(inode)); + return 0; +} + +/* + * If the value of "clu" is 0, it means cluster 2 which is the first cluster of + * the cluster heap. + */ +void exfat_clear_bitmap(struct inode *inode, unsigned int clu) +{ + int i, b; + unsigned int ent_idx; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_mount_options *opts = &sbi->options; + + WARN_ON(clu < EXFAT_FIRST_CLUSTER); + ent_idx = CLUSTER_TO_BITMAP_ENT(clu); + i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); + b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); + + clear_bit_le(b, sbi->vol_amap[i]->b_data); + exfat_update_bh(sb, sbi->vol_amap[i], IS_DIRSYNC(inode)); + + if (opts->discard) { + int ret_discard; + + ret_discard = sb_issue_discard(sb, + exfat_cluster_to_sector(sbi, clu + + EXFAT_RESERVED_CLUSTERS), + (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0); + + if (ret_discard == -EOPNOTSUPP) { + exfat_msg(sb, KERN_ERR, + "discard not supported by device, disabling"); + opts->discard = 0; + } + } +} + +/* + * If the value of "clu" is 0, it means cluster 2 which is the first cluster of + * the cluster heap. + */ +unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu) +{ + unsigned int i, map_i, map_b, ent_idx; + unsigned int clu_base, clu_free; + unsigned char k, clu_mask; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + WARN_ON(clu < EXFAT_FIRST_CLUSTER); + ent_idx = CLUSTER_TO_BITMAP_ENT(clu); + clu_base = BITMAP_ENT_TO_CLUSTER(ent_idx & ~(BITS_PER_BYTE_MASK)); + clu_mask = IGNORED_BITS_REMAINED(clu, clu_base); + + map_i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); + map_b = BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent_idx); + + for (i = EXFAT_FIRST_CLUSTER; i < sbi->num_clusters; + i += BITS_PER_BYTE) { + k = *(sbi->vol_amap[map_i]->b_data + map_b); + if (clu_mask > 0) { + k |= clu_mask; + clu_mask = 0; + } + if (k < 0xFF) { + clu_free = clu_base + free_bit[k]; + if (clu_free < sbi->num_clusters) + return clu_free; + } + clu_base += BITS_PER_BYTE; + + if (++map_b >= sb->s_blocksize || + clu_base >= sbi->num_clusters) { + if (++map_i >= sbi->map_sectors) { + clu_base = EXFAT_FIRST_CLUSTER; + map_i = 0; + } + map_b = 0; + } + } + + return EXFAT_EOF_CLUSTER; +} + +int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int count = 0; + unsigned int i, map_i = 0, map_b = 0; + unsigned int total_clus = EXFAT_DATA_CLUSTER_COUNT(sbi); + unsigned int last_mask = total_clus & BITS_PER_BYTE_MASK; + unsigned char clu_bits; + const unsigned char last_bit_mask[] = {0, 0b00000001, 0b00000011, + 0b00000111, 0b00001111, 0b00011111, 0b00111111, 0b01111111}; + + total_clus &= ~last_mask; + for (i = 0; i < total_clus; i += BITS_PER_BYTE) { + clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b); + count += used_bit[clu_bits]; + if (++map_b >= (unsigned int)sb->s_blocksize) { + map_i++; + map_b = 0; + } + } + + if (last_mask) { + clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b); + clu_bits &= last_bit_mask[last_mask]; + count += used_bit[clu_bits]; + } + + *ret_count = count; + return 0; +} diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c new file mode 100644 index 000000000000..03d0824fc368 --- /dev/null +++ b/fs/exfat/cache.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * linux/fs/fat/cache.c + * + * Written 1992,1993 by Werner Almesberger + * + * Mar 1999. AV. Changed cache, so that it uses the starting cluster instead + * of inode number. + * May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers. + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/slab.h> +#include <asm/unaligned.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +#define EXFAT_CACHE_VALID 0 +#define EXFAT_MAX_CACHE 16 + +struct exfat_cache { + struct list_head cache_list; + unsigned int nr_contig; /* number of contiguous clusters */ + unsigned int fcluster; /* cluster number in the file. */ + unsigned int dcluster; /* cluster number on disk. */ +}; + +struct exfat_cache_id { + unsigned int id; + unsigned int nr_contig; + unsigned int fcluster; + unsigned int dcluster; +}; + +static struct kmem_cache *exfat_cachep; + +static void exfat_cache_init_once(void *c) +{ + struct exfat_cache *cache = (struct exfat_cache *)c; + + INIT_LIST_HEAD(&cache->cache_list); +} + +int exfat_cache_init(void) +{ + exfat_cachep = kmem_cache_create("exfat_cache", + sizeof(struct exfat_cache), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + exfat_cache_init_once); + if (!exfat_cachep) + return -ENOMEM; + return 0; +} + +void exfat_cache_shutdown(void) +{ + if (!exfat_cachep) + return; + kmem_cache_destroy(exfat_cachep); +} + +void exfat_cache_init_inode(struct inode *inode) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + + spin_lock_init(&ei->cache_lru_lock); + ei->nr_caches = 0; + ei->cache_valid_id = EXFAT_CACHE_VALID + 1; + INIT_LIST_HEAD(&ei->cache_lru); +} + +static inline struct exfat_cache *exfat_cache_alloc(void) +{ + return kmem_cache_alloc(exfat_cachep, GFP_NOFS); +} + +static inline void exfat_cache_free(struct exfat_cache *cache) +{ + WARN_ON(!list_empty(&cache->cache_list)); + kmem_cache_free(exfat_cachep, cache); +} + +static inline void exfat_cache_update_lru(struct inode *inode, + struct exfat_cache *cache) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + + if (ei->cache_lru.next != &cache->cache_list) + list_move(&cache->cache_list, &ei->cache_lru); +} + +static unsigned int exfat_cache_lookup(struct inode *inode, + unsigned int fclus, struct exfat_cache_id *cid, + unsigned int *cached_fclus, unsigned int *cached_dclus) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + static struct exfat_cache nohit = { .fcluster = 0, }; + struct exfat_cache *hit = &nohit, *p; + unsigned int offset = EXFAT_EOF_CLUSTER; + + spin_lock(&ei->cache_lru_lock); + list_for_each_entry(p, &ei->cache_lru, cache_list) { + /* Find the cache of "fclus" or nearest cache. */ + if (p->fcluster <= fclus && hit->fcluster < p->fcluster) { + hit = p; + if (hit->fcluster + hit->nr_contig < fclus) { + offset = hit->nr_contig; + } else { + offset = fclus - hit->fcluster; + break; + } + } + } + if (hit != &nohit) { + exfat_cache_update_lru(inode, hit); + + cid->id = ei->cache_valid_id; + cid->nr_contig = hit->nr_contig; + cid->fcluster = hit->fcluster; + cid->dcluster = hit->dcluster; + *cached_fclus = cid->fcluster + offset; + *cached_dclus = cid->dcluster + offset; + } + spin_unlock(&ei->cache_lru_lock); + + return offset; +} + +static struct exfat_cache *exfat_cache_merge(struct inode *inode, + struct exfat_cache_id *new) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_cache *p; + + list_for_each_entry(p, &ei->cache_lru, cache_list) { + /* Find the same part as "new" in cluster-chain. */ + if (p->fcluster == new->fcluster) { + if (new->nr_contig > p->nr_contig) + p->nr_contig = new->nr_contig; + return p; + } + } + return NULL; +} + +static void exfat_cache_add(struct inode *inode, + struct exfat_cache_id *new) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_cache *cache, *tmp; + + if (new->fcluster == EXFAT_EOF_CLUSTER) /* dummy cache */ + return; + + spin_lock(&ei->cache_lru_lock); + if (new->id != EXFAT_CACHE_VALID && + new->id != ei->cache_valid_id) + goto unlock; /* this cache was invalidated */ + + cache = exfat_cache_merge(inode, new); + if (cache == NULL) { + if (ei->nr_caches < EXFAT_MAX_CACHE) { + ei->nr_caches++; + spin_unlock(&ei->cache_lru_lock); + + tmp = exfat_cache_alloc(); + if (!tmp) { + spin_lock(&ei->cache_lru_lock); + ei->nr_caches--; + spin_unlock(&ei->cache_lru_lock); + return; + } + + spin_lock(&ei->cache_lru_lock); + cache = exfat_cache_merge(inode, new); + if (cache != NULL) { + ei->nr_caches--; + exfat_cache_free(tmp); + goto out_update_lru; + } + cache = tmp; + } else { + struct list_head *p = ei->cache_lru.prev; + + cache = list_entry(p, + struct exfat_cache, cache_list); + } + cache->fcluster = new->fcluster; + cache->dcluster = new->dcluster; + cache->nr_contig = new->nr_contig; + } +out_update_lru: + exfat_cache_update_lru(inode, cache); +unlock: + spin_unlock(&ei->cache_lru_lock); +} + +/* + * Cache invalidation occurs rarely, thus the LRU chain is not updated. It + * fixes itself after a while. + */ +static void __exfat_cache_inval_inode(struct inode *inode) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_cache *cache; + + while (!list_empty(&ei->cache_lru)) { + cache = list_entry(ei->cache_lru.next, + struct exfat_cache, cache_list); + list_del_init(&cache->cache_list); + ei->nr_caches--; + exfat_cache_free(cache); + } + /* Update. The copy of caches before this id is discarded. */ + ei->cache_valid_id++; + if (ei->cache_valid_id == EXFAT_CACHE_VALID) + ei->cache_valid_id++; +} + +void exfat_cache_inval_inode(struct inode *inode) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + + spin_lock(&ei->cache_lru_lock); + __exfat_cache_inval_inode(inode); + spin_unlock(&ei->cache_lru_lock); +} + +static inline int cache_contiguous(struct exfat_cache_id *cid, + unsigned int dclus) +{ + cid->nr_contig++; + return cid->dcluster + cid->nr_contig == dclus; +} + +static inline void cache_init(struct exfat_cache_id *cid, + unsigned int fclus, unsigned int dclus) +{ + cid->id = EXFAT_CACHE_VALID; + cid->fcluster = fclus; + cid->dcluster = dclus; + cid->nr_contig = 0; +} + +int exfat_get_cluster(struct inode *inode, unsigned int cluster, + unsigned int *fclus, unsigned int *dclus, + unsigned int *last_dclus, int allow_eof) +{ + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int limit = sbi->num_clusters; + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_cache_id cid; + unsigned int content; + + if (ei->start_clu == EXFAT_FREE_CLUSTER) { + exfat_fs_error(sb, + "invalid access to exfat cache (entry 0x%08x)", + ei->start_clu); + return -EIO; + } + + *fclus = 0; + *dclus = ei->start_clu; + *last_dclus = *dclus; + + /* + * Don`t use exfat_cache if zero offset or non-cluster allocation + */ + if (cluster == 0 || *dclus == EXFAT_EOF_CLUSTER) + return 0; + + cache_init(&cid, EXFAT_EOF_CLUSTER, EXFAT_EOF_CLUSTER); + + if (exfat_cache_lookup(inode, cluster, &cid, fclus, dclus) == + EXFAT_EOF_CLUSTER) { + /* + * dummy, always not contiguous + * This is reinitialized by cache_init(), later. + */ + WARN_ON(cid.id != EXFAT_CACHE_VALID || + cid.fcluster != EXFAT_EOF_CLUSTER || + cid.dcluster != EXFAT_EOF_CLUSTER || + cid.nr_contig != 0); + } + + if (*fclus == cluster) + return 0; + + while (*fclus < cluster) { + /* prevent the infinite loop of cluster chain */ + if (*fclus > limit) { + exfat_fs_error(sb, + "detected the cluster chain loop (i_pos %u)", + (*fclus)); + return -EIO; + } + + if (exfat_ent_get(sb, *dclus, &content)) + return -EIO; + + *last_dclus = *dclus; + *dclus = content; + (*fclus)++; + + if (content == EXFAT_EOF_CLUSTER) { + if (!allow_eof) { + exfat_fs_error(sb, + "invalid cluster chain (i_pos %u, last_clus 0x%08x is EOF)", + *fclus, (*last_dclus)); + return -EIO; + } + + break; + } + + if (!cache_contiguous(&cid, *dclus)) + cache_init(&cid, *fclus, *dclus); + } + + exfat_cache_add(inode, &cid); + return 0; +} diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c new file mode 100644 index 000000000000..4b91afb0f051 --- /dev/null +++ b/fs/exfat/dir.c @@ -0,0 +1,1238 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/slab.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static int exfat_extract_uni_name(struct exfat_dentry *ep, + unsigned short *uniname) +{ + int i, len = 0; + + for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) { + *uniname = le16_to_cpu(ep->dentry.name.unicode_0_14[i]); + if (*uniname == 0x0) + return len; + uniname++; + len++; + } + + *uniname = 0x0; + return len; + +} + +static void exfat_get_uniname_from_ext_entry(struct super_block *sb, + struct exfat_chain *p_dir, int entry, unsigned short *uniname) +{ + int i; + struct exfat_dentry *ep; + struct exfat_entry_set_cache *es; + + es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES, &ep); + if (!es) + return; + + if (es->num_entries < 3) + goto free_es; + + ep += 2; + + /* + * First entry : file entry + * Second entry : stream-extension entry + * Third entry : first file-name entry + * So, the index of first file-name dentry should start from 2. + */ + for (i = 2; i < es->num_entries; i++, ep++) { + /* end of name entry */ + if (exfat_get_entry_type(ep) != TYPE_EXTEND) + goto free_es; + + exfat_extract_uni_name(ep, uniname); + uniname += EXFAT_FILE_NAME_LEN; + } + +free_es: + kfree(es); +} + +/* read a directory entry from the opened directory */ +static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry) +{ + int i, dentries_per_clu, dentries_per_clu_bits = 0; + unsigned int type, clu_offset; + sector_t sector; + struct exfat_chain dir, clu; + struct exfat_uni_name uni_name; + struct exfat_dentry *ep; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + unsigned int dentry = ei->rwoffset & 0xFFFFFFFF; + struct buffer_head *bh; + + /* check if the given file ID is opened */ + if (ei->type != TYPE_DIR) + return -EPERM; + + if (ei->entry == -1) + exfat_chain_set(&dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + else + exfat_chain_set(&dir, ei->start_clu, + EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); + + dentries_per_clu = sbi->dentries_per_clu; + dentries_per_clu_bits = ilog2(dentries_per_clu); + + clu_offset = dentry >> dentries_per_clu_bits; + exfat_chain_dup(&clu, &dir); + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + clu.dir += clu_offset; + clu.size -= clu_offset; + } else { + /* hint_information */ + if (clu_offset > 0 && ei->hint_bmap.off != EXFAT_EOF_CLUSTER && + ei->hint_bmap.off > 0 && clu_offset >= ei->hint_bmap.off) { + clu_offset -= ei->hint_bmap.off; + clu.dir = ei->hint_bmap.clu; + } + + while (clu_offset > 0) { + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + + clu_offset--; + } + } + + while (clu.dir != EXFAT_EOF_CLUSTER) { + i = dentry & (dentries_per_clu - 1); + + for ( ; i < dentries_per_clu; i++, dentry++) { + ep = exfat_get_dentry(sb, &clu, i, &bh, §or); + if (!ep) + return -EIO; + + type = exfat_get_entry_type(ep); + if (type == TYPE_UNUSED) { + brelse(bh); + break; + } + + if (type != TYPE_FILE && type != TYPE_DIR) { + brelse(bh); + continue; + } + + dir_entry->attr = le16_to_cpu(ep->dentry.file.attr); + exfat_get_entry_time(sbi, &dir_entry->crtime, + ep->dentry.file.create_tz, + ep->dentry.file.create_time, + ep->dentry.file.create_date, + ep->dentry.file.create_time_ms); + exfat_get_entry_time(sbi, &dir_entry->mtime, + ep->dentry.file.modify_tz, + ep->dentry.file.modify_time, + ep->dentry.file.modify_date, + ep->dentry.file.modify_time_ms); + exfat_get_entry_time(sbi, &dir_entry->atime, + ep->dentry.file.access_tz, + ep->dentry.file.access_time, + ep->dentry.file.access_date, + 0); + + *uni_name.name = 0x0; + exfat_get_uniname_from_ext_entry(sb, &dir, dentry, + uni_name.name); + exfat_utf16_to_nls(sb, &uni_name, + dir_entry->namebuf.lfn, + dir_entry->namebuf.lfnbuf_len); + brelse(bh); + + ep = exfat_get_dentry(sb, &clu, i + 1, &bh, NULL); + if (!ep) + return -EIO; + dir_entry->size = + le64_to_cpu(ep->dentry.stream.valid_size); + brelse(bh); + + ei->hint_bmap.off = dentry >> dentries_per_clu_bits; + ei->hint_bmap.clu = clu.dir; + + ei->rwoffset = ++dentry; + return 0; + } + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + } + } + + dir_entry->namebuf.lfn[0] = '\0'; + ei->rwoffset = dentry; + return 0; +} + +static void exfat_init_namebuf(struct exfat_dentry_namebuf *nb) +{ + nb->lfn = NULL; + nb->lfnbuf_len = 0; +} + +static int exfat_alloc_namebuf(struct exfat_dentry_namebuf *nb) +{ + nb->lfn = __getname(); + if (!nb->lfn) + return -ENOMEM; + nb->lfnbuf_len = MAX_VFSNAME_BUF_SIZE; + return 0; +} + +static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb) +{ + if (!nb->lfn) + return; + + __putname(nb->lfn); + exfat_init_namebuf(nb); +} + +/* skip iterating emit_dots when dir is empty */ +#define ITER_POS_FILLED_DOTS (2) +static int exfat_iterate(struct file *filp, struct dir_context *ctx) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct inode *tmp; + struct exfat_dir_entry de; + struct exfat_dentry_namebuf *nb = &(de.namebuf); + struct exfat_inode_info *ei = EXFAT_I(inode); + unsigned long inum; + loff_t cpos, i_pos; + int err = 0, fake_offset = 0; + + exfat_init_namebuf(nb); + mutex_lock(&EXFAT_SB(sb)->s_lock); + + cpos = ctx->pos; + if (!dir_emit_dots(filp, ctx)) + goto unlock; + + if (ctx->pos == ITER_POS_FILLED_DOTS) { + cpos = 0; + fake_offset = 1; + } + + if (cpos & (DENTRY_SIZE - 1)) { + err = -ENOENT; + goto unlock; + } + + /* name buffer should be allocated before use */ + err = exfat_alloc_namebuf(nb); + if (err) + goto unlock; +get_new: + ei->rwoffset = EXFAT_B_TO_DEN(cpos); + + if (cpos >= i_size_read(inode)) + goto end_of_dir; + + err = exfat_readdir(inode, &de); + if (err) { + /* + * At least we tried to read a sector. Move cpos to next sector + * position (should be aligned). + */ + if (err == -EIO) { + cpos += 1 << (sb->s_blocksize_bits); + cpos &= ~(sb->s_blocksize - 1); + } + + err = -EIO; + goto end_of_dir; + } + + cpos = EXFAT_DEN_TO_B(ei->rwoffset); + + if (!nb->lfn[0]) + goto end_of_dir; + + i_pos = ((loff_t)ei->start_clu << 32) | + ((ei->rwoffset - 1) & 0xffffffff); + tmp = exfat_iget(sb, i_pos); + if (tmp) { + inum = tmp->i_ino; + iput(tmp); + } else { + inum = iunique(sb, EXFAT_ROOT_INO); + } + + /* + * Before calling dir_emit(), sb_lock should be released. + * Because page fault can occur in dir_emit() when the size + * of buffer given from user is larger than one page size. + */ + mutex_unlock(&EXFAT_SB(sb)->s_lock); + if (!dir_emit(ctx, nb->lfn, strlen(nb->lfn), inum, + (de.attr & ATTR_SUBDIR) ? DT_DIR : DT_REG)) + goto out_unlocked; + mutex_lock(&EXFAT_SB(sb)->s_lock); + ctx->pos = cpos; + goto get_new; + +end_of_dir: + if (!cpos && fake_offset) + cpos = ITER_POS_FILLED_DOTS; + ctx->pos = cpos; +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); +out_unlocked: + /* + * To improve performance, free namebuf after unlock sb_lock. + * If namebuf is not allocated, this function do nothing + */ + exfat_free_namebuf(nb); + return err; +} + +const struct file_operations exfat_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = exfat_iterate, + .fsync = generic_file_fsync, +}; + +int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu) +{ + int ret; + + exfat_chain_set(clu, EXFAT_EOF_CLUSTER, 0, ALLOC_NO_FAT_CHAIN); + + ret = exfat_alloc_cluster(inode, 1, clu); + if (ret) + return ret; + + return exfat_zeroed_cluster(inode, clu->dir); +} + +int exfat_calc_num_entries(struct exfat_uni_name *p_uniname) +{ + int len; + + len = p_uniname->name_len; + if (len == 0) + return -EINVAL; + + /* 1 file entry + 1 stream entry + name entries */ + return ((len - 1) / EXFAT_FILE_NAME_LEN + 3); +} + +unsigned int exfat_get_entry_type(struct exfat_dentry *ep) +{ + if (ep->type == EXFAT_UNUSED) + return TYPE_UNUSED; + if (IS_EXFAT_DELETED(ep->type)) + return TYPE_DELETED; + if (ep->type == EXFAT_INVAL) + return TYPE_INVALID; + if (IS_EXFAT_CRITICAL_PRI(ep->type)) { + if (ep->type == EXFAT_BITMAP) + return TYPE_BITMAP; + if (ep->type == EXFAT_UPCASE) + return TYPE_UPCASE; + if (ep->type == EXFAT_VOLUME) + return TYPE_VOLUME; + if (ep->type == EXFAT_FILE) { + if (le16_to_cpu(ep->dentry.file.attr) & ATTR_SUBDIR) + return TYPE_DIR; + return TYPE_FILE; + } + return TYPE_CRITICAL_PRI; + } + if (IS_EXFAT_BENIGN_PRI(ep->type)) { + if (ep->type == EXFAT_GUID) + return TYPE_GUID; + if (ep->type == EXFAT_PADDING) + return TYPE_PADDING; + if (ep->type == EXFAT_ACLTAB) + return TYPE_ACLTAB; + return TYPE_BENIGN_PRI; + } + if (IS_EXFAT_CRITICAL_SEC(ep->type)) { + if (ep->type == EXFAT_STREAM) + return TYPE_STREAM; + if (ep->type == EXFAT_NAME) + return TYPE_EXTEND; + if (ep->type == EXFAT_ACL) + return TYPE_ACL; + return TYPE_CRITICAL_SEC; + } + return TYPE_BENIGN_SEC; +} + +static void exfat_set_entry_type(struct exfat_dentry *ep, unsigned int type) +{ + if (type == TYPE_UNUSED) { + ep->type = EXFAT_UNUSED; + } else if (type == TYPE_DELETED) { + ep->type &= EXFAT_DELETE; + } else if (type == TYPE_STREAM) { + ep->type = EXFAT_STREAM; + } else if (type == TYPE_EXTEND) { + ep->type = EXFAT_NAME; + } else if (type == TYPE_BITMAP) { + ep->type = EXFAT_BITMAP; + } else if (type == TYPE_UPCASE) { + ep->type = EXFAT_UPCASE; + } else if (type == TYPE_VOLUME) { + ep->type = EXFAT_VOLUME; + } else if (type == TYPE_DIR) { + ep->type = EXFAT_FILE; + ep->dentry.file.attr = cpu_to_le16(ATTR_SUBDIR); + } else if (type == TYPE_FILE) { + ep->type = EXFAT_FILE; + ep->dentry.file.attr = cpu_to_le16(ATTR_ARCHIVE); + } +} + +static void exfat_init_stream_entry(struct exfat_dentry *ep, + unsigned char flags, unsigned int start_clu, + unsigned long long size) +{ + exfat_set_entry_type(ep, TYPE_STREAM); + ep->dentry.stream.flags = flags; + ep->dentry.stream.start_clu = cpu_to_le32(start_clu); + ep->dentry.stream.valid_size = cpu_to_le64(size); + ep->dentry.stream.size = cpu_to_le64(size); +} + +static void exfat_init_name_entry(struct exfat_dentry *ep, + unsigned short *uniname) +{ + int i; + + exfat_set_entry_type(ep, TYPE_EXTEND); + ep->dentry.name.flags = 0x0; + + for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) { + ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname); + if (*uniname == 0x0) + break; + uniname++; + } +} + +int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, + int entry, unsigned int type, unsigned int start_clu, + unsigned long long size) +{ + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct timespec64 ts = current_time(inode); + sector_t sector; + struct exfat_dentry *ep; + struct buffer_head *bh; + + /* + * We cannot use exfat_get_dentry_set here because file ep is not + * initialized yet. + */ + ep = exfat_get_dentry(sb, p_dir, entry, &bh, §or); + if (!ep) + return -EIO; + + exfat_set_entry_type(ep, type); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.create_tz, + &ep->dentry.file.create_time, + &ep->dentry.file.create_date, + &ep->dentry.file.create_time_ms); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.modify_tz, + &ep->dentry.file.modify_time, + &ep->dentry.file.modify_date, + &ep->dentry.file.modify_time_ms); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.access_tz, + &ep->dentry.file.access_time, + &ep->dentry.file.access_date, + NULL); + + exfat_update_bh(sb, bh, IS_DIRSYNC(inode)); + brelse(bh); + + ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, §or); + if (!ep) + return -EIO; + + exfat_init_stream_entry(ep, + (type == TYPE_FILE) ? ALLOC_FAT_CHAIN : ALLOC_NO_FAT_CHAIN, + start_clu, size); + exfat_update_bh(sb, bh, IS_DIRSYNC(inode)); + brelse(bh); + + return 0; +} + +int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, + int entry) +{ + struct super_block *sb = inode->i_sb; + int ret = 0; + int i, num_entries; + sector_t sector; + unsigned short chksum; + struct exfat_dentry *ep, *fep; + struct buffer_head *fbh, *bh; + + fep = exfat_get_dentry(sb, p_dir, entry, &fbh, §or); + if (!fep) + return -EIO; + + num_entries = fep->dentry.file.num_ext + 1; + chksum = exfat_calc_chksum_2byte(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY); + + for (i = 1; i < num_entries; i++) { + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, NULL); + if (!ep) { + ret = -EIO; + goto release_fbh; + } + chksum = exfat_calc_chksum_2byte(ep, DENTRY_SIZE, chksum, + CS_DEFAULT); + brelse(bh); + } + + fep->dentry.file.checksum = cpu_to_le16(chksum); + exfat_update_bh(sb, fbh, IS_DIRSYNC(inode)); +release_fbh: + brelse(fbh); + return ret; +} + +int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, + int entry, int num_entries, struct exfat_uni_name *p_uniname) +{ + struct super_block *sb = inode->i_sb; + int i; + sector_t sector; + unsigned short *uniname = p_uniname->name; + struct exfat_dentry *ep; + struct buffer_head *bh; + int sync = IS_DIRSYNC(inode); + + ep = exfat_get_dentry(sb, p_dir, entry, &bh, §or); + if (!ep) + return -EIO; + + ep->dentry.file.num_ext = (unsigned char)(num_entries - 1); + exfat_update_bh(sb, bh, sync); + brelse(bh); + + ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, §or); + if (!ep) + return -EIO; + + ep->dentry.stream.name_len = p_uniname->name_len; + ep->dentry.stream.name_hash = cpu_to_le16(p_uniname->name_hash); + exfat_update_bh(sb, bh, sync); + brelse(bh); + + for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) { + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, §or); + if (!ep) + return -EIO; + + exfat_init_name_entry(ep, uniname); + exfat_update_bh(sb, bh, sync); + brelse(bh); + uniname += EXFAT_FILE_NAME_LEN; + } + + exfat_update_dir_chksum(inode, p_dir, entry); + return 0; +} + +int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir, + int entry, int order, int num_entries) +{ + struct super_block *sb = inode->i_sb; + int i; + sector_t sector; + struct exfat_dentry *ep; + struct buffer_head *bh; + + for (i = order; i < num_entries; i++) { + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, §or); + if (!ep) + return -EIO; + + exfat_set_entry_type(ep, TYPE_DELETED); + exfat_update_bh(sb, bh, IS_DIRSYNC(inode)); + brelse(bh); + } + + return 0; +} + +int exfat_update_dir_chksum_with_entry_set(struct super_block *sb, + struct exfat_entry_set_cache *es, int sync) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + sector_t sec = es->sector; + unsigned int off = es->offset; + int chksum_type = CS_DIR_ENTRY, i, num_entries = es->num_entries; + unsigned int buf_off = (off - es->offset); + unsigned int remaining_byte_in_sector, copy_entries, clu; + unsigned short chksum = 0; + + for (i = 0; i < num_entries; i++) { + chksum = exfat_calc_chksum_2byte(&es->entries[i], DENTRY_SIZE, + chksum, chksum_type); + chksum_type = CS_DEFAULT; + } + + es->entries[0].dentry.file.checksum = cpu_to_le16(chksum); + + while (num_entries) { + /* write per sector base */ + remaining_byte_in_sector = (1 << sb->s_blocksize_bits) - off; + copy_entries = min_t(int, + EXFAT_B_TO_DEN(remaining_byte_in_sector), + num_entries); + bh = sb_bread(sb, sec); + if (!bh) + goto err_out; + memcpy(bh->b_data + off, + (unsigned char *)&es->entries[0] + buf_off, + EXFAT_DEN_TO_B(copy_entries)); + exfat_update_bh(sb, bh, sync); + brelse(bh); + num_entries -= copy_entries; + + if (num_entries) { + /* get next sector */ + if (exfat_is_last_sector_in_cluster(sbi, sec)) { + clu = exfat_sector_to_cluster(sbi, sec); + if (es->alloc_flag == ALLOC_NO_FAT_CHAIN) + clu++; + else if (exfat_get_next_cluster(sb, &clu)) + goto err_out; + sec = exfat_cluster_to_sector(sbi, clu); + } else { + sec++; + } + off = 0; + buf_off += EXFAT_DEN_TO_B(copy_entries); + } + } + + return 0; +err_out: + return -EIO; +} + +static int exfat_walk_fat_chain(struct super_block *sb, + struct exfat_chain *p_dir, unsigned int byte_offset, + unsigned int *clu) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int clu_offset; + unsigned int cur_clu; + + clu_offset = EXFAT_B_TO_CLU(byte_offset, sbi); + cur_clu = p_dir->dir; + + if (p_dir->flags == ALLOC_NO_FAT_CHAIN) { + cur_clu += clu_offset; + } else { + while (clu_offset > 0) { + if (exfat_get_next_cluster(sb, &cur_clu)) + return -EIO; + if (cur_clu == EXFAT_EOF_CLUSTER) { + exfat_fs_error(sb, + "invalid dentry access beyond EOF (clu : %u, eidx : %d)", + p_dir->dir, + EXFAT_B_TO_DEN(byte_offset)); + return -EIO; + } + clu_offset--; + } + } + + *clu = cur_clu; + return 0; +} + +int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, + int entry, sector_t *sector, int *offset) +{ + int ret; + unsigned int off, clu = 0; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + off = EXFAT_DEN_TO_B(entry); + + ret = exfat_walk_fat_chain(sb, p_dir, off, &clu); + if (ret) + return ret; + + /* byte offset in cluster */ + off = EXFAT_CLU_OFFSET(off, sbi); + + /* byte offset in sector */ + *offset = EXFAT_BLK_OFFSET(off, sb); + + /* sector offset in cluster */ + *sector = EXFAT_B_TO_BLK(off, sb); + *sector += exfat_cluster_to_sector(sbi, clu); + return 0; +} + +#define EXFAT_MAX_RA_SIZE (128*1024) +static int exfat_dir_readahead(struct super_block *sb, sector_t sec) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + unsigned int max_ra_count = EXFAT_MAX_RA_SIZE >> sb->s_blocksize_bits; + unsigned int page_ra_count = PAGE_SIZE >> sb->s_blocksize_bits; + unsigned int adj_ra_count = max(sbi->sect_per_clus, page_ra_count); + unsigned int ra_count = min(adj_ra_count, max_ra_count); + + /* Read-ahead is not required */ + if (sbi->sect_per_clus == 1) + return 0; + + if (sec < sbi->data_start_sector) { + exfat_msg(sb, KERN_ERR, + "requested sector is invalid(sect:%llu, root:%llu)", + (unsigned long long)sec, sbi->data_start_sector); + return -EIO; + } + + /* Not sector aligned with ra_count, resize ra_count to page size */ + if ((sec - sbi->data_start_sector) & (ra_count - 1)) + ra_count = page_ra_count; + + bh = sb_find_get_block(sb, sec); + if (!bh || !buffer_uptodate(bh)) { + unsigned int i; + + for (i = 0; i < ra_count; i++) + sb_breadahead(sb, (sector_t)(sec + i)); + } + brelse(bh); + return 0; +} + +struct exfat_dentry *exfat_get_dentry(struct super_block *sb, + struct exfat_chain *p_dir, int entry, struct buffer_head **bh, + sector_t *sector) +{ + unsigned int dentries_per_page = EXFAT_B_TO_DEN(PAGE_SIZE); + int off; + sector_t sec; + + if (p_dir->dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry\n"); + return NULL; + } + + if (exfat_find_location(sb, p_dir, entry, &sec, &off)) + return NULL; + + if (p_dir->dir != EXFAT_FREE_CLUSTER && + !(entry & (dentries_per_page - 1))) + exfat_dir_readahead(sb, sec); + + *bh = sb_bread(sb, sec); + if (!*bh) + return NULL; + + if (sector) + *sector = sec; + return (struct exfat_dentry *)((*bh)->b_data + off); +} + +enum exfat_validate_dentry_mode { + ES_MODE_STARTED, + ES_MODE_GET_FILE_ENTRY, + ES_MODE_GET_STRM_ENTRY, + ES_MODE_GET_NAME_ENTRY, + ES_MODE_GET_CRITICAL_SEC_ENTRY, +}; + +static bool exfat_validate_entry(unsigned int type, + enum exfat_validate_dentry_mode *mode) +{ + if (type == TYPE_UNUSED || type == TYPE_DELETED) + return false; + + switch (*mode) { + case ES_MODE_STARTED: + if (type != TYPE_FILE && type != TYPE_DIR) + return false; + *mode = ES_MODE_GET_FILE_ENTRY; + return true; + case ES_MODE_GET_FILE_ENTRY: + if (type != TYPE_STREAM) + return false; + *mode = ES_MODE_GET_STRM_ENTRY; + return true; + case ES_MODE_GET_STRM_ENTRY: + if (type != TYPE_EXTEND) + return false; + *mode = ES_MODE_GET_NAME_ENTRY; + return true; + case ES_MODE_GET_NAME_ENTRY: + if (type == TYPE_STREAM) + return false; + if (type != TYPE_EXTEND) { + if (!(type & TYPE_CRITICAL_SEC)) + return false; + *mode = ES_MODE_GET_CRITICAL_SEC_ENTRY; + } + return true; + case ES_MODE_GET_CRITICAL_SEC_ENTRY: + if (type == TYPE_EXTEND || type == TYPE_STREAM) + return false; + if ((type & TYPE_CRITICAL_SEC) != TYPE_CRITICAL_SEC) + return false; + return true; + default: + WARN_ON_ONCE(1); + return false; + } +} + +/* + * Returns a set of dentries for a file or dir. + * + * Note that this is a copy (dump) of dentries so that user should + * call write_entry_set() to apply changes made in this entry set + * to the real device. + * + * in: + * sb+p_dir+entry: indicates a file/dir + * type: specifies how many dentries should be included. + * out: + * file_ep: will point the first dentry(= file dentry) on success + * return: + * pointer of entry set on success, + * NULL on failure. + */ +struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, + struct exfat_chain *p_dir, int entry, unsigned int type, + struct exfat_dentry **file_ep) +{ + int ret; + unsigned int off, byte_offset, clu = 0; + unsigned int entry_type; + sector_t sec; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_entry_set_cache *es; + struct exfat_dentry *ep, *pos; + unsigned char num_entries; + enum exfat_validate_dentry_mode mode = ES_MODE_STARTED; + struct buffer_head *bh; + + if (p_dir->dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, "access to deleted dentry\n"); + return NULL; + } + + byte_offset = EXFAT_DEN_TO_B(entry); + ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu); + if (ret) + return NULL; + + /* byte offset in cluster */ + byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi); + + /* byte offset in sector */ + off = EXFAT_BLK_OFFSET(byte_offset, sb); + + /* sector offset in cluster */ + sec = EXFAT_B_TO_BLK(byte_offset, sb); + sec += exfat_cluster_to_sector(sbi, clu); + + bh = sb_bread(sb, sec); + if (!bh) + return NULL; + + ep = (struct exfat_dentry *)(bh->b_data + off); + entry_type = exfat_get_entry_type(ep); + + if (entry_type != TYPE_FILE && entry_type != TYPE_DIR) + goto release_bh; + + num_entries = type == ES_ALL_ENTRIES ? + ep->dentry.file.num_ext + 1 : type; + es = kmalloc(struct_size(es, entries, num_entries), GFP_KERNEL); + if (!es) + goto release_bh; + + es->num_entries = num_entries; + es->sector = sec; + es->offset = off; + es->alloc_flag = p_dir->flags; + + pos = &es->entries[0]; + + while (num_entries) { + if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) + goto free_es; + + /* copy dentry */ + memcpy(pos, ep, sizeof(struct exfat_dentry)); + + if (--num_entries == 0) + break; + + if (((off + DENTRY_SIZE) & (sb->s_blocksize - 1)) < + (off & (sb->s_blocksize - 1))) { + /* get the next sector */ + if (exfat_is_last_sector_in_cluster(sbi, sec)) { + if (es->alloc_flag == ALLOC_NO_FAT_CHAIN) + clu++; + else if (exfat_get_next_cluster(sb, &clu)) + goto free_es; + sec = exfat_cluster_to_sector(sbi, clu); + } else { + sec++; + } + + brelse(bh); + bh = sb_bread(sb, sec); + if (!bh) + goto free_es; + off = 0; + ep = (struct exfat_dentry *)bh->b_data; + } else { + ep++; + off += DENTRY_SIZE; + } + pos++; + } + + if (file_ep) + *file_ep = &es->entries[0]; + brelse(bh); + return es; + +free_es: + kfree(es); +release_bh: + brelse(bh); + return NULL; +} + +enum { + DIRENT_STEP_FILE, + DIRENT_STEP_STRM, + DIRENT_STEP_NAME, + DIRENT_STEP_SECD, +}; + +/* + * return values: + * >= 0 : return dir entiry position with the name in dir + * -EEXIST : (root dir, ".") it is the root dir itself + * -ENOENT : entry with the name does not exist + * -EIO : I/O error + */ +int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, + struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, + int num_entries, unsigned int type) +{ + int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len; + int order, step, name_len = 0; + int dentries_per_clu, num_empty = 0; + unsigned int entry_type; + unsigned short *uniname = NULL; + struct exfat_chain clu; + struct exfat_hint *hint_stat = &ei->hint_stat; + struct exfat_hint_femp candi_empty; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + dentries_per_clu = sbi->dentries_per_clu; + + exfat_chain_dup(&clu, p_dir); + + if (hint_stat->eidx) { + clu.dir = hint_stat->clu; + dentry = hint_stat->eidx; + end_eidx = dentry; + } + + candi_empty.eidx = EXFAT_HINT_NONE; +rewind: + order = 0; + step = DIRENT_STEP_FILE; + while (clu.dir != EXFAT_EOF_CLUSTER) { + i = dentry & (dentries_per_clu - 1); + for (; i < dentries_per_clu; i++, dentry++) { + struct exfat_dentry *ep; + struct buffer_head *bh; + + if (rewind && dentry == end_eidx) + goto not_found; + + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + + entry_type = exfat_get_entry_type(ep); + + if (entry_type == TYPE_UNUSED || + entry_type == TYPE_DELETED) { + step = DIRENT_STEP_FILE; + + num_empty++; + if (candi_empty.eidx == EXFAT_HINT_NONE && + num_empty == 1) { + exfat_chain_set(&candi_empty.cur, + clu.dir, clu.size, clu.flags); + } + + if (candi_empty.eidx == EXFAT_HINT_NONE && + num_empty >= num_entries) { + candi_empty.eidx = + dentry - (num_empty - 1); + WARN_ON(candi_empty.eidx < 0); + candi_empty.count = num_empty; + + if (ei->hint_femp.eidx == + EXFAT_HINT_NONE || + candi_empty.eidx <= + ei->hint_femp.eidx) { + memcpy(&ei->hint_femp, + &candi_empty, + sizeof(candi_empty)); + } + } + + brelse(bh); + if (entry_type == TYPE_UNUSED) + goto not_found; + continue; + } + + num_empty = 0; + candi_empty.eidx = EXFAT_HINT_NONE; + + if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) { + step = DIRENT_STEP_FILE; + if (type == TYPE_ALL || type == entry_type) { + num_ext = ep->dentry.file.num_ext; + step = DIRENT_STEP_STRM; + } + brelse(bh); + continue; + } + + if (entry_type == TYPE_STREAM) { + unsigned short name_hash; + + if (step != DIRENT_STEP_STRM) { + step = DIRENT_STEP_FILE; + brelse(bh); + continue; + } + step = DIRENT_STEP_FILE; + name_hash = le16_to_cpu( + ep->dentry.stream.name_hash); + if (p_uniname->name_hash == name_hash && + p_uniname->name_len == + ep->dentry.stream.name_len) { + step = DIRENT_STEP_NAME; + order = 1; + name_len = 0; + } + brelse(bh); + continue; + } + + brelse(bh); + if (entry_type == TYPE_EXTEND) { + unsigned short entry_uniname[16], unichar; + + if (step != DIRENT_STEP_NAME) { + step = DIRENT_STEP_FILE; + continue; + } + + if (++order == 2) + uniname = p_uniname->name; + else + uniname += EXFAT_FILE_NAME_LEN; + + len = exfat_extract_uni_name(ep, entry_uniname); + name_len += len; + + unichar = *(uniname+len); + *(uniname+len) = 0x0; + + if (exfat_uniname_ncmp(sb, uniname, + entry_uniname, len)) { + step = DIRENT_STEP_FILE; + } else if (p_uniname->name_len == name_len) { + if (order == num_ext) + goto found; + step = DIRENT_STEP_SECD; + } + + *(uniname+len) = unichar; + continue; + } + + if (entry_type & + (TYPE_CRITICAL_SEC | TYPE_BENIGN_SEC)) { + if (step == DIRENT_STEP_SECD) { + if (++order == num_ext) + goto found; + continue; + } + } + step = DIRENT_STEP_FILE; + } + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + if (exfat_get_next_cluster(sb, &clu.dir)) + return -EIO; + } + } + +not_found: + /* + * We started at not 0 index,so we should try to find target + * from 0 index to the index we started at. + */ + if (!rewind && end_eidx) { + rewind = 1; + dentry = 0; + clu.dir = p_dir->dir; + /* reset empty hint */ + num_empty = 0; + candi_empty.eidx = EXFAT_HINT_NONE; + goto rewind; + } + + /* initialized hint_stat */ + hint_stat->clu = p_dir->dir; + hint_stat->eidx = 0; + return -ENOENT; + +found: + /* next dentry we'll find is out of this cluster */ + if (!((dentry + 1) & (dentries_per_clu - 1))) { + int ret = 0; + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + ret = exfat_get_next_cluster(sb, &clu.dir); + } + + if (ret || clu.dir != EXFAT_EOF_CLUSTER) { + /* just initialized hint_stat */ + hint_stat->clu = p_dir->dir; + hint_stat->eidx = 0; + return (dentry - num_ext); + } + } + + hint_stat->clu = clu.dir; + hint_stat->eidx = dentry + 1; + return dentry - num_ext; +} + +int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir, + int entry, struct exfat_dentry *ep) +{ + int i, count = 0; + unsigned int type; + struct exfat_dentry *ext_ep; + struct buffer_head *bh; + + for (i = 0, entry++; i < ep->dentry.file.num_ext; i++, entry++) { + ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh, NULL); + if (!ext_ep) + return -EIO; + + type = exfat_get_entry_type(ext_ep); + brelse(bh); + if (type == TYPE_EXTEND || type == TYPE_STREAM) + count++; + else + break; + } + return count; +} + +int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) +{ + int i, count = 0; + int dentries_per_clu; + unsigned int entry_type; + struct exfat_chain clu; + struct exfat_dentry *ep; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + + dentries_per_clu = sbi->dentries_per_clu; + + exfat_chain_dup(&clu, p_dir); + + while (clu.dir != EXFAT_EOF_CLUSTER) { + for (i = 0; i < dentries_per_clu; i++) { + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + entry_type = exfat_get_entry_type(ep); + brelse(bh); + + if (entry_type == TYPE_UNUSED) + return count; + if (entry_type != TYPE_DIR) + continue; + count++; + } + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + } + } + + return count; +} diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h new file mode 100644 index 000000000000..67d4e46fb810 --- /dev/null +++ b/fs/exfat/exfat_fs.h @@ -0,0 +1,519 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#ifndef _EXFAT_FS_H +#define _EXFAT_FS_H + +#include <linux/fs.h> +#include <linux/ratelimit.h> +#include <linux/nls.h> + +#define EXFAT_SUPER_MAGIC 0x2011BAB0UL +#define EXFAT_ROOT_INO 1 + +#define EXFAT_SB_DIRTY 0 + +#define EXFAT_CLUSTERS_UNTRACKED (~0u) + +/* + * exfat error flags + */ +enum exfat_error_mode { + EXFAT_ERRORS_CONT, /* ignore error and continue */ + EXFAT_ERRORS_PANIC, /* panic on error */ + EXFAT_ERRORS_RO, /* remount r/o on error */ +}; + +/* + * exfat nls lossy flag + */ +enum { + NLS_NAME_NO_LOSSY, /* no lossy */ + NLS_NAME_LOSSY, /* just detected incorrect filename(s) */ + NLS_NAME_OVERLEN, /* the length is over than its limit */ +}; + +#define EXFAT_HASH_BITS 8 +#define EXFAT_HASH_SIZE (1UL << EXFAT_HASH_BITS) + +/* + * Type Definitions + */ +#define ES_2_ENTRIES 2 +#define ES_ALL_ENTRIES 0 + +#define DIR_DELETED 0xFFFF0321 + +/* type values */ +#define TYPE_UNUSED 0x0000 +#define TYPE_DELETED 0x0001 +#define TYPE_INVALID 0x0002 +#define TYPE_CRITICAL_PRI 0x0100 +#define TYPE_BITMAP 0x0101 +#define TYPE_UPCASE 0x0102 +#define TYPE_VOLUME 0x0103 +#define TYPE_DIR 0x0104 +#define TYPE_FILE 0x011F +#define TYPE_CRITICAL_SEC 0x0200 +#define TYPE_STREAM 0x0201 +#define TYPE_EXTEND 0x0202 +#define TYPE_ACL 0x0203 +#define TYPE_BENIGN_PRI 0x0400 +#define TYPE_GUID 0x0401 +#define TYPE_PADDING 0x0402 +#define TYPE_ACLTAB 0x0403 +#define TYPE_BENIGN_SEC 0x0800 +#define TYPE_ALL 0x0FFF + +#define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */ +#define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */ +#define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE) + +#define FAT_CACHE_SIZE 128 +#define FAT_CACHE_HASH_SIZE 64 +#define BUF_CACHE_SIZE 256 +#define BUF_CACHE_HASH_SIZE 64 + +#define EXFAT_HINT_NONE -1 +#define EXFAT_MIN_SUBDIR 2 + +/* + * helpers for cluster size to byte conversion. + */ +#define EXFAT_CLU_TO_B(b, sbi) ((b) << (sbi)->cluster_size_bits) +#define EXFAT_B_TO_CLU(b, sbi) ((b) >> (sbi)->cluster_size_bits) +#define EXFAT_B_TO_CLU_ROUND_UP(b, sbi) \ + (((b - 1) >> (sbi)->cluster_size_bits) + 1) +#define EXFAT_CLU_OFFSET(off, sbi) ((off) & ((sbi)->cluster_size - 1)) + +/* + * helpers for block size to byte conversion. + */ +#define EXFAT_BLK_TO_B(b, sb) ((b) << (sb)->s_blocksize_bits) +#define EXFAT_B_TO_BLK(b, sb) ((b) >> (sb)->s_blocksize_bits) +#define EXFAT_B_TO_BLK_ROUND_UP(b, sb) \ + (((b - 1) >> (sb)->s_blocksize_bits) + 1) +#define EXFAT_BLK_OFFSET(off, sb) ((off) & ((sb)->s_blocksize - 1)) + +/* + * helpers for block size to dentry size conversion. + */ +#define EXFAT_B_TO_DEN_IDX(b, sbi) \ + ((b) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS)) +#define EXFAT_B_TO_DEN(b) ((b) >> DENTRY_SIZE_BITS) +#define EXFAT_DEN_TO_B(b) ((b) << DENTRY_SIZE_BITS) + +/* + * helpers for fat entry. + */ +#define FAT_ENT_SIZE (4) +#define FAT_ENT_SIZE_BITS (2) +#define FAT_ENT_OFFSET_SECTOR(sb, loc) (EXFAT_SB(sb)->FAT1_start_sector + \ + (((u64)loc << FAT_ENT_SIZE_BITS) >> sb->s_blocksize_bits)) +#define FAT_ENT_OFFSET_BYTE_IN_SECTOR(sb, loc) \ + ((loc << FAT_ENT_SIZE_BITS) & (sb->s_blocksize - 1)) + +/* + * helpers for bitmap. + */ +#define CLUSTER_TO_BITMAP_ENT(clu) ((clu) - EXFAT_RESERVED_CLUSTERS) +#define BITMAP_ENT_TO_CLUSTER(ent) ((ent) + EXFAT_RESERVED_CLUSTERS) +#define BITS_PER_SECTOR(sb) ((sb)->s_blocksize * BITS_PER_BYTE) +#define BITS_PER_SECTOR_MASK(sb) (BITS_PER_SECTOR(sb) - 1) +#define BITMAP_OFFSET_SECTOR_INDEX(sb, ent) \ + ((ent / BITS_PER_BYTE) >> (sb)->s_blocksize_bits) +#define BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent) (ent & BITS_PER_SECTOR_MASK(sb)) +#define BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent) \ + ((ent / BITS_PER_BYTE) & ((sb)->s_blocksize - 1)) +#define BITS_PER_BYTE_MASK 0x7 +#define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1) + +struct exfat_dentry_namebuf { + char *lfn; + int lfnbuf_len; /* usally MAX_UNINAME_BUF_SIZE */ +}; + +/* unicode name structure */ +struct exfat_uni_name { + /* +3 for null and for converting */ + unsigned short name[MAX_NAME_LENGTH + 3]; + unsigned short name_hash; + unsigned char name_len; +}; + +/* directory structure */ +struct exfat_chain { + unsigned int dir; + unsigned int size; + unsigned char flags; +}; + +/* first empty entry hint information */ +struct exfat_hint_femp { + /* entry index of a directory */ + int eidx; + /* count of continuous empty entry */ + int count; + /* the cluster that first empty slot exists in */ + struct exfat_chain cur; +}; + +/* hint structure */ +struct exfat_hint { + unsigned int clu; + union { + unsigned int off; /* cluster offset */ + int eidx; /* entry index */ + }; +}; + +struct exfat_entry_set_cache { + /* sector number that contains file_entry */ + sector_t sector; + /* byte offset in the sector */ + unsigned int offset; + /* flag in stream entry. 01 for cluster chain, 03 for contig. */ + int alloc_flag; + unsigned int num_entries; + struct exfat_dentry entries[]; +}; + +struct exfat_dir_entry { + struct exfat_chain dir; + int entry; + unsigned int type; + unsigned int start_clu; + unsigned char flags; + unsigned short attr; + loff_t size; + unsigned int num_subdirs; + struct timespec64 atime; + struct timespec64 mtime; + struct timespec64 crtime; + struct exfat_dentry_namebuf namebuf; +}; + +/* + * exfat mount in-memory data + */ +struct exfat_mount_options { + kuid_t fs_uid; + kgid_t fs_gid; + unsigned short fs_fmask; + unsigned short fs_dmask; + /* permission for setting the [am]time */ + unsigned short allow_utime; + /* charset for filename input/display */ + char *iocharset; + /* on error: continue, panic, remount-ro */ + enum exfat_error_mode errors; + unsigned utf8:1, /* Use of UTF-8 character set */ + discard:1; /* Issue discard requests on deletions */ + int time_offset; /* Offset of timestamps from UTC (in minutes) */ +}; + +/* + * EXFAT file system superblock in-memory data + */ +struct exfat_sb_info { + unsigned long long num_sectors; /* num of sectors in volume */ + unsigned int num_clusters; /* num of clusters in volume */ + unsigned int cluster_size; /* cluster size in bytes */ + unsigned int cluster_size_bits; + unsigned int sect_per_clus; /* cluster size in sectors */ + unsigned int sect_per_clus_bits; + unsigned long long FAT1_start_sector; /* FAT1 start sector */ + unsigned long long FAT2_start_sector; /* FAT2 start sector */ + unsigned long long data_start_sector; /* data area start sector */ + unsigned int num_FAT_sectors; /* num of FAT sectors */ + unsigned int root_dir; /* root dir cluster */ + unsigned int dentries_per_clu; /* num of dentries per cluster */ + unsigned int vol_flag; /* volume dirty flag */ + struct buffer_head *pbr_bh; /* buffer_head of PBR sector */ + + unsigned int map_clu; /* allocation bitmap start cluster */ + unsigned int map_sectors; /* num of allocation bitmap sectors */ + struct buffer_head **vol_amap; /* allocation bitmap */ + + unsigned short *vol_utbl; /* upcase table */ + + unsigned int clu_srch_ptr; /* cluster search pointer */ + unsigned int used_clusters; /* number of used clusters */ + + unsigned long s_state; + struct mutex s_lock; /* superblock lock */ + struct exfat_mount_options options; + struct nls_table *nls_io; /* Charset used for input and display */ + struct ratelimit_state ratelimit; + + spinlock_t inode_hash_lock; + struct hlist_head inode_hashtable[EXFAT_HASH_SIZE]; + + struct rcu_head rcu; +}; + +/* + * EXFAT file system inode in-memory data + */ +struct exfat_inode_info { + struct exfat_chain dir; + int entry; + unsigned int type; + unsigned short attr; + unsigned int start_clu; + unsigned char flags; + /* + * the copy of low 32bit of i_version to check + * the validation of hint_stat. + */ + unsigned int version; + /* file offset or dentry index for readdir */ + loff_t rwoffset; + + /* hint for cluster last accessed */ + struct exfat_hint hint_bmap; + /* hint for entry index we try to lookup next time */ + struct exfat_hint hint_stat; + /* hint for first empty entry */ + struct exfat_hint_femp hint_femp; + + spinlock_t cache_lru_lock; + struct list_head cache_lru; + int nr_caches; + /* for avoiding the race between alloc and free */ + unsigned int cache_valid_id; + + /* + * NOTE: i_size_ondisk is 64bits, so must hold ->inode_lock to access. + * physically allocated size. + */ + loff_t i_size_ondisk; + /* block-aligned i_size (used in cont_write_begin) */ + loff_t i_size_aligned; + /* on-disk position of directory entry or 0 */ + loff_t i_pos; + /* hash by i_location */ + struct hlist_node i_hash_fat; + /* protect bmap against truncate */ + struct rw_semaphore truncate_lock; + struct inode vfs_inode; + /* File creation time */ + struct timespec64 i_crtime; +}; + +static inline struct exfat_sb_info *EXFAT_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct exfat_inode_info *EXFAT_I(struct inode *inode) +{ + return container_of(inode, struct exfat_inode_info, vfs_inode); +} + +/* + * If ->i_mode can't hold 0222 (i.e. ATTR_RO), we use ->i_attrs to + * save ATTR_RO instead of ->i_mode. + * + * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only + * bit, it's just used as flag for app. + */ +static inline int exfat_mode_can_hold_ro(struct inode *inode) +{ + struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); + + if (S_ISDIR(inode->i_mode)) + return 0; + + if ((~sbi->options.fs_fmask) & 0222) + return 1; + return 0; +} + +/* Convert attribute bits and a mask to the UNIX mode. */ +static inline mode_t exfat_make_mode(struct exfat_sb_info *sbi, + unsigned short attr, mode_t mode) +{ + if ((attr & ATTR_READONLY) && !(attr & ATTR_SUBDIR)) + mode &= ~0222; + + if (attr & ATTR_SUBDIR) + return (mode & ~sbi->options.fs_dmask) | S_IFDIR; + + return (mode & ~sbi->options.fs_fmask) | S_IFREG; +} + +/* Return the FAT attribute byte for this inode */ +static inline unsigned short exfat_make_attr(struct inode *inode) +{ + unsigned short attr = EXFAT_I(inode)->attr; + + if (S_ISDIR(inode->i_mode)) + attr |= ATTR_SUBDIR; + if (exfat_mode_can_hold_ro(inode) && !(inode->i_mode & 0222)) + attr |= ATTR_READONLY; + return attr; +} + +static inline void exfat_save_attr(struct inode *inode, unsigned short attr) +{ + if (exfat_mode_can_hold_ro(inode)) + EXFAT_I(inode)->attr = attr & (ATTR_RWMASK | ATTR_READONLY); + else + EXFAT_I(inode)->attr = attr & ATTR_RWMASK; +} + +static inline bool exfat_is_last_sector_in_cluster(struct exfat_sb_info *sbi, + sector_t sec) +{ + return ((sec - sbi->data_start_sector + 1) & + ((1 << sbi->sect_per_clus_bits) - 1)) == 0; +} + +static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi, + unsigned int clus) +{ + return ((clus - EXFAT_RESERVED_CLUSTERS) << sbi->sect_per_clus_bits) + + sbi->data_start_sector; +} + +static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi, + sector_t sec) +{ + return ((sec - sbi->data_start_sector) >> sbi->sect_per_clus_bits) + + EXFAT_RESERVED_CLUSTERS; +} + +/* super.c */ +int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag); + +/* fatent.c */ +#define exfat_get_next_cluster(sb, pclu) exfat_ent_get(sb, *(pclu), pclu) + +int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, + struct exfat_chain *p_chain); +int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain); +int exfat_ent_get(struct super_block *sb, unsigned int loc, + unsigned int *content); +int exfat_ent_set(struct super_block *sb, unsigned int loc, + unsigned int content); +int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir, + int entry, struct exfat_dentry *p_entry); +int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain, + unsigned int len); +int exfat_zeroed_cluster(struct inode *dir, unsigned int clu); +int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain, + unsigned int *ret_clu); +int exfat_count_num_clusters(struct super_block *sb, + struct exfat_chain *p_chain, unsigned int *ret_count); + +/* balloc.c */ +int exfat_load_bitmap(struct super_block *sb); +void exfat_free_bitmap(struct exfat_sb_info *sbi); +int exfat_set_bitmap(struct inode *inode, unsigned int clu); +void exfat_clear_bitmap(struct inode *inode, unsigned int clu); +unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu); +int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count); + +/* file.c */ +extern const struct file_operations exfat_file_operations; +int __exfat_truncate(struct inode *inode, loff_t new_size); +void exfat_truncate(struct inode *inode, loff_t size); +int exfat_setattr(struct dentry *dentry, struct iattr *attr); +int exfat_getattr(const struct path *path, struct kstat *stat, + unsigned int request_mask, unsigned int query_flags); + +/* namei.c */ +extern const struct dentry_operations exfat_dentry_ops; +extern const struct dentry_operations exfat_utf8_dentry_ops; + +/* cache.c */ +int exfat_cache_init(void); +void exfat_cache_shutdown(void); +void exfat_cache_init_inode(struct inode *inode); +void exfat_cache_inval_inode(struct inode *inode); +int exfat_get_cluster(struct inode *inode, unsigned int cluster, + unsigned int *fclus, unsigned int *dclus, + unsigned int *last_dclus, int allow_eof); + +/* dir.c */ +extern const struct inode_operations exfat_dir_inode_operations; +extern const struct file_operations exfat_dir_operations; +unsigned int exfat_get_entry_type(struct exfat_dentry *p_entry); +int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, + int entry, unsigned int type, unsigned int start_clu, + unsigned long long size); +int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, + int entry, int num_entries, struct exfat_uni_name *p_uniname); +int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir, + int entry, int order, int num_entries); +int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, + int entry); +int exfat_update_dir_chksum_with_entry_set(struct super_block *sb, + struct exfat_entry_set_cache *es, int sync); +int exfat_calc_num_entries(struct exfat_uni_name *p_uniname); +int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, + struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, + int num_entries, unsigned int type); +int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu); +int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, + int entry, sector_t *sector, int *offset); +struct exfat_dentry *exfat_get_dentry(struct super_block *sb, + struct exfat_chain *p_dir, int entry, struct buffer_head **bh, + sector_t *sector); +struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, + struct exfat_chain *p_dir, int entry, unsigned int type, + struct exfat_dentry **file_ep); +int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir); + +/* inode.c */ +extern const struct inode_operations exfat_file_inode_operations; +void exfat_sync_inode(struct inode *inode); +struct inode *exfat_build_inode(struct super_block *sb, + struct exfat_dir_entry *info, loff_t i_pos); +void exfat_hash_inode(struct inode *inode, loff_t i_pos); +void exfat_unhash_inode(struct inode *inode); +struct inode *exfat_iget(struct super_block *sb, loff_t i_pos); +int exfat_write_inode(struct inode *inode, struct writeback_control *wbc); +void exfat_evict_inode(struct inode *inode); +int exfat_block_truncate_page(struct inode *inode, loff_t from); + +/* exfat/nls.c */ +unsigned short exfat_toupper(struct super_block *sb, unsigned short a); +int exfat_uniname_ncmp(struct super_block *sb, unsigned short *a, + unsigned short *b, unsigned int len); +int exfat_utf16_to_nls(struct super_block *sb, + struct exfat_uni_name *uniname, unsigned char *p_cstring, + int len); +int exfat_nls_to_utf16(struct super_block *sb, + const unsigned char *p_cstring, const int len, + struct exfat_uni_name *uniname, int *p_lossy); +int exfat_create_upcase_table(struct super_block *sb); +void exfat_free_upcase_table(struct exfat_sb_info *sbi); +unsigned short exfat_high_surrogate(unicode_t u); +unsigned short exfat_low_surrogate(unicode_t u); + +/* exfat/misc.c */ +void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) + __printf(3, 4) __cold; +#define exfat_fs_error(sb, fmt, args...) \ + __exfat_fs_error(sb, 1, fmt, ## args) +#define exfat_fs_error_ratelimit(sb, fmt, args...) \ + __exfat_fs_error(sb, __ratelimit(&EXFAT_SB(sb)->ratelimit), \ + fmt, ## args) +void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...) + __printf(3, 4) __cold; +void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, + u8 tz, __le16 time, __le16 date, u8 time_ms); +void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, + u8 *tz, __le16 *time, __le16 *date, u8 *time_ms); +unsigned short exfat_calc_chksum_2byte(void *data, int len, + unsigned short chksum, int type); +void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync); +void exfat_chain_set(struct exfat_chain *ec, unsigned int dir, + unsigned int size, unsigned char flags); +void exfat_chain_dup(struct exfat_chain *dup, struct exfat_chain *ec); + +#endif /* !_EXFAT_FS_H */ diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h new file mode 100644 index 000000000000..2a841010e649 --- /dev/null +++ b/fs/exfat/exfat_raw.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#ifndef _EXFAT_RAW_H +#define _EXFAT_RAW_H + +#include <linux/types.h> + +#define PBR_SIGNATURE 0xAA55 + +#define EXFAT_MAX_FILE_LEN 255 + +#define VOL_CLEAN 0x0000 +#define VOL_DIRTY 0x0002 + +#define EXFAT_EOF_CLUSTER 0xFFFFFFFFu +#define EXFAT_BAD_CLUSTER 0xFFFFFFF7u +#define EXFAT_FREE_CLUSTER 0 +/* Cluster 0, 1 are reserved, the first cluster is 2 in the cluster heap. */ +#define EXFAT_RESERVED_CLUSTERS 2 +#define EXFAT_FIRST_CLUSTER 2 +#define EXFAT_DATA_CLUSTER_COUNT(sbi) \ + ((sbi)->num_clusters - EXFAT_RESERVED_CLUSTERS) + +/* AllocationPossible and NoFatChain field in GeneralSecondaryFlags Field */ +#define ALLOC_FAT_CHAIN 0x01 +#define ALLOC_NO_FAT_CHAIN 0x03 + +#define DENTRY_SIZE 32 /* directory entry size */ +#define DENTRY_SIZE_BITS 5 +/* exFAT allows 8388608(256MB) directory entries */ +#define MAX_EXFAT_DENTRIES 8388608 + +/* dentry types */ +#define EXFAT_UNUSED 0x00 /* end of directory */ +#define EXFAT_DELETE (~0x80) +#define IS_EXFAT_DELETED(x) ((x) < 0x80) /* deleted file (0x01~0x7F) */ +#define EXFAT_INVAL 0x80 /* invalid value */ +#define EXFAT_BITMAP 0x81 /* allocation bitmap */ +#define EXFAT_UPCASE 0x82 /* upcase table */ +#define EXFAT_VOLUME 0x83 /* volume label */ +#define EXFAT_FILE 0x85 /* file or dir */ +#define EXFAT_GUID 0xA0 +#define EXFAT_PADDING 0xA1 +#define EXFAT_ACLTAB 0xA2 +#define EXFAT_STREAM 0xC0 /* stream entry */ +#define EXFAT_NAME 0xC1 /* file name entry */ +#define EXFAT_ACL 0xC2 /* stream entry */ + +#define IS_EXFAT_CRITICAL_PRI(x) (x < 0xA0) +#define IS_EXFAT_BENIGN_PRI(x) (x < 0xC0) +#define IS_EXFAT_CRITICAL_SEC(x) (x < 0xE0) + +/* checksum types */ +#define CS_DIR_ENTRY 0 +#define CS_PBR_SECTOR 1 +#define CS_DEFAULT 2 + +/* file attributes */ +#define ATTR_READONLY 0x0001 +#define ATTR_HIDDEN 0x0002 +#define ATTR_SYSTEM 0x0004 +#define ATTR_VOLUME 0x0008 +#define ATTR_SUBDIR 0x0010 +#define ATTR_ARCHIVE 0x0020 + +#define ATTR_RWMASK (ATTR_HIDDEN | ATTR_SYSTEM | ATTR_VOLUME | \ + ATTR_SUBDIR | ATTR_ARCHIVE) + +#define PBR64_JUMP_BOOT_LEN 3 +#define PBR64_OEM_NAME_LEN 8 +#define PBR64_RESERVED_LEN 53 + +#define EXFAT_FILE_NAME_LEN 15 + +/* EXFAT BIOS parameter block (64 bytes) */ +struct bpb64 { + __u8 jmp_boot[PBR64_JUMP_BOOT_LEN]; + __u8 oem_name[PBR64_OEM_NAME_LEN]; + __u8 res_zero[PBR64_RESERVED_LEN]; +} __packed; + +/* EXFAT EXTEND BIOS parameter block (56 bytes) */ +struct bsx64 { + __le64 vol_offset; + __le64 vol_length; + __le32 fat_offset; + __le32 fat_length; + __le32 clu_offset; + __le32 clu_count; + __le32 root_cluster; + __le32 vol_serial; + __u8 fs_version[2]; + __le16 vol_flags; + __u8 sect_size_bits; + __u8 sect_per_clus_bits; + __u8 num_fats; + __u8 phy_drv_no; + __u8 perc_in_use; + __u8 reserved2[7]; +} __packed; + +/* EXFAT PBR[BPB+BSX] (120 bytes) */ +struct pbr64 { + struct bpb64 bpb; + struct bsx64 bsx; +} __packed; + +/* Common PBR[Partition Boot Record] (512 bytes) */ +struct pbr { + union { + __u8 raw[64]; + struct bpb64 f64; + } bpb; + union { + __u8 raw[56]; + struct bsx64 f64; + } bsx; + __u8 boot_code[390]; + __le16 signature; +} __packed; + +struct exfat_dentry { + __u8 type; + union { + struct { + __u8 num_ext; + __le16 checksum; + __le16 attr; + __le16 reserved1; + __le16 create_time; + __le16 create_date; + __le16 modify_time; + __le16 modify_date; + __le16 access_time; + __le16 access_date; + __u8 create_time_ms; + __u8 modify_time_ms; + __u8 create_tz; + __u8 modify_tz; + __u8 access_tz; + __u8 reserved2[7]; + } __packed file; /* file directory entry */ + struct { + __u8 flags; + __u8 reserved1; + __u8 name_len; + __le16 name_hash; + __le16 reserved2; + __le64 valid_size; + __le32 reserved3; + __le32 start_clu; + __le64 size; + } __packed stream; /* stream extension directory entry */ + struct { + __u8 flags; + __le16 unicode_0_14[EXFAT_FILE_NAME_LEN]; + } __packed name; /* file name directory entry */ + struct { + __u8 flags; + __u8 reserved[18]; + __le32 start_clu; + __le64 size; + } __packed bitmap; /* allocation bitmap directory entry */ + struct { + __u8 reserved1[3]; + __le32 checksum; + __u8 reserved2[12]; + __le32 start_clu; + __le64 size; + } __packed upcase; /* up-case table directory entry */ + } __packed dentry; +} __packed; + +#define EXFAT_TZ_VALID (1 << 7) + +/* Jan 1 GMT 00:00:00 1980 */ +#define EXFAT_MIN_TIMESTAMP_SECS 315532800LL +/* Dec 31 GMT 23:59:59 2107 */ +#define EXFAT_MAX_TIMESTAMP_SECS 4354819199LL + +#endif /* !_EXFAT_RAW_H */ diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c new file mode 100644 index 000000000000..a855b1769a96 --- /dev/null +++ b/fs/exfat/fatent.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/slab.h> +#include <asm/unaligned.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static int exfat_mirror_bh(struct super_block *sb, sector_t sec, + struct buffer_head *bh) +{ + struct buffer_head *c_bh; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + sector_t sec2; + int err = 0; + + if (sbi->FAT2_start_sector != sbi->FAT1_start_sector) { + sec2 = sec - sbi->FAT1_start_sector + sbi->FAT2_start_sector; + c_bh = sb_getblk(sb, sec2); + if (!c_bh) + return -ENOMEM; + memcpy(c_bh->b_data, bh->b_data, sb->s_blocksize); + set_buffer_uptodate(c_bh); + mark_buffer_dirty(c_bh); + if (sb->s_flags & SB_SYNCHRONOUS) + err = sync_dirty_buffer(c_bh); + brelse(c_bh); + } + + return err; +} + +static int __exfat_ent_get(struct super_block *sb, unsigned int loc, + unsigned int *content) +{ + unsigned int off; + sector_t sec; + struct buffer_head *bh; + + sec = FAT_ENT_OFFSET_SECTOR(sb, loc); + off = FAT_ENT_OFFSET_BYTE_IN_SECTOR(sb, loc); + + bh = sb_bread(sb, sec); + if (!bh) + return -EIO; + + *content = le32_to_cpu(*(__le32 *)(&bh->b_data[off])); + + /* remap reserved clusters to simplify code */ + if (*content > EXFAT_BAD_CLUSTER) + *content = EXFAT_EOF_CLUSTER; + + brelse(bh); + return 0; +} + +int exfat_ent_set(struct super_block *sb, unsigned int loc, + unsigned int content) +{ + unsigned int off; + sector_t sec; + __le32 *fat_entry; + struct buffer_head *bh; + + sec = FAT_ENT_OFFSET_SECTOR(sb, loc); + off = FAT_ENT_OFFSET_BYTE_IN_SECTOR(sb, loc); + + bh = sb_bread(sb, sec); + if (!bh) + return -EIO; + + fat_entry = (__le32 *)&(bh->b_data[off]); + *fat_entry = cpu_to_le32(content); + exfat_update_bh(sb, bh, sb->s_flags & SB_SYNCHRONOUS); + exfat_mirror_bh(sb, sec, bh); + brelse(bh); + return 0; +} + +static inline bool is_valid_cluster(struct exfat_sb_info *sbi, + unsigned int clus) +{ + if (clus < EXFAT_FIRST_CLUSTER || sbi->num_clusters <= clus) + return false; + return true; +} + +int exfat_ent_get(struct super_block *sb, unsigned int loc, + unsigned int *content) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + int err; + + if (!is_valid_cluster(sbi, loc)) { + exfat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", + loc); + return -EIO; + } + + err = __exfat_ent_get(sb, loc, content); + if (err) { + exfat_fs_error(sb, + "failed to access to FAT (entry 0x%08x, err:%d)", + loc, err); + return err; + } + + if (*content == EXFAT_FREE_CLUSTER) { + exfat_fs_error(sb, + "invalid access to FAT free cluster (entry 0x%08x)", + loc); + return -EIO; + } + + if (*content == EXFAT_BAD_CLUSTER) { + exfat_fs_error(sb, + "invalid access to FAT bad cluster (entry 0x%08x)", + loc); + return -EIO; + } + + if (*content != EXFAT_EOF_CLUSTER && !is_valid_cluster(sbi, *content)) { + exfat_fs_error(sb, + "invalid access to FAT (entry 0x%08x) bogus content (0x%08x)", + loc, *content); + return -EIO; + } + + return 0; +} + +int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain, + unsigned int len) +{ + if (!len) + return 0; + + while (len > 1) { + if (exfat_ent_set(sb, chain, chain + 1)) + return -EIO; + chain++; + len--; + } + + if (exfat_ent_set(sb, chain, EXFAT_EOF_CLUSTER)) + return -EIO; + return 0; +} + +int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) +{ + unsigned int num_clusters = 0; + unsigned int clu; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + /* invalid cluster number */ + if (p_chain->dir == EXFAT_FREE_CLUSTER || + p_chain->dir == EXFAT_EOF_CLUSTER || + p_chain->dir < EXFAT_FIRST_CLUSTER) + return 0; + + /* no cluster to truncate */ + if (p_chain->size == 0) + return 0; + + /* check cluster validation */ + if (p_chain->dir < 2 && p_chain->dir >= sbi->num_clusters) { + exfat_msg(sb, KERN_ERR, "invalid start cluster (%u)", + p_chain->dir); + return -EIO; + } + + set_bit(EXFAT_SB_DIRTY, &sbi->s_state); + clu = p_chain->dir; + + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + do { + exfat_clear_bitmap(inode, clu); + clu++; + + num_clusters++; + } while (num_clusters < p_chain->size); + } else { + do { + exfat_clear_bitmap(inode, clu); + + if (exfat_get_next_cluster(sb, &clu)) + goto dec_used_clus; + + num_clusters++; + } while (clu != EXFAT_EOF_CLUSTER); + } + +dec_used_clus: + sbi->used_clusters -= num_clusters; + return 0; +} + +int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain, + unsigned int *ret_clu) +{ + unsigned int clu, next; + unsigned int count = 0; + + next = p_chain->dir; + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + *ret_clu = next + p_chain->size - 1; + return 0; + } + + do { + count++; + clu = next; + if (exfat_ent_get(sb, clu, &next)) + return -EIO; + } while (next != EXFAT_EOF_CLUSTER); + + if (p_chain->size != count) { + exfat_fs_error(sb, + "bogus directory size (clus : ondisk(%d) != counted(%d))", + p_chain->size, count); + return -EIO; + } + + *ret_clu = clu; + return 0; +} + +static inline int exfat_sync_bhs(struct buffer_head **bhs, int nr_bhs) +{ + int i, err = 0; + + for (i = 0; i < nr_bhs; i++) + write_dirty_buffer(bhs[i], 0); + + for (i = 0; i < nr_bhs; i++) { + wait_on_buffer(bhs[i]); + if (!err && !buffer_uptodate(bhs[i])) + err = -EIO; + } + return err; +} + +int exfat_zeroed_cluster(struct inode *dir, unsigned int clu) +{ + struct super_block *sb = dir->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + int nr_bhs = MAX_BUF_PER_PAGE; + sector_t blknr, last_blknr; + int err, i, n; + + blknr = exfat_cluster_to_sector(sbi, clu); + last_blknr = blknr + sbi->sect_per_clus; + + if (last_blknr > sbi->num_sectors && sbi->num_sectors > 0) { + exfat_fs_error_ratelimit(sb, + "%s: out of range(sect:%llu len:%u)", + __func__, (unsigned long long)blknr, + sbi->sect_per_clus); + return -EIO; + } + + /* Zeroing the unused blocks on this cluster */ + n = 0; + while (blknr < last_blknr) { + bhs[n] = sb_getblk(sb, blknr); + if (!bhs[n]) { + err = -ENOMEM; + goto release_bhs; + } + memset(bhs[n]->b_data, 0, sb->s_blocksize); + exfat_update_bh(sb, bhs[n], 0); + + n++; + blknr++; + + if (n == nr_bhs) { + if (IS_DIRSYNC(dir)) { + err = exfat_sync_bhs(bhs, n); + if (err) + goto release_bhs; + } + + for (i = 0; i < n; i++) + brelse(bhs[i]); + n = 0; + } + } + + if (IS_DIRSYNC(dir)) { + err = exfat_sync_bhs(bhs, n); + if (err) + goto release_bhs; + } + + for (i = 0; i < n; i++) + brelse(bhs[i]); + + return 0; + +release_bhs: + exfat_msg(sb, KERN_ERR, "failed zeroed sect %llu\n", + (unsigned long long)blknr); + for (i = 0; i < n; i++) + bforget(bhs[i]); + return err; +} + +int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, + struct exfat_chain *p_chain) +{ + int ret = -ENOSPC; + unsigned int num_clusters = 0, total_cnt; + unsigned int hint_clu, new_clu, last_clu = EXFAT_EOF_CLUSTER; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + total_cnt = EXFAT_DATA_CLUSTER_COUNT(sbi); + + if (unlikely(total_cnt < sbi->used_clusters)) { + exfat_fs_error_ratelimit(sb, + "%s: invalid used clusters(t:%u,u:%u)\n", + __func__, total_cnt, sbi->used_clusters); + return -EIO; + } + + if (num_alloc > total_cnt - sbi->used_clusters) + return -ENOSPC; + + hint_clu = p_chain->dir; + /* find new cluster */ + if (hint_clu == EXFAT_EOF_CLUSTER) { + if (sbi->clu_srch_ptr < EXFAT_FIRST_CLUSTER) { + exfat_msg(sb, KERN_ERR, + "sbi->clu_srch_ptr is invalid (%u)\n", + sbi->clu_srch_ptr); + sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; + } + + hint_clu = exfat_find_free_bitmap(sb, sbi->clu_srch_ptr); + if (hint_clu == EXFAT_EOF_CLUSTER) + return -ENOSPC; + } + + /* check cluster validation */ + if (hint_clu < EXFAT_FIRST_CLUSTER && hint_clu >= sbi->num_clusters) { + exfat_msg(sb, KERN_ERR, "hint_cluster is invalid (%u)\n", + hint_clu); + hint_clu = EXFAT_FIRST_CLUSTER; + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + if (exfat_chain_cont_cluster(sb, p_chain->dir, + num_clusters)) + return -EIO; + p_chain->flags = ALLOC_FAT_CHAIN; + } + } + + set_bit(EXFAT_SB_DIRTY, &sbi->s_state); + + p_chain->dir = EXFAT_EOF_CLUSTER; + + while ((new_clu = exfat_find_free_bitmap(sb, hint_clu)) != + EXFAT_EOF_CLUSTER) { + if (new_clu != hint_clu && + p_chain->flags == ALLOC_NO_FAT_CHAIN) { + if (exfat_chain_cont_cluster(sb, p_chain->dir, + num_clusters)) { + ret = -EIO; + goto free_cluster; + } + p_chain->flags = ALLOC_FAT_CHAIN; + } + + /* update allocation bitmap */ + if (exfat_set_bitmap(inode, new_clu)) { + ret = -EIO; + goto free_cluster; + } + + num_clusters++; + + /* update FAT table */ + if (p_chain->flags == ALLOC_FAT_CHAIN) { + if (exfat_ent_set(sb, new_clu, EXFAT_EOF_CLUSTER)) { + ret = -EIO; + goto free_cluster; + } + } + + if (p_chain->dir == EXFAT_EOF_CLUSTER) { + p_chain->dir = new_clu; + } else if (p_chain->flags == ALLOC_FAT_CHAIN) { + if (exfat_ent_set(sb, last_clu, new_clu)) { + ret = -EIO; + goto free_cluster; + } + } + last_clu = new_clu; + + if (--num_alloc == 0) { + sbi->clu_srch_ptr = hint_clu; + sbi->used_clusters += num_clusters; + + p_chain->size += num_clusters; + return 0; + } + + hint_clu = new_clu + 1; + if (hint_clu >= sbi->num_clusters) { + hint_clu = EXFAT_FIRST_CLUSTER; + + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + if (exfat_chain_cont_cluster(sb, p_chain->dir, + num_clusters)) { + ret = -EIO; + goto free_cluster; + } + p_chain->flags = ALLOC_FAT_CHAIN; + } + } + } +free_cluster: + if (num_clusters) + exfat_free_cluster(inode, p_chain); + return ret; +} + +int exfat_count_num_clusters(struct super_block *sb, + struct exfat_chain *p_chain, unsigned int *ret_count) +{ + unsigned int i, count; + unsigned int clu; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + if (!p_chain->dir || p_chain->dir == EXFAT_EOF_CLUSTER) { + *ret_count = 0; + return 0; + } + + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + *ret_count = p_chain->size; + return 0; + } + + clu = p_chain->dir; + count = 0; + for (i = EXFAT_FIRST_CLUSTER; i < sbi->num_clusters; i++) { + count++; + if (exfat_ent_get(sb, clu, &clu)) + return -EIO; + if (clu == EXFAT_EOF_CLUSTER) + break; + } + + *ret_count = count; + return 0; +} diff --git a/fs/exfat/file.c b/fs/exfat/file.c new file mode 100644 index 000000000000..483f683757aa --- /dev/null +++ b/fs/exfat/file.c @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/slab.h> +#include <linux/cred.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static int exfat_cont_expand(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + loff_t start = i_size_read(inode), count = size - i_size_read(inode); + int err, err2; + + err = generic_cont_expand_simple(inode, size); + if (err) + return err; + + inode->i_ctime = inode->i_mtime = current_time(inode); + mark_inode_dirty(inode); + + if (!IS_SYNC(inode)) + return 0; + + err = filemap_fdatawrite_range(mapping, start, start + count - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (err) + return err; + + return filemap_fdatawait_range(mapping, start, start + count - 1); +} + +static bool exfat_allow_set_time(struct exfat_sb_info *sbi, struct inode *inode) +{ + mode_t allow_utime = sbi->options.allow_utime; + + if (!uid_eq(current_fsuid(), inode->i_uid)) { + if (in_group_p(inode->i_gid)) + allow_utime >>= 3; + if (allow_utime & MAY_WRITE) + return true; + } + + /* use a default check */ + return false; +} + +static int exfat_sanitize_mode(const struct exfat_sb_info *sbi, + struct inode *inode, umode_t *mode_ptr) +{ + mode_t i_mode, mask, perm; + + i_mode = inode->i_mode; + + mask = (S_ISREG(i_mode) || S_ISLNK(i_mode)) ? + sbi->options.fs_fmask : sbi->options.fs_dmask; + perm = *mode_ptr & ~(S_IFMT | mask); + + /* Of the r and x bits, all (subject to umask) must be present.*/ + if ((perm & 0555) != (i_mode & 0555)) + return -EPERM; + + if (exfat_mode_can_hold_ro(inode)) { + /* + * Of the w bits, either all (subject to umask) or none must + * be present. + */ + if ((perm & 0222) && ((perm & 0222) != (0222 & ~mask))) + return -EPERM; + } else { + /* + * If exfat_mode_can_hold_ro(inode) is false, can't change + * w bits. + */ + if ((perm & 0222) != (0222 & ~mask)) + return -EPERM; + } + + *mode_ptr &= S_IFMT | perm; + + return 0; +} + +/* resize the file length */ +int __exfat_truncate(struct inode *inode, loff_t new_size) +{ + unsigned int num_clusters_new, num_clusters_phys; + unsigned int last_clu = EXFAT_FREE_CLUSTER; + struct exfat_chain clu; + struct exfat_dentry *ep, *ep2; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_entry_set_cache *es = NULL; + int evict = (ei->dir.dir == DIR_DELETED) ? 1 : 0; + + /* check if the given file ID is opened */ + if (ei->type != TYPE_FILE && ei->type != TYPE_DIR) + return -EPERM; + + exfat_set_vol_flags(sb, VOL_DIRTY); + + num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi); + num_clusters_phys = + EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, sbi); + + exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags); + + if (new_size > 0) { + /* + * Truncate FAT chain num_clusters after the first cluster + * num_clusters = min(new, phys); + */ + unsigned int num_clusters = + min(num_clusters_new, num_clusters_phys); + + /* + * Follow FAT chain + * (defensive coding - works fine even with corrupted FAT table + */ + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + clu.dir += num_clusters; + clu.size -= num_clusters; + } else { + while (num_clusters > 0) { + last_clu = clu.dir; + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + + num_clusters--; + clu.size--; + } + } + } else { + ei->flags = ALLOC_NO_FAT_CHAIN; + ei->start_clu = EXFAT_EOF_CLUSTER; + } + + i_size_write(inode, new_size); + + if (ei->type == TYPE_FILE) + ei->attr |= ATTR_ARCHIVE; + + /* update the directory entry */ + if (!evict) { + struct timespec64 ts; + + es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, + ES_ALL_ENTRIES, &ep); + if (!es) + return -EIO; + ep2 = ep + 1; + + ts = current_time(inode); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.modify_tz, + &ep->dentry.file.modify_time, + &ep->dentry.file.modify_date, + &ep->dentry.file.modify_time_ms); + ep->dentry.file.attr = cpu_to_le16(ei->attr); + + /* File size should be zero if there is no cluster allocated */ + if (ei->start_clu == EXFAT_EOF_CLUSTER) { + ep->dentry.stream.valid_size = 0; + ep->dentry.stream.size = 0; + } else { + ep->dentry.stream.valid_size = cpu_to_le64(new_size); + ep->dentry.stream.size = ep->dentry.stream.valid_size; + } + + if (new_size == 0) { + /* Any directory can not be truncated to zero */ + WARN_ON(ei->type != TYPE_FILE); + + ep2->dentry.stream.flags = ALLOC_FAT_CHAIN; + ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; + } + + if (exfat_update_dir_chksum_with_entry_set(sb, es, + inode_needs_sync(inode))) + return -EIO; + kfree(es); + } + + /* cut off from the FAT chain */ + if (ei->flags == ALLOC_FAT_CHAIN && last_clu != EXFAT_FREE_CLUSTER && + last_clu != EXFAT_EOF_CLUSTER) { + if (exfat_ent_set(sb, last_clu, EXFAT_EOF_CLUSTER)) + return -EIO; + } + + /* invalidate cache and free the clusters */ + /* clear exfat cache */ + exfat_cache_inval_inode(inode); + + /* hint information */ + ei->hint_bmap.off = EXFAT_EOF_CLUSTER; + ei->hint_bmap.clu = EXFAT_EOF_CLUSTER; + if (ei->rwoffset > new_size) + ei->rwoffset = new_size; + + /* hint_stat will be used if this is directory. */ + ei->hint_stat.eidx = 0; + ei->hint_stat.clu = ei->start_clu; + ei->hint_femp.eidx = EXFAT_HINT_NONE; + + /* free the clusters */ + if (exfat_free_cluster(inode, &clu)) + return -EIO; + + exfat_set_vol_flags(sb, VOL_CLEAN); + + return 0; +} + +void exfat_truncate(struct inode *inode, loff_t size) +{ + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int blocksize = 1 << inode->i_blkbits; + loff_t aligned_size; + int err; + + mutex_lock(&sbi->s_lock); + if (EXFAT_I(inode)->start_clu == 0) { + /* + * Empty start_clu != ~0 (not allocated) + */ + exfat_fs_error(sb, "tried to truncate zeroed cluster."); + goto write_size; + } + + err = __exfat_truncate(inode, i_size_read(inode)); + if (err) + goto write_size; + + inode->i_ctime = inode->i_mtime = current_time(inode); + if (IS_DIRSYNC(inode)) + exfat_sync_inode(inode); + else + mark_inode_dirty(inode); + + inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & + ~(sbi->cluster_size - 1)) >> inode->i_blkbits; +write_size: + aligned_size = i_size_read(inode); + if (aligned_size & (blocksize - 1)) { + aligned_size |= (blocksize - 1); + aligned_size++; + } + + if (EXFAT_I(inode)->i_size_ondisk > i_size_read(inode)) + EXFAT_I(inode)->i_size_ondisk = aligned_size; + + if (EXFAT_I(inode)->i_size_aligned > i_size_read(inode)) + EXFAT_I(inode)->i_size_aligned = aligned_size; + mutex_unlock(&sbi->s_lock); +} + +int exfat_getattr(const struct path *path, struct kstat *stat, + unsigned int request_mask, unsigned int query_flags) +{ + struct inode *inode = d_backing_inode(path->dentry); + struct exfat_inode_info *ei = EXFAT_I(inode); + + generic_fillattr(inode, stat); + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = ei->i_crtime.tv_sec; + stat->btime.tv_nsec = ei->i_crtime.tv_nsec; + stat->blksize = EXFAT_SB(inode->i_sb)->cluster_size; + return 0; +} + +int exfat_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb); + struct inode *inode = dentry->d_inode; + unsigned int ia_valid; + int error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size > i_size_read(inode)) { + error = exfat_cont_expand(inode, attr->ia_size); + if (error || attr->ia_valid == ATTR_SIZE) + return error; + attr->ia_valid &= ~ATTR_SIZE; + } + + /* Check for setting the inode time. */ + ia_valid = attr->ia_valid; + if ((ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) && + exfat_allow_set_time(sbi, inode)) { + attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET | + ATTR_TIMES_SET); + } + + error = setattr_prepare(dentry, attr); + attr->ia_valid = ia_valid; + if (error) + goto out; + + if (((attr->ia_valid & ATTR_UID) && + !uid_eq(attr->ia_uid, sbi->options.fs_uid)) || + ((attr->ia_valid & ATTR_GID) && + !gid_eq(attr->ia_gid, sbi->options.fs_gid)) || + ((attr->ia_valid & ATTR_MODE) && + (attr->ia_mode & ~(S_IFREG | S_IFLNK | S_IFDIR | 0777)))) { + error = -EPERM; + goto out; + } + + /* + * We don't return -EPERM here. Yes, strange, but this is too + * old behavior. + */ + if (attr->ia_valid & ATTR_MODE) { + if (exfat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0) + attr->ia_valid &= ~ATTR_MODE; + } + + if (attr->ia_valid & ATTR_SIZE) { + error = exfat_block_truncate_page(inode, attr->ia_size); + if (error) + goto out; + + down_write(&EXFAT_I(inode)->truncate_lock); + truncate_setsize(inode, attr->ia_size); + exfat_truncate(inode, attr->ia_size); + up_write(&EXFAT_I(inode)->truncate_lock); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + +out: + return error; +} + +const struct file_operations exfat_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations exfat_file_inode_operations = { + .setattr = exfat_setattr, + .getattr = exfat_getattr, +}; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c new file mode 100644 index 000000000000..06887492f54b --- /dev/null +++ b/fs/exfat/inode.c @@ -0,0 +1,671 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/init.h> +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/time.h> +#include <linux/writeback.h> +#include <linux/uio.h> +#include <linux/random.h> +#include <linux/iversion.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static int __exfat_write_inode(struct inode *inode, int sync) +{ + int ret = -EIO; + unsigned long long on_disk_size; + struct exfat_dentry *ep, *ep2; + struct exfat_entry_set_cache *es = NULL; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + bool is_dir = (ei->type == TYPE_DIR) ? true : false; + + if (inode->i_ino == EXFAT_ROOT_INO) + return 0; + + /* + * If the indode is already unlinked, there is no need for updating it. + */ + if (ei->dir.dir == DIR_DELETED) + return 0; + + if (is_dir && ei->dir.dir == sbi->root_dir && ei->entry == -1) + return 0; + + exfat_set_vol_flags(sb, VOL_DIRTY); + + /* get the directory entry of given file or directory */ + es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES, + &ep); + if (!es) + return -EIO; + ep2 = ep + 1; + + ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode)); + + /* set FILE_INFO structure using the acquired struct exfat_dentry */ + exfat_set_entry_time(sbi, &ei->i_crtime, + &ep->dentry.file.create_tz, + &ep->dentry.file.create_time, + &ep->dentry.file.create_date, + &ep->dentry.file.create_time_ms); + exfat_set_entry_time(sbi, &inode->i_mtime, + &ep->dentry.file.modify_tz, + &ep->dentry.file.modify_time, + &ep->dentry.file.modify_date, + &ep->dentry.file.modify_time_ms); + exfat_set_entry_time(sbi, &inode->i_atime, + &ep->dentry.file.access_tz, + &ep->dentry.file.access_time, + &ep->dentry.file.access_date, + NULL); + + /* File size should be zero if there is no cluster allocated */ + on_disk_size = i_size_read(inode); + + if (ei->start_clu == EXFAT_EOF_CLUSTER) + on_disk_size = 0; + + ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size); + ep2->dentry.stream.size = ep2->dentry.stream.valid_size; + + ret = exfat_update_dir_chksum_with_entry_set(sb, es, sync); + kfree(es); + return ret; +} + +int exfat_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + + mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); + ret = __exfat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); + mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); + + return ret; +} + +void exfat_sync_inode(struct inode *inode) +{ + lockdep_assert_held(&EXFAT_SB(inode->i_sb)->s_lock); + __exfat_write_inode(inode, 1); +} + +/* + * Input: inode, (logical) clu_offset, target allocation area + * Output: errcode, cluster number + * *clu = (~0), if it's unable to allocate a new cluster + */ +static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, + unsigned int *clu, int create) +{ + int ret, modified = false; + unsigned int last_clu; + struct exfat_chain new_clu; + struct exfat_dentry *ep; + struct exfat_entry_set_cache *es = NULL; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + unsigned int local_clu_offset = clu_offset; + unsigned int num_to_be_allocated = 0, num_clusters = 0; + + ei->rwoffset = EXFAT_CLU_TO_B(clu_offset, sbi); + + if (EXFAT_I(inode)->i_size_ondisk > 0) + num_clusters = + EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, + sbi); + + if (clu_offset >= num_clusters) + num_to_be_allocated = clu_offset - num_clusters + 1; + + if (!create && (num_to_be_allocated > 0)) { + *clu = EXFAT_EOF_CLUSTER; + return 0; + } + + *clu = last_clu = ei->start_clu; + + if (ei->flags == ALLOC_NO_FAT_CHAIN) { + if (clu_offset > 0 && *clu != EXFAT_EOF_CLUSTER) { + last_clu += clu_offset - 1; + + if (clu_offset == num_clusters) + *clu = EXFAT_EOF_CLUSTER; + else + *clu += clu_offset; + } + } else if (ei->type == TYPE_FILE) { + unsigned int fclus = 0; + int err = exfat_get_cluster(inode, clu_offset, + &fclus, clu, &last_clu, 1); + if (err) + return -EIO; + + clu_offset -= fclus; + } else { + /* hint information */ + if (clu_offset > 0 && ei->hint_bmap.off != EXFAT_EOF_CLUSTER && + ei->hint_bmap.off > 0 && clu_offset >= ei->hint_bmap.off) { + clu_offset -= ei->hint_bmap.off; + /* hint_bmap.clu should be valid */ + WARN_ON(ei->hint_bmap.clu < 2); + *clu = ei->hint_bmap.clu; + } + + while (clu_offset > 0 && *clu != EXFAT_EOF_CLUSTER) { + last_clu = *clu; + if (exfat_get_next_cluster(sb, clu)) + return -EIO; + clu_offset--; + } + } + + if (*clu == EXFAT_EOF_CLUSTER) { + exfat_set_vol_flags(sb, VOL_DIRTY); + + new_clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ? + EXFAT_EOF_CLUSTER : last_clu + 1; + new_clu.size = 0; + new_clu.flags = ei->flags; + + /* allocate a cluster */ + if (num_to_be_allocated < 1) { + /* Broken FAT (i_sze > allocated FAT) */ + exfat_fs_error(sb, "broken FAT chain."); + return -EIO; + } + + ret = exfat_alloc_cluster(inode, num_to_be_allocated, &new_clu); + if (ret) + return ret; + + if (new_clu.dir == EXFAT_EOF_CLUSTER || + new_clu.dir == EXFAT_FREE_CLUSTER) { + exfat_fs_error(sb, + "bogus cluster new allocated (last_clu : %u, new_clu : %u)", + last_clu, new_clu.dir); + return -EIO; + } + + /* append to the FAT chain */ + if (last_clu == EXFAT_EOF_CLUSTER) { + if (new_clu.flags == ALLOC_FAT_CHAIN) + ei->flags = ALLOC_FAT_CHAIN; + ei->start_clu = new_clu.dir; + modified = true; + } else { + if (new_clu.flags != ei->flags) { + /* no-fat-chain bit is disabled, + * so fat-chain should be synced with + * alloc-bitmap + */ + exfat_chain_cont_cluster(sb, ei->start_clu, + num_clusters); + ei->flags = ALLOC_FAT_CHAIN; + modified = true; + } + if (new_clu.flags == ALLOC_FAT_CHAIN) + if (exfat_ent_set(sb, last_clu, new_clu.dir)) + return -EIO; + } + + num_clusters += num_to_be_allocated; + *clu = new_clu.dir; + + if (ei->dir.dir != DIR_DELETED) { + es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, + ES_ALL_ENTRIES, &ep); + if (!es) + return -EIO; + /* get stream entry */ + ep++; + + /* update directory entry */ + if (modified) { + if (ep->dentry.stream.flags != ei->flags) + ep->dentry.stream.flags = ei->flags; + + if (le32_to_cpu(ep->dentry.stream.start_clu) != + ei->start_clu) + ep->dentry.stream.start_clu = + cpu_to_le32(ei->start_clu); + + ep->dentry.stream.valid_size = + cpu_to_le64(i_size_read(inode)); + ep->dentry.stream.size = + ep->dentry.stream.valid_size; + } + + if (exfat_update_dir_chksum_with_entry_set(sb, es, + inode_needs_sync(inode))) + return -EIO; + kfree(es); + + } /* end of if != DIR_DELETED */ + + inode->i_blocks += + num_to_be_allocated << sbi->sect_per_clus_bits; + + /* + * Move *clu pointer along FAT chains (hole care) because the + * caller of this function expect *clu to be the last cluster. + * This only works when num_to_be_allocated >= 2, + * *clu = (the first cluster of the allocated chain) => + * (the last cluster of ...) + */ + if (ei->flags == ALLOC_NO_FAT_CHAIN) { + *clu += num_to_be_allocated - 1; + } else { + while (num_to_be_allocated > 1) { + if (exfat_get_next_cluster(sb, clu)) + return -EIO; + num_to_be_allocated--; + } + } + + } + + /* hint information */ + ei->hint_bmap.off = local_clu_offset; + ei->hint_bmap.clu = *clu; + + return 0; +} + +static int exfat_map_new_buffer(struct exfat_inode_info *ei, + struct buffer_head *bh, loff_t pos) +{ + if (buffer_delay(bh) && pos > ei->i_size_aligned) + return -EIO; + set_buffer_new(bh); + + /* + * Adjust i_size_aligned if i_size_ondisk is bigger than it. + */ + if (ei->i_size_ondisk > ei->i_size_aligned) + ei->i_size_aligned = ei->i_size_ondisk; + return 0; +} + +static int exfat_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err = 0; + unsigned long mapped_blocks = 0; + unsigned int cluster, sec_offset; + sector_t last_block; + sector_t phys = 0; + loff_t pos; + + mutex_lock(&sbi->s_lock); + last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb); + if (iblock >= last_block && !create) + goto done; + + /* Is this block already allocated? */ + err = exfat_map_cluster(inode, iblock >> sbi->sect_per_clus_bits, + &cluster, create); + if (err) { + if (err != -ENOSPC) + exfat_fs_error_ratelimit(sb, + "failed to bmap (inode : %p iblock : %llu, err : %d)", + inode, (unsigned long long)iblock, err); + goto unlock_ret; + } + + if (cluster == EXFAT_EOF_CLUSTER) + goto done; + + /* sector offset in cluster */ + sec_offset = iblock & (sbi->sect_per_clus - 1); + + phys = exfat_cluster_to_sector(sbi, cluster) + sec_offset; + mapped_blocks = sbi->sect_per_clus - sec_offset; + max_blocks = min(mapped_blocks, max_blocks); + + /* Treat newly added block / cluster */ + if (iblock < last_block) + create = 0; + + if (create || buffer_delay(bh_result)) { + pos = EXFAT_BLK_TO_B((iblock + 1), sb); + if (ei->i_size_ondisk < pos) + ei->i_size_ondisk = pos; + } + + if (create) { + err = exfat_map_new_buffer(ei, bh_result, pos); + if (err) { + exfat_fs_error(sb, + "requested for bmap out of range(pos : (%llu) > i_size_aligned(%llu)\n", + pos, ei->i_size_aligned); + goto unlock_ret; + } + } + + if (buffer_delay(bh_result)) + clear_buffer_delay(bh_result); + map_bh(bh_result, sb, phys); +done: + bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); +unlock_ret: + mutex_unlock(&sbi->s_lock); + return err; +} + +static int exfat_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, exfat_get_block); +} + +static int exfat_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned int nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, exfat_get_block); +} + +static int exfat_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, exfat_get_block, wbc); +} + +static int exfat_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, exfat_get_block); +} + +static void exfat_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > i_size_read(inode)) { + truncate_pagecache(inode, i_size_read(inode)); + exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned); + } +} + +static int exfat_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + exfat_get_block, + &EXFAT_I(mapping->host)->i_size_ondisk); + + if (ret < 0) + exfat_write_failed(mapping, pos+len); + + return ret; +} + +static int exfat_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct page *pagep, void *fsdata) +{ + struct inode *inode = mapping->host; + struct exfat_inode_info *ei = EXFAT_I(inode); + int err; + + err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + + if (EXFAT_I(inode)->i_size_aligned < i_size_read(inode)) { + exfat_fs_error(inode->i_sb, + "invalid size(size(%llu) > aligned(%llu)\n", + i_size_read(inode), EXFAT_I(inode)->i_size_aligned); + return -EIO; + } + + if (err < len) + exfat_write_failed(mapping, pos+len); + + if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) { + inode->i_mtime = inode->i_ctime = current_time(inode); + ei->attr |= ATTR_ARCHIVE; + mark_inode_dirty(inode); + } + + return err; +} + +static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + loff_t size = iocb->ki_pos + iov_iter_count(iter); + int rw = iov_iter_rw(iter); + ssize_t ret; + + if (rw == WRITE) { + /* + * FIXME: blockdev_direct_IO() doesn't use ->write_begin(), + * so we need to update the ->i_size_aligned to block boundary. + * + * But we must fill the remaining area or hole by nul for + * updating ->i_size_aligned + * + * Return 0, and fallback to normal buffered write. + */ + if (EXFAT_I(inode)->i_size_aligned < size) + return 0; + } + + /* + * Need to use the DIO_LOCKING for avoiding the race + * condition of exfat_get_block() and ->truncate(). + */ + ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); + if (ret < 0 && (rw & WRITE)) + exfat_write_failed(mapping, size); + return ret; +} + +static sector_t exfat_aop_bmap(struct address_space *mapping, sector_t block) +{ + sector_t blocknr; + + /* exfat_get_cluster() assumes the requested blocknr isn't truncated. */ + down_read(&EXFAT_I(mapping->host)->truncate_lock); + blocknr = generic_block_bmap(mapping, block, exfat_get_block); + up_read(&EXFAT_I(mapping->host)->truncate_lock); + return blocknr; +} + +/* + * exfat_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This is required during truncate to physically zeroout the tail end + * of that block so it doesn't yield old data if the file is later grown. + * Also, avoid causing failure from fsx for cases of "data past EOF" + */ +int exfat_block_truncate_page(struct inode *inode, loff_t from) +{ + return block_truncate_page(inode->i_mapping, from, exfat_get_block); +} + +static const struct address_space_operations exfat_aops = { + .readpage = exfat_readpage, + .readpages = exfat_readpages, + .writepage = exfat_writepage, + .writepages = exfat_writepages, + .write_begin = exfat_write_begin, + .write_end = exfat_write_end, + .direct_IO = exfat_direct_IO, + .bmap = exfat_aop_bmap +}; + +static inline unsigned long exfat_hash(loff_t i_pos) +{ + return hash_32(i_pos, EXFAT_HASH_BITS); +} + +void exfat_hash_inode(struct inode *inode, loff_t i_pos) +{ + struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); + struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos); + + spin_lock(&sbi->inode_hash_lock); + EXFAT_I(inode)->i_pos = i_pos; + hlist_add_head(&EXFAT_I(inode)->i_hash_fat, head); + spin_unlock(&sbi->inode_hash_lock); +} + +void exfat_unhash_inode(struct inode *inode) +{ + struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); + + spin_lock(&sbi->inode_hash_lock); + hlist_del_init(&EXFAT_I(inode)->i_hash_fat); + EXFAT_I(inode)->i_pos = 0; + spin_unlock(&sbi->inode_hash_lock); +} + +struct inode *exfat_iget(struct super_block *sb, loff_t i_pos) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *info; + struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos); + struct inode *inode = NULL; + + spin_lock(&sbi->inode_hash_lock); + hlist_for_each_entry(info, head, i_hash_fat) { + WARN_ON(info->vfs_inode.i_sb != sb); + + if (i_pos != info->i_pos) + continue; + inode = igrab(&info->vfs_inode); + if (inode) + break; + } + spin_unlock(&sbi->inode_hash_lock); + return inode; +} + +/* doesn't deal with root inode */ +static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) +{ + struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + loff_t size = info->size; + + memcpy(&ei->dir, &info->dir, sizeof(struct exfat_chain)); + ei->entry = info->entry; + ei->attr = info->attr; + ei->start_clu = info->start_clu; + ei->flags = info->flags; + ei->type = info->type; + + ei->version = 0; + ei->hint_stat.eidx = 0; + ei->hint_stat.clu = info->start_clu; + ei->hint_femp.eidx = EXFAT_HINT_NONE; + ei->rwoffset = 0; + ei->hint_bmap.off = EXFAT_EOF_CLUSTER; + ei->i_pos = 0; + + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + inode_inc_iversion(inode); + inode->i_generation = prandom_u32(); + + if (info->attr & ATTR_SUBDIR) { /* directory */ + inode->i_generation &= ~1; + inode->i_mode = exfat_make_mode(sbi, info->attr, 0777); + inode->i_op = &exfat_dir_inode_operations; + inode->i_fop = &exfat_dir_operations; + set_nlink(inode, info->num_subdirs); + } else { /* regular file */ + inode->i_generation |= 1; + inode->i_mode = exfat_make_mode(sbi, info->attr, 0777); + inode->i_op = &exfat_file_inode_operations; + inode->i_fop = &exfat_file_operations; + inode->i_mapping->a_ops = &exfat_aops; + inode->i_mapping->nrpages = 0; + } + + i_size_write(inode, size); + + /* ondisk and aligned size should be aligned with block size */ + if (size & (inode->i_sb->s_blocksize - 1)) { + size |= (inode->i_sb->s_blocksize - 1); + size++; + } + + ei->i_size_aligned = size; + ei->i_size_ondisk = size; + + exfat_save_attr(inode, info->attr); + + inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & + ~(sbi->cluster_size - 1)) >> inode->i_blkbits; + inode->i_mtime = info->mtime; + inode->i_ctime = info->mtime; + ei->i_crtime = info->crtime; + inode->i_atime = info->atime; + + exfat_cache_init_inode(inode); + + return 0; +} + +struct inode *exfat_build_inode(struct super_block *sb, + struct exfat_dir_entry *info, loff_t i_pos) +{ + struct inode *inode; + int err; + + inode = exfat_iget(sb, i_pos); + if (inode) + goto out; + inode = new_inode(sb); + if (!inode) { + inode = ERR_PTR(-ENOMEM); + goto out; + } + inode->i_ino = iunique(sb, EXFAT_ROOT_INO); + inode_set_iversion(inode, 1); + err = exfat_fill_inode(inode, info); + if (err) { + iput(inode); + inode = ERR_PTR(err); + goto out; + } + exfat_hash_inode(inode, i_pos); + insert_inode_hash(inode); +out: + return inode; +} + +void exfat_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + + if (!inode->i_nlink) { + i_size_write(inode, 0); + mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); + __exfat_truncate(inode, 0); + mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); + } + + invalidate_inode_buffers(inode); + clear_inode(inode); + exfat_cache_inval_inode(inode); + exfat_unhash_inode(inode); +} diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c new file mode 100644 index 000000000000..14a3300848f6 --- /dev/null +++ b/fs/exfat/misc.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Written 1992,1993 by Werner Almesberger + * 22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980 + * and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru) + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +/* + * exfat_fs_error reports a file system problem that might indicate fa data + * corruption/inconsistency. Depending on 'errors' mount option the + * panic() is called, or error message is printed FAT and nothing is done, + * or filesystem is remounted read-only (default behavior). + * In case the file system is remounted read-only, it can be made writable + * again by remounting it. + */ +void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) +{ + struct exfat_mount_options *opts = &EXFAT_SB(sb)->options; + va_list args; + struct va_format vaf; + + if (report) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + exfat_msg(sb, KERN_ERR, "error, %pV\n", &vaf); + va_end(args); + } + + if (opts->errors == EXFAT_ERRORS_PANIC) { + panic("exFAT-fs (%s): fs panic from previous error\n", + sb->s_id); + } else if (opts->errors == EXFAT_ERRORS_RO && !sb_rdonly(sb)) { + sb->s_flags |= SB_RDONLY; + exfat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); + } +} + +/* + * exfat_msg() - print preformated EXFAT specific messages. + * All logs except what uses exfat_fs_error() should be written by exfat_msg() + */ +void exfat_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + /* level means KERN_ pacility level */ + printk("%sexFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + +#define SECS_PER_MIN (60) +#define TIMEZONE_SEC(x) ((x) * 15 * SECS_PER_MIN) + +static void exfat_adjust_tz(struct timespec64 *ts, u8 tz_off) +{ + if (tz_off <= 0x3F) + ts->tv_sec -= TIMEZONE_SEC(tz_off); + else /* 0x40 <= (tz_off & 0x7F) <=0x7F */ + ts->tv_sec += TIMEZONE_SEC(0x80 - tz_off); +} + +/* Convert a EXFAT time/date pair to a UNIX date (seconds since 1 1 70). */ +void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, + u8 tz, __le16 time, __le16 date, u8 time_ms) +{ + u16 t = le16_to_cpu(time); + u16 d = le16_to_cpu(date); + + ts->tv_sec = mktime64(1980 + (d >> 9), d >> 5 & 0x000F, d & 0x001F, + t >> 11, (t >> 5) & 0x003F, (t & 0x001F) << 1); + + + /* time_ms field represent 0 ~ 199(1990 ms) */ + if (time_ms) { + ts->tv_sec += time_ms / 100; + ts->tv_nsec = (time_ms % 100) * 10 * NSEC_PER_MSEC; + } + + if (tz & EXFAT_TZ_VALID) + /* Adjust timezone to UTC0. */ + exfat_adjust_tz(ts, tz & ~EXFAT_TZ_VALID); + else + /* Convert from local time to UTC using time_offset. */ + ts->tv_sec -= sbi->options.time_offset * SECS_PER_MIN; +} + +/* Convert linear UNIX date to a EXFAT time/date pair. */ +void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, + u8 *tz, __le16 *time, __le16 *date, u8 *time_ms) +{ + struct tm tm; + u16 t, d; + + time64_to_tm(ts->tv_sec, 0, &tm); + t = (tm.tm_hour << 11) | (tm.tm_min << 5) | (tm.tm_sec >> 1); + d = ((tm.tm_year - 80) << 9) | ((tm.tm_mon + 1) << 5) | tm.tm_mday; + + *time = cpu_to_le16(t); + *date = cpu_to_le16(d); + + /* time_ms field represent 0 ~ 199(1990 ms) */ + if (time_ms) + *time_ms = (tm.tm_sec & 1) * 100 + + ts->tv_nsec / (10 * NSEC_PER_MSEC); + + /* + * Record 00h value for OffsetFromUtc field and 1 value for OffsetValid + * to indicate that local time and UTC are the same. + */ + *tz = EXFAT_TZ_VALID; +} + +unsigned short exfat_calc_chksum_2byte(void *data, int len, + unsigned short chksum, int type) +{ + int i; + unsigned char *c = (unsigned char *)data; + + for (i = 0; i < len; i++, c++) { + if (((i == 2) || (i == 3)) && (type == CS_DIR_ENTRY)) + continue; + chksum = (((chksum & 1) << 15) | ((chksum & 0xFFFE) >> 1)) + + (unsigned short)*c; + } + return chksum; +} + +void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync) +{ + set_bit(EXFAT_SB_DIRTY, &EXFAT_SB(sb)->s_state); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + + if (sync) + sync_dirty_buffer(bh); +} + +void exfat_chain_set(struct exfat_chain *ec, unsigned int dir, + unsigned int size, unsigned char flags) +{ + ec->dir = dir; + ec->size = size; + ec->flags = flags; +} + +void exfat_chain_dup(struct exfat_chain *dup, struct exfat_chain *ec) +{ + return exfat_chain_set(dup, ec->dir, ec->size, ec->flags); +} diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c new file mode 100644 index 000000000000..a8681d91f569 --- /dev/null +++ b/fs/exfat/namei.c @@ -0,0 +1,1448 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/iversion.h> +#include <linux/namei.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/nls.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static inline unsigned long exfat_d_version(struct dentry *dentry) +{ + return (unsigned long) dentry->d_fsdata; +} + +static inline void exfat_d_version_set(struct dentry *dentry, + unsigned long version) +{ + dentry->d_fsdata = (void *) version; +} + +/* + * If new entry was created in the parent, it could create the 8.3 alias (the + * shortname of logname). So, the parent may have the negative-dentry which + * matches the created 8.3 alias. + * + * If it happened, the negative dentry isn't actually negative anymore. So, + * drop it. + */ +static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + int ret; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* + * This is not negative dentry. Always valid. + * + * Note, rename() to existing directory entry will have ->d_inode, and + * will use existing name which isn't specified name by user. + * + * We may be able to drop this positive dentry here. But dropping + * positive dentry isn't good idea. So it's unsupported like + * rename("filename", "FILENAME") for now. + */ + if (d_really_is_positive(dentry)) + return 1; + + /* + * Drop the negative dentry, in order to make sure to use the case + * sensitive name which is specified by user if this is for creation. + */ + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + + spin_lock(&dentry->d_lock); + ret = inode_eq_iversion(d_inode(dentry->d_parent), + exfat_d_version(dentry)); + spin_unlock(&dentry->d_lock); + return ret; +} + +/* returns the length of a struct qstr, ignoring trailing dots */ +static unsigned int exfat_striptail_len(unsigned int len, const char *name) +{ + while (len && name[len - 1] == '.') + len--; + return len; +} + +/* + * Compute the hash for the exfat name corresponding to the dentry. If the name + * is invalid, we leave the hash code unchanged so that the existing dentry can + * be used. The exfat fs routines will return ENOENT or EINVAL as appropriate. + */ +static int exfat_d_hash(const struct dentry *dentry, struct qstr *qstr) +{ + struct super_block *sb = dentry->d_sb; + struct nls_table *t = EXFAT_SB(sb)->nls_io; + const unsigned char *name = qstr->name; + unsigned int len = exfat_striptail_len(qstr->len, qstr->name); + unsigned long hash = init_name_hash(dentry); + int i, charlen; + wchar_t c; + + for (i = 0; i < len; i += charlen) { + charlen = t->char2uni(&name[i], len - i, &c); + if (charlen < 0) + return charlen; + hash = partial_name_hash(exfat_toupper(sb, c), hash); + } + + qstr->hash = end_name_hash(hash); + return 0; +} + +static int exfat_d_cmp(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +{ + struct super_block *sb = dentry->d_sb; + struct nls_table *t = EXFAT_SB(sb)->nls_io; + unsigned int alen = exfat_striptail_len(name->len, name->name); + unsigned int blen = exfat_striptail_len(len, str); + wchar_t c1, c2; + int charlen, i; + + if (alen != blen) + return 1; + + for (i = 0; i < len; i += charlen) { + charlen = t->char2uni(&name->name[i], alen - i, &c1); + if (charlen < 0) + return 1; + if (charlen != t->char2uni(&str[i], blen - i, &c2)) + return 1; + + if (exfat_toupper(sb, c1) != exfat_toupper(sb, c2)) + return 1; + } + + return 0; +} + +const struct dentry_operations exfat_dentry_ops = { + .d_revalidate = exfat_d_revalidate, + .d_hash = exfat_d_hash, + .d_compare = exfat_d_cmp, +}; + +static int exfat_utf8_d_hash(const struct dentry *dentry, struct qstr *qstr) +{ + struct super_block *sb = dentry->d_sb; + const unsigned char *name = qstr->name; + unsigned int len = exfat_striptail_len(qstr->len, qstr->name); + unsigned long hash = init_name_hash(dentry); + int i, charlen; + unicode_t u; + + for (i = 0; i < len; i += charlen) { + charlen = utf8_to_utf32(&name[i], len - i, &u); + if (charlen < 0) + return charlen; + + /* + * Convert to UTF-16: code points above U+FFFF are encoded as + * surrogate pairs. + * exfat_toupper() works only for code points up to the U+FFFF. + */ + if (u > 0xFFFF) { + hash = partial_name_hash(exfat_high_surrogate(u), hash); + hash = partial_name_hash(exfat_low_surrogate(u), hash); + } else { + hash = partial_name_hash(exfat_toupper(sb, u), hash); + } + } + + qstr->hash = end_name_hash(hash); + return 0; +} + +static int exfat_utf8_d_cmp(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +{ + struct super_block *sb = dentry->d_sb; + unsigned int alen = exfat_striptail_len(name->len, name->name); + unsigned int blen = exfat_striptail_len(len, str); + unicode_t u_a, u_b; + int charlen, i; + + if (alen != blen) + return 1; + + for (i = 0; i < alen; i += charlen) { + charlen = utf8_to_utf32(&name->name[i], alen - i, &u_a); + if (charlen < 0) + return 1; + if (charlen != utf8_to_utf32(&str[i], blen - i, &u_b)) + return 1; + + if (u_a <= 0xFFFF && u_b <= 0xFFFF) { + if (exfat_toupper(sb, u_a) != exfat_toupper(sb, u_b)) + return 1; + } else if (u_a > 0xFFFF && u_b > 0xFFFF) { + if (exfat_low_surrogate(u_a) != + exfat_low_surrogate(u_b) || + exfat_high_surrogate(u_a) != + exfat_high_surrogate(u_b)) + return 1; + } else { + return 1; + } + } + + return 0; +} + +const struct dentry_operations exfat_utf8_dentry_ops = { + .d_revalidate = exfat_d_revalidate, + .d_hash = exfat_utf8_d_hash, + .d_compare = exfat_utf8_d_cmp, +}; + +/* used only in search empty_slot() */ +#define CNT_UNUSED_NOHIT (-1) +#define CNT_UNUSED_HIT (-2) +/* search EMPTY CONTINUOUS "num_entries" entries */ +static int exfat_search_empty_slot(struct super_block *sb, + struct exfat_hint_femp *hint_femp, struct exfat_chain *p_dir, + int num_entries) +{ + int i, dentry, num_empty = 0; + int dentries_per_clu; + unsigned int type; + struct exfat_chain clu; + struct exfat_dentry *ep; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + + dentries_per_clu = sbi->dentries_per_clu; + + if (hint_femp->eidx != EXFAT_HINT_NONE) { + dentry = hint_femp->eidx; + if (num_entries <= hint_femp->count) { + hint_femp->eidx = EXFAT_HINT_NONE; + return dentry; + } + + exfat_chain_dup(&clu, &hint_femp->cur); + } else { + exfat_chain_dup(&clu, p_dir); + dentry = 0; + } + + while (clu.dir != EXFAT_EOF_CLUSTER) { + i = dentry & (dentries_per_clu - 1); + + for (; i < dentries_per_clu; i++, dentry++) { + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + type = exfat_get_entry_type(ep); + brelse(bh); + + if (type == TYPE_UNUSED || type == TYPE_DELETED) { + num_empty++; + if (hint_femp->eidx == EXFAT_HINT_NONE) { + hint_femp->eidx = dentry; + hint_femp->count = CNT_UNUSED_NOHIT; + exfat_chain_set(&hint_femp->cur, + clu.dir, clu.size, clu.flags); + } + + if (type == TYPE_UNUSED && + hint_femp->count != CNT_UNUSED_HIT) + hint_femp->count = CNT_UNUSED_HIT; + } else { + if (hint_femp->eidx != EXFAT_HINT_NONE && + hint_femp->count == CNT_UNUSED_HIT) { + /* unused empty group means + * an empty group which includes + * unused dentry + */ + exfat_fs_error(sb, + "found bogus dentry(%d) beyond unused empty group(%d) (start_clu : %u, cur_clu : %u)", + dentry, hint_femp->eidx, + p_dir->dir, clu.dir); + return -EIO; + } + + num_empty = 0; + hint_femp->eidx = EXFAT_HINT_NONE; + } + + if (num_empty >= num_entries) { + /* found and invalidate hint_femp */ + hint_femp->eidx = EXFAT_HINT_NONE; + return (dentry - (num_entries - 1)); + } + } + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + if (exfat_get_next_cluster(sb, &clu.dir)) + return -EIO; + } + } + + return -ENOSPC; +} + +static int exfat_check_max_dentries(struct inode *inode) +{ + if (EXFAT_B_TO_DEN(i_size_read(inode)) >= MAX_EXFAT_DENTRIES) { + /* + * exFAT spec allows a dir to grow upto 8388608(256MB) + * dentries + */ + return -ENOSPC; + } + return 0; +} + +/* find empty directory entry. + * if there isn't any empty slot, expand cluster chain. + */ +static int exfat_find_empty_entry(struct inode *inode, + struct exfat_chain *p_dir, int num_entries) +{ + int dentry; + unsigned int ret, last_clu; + sector_t sector; + loff_t size = 0; + struct exfat_chain clu; + struct exfat_dentry *ep = NULL; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_hint_femp hint_femp; + + hint_femp.eidx = EXFAT_HINT_NONE; + + if (ei->hint_femp.eidx != EXFAT_HINT_NONE) { + memcpy(&hint_femp, &ei->hint_femp, + sizeof(struct exfat_hint_femp)); + ei->hint_femp.eidx = EXFAT_HINT_NONE; + } + + while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir, + num_entries)) < 0) { + if (dentry == -EIO) + break; + + if (exfat_check_max_dentries(inode)) + return -ENOSPC; + + /* we trust p_dir->size regardless of FAT type */ + if (exfat_find_last_cluster(sb, p_dir, &last_clu)) + return -EIO; + + /* + * Allocate new cluster to this directory + */ + exfat_chain_set(&clu, last_clu + 1, 0, p_dir->flags); + + /* allocate a cluster */ + ret = exfat_alloc_cluster(inode, 1, &clu); + if (ret) + return ret; + + if (exfat_zeroed_cluster(inode, clu.dir)) + return -EIO; + + /* append to the FAT chain */ + if (clu.flags != p_dir->flags) { + /* no-fat-chain bit is disabled, + * so fat-chain should be synced with alloc-bitmap + */ + exfat_chain_cont_cluster(sb, p_dir->dir, p_dir->size); + p_dir->flags = ALLOC_FAT_CHAIN; + hint_femp.cur.flags = ALLOC_FAT_CHAIN; + } + + if (clu.flags == ALLOC_FAT_CHAIN) + if (exfat_ent_set(sb, last_clu, clu.dir)) + return -EIO; + + if (hint_femp.eidx == EXFAT_HINT_NONE) { + /* the special case that new dentry + * should be allocated from the start of new cluster + */ + hint_femp.eidx = EXFAT_B_TO_DEN_IDX(p_dir->size, sbi); + hint_femp.count = sbi->dentries_per_clu; + + exfat_chain_set(&hint_femp.cur, clu.dir, 0, clu.flags); + } + hint_femp.cur.size++; + p_dir->size++; + size = EXFAT_CLU_TO_B(p_dir->size, sbi); + + /* update the directory entry */ + if (p_dir->dir != sbi->root_dir) { + struct buffer_head *bh; + + ep = exfat_get_dentry(sb, + &(ei->dir), ei->entry + 1, &bh, §or); + if (!ep) + return -EIO; + + ep->dentry.stream.valid_size = cpu_to_le64(size); + ep->dentry.stream.size = ep->dentry.stream.valid_size; + ep->dentry.stream.flags = p_dir->flags; + exfat_update_bh(sb, bh, IS_DIRSYNC(inode)); + brelse(bh); + if (exfat_update_dir_chksum(inode, &(ei->dir), + ei->entry)) + return -EIO; + } + + /* directory inode should be updated in here */ + i_size_write(inode, size); + EXFAT_I(inode)->i_size_ondisk += sbi->cluster_size; + EXFAT_I(inode)->i_size_aligned += sbi->cluster_size; + EXFAT_I(inode)->flags = p_dir->flags; + inode->i_blocks += 1 << sbi->sect_per_clus_bits; + } + + return dentry; +} + +/* + * Name Resolution Functions : + * Zero if it was successful; otherwise nonzero. + */ +static int __exfat_resolve_path(struct inode *inode, const unsigned char *path, + struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, + int lookup) +{ + int namelen; + int lossy = NLS_NAME_NO_LOSSY; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + + /* strip all trailing periods */ + namelen = exfat_striptail_len(strlen(path), path); + if (!namelen) + return -ENOENT; + + if (strlen(path) > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE)) + return -ENAMETOOLONG; + + /* + * strip all leading spaces : + * "MS windows 7" supports leading spaces. + * So we should skip this preprocessing for compatibility. + */ + + /* file name conversion : + * If lookup case, we allow bad-name for compatibility. + */ + namelen = exfat_nls_to_utf16(sb, path, namelen, p_uniname, + &lossy); + if (namelen < 0) + return namelen; /* return error value */ + + if ((lossy && !lookup) || !namelen) + return -EINVAL; + + exfat_chain_set(p_dir, ei->start_clu, + EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); + + return 0; +} + +static inline int exfat_resolve_path(struct inode *inode, + const unsigned char *path, struct exfat_chain *dir, + struct exfat_uni_name *uni) +{ + return __exfat_resolve_path(inode, path, dir, uni, 0); +} + +static inline int exfat_resolve_path_for_lookup(struct inode *inode, + const unsigned char *path, struct exfat_chain *dir, + struct exfat_uni_name *uni) +{ + return __exfat_resolve_path(inode, path, dir, uni, 1); +} + +static inline loff_t exfat_make_i_pos(struct exfat_dir_entry *info) +{ + return ((loff_t) info->dir.dir << 32) | (info->entry & 0xffffffff); +} + +static int exfat_add_entry(struct inode *inode, const char *path, + struct exfat_chain *p_dir, unsigned int type, + struct exfat_dir_entry *info) +{ + int ret, dentry, num_entries; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_uni_name uniname; + struct exfat_chain clu; + int clu_size = 0; + unsigned int start_clu = EXFAT_FREE_CLUSTER; + + ret = exfat_resolve_path(inode, path, p_dir, &uniname); + if (ret) + goto out; + + num_entries = exfat_calc_num_entries(&uniname); + if (num_entries < 0) { + ret = num_entries; + goto out; + } + + /* exfat_find_empty_entry must be called before alloc_cluster() */ + dentry = exfat_find_empty_entry(inode, p_dir, num_entries); + if (dentry < 0) { + ret = dentry; /* -EIO or -ENOSPC */ + goto out; + } + + if (type == TYPE_DIR) { + ret = exfat_alloc_new_dir(inode, &clu); + if (ret) + goto out; + start_clu = clu.dir; + clu_size = sbi->cluster_size; + } + + /* update the directory entry */ + /* fill the dos name directory entry information of the created file. + * the first cluster is not determined yet. (0) + */ + ret = exfat_init_dir_entry(inode, p_dir, dentry, type, + start_clu, clu_size); + if (ret) + goto out; + + ret = exfat_init_ext_entry(inode, p_dir, dentry, num_entries, &uniname); + if (ret) + goto out; + + memcpy(&info->dir, p_dir, sizeof(struct exfat_chain)); + info->entry = dentry; + info->flags = ALLOC_NO_FAT_CHAIN; + info->type = type; + + if (type == TYPE_FILE) { + info->attr = ATTR_ARCHIVE; + info->start_clu = EXFAT_EOF_CLUSTER; + info->size = 0; + info->num_subdirs = 0; + } else { + int count; + struct exfat_chain cdir; + + info->attr = ATTR_SUBDIR; + info->start_clu = start_clu; + info->size = clu_size; + + exfat_chain_set(&cdir, info->start_clu, + EXFAT_B_TO_CLU(info->size, sbi), info->flags); + count = exfat_count_dir_entries(sb, &cdir); + if (count < 0) + return -EIO; + info->num_subdirs = count + EXFAT_MIN_SUBDIR; + } + memset(&info->crtime, 0, sizeof(info->crtime)); + memset(&info->mtime, 0, sizeof(info->mtime)); + memset(&info->atime, 0, sizeof(info->atime)); +out: + return ret; +} + +static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + struct exfat_chain cdir; + struct exfat_dir_entry info; + loff_t i_pos; + int err; + + mutex_lock(&EXFAT_SB(sb)->s_lock); + exfat_set_vol_flags(sb, VOL_DIRTY); + err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE, + &info); + exfat_set_vol_flags(sb, VOL_CLEAN); + if (err) + goto unlock; + + inode_inc_iversion(dir); + dir->i_ctime = dir->i_mtime = current_time(dir); + if (IS_DIRSYNC(dir)) + exfat_sync_inode(dir); + else + mark_inode_dirty(dir); + + i_pos = exfat_make_i_pos(&info); + inode = exfat_build_inode(sb, &info, i_pos); + if (IS_ERR(inode)) + goto unlock; + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = + EXFAT_I(inode)->i_crtime = current_time(inode); + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + d_instantiate(dentry, inode); +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return err; +} + +/* lookup a file */ +static int exfat_find(struct inode *dir, struct qstr *qname, + struct exfat_dir_entry *info) +{ + int ret, dentry, num_entries, count; + struct exfat_chain cdir; + struct exfat_uni_name uni_name; + struct exfat_dentry *ep, *ep2; + struct exfat_entry_set_cache *es = NULL; + struct super_block *sb = dir->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(dir); + + if (qname->len == 0) + return -ENOENT; + + /* check the validity of directory name in the given pathname */ + ret = exfat_resolve_path_for_lookup(dir, qname->name, &cdir, &uni_name); + if (ret) + return ret; + + num_entries = exfat_calc_num_entries(&uni_name); + if (num_entries < 0) + return num_entries; + + /* check the validation of hint_stat and initialize it if required */ + if (ei->version != (inode_peek_iversion_raw(dir) & 0xffffffff)) { + ei->hint_stat.clu = cdir.dir; + ei->hint_stat.eidx = 0; + ei->version = (inode_peek_iversion_raw(dir) & 0xffffffff); + ei->hint_femp.eidx = EXFAT_HINT_NONE; + } + + /* search the file name for directories */ + dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, + num_entries, TYPE_ALL); + + if ((dentry < 0) && (dentry != -EEXIST)) + return dentry; /* -error value */ + + memcpy(&info->dir, &cdir.dir, sizeof(struct exfat_chain)); + info->entry = dentry; + info->num_subdirs = 0; + + /* root directory itself */ + if (unlikely(dentry == -EEXIST)) { + int num_clu = 0; + + info->type = TYPE_DIR; + info->attr = ATTR_SUBDIR; + info->flags = ALLOC_FAT_CHAIN; + info->start_clu = sbi->root_dir; + memset(&info->crtime, 0, sizeof(info->crtime)); + memset(&info->mtime, 0, sizeof(info->mtime)); + memset(&info->atime, 0, sizeof(info->atime)); + + exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + if (exfat_count_num_clusters(sb, &cdir, &num_clu)) + return -EIO; + info->size = num_clu << sbi->cluster_size_bits; + + count = exfat_count_dir_entries(sb, &cdir); + if (count < 0) + return -EIO; + + info->num_subdirs = count; + } else { + es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES, &ep); + if (!es) + return -EIO; + ep2 = ep + 1; + + info->type = exfat_get_entry_type(ep); + info->attr = le16_to_cpu(ep->dentry.file.attr); + info->size = le64_to_cpu(ep2->dentry.stream.valid_size); + if ((info->type == TYPE_FILE) && (info->size == 0)) { + info->flags = ALLOC_NO_FAT_CHAIN; + info->start_clu = EXFAT_EOF_CLUSTER; + } else { + info->flags = ep2->dentry.stream.flags; + info->start_clu = + le32_to_cpu(ep2->dentry.stream.start_clu); + } + + if (ei->start_clu == EXFAT_FREE_CLUSTER) { + exfat_fs_error(sb, + "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)", + i_size_read(dir), ei->dir.dir, ei->entry); + return -EIO; + } + + exfat_get_entry_time(sbi, &info->crtime, + ep->dentry.file.create_tz, + ep->dentry.file.create_time, + ep->dentry.file.create_date, + ep->dentry.file.create_time_ms); + exfat_get_entry_time(sbi, &info->mtime, + ep->dentry.file.modify_tz, + ep->dentry.file.modify_time, + ep->dentry.file.modify_date, + ep->dentry.file.modify_time_ms); + exfat_get_entry_time(sbi, &info->atime, + ep->dentry.file.access_tz, + ep->dentry.file.access_time, + ep->dentry.file.access_date, + 0); + kfree(es); + + if (info->type == TYPE_DIR) { + exfat_chain_set(&cdir, info->start_clu, + EXFAT_B_TO_CLU(info->size, sbi), info->flags); + count = exfat_count_dir_entries(sb, &cdir); + if (count < 0) + return -EIO; + + info->num_subdirs = count + EXFAT_MIN_SUBDIR; + } + } + return 0; +} + +static int exfat_d_anon_disconn(struct dentry *dentry) +{ + return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED); +} + +static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + struct dentry *alias; + struct exfat_dir_entry info; + int err; + loff_t i_pos; + mode_t i_mode; + + mutex_lock(&EXFAT_SB(sb)->s_lock); + err = exfat_find(dir, &dentry->d_name, &info); + if (err) { + if (err == -ENOENT) { + inode = NULL; + goto out; + } + goto unlock; + } + + i_pos = exfat_make_i_pos(&info); + inode = exfat_build_inode(sb, &info, i_pos); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto unlock; + } + + i_mode = inode->i_mode; + alias = d_find_alias(inode); + + /* + * Checking "alias->d_parent == dentry->d_parent" to make sure + * FS is not corrupted (especially double linked dir). + */ + if (alias && alias->d_parent == dentry->d_parent && + !exfat_d_anon_disconn(alias)) { + + /* + * Unhashed alias is able to exist because of revalidate() + * called by lookup_fast. You can easily make this status + * by calling create and lookup concurrently + * In such case, we reuse an alias instead of new dentry + */ + if (d_unhashed(alias)) { + WARN_ON(alias->d_name.hash_len != + dentry->d_name.hash_len); + exfat_msg(sb, KERN_INFO, + "rehashed a dentry(%p) in read lookup", alias); + d_drop(dentry); + d_rehash(alias); + } else if (!S_ISDIR(i_mode)) { + /* + * This inode has non anonymous-DCACHE_DISCONNECTED + * dentry. This means, the user did ->lookup() by an + * another name (longname vs 8.3 alias of it) in past. + * + * Switch to new one for reason of locality if possible. + */ + d_move(alias, dentry); + } + iput(inode); + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return alias; + } + dput(alias); +out: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + if (!inode) + exfat_d_version_set(dentry, inode_query_iversion(dir)); + + return d_splice_alias(inode, dentry); +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return ERR_PTR(err); +} + +/* remove an entry, BUT don't truncate */ +static int exfat_unlink(struct inode *dir, struct dentry *dentry) +{ + struct exfat_chain cdir; + struct exfat_dentry *ep; + struct super_block *sb = dir->i_sb; + struct inode *inode = dentry->d_inode; + struct exfat_inode_info *ei = EXFAT_I(inode); + struct buffer_head *bh; + sector_t sector; + int num_entries, entry, err = 0; + + mutex_lock(&EXFAT_SB(sb)->s_lock); + exfat_chain_dup(&cdir, &ei->dir); + entry = ei->entry; + if (ei->dir.dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry"); + err = -ENOENT; + goto unlock; + } + + ep = exfat_get_dentry(sb, &cdir, entry, &bh, §or); + if (!ep) { + err = -EIO; + goto unlock; + } + num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep); + if (num_entries < 0) { + err = -EIO; + brelse(bh); + goto unlock; + } + num_entries++; + brelse(bh); + + exfat_set_vol_flags(sb, VOL_DIRTY); + /* update the directory entry */ + if (exfat_remove_entries(dir, &cdir, entry, 0, num_entries)) { + err = -EIO; + goto unlock; + } + + /* This doesn't modify ei */ + ei->dir.dir = DIR_DELETED; + exfat_set_vol_flags(sb, VOL_CLEAN); + + inode_inc_iversion(dir); + dir->i_mtime = dir->i_atime = current_time(dir); + if (IS_DIRSYNC(dir)) + exfat_sync_inode(dir); + else + mark_inode_dirty(dir); + + clear_nlink(inode); + inode->i_mtime = inode->i_atime = current_time(inode); + exfat_unhash_inode(inode); + exfat_d_version_set(dentry, inode_query_iversion(dir)); +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return err; +} + +static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + struct exfat_dir_entry info; + struct exfat_chain cdir; + loff_t i_pos; + int err; + + mutex_lock(&EXFAT_SB(sb)->s_lock); + exfat_set_vol_flags(sb, VOL_DIRTY); + err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR, + &info); + exfat_set_vol_flags(sb, VOL_CLEAN); + if (err) + goto unlock; + + inode_inc_iversion(dir); + dir->i_ctime = dir->i_mtime = current_time(dir); + if (IS_DIRSYNC(dir)) + exfat_sync_inode(dir); + else + mark_inode_dirty(dir); + inc_nlink(dir); + + i_pos = exfat_make_i_pos(&info); + inode = exfat_build_inode(sb, &info, i_pos); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto unlock; + } + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = + EXFAT_I(inode)->i_crtime = current_time(inode); + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + d_instantiate(dentry, inode); + +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return err; +} + +static int exfat_check_dir_empty(struct super_block *sb, + struct exfat_chain *p_dir) +{ + int i, dentries_per_clu; + unsigned int type; + struct exfat_chain clu; + struct exfat_dentry *ep; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + + dentries_per_clu = sbi->dentries_per_clu; + + exfat_chain_dup(&clu, p_dir); + + while (clu.dir != EXFAT_EOF_CLUSTER) { + for (i = 0; i < dentries_per_clu; i++) { + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + type = exfat_get_entry_type(ep); + brelse(bh); + if (type == TYPE_UNUSED) + return 0; + + if (type != TYPE_FILE && type != TYPE_DIR) + continue; + + return -ENOTEMPTY; + } + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + } + } + + return 0; +} + +static int exfat_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct exfat_dentry *ep; + struct exfat_chain cdir, clu_to_free; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + struct buffer_head *bh; + sector_t sector; + int num_entries, entry, err; + + mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); + + exfat_chain_dup(&cdir, &ei->dir); + entry = ei->entry; + + if (ei->dir.dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry"); + err = -ENOENT; + goto unlock; + } + + exfat_set_vol_flags(sb, VOL_DIRTY); + exfat_chain_set(&clu_to_free, ei->start_clu, + EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi), ei->flags); + + err = exfat_check_dir_empty(sb, &clu_to_free); + if (err) { + if (err == -EIO) + exfat_msg(sb, KERN_ERR, + "failed to exfat_check_dir_empty : err(%d)", + err); + goto unlock; + } + + ep = exfat_get_dentry(sb, &cdir, entry, &bh, §or); + if (!ep) { + err = -EIO; + goto unlock; + } + + num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep); + if (num_entries < 0) { + err = -EIO; + brelse(bh); + goto unlock; + } + num_entries++; + brelse(bh); + + err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries); + if (err) { + exfat_msg(sb, KERN_ERR, + "failed to exfat_remove_entries : err(%d)", + err); + goto unlock; + } + ei->dir.dir = DIR_DELETED; + exfat_set_vol_flags(sb, VOL_CLEAN); + + inode_inc_iversion(dir); + dir->i_mtime = dir->i_atime = current_time(dir); + if (IS_DIRSYNC(dir)) + exfat_sync_inode(dir); + else + mark_inode_dirty(dir); + drop_nlink(dir); + + clear_nlink(inode); + inode->i_mtime = inode->i_atime = current_time(inode); + exfat_unhash_inode(inode); + exfat_d_version_set(dentry, inode_query_iversion(dir)); +unlock: + mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); + return err; +} + +static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, + int oldentry, struct exfat_uni_name *p_uniname, + struct exfat_inode_info *ei) +{ + int ret, num_old_entries, num_new_entries; + sector_t sector_old, sector_new; + struct exfat_dentry *epold, *epnew; + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh, *old_bh; + int sync = IS_DIRSYNC(inode); + + epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh, §or_old); + if (!epold) + return -EIO; + + num_old_entries = exfat_count_ext_entries(sb, p_dir, oldentry, epold); + if (num_old_entries < 0) + return -EIO; + num_old_entries++; + + num_new_entries = exfat_calc_num_entries(p_uniname); + if (num_new_entries < 0) + return num_new_entries; + + if (num_old_entries < num_new_entries) { + int newentry; + + newentry = + exfat_find_empty_entry(inode, p_dir, num_new_entries); + if (newentry < 0) + return newentry; /* -EIO or -ENOSPC */ + + epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh, + §or_new); + if (!epnew) + return -EIO; + + memcpy(epnew, epold, DENTRY_SIZE); + if (exfat_get_entry_type(epnew) == TYPE_FILE) { + epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); + ei->attr |= ATTR_ARCHIVE; + } + exfat_update_bh(sb, new_bh, sync); + brelse(old_bh); + brelse(new_bh); + + epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh, + §or_old); + epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh, + §or_new); + if (!epold || !epnew) + return -EIO; + + memcpy(epnew, epold, DENTRY_SIZE); + exfat_update_bh(sb, new_bh, sync); + brelse(old_bh); + brelse(new_bh); + + ret = exfat_init_ext_entry(inode, p_dir, newentry, + num_new_entries, p_uniname); + if (ret) + return ret; + + exfat_remove_entries(inode, p_dir, oldentry, 0, + num_old_entries); + ei->entry = newentry; + } else { + if (exfat_get_entry_type(epold) == TYPE_FILE) { + epold->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); + ei->attr |= ATTR_ARCHIVE; + } + exfat_update_bh(sb, old_bh, sync); + brelse(old_bh); + ret = exfat_init_ext_entry(inode, p_dir, oldentry, + num_new_entries, p_uniname); + if (ret) + return ret; + + exfat_remove_entries(inode, p_dir, oldentry, num_new_entries, + num_old_entries); + } + return 0; +} + +static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, + int oldentry, struct exfat_chain *p_newdir, + struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei) +{ + int ret, newentry, num_new_entries, num_old_entries; + sector_t sector_mov, sector_new; + struct exfat_dentry *epmov, *epnew; + struct super_block *sb = inode->i_sb; + struct buffer_head *mov_bh, *new_bh; + + epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh, §or_mov); + if (!epmov) + return -EIO; + + /* check if the source and target directory is the same */ + if (exfat_get_entry_type(epmov) == TYPE_DIR && + le32_to_cpu(epmov->dentry.stream.start_clu) == p_newdir->dir) + return -EINVAL; + + num_old_entries = exfat_count_ext_entries(sb, p_olddir, oldentry, + epmov); + if (num_old_entries < 0) + return -EIO; + num_old_entries++; + + num_new_entries = exfat_calc_num_entries(p_uniname); + if (num_new_entries < 0) + return num_new_entries; + + newentry = exfat_find_empty_entry(inode, p_newdir, num_new_entries); + if (newentry < 0) + return newentry; /* -EIO or -ENOSPC */ + + epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh, §or_new); + if (!epnew) + return -EIO; + + memcpy(epnew, epmov, DENTRY_SIZE); + if (exfat_get_entry_type(epnew) == TYPE_FILE) { + epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); + ei->attr |= ATTR_ARCHIVE; + } + exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode)); + brelse(mov_bh); + brelse(new_bh); + + epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh, + §or_mov); + epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh, + §or_new); + if (!epmov || !epnew) + return -EIO; + + memcpy(epnew, epmov, DENTRY_SIZE); + exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode)); + brelse(mov_bh); + brelse(new_bh); + + ret = exfat_init_ext_entry(inode, p_newdir, newentry, num_new_entries, + p_uniname); + if (ret) + return ret; + + exfat_remove_entries(inode, p_olddir, oldentry, 0, num_old_entries); + + exfat_chain_set(&ei->dir, p_newdir->dir, p_newdir->size, + p_newdir->flags); + + ei->entry = newentry; + return 0; +} + +static void exfat_update_parent_info(struct exfat_inode_info *ei, + struct inode *parent_inode) +{ + struct exfat_sb_info *sbi = EXFAT_SB(parent_inode->i_sb); + struct exfat_inode_info *parent_ei = EXFAT_I(parent_inode); + loff_t parent_isize = i_size_read(parent_inode); + + /* + * the problem that struct exfat_inode_info caches wrong parent info. + * + * because of flag-mismatch of ei->dir, + * there is abnormal traversing cluster chain. + */ + if (unlikely(parent_ei->flags != ei->dir.flags || + parent_isize != EXFAT_CLU_TO_B(ei->dir.size, sbi) || + parent_ei->start_clu != ei->dir.dir)) { + exfat_chain_set(&ei->dir, parent_ei->start_clu, + EXFAT_B_TO_CLU_ROUND_UP(parent_isize, sbi), + parent_ei->flags); + } +} + +/* rename or move a old file into a new file */ +static int __exfat_rename(struct inode *old_parent_inode, + struct exfat_inode_info *ei, struct inode *new_parent_inode, + struct dentry *new_dentry) +{ + int ret; + int dentry; + struct exfat_chain olddir, newdir; + struct exfat_chain *p_dir = NULL; + struct exfat_uni_name uni_name; + struct exfat_dentry *ep; + struct super_block *sb = old_parent_inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + const unsigned char *new_path = new_dentry->d_name.name; + struct inode *new_inode = new_dentry->d_inode; + int num_entries; + struct exfat_inode_info *new_ei = NULL; + unsigned int new_entry_type = TYPE_UNUSED; + int new_entry = 0; + struct buffer_head *old_bh, *new_bh = NULL; + + /* check the validity of pointer parameters */ + if (new_path == NULL || strlen(new_path) == 0) + return -EINVAL; + + if (ei->dir.dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, + "abnormal access to deleted source dentry"); + return -ENOENT; + } + + exfat_update_parent_info(ei, old_parent_inode); + + exfat_chain_dup(&olddir, &ei->dir); + dentry = ei->entry; + + ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh, NULL); + if (!ep) { + ret = -EIO; + goto out; + } + brelse(old_bh); + + /* check whether new dir is existing directory and empty */ + if (new_inode) { + ret = -EIO; + new_ei = EXFAT_I(new_inode); + + if (new_ei->dir.dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, + "abnormal access to deleted target dentry"); + goto out; + } + + exfat_update_parent_info(new_ei, new_parent_inode); + + p_dir = &(new_ei->dir); + new_entry = new_ei->entry; + ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL); + if (!ep) + goto out; + + new_entry_type = exfat_get_entry_type(ep); + brelse(new_bh); + + /* if new_inode exists, update ei */ + if (new_entry_type == TYPE_DIR) { + struct exfat_chain new_clu; + + new_clu.dir = new_ei->start_clu; + new_clu.size = + EXFAT_B_TO_CLU_ROUND_UP(i_size_read(new_inode), + sbi); + new_clu.flags = new_ei->flags; + + ret = exfat_check_dir_empty(sb, &new_clu); + if (ret) + goto out; + } + } + + /* check the validity of directory name in the given new pathname */ + ret = exfat_resolve_path(new_parent_inode, new_path, &newdir, + &uni_name); + if (ret) + goto out; + + exfat_set_vol_flags(sb, VOL_DIRTY); + + if (olddir.dir == newdir.dir) + ret = exfat_rename_file(new_parent_inode, &olddir, dentry, + &uni_name, ei); + else + ret = exfat_move_file(new_parent_inode, &olddir, dentry, + &newdir, &uni_name, ei); + + if (!ret && new_inode) { + /* delete entries of new_dir */ + ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL); + if (!ep) { + ret = -EIO; + goto del_out; + } + + num_entries = exfat_count_ext_entries(sb, p_dir, new_entry, ep); + if (num_entries < 0) { + ret = -EIO; + goto del_out; + } + brelse(new_bh); + + if (exfat_remove_entries(new_inode, p_dir, new_entry, 0, + num_entries + 1)) { + ret = -EIO; + goto del_out; + } + + /* Free the clusters if new_inode is a dir(as if exfat_rmdir) */ + if (new_entry_type == TYPE_DIR) { + /* new_ei, new_clu_to_free */ + struct exfat_chain new_clu_to_free; + + exfat_chain_set(&new_clu_to_free, new_ei->start_clu, + EXFAT_B_TO_CLU_ROUND_UP(i_size_read(new_inode), + sbi), new_ei->flags); + + if (exfat_free_cluster(new_inode, &new_clu_to_free)) { + /* just set I/O error only */ + ret = -EIO; + } + + i_size_write(new_inode, 0); + new_ei->start_clu = EXFAT_EOF_CLUSTER; + new_ei->flags = ALLOC_NO_FAT_CHAIN; + } +del_out: + /* Update new_inode ei + * Prevent syncing removed new_inode + * (new_ei is already initialized above code ("if (new_inode)") + */ + new_ei->dir.dir = DIR_DELETED; + } + exfat_set_vol_flags(sb, VOL_CLEAN); +out: + return ret; +} + +static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct inode *old_inode, *new_inode; + struct super_block *sb = old_dir->i_sb; + loff_t i_pos; + int err; + + /* + * The VFS already checks for existence, so for local filesystems + * the RENAME_NOREPLACE implementation is equivalent to plain rename. + * Don't support any other flags + */ + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + mutex_lock(&EXFAT_SB(sb)->s_lock); + old_inode = old_dentry->d_inode; + new_inode = new_dentry->d_inode; + + err = __exfat_rename(old_dir, EXFAT_I(old_inode), new_dir, new_dentry); + if (err) + goto unlock; + + inode_inc_iversion(new_dir); + new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime = + EXFAT_I(new_dir)->i_crtime = current_time(new_dir); + if (IS_DIRSYNC(new_dir)) + exfat_sync_inode(new_dir); + else + mark_inode_dirty(new_dir); + + i_pos = ((loff_t)EXFAT_I(old_inode)->dir.dir << 32) | + (EXFAT_I(old_inode)->entry & 0xffffffff); + exfat_unhash_inode(old_inode); + exfat_hash_inode(old_inode, i_pos); + if (IS_DIRSYNC(new_dir)) + exfat_sync_inode(old_inode); + else + mark_inode_dirty(old_inode); + + if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) { + drop_nlink(old_dir); + if (!new_inode) + inc_nlink(new_dir); + } + + inode_inc_iversion(old_dir); + old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + if (IS_DIRSYNC(old_dir)) + exfat_sync_inode(old_dir); + else + mark_inode_dirty(old_dir); + + if (new_inode) { + exfat_unhash_inode(new_inode); + + /* skip drop_nlink if new_inode already has been dropped */ + if (new_inode->i_nlink) { + drop_nlink(new_inode); + if (S_ISDIR(new_inode->i_mode)) + drop_nlink(new_inode); + } else { + exfat_msg(sb, KERN_WARNING, + "abnormal access to an inode dropped"); + WARN_ON(new_inode->i_nlink == 0); + } + new_inode->i_ctime = EXFAT_I(new_inode)->i_crtime = + current_time(new_inode); + } + +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return err; +} + +const struct inode_operations exfat_dir_inode_operations = { + .create = exfat_create, + .lookup = exfat_lookup, + .unlink = exfat_unlink, + .mkdir = exfat_mkdir, + .rmdir = exfat_rmdir, + .rename = exfat_rename, + .setattr = exfat_setattr, + .getattr = exfat_getattr, +}; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c new file mode 100644 index 000000000000..6d1c3ae130ff --- /dev/null +++ b/fs/exfat/nls.c @@ -0,0 +1,831 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <asm/unaligned.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +/* Upcase tabel macro */ +#define EXFAT_NUM_UPCASE (2918) +#define UTBL_COUNT (0x10000) + +/* + * Upcase table in compressed format (7.2.5.1 Recommended Up-case Table + * in exfat specification, See: + * https://docs.microsoft.com/en-us/windows/win32/fileio/exfat-specification). + */ +static const unsigned short uni_def_upcase[EXFAT_NUM_UPCASE] = { + 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, + 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, + 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, + 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, + 0x0060, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, + 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, + 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, + 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, + 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, + 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00f7, + 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178, + 0x0100, 0x0100, 0x0102, 0x0102, 0x0104, 0x0104, 0x0106, 0x0106, + 0x0108, 0x0108, 0x010a, 0x010a, 0x010c, 0x010c, 0x010e, 0x010e, + 0x0110, 0x0110, 0x0112, 0x0112, 0x0114, 0x0114, 0x0116, 0x0116, + 0x0118, 0x0118, 0x011a, 0x011a, 0x011c, 0x011c, 0x011e, 0x011e, + 0x0120, 0x0120, 0x0122, 0x0122, 0x0124, 0x0124, 0x0126, 0x0126, + 0x0128, 0x0128, 0x012a, 0x012a, 0x012c, 0x012c, 0x012e, 0x012e, + 0x0130, 0x0131, 0x0132, 0x0132, 0x0134, 0x0134, 0x0136, 0x0136, + 0x0138, 0x0139, 0x0139, 0x013b, 0x013b, 0x013d, 0x013d, 0x013f, + 0x013f, 0x0141, 0x0141, 0x0143, 0x0143, 0x0145, 0x0145, 0x0147, + 0x0147, 0x0149, 0x014a, 0x014a, 0x014c, 0x014c, 0x014e, 0x014e, + 0x0150, 0x0150, 0x0152, 0x0152, 0x0154, 0x0154, 0x0156, 0x0156, + 0x0158, 0x0158, 0x015a, 0x015a, 0x015c, 0x015c, 0x015e, 0x015e, + 0x0160, 0x0160, 0x0162, 0x0162, 0x0164, 0x0164, 0x0166, 0x0166, + 0x0168, 0x0168, 0x016a, 0x016a, 0x016c, 0x016c, 0x016e, 0x016e, + 0x0170, 0x0170, 0x0172, 0x0172, 0x0174, 0x0174, 0x0176, 0x0176, + 0x0178, 0x0179, 0x0179, 0x017b, 0x017b, 0x017d, 0x017d, 0x017f, + 0x0243, 0x0181, 0x0182, 0x0182, 0x0184, 0x0184, 0x0186, 0x0187, + 0x0187, 0x0189, 0x018a, 0x018b, 0x018b, 0x018d, 0x018e, 0x018f, + 0x0190, 0x0191, 0x0191, 0x0193, 0x0194, 0x01f6, 0x0196, 0x0197, + 0x0198, 0x0198, 0x023d, 0x019b, 0x019c, 0x019d, 0x0220, 0x019f, + 0x01a0, 0x01a0, 0x01a2, 0x01a2, 0x01a4, 0x01a4, 0x01a6, 0x01a7, + 0x01a7, 0x01a9, 0x01aa, 0x01ab, 0x01ac, 0x01ac, 0x01ae, 0x01af, + 0x01af, 0x01b1, 0x01b2, 0x01b3, 0x01b3, 0x01b5, 0x01b5, 0x01b7, + 0x01b8, 0x01b8, 0x01ba, 0x01bb, 0x01bc, 0x01bc, 0x01be, 0x01f7, + 0x01c0, 0x01c1, 0x01c2, 0x01c3, 0x01c4, 0x01c5, 0x01c4, 0x01c7, + 0x01c8, 0x01c7, 0x01ca, 0x01cb, 0x01ca, 0x01cd, 0x01cd, 0x01cf, + 0x01cf, 0x01d1, 0x01d1, 0x01d3, 0x01d3, 0x01d5, 0x01d5, 0x01d7, + 0x01d7, 0x01d9, 0x01d9, 0x01db, 0x01db, 0x018e, 0x01de, 0x01de, + 0x01e0, 0x01e0, 0x01e2, 0x01e2, 0x01e4, 0x01e4, 0x01e6, 0x01e6, + 0x01e8, 0x01e8, 0x01ea, 0x01ea, 0x01ec, 0x01ec, 0x01ee, 0x01ee, + 0x01f0, 0x01f1, 0x01f2, 0x01f1, 0x01f4, 0x01f4, 0x01f6, 0x01f7, + 0x01f8, 0x01f8, 0x01fa, 0x01fa, 0x01fc, 0x01fc, 0x01fe, 0x01fe, + 0x0200, 0x0200, 0x0202, 0x0202, 0x0204, 0x0204, 0x0206, 0x0206, + 0x0208, 0x0208, 0x020a, 0x020a, 0x020c, 0x020c, 0x020e, 0x020e, + 0x0210, 0x0210, 0x0212, 0x0212, 0x0214, 0x0214, 0x0216, 0x0216, + 0x0218, 0x0218, 0x021a, 0x021a, 0x021c, 0x021c, 0x021e, 0x021e, + 0x0220, 0x0221, 0x0222, 0x0222, 0x0224, 0x0224, 0x0226, 0x0226, + 0x0228, 0x0228, 0x022a, 0x022a, 0x022c, 0x022c, 0x022e, 0x022e, + 0x0230, 0x0230, 0x0232, 0x0232, 0x0234, 0x0235, 0x0236, 0x0237, + 0x0238, 0x0239, 0x2c65, 0x023b, 0x023b, 0x023d, 0x2c66, 0x023f, + 0x0240, 0x0241, 0x0241, 0x0243, 0x0244, 0x0245, 0x0246, 0x0246, + 0x0248, 0x0248, 0x024a, 0x024a, 0x024c, 0x024c, 0x024e, 0x024e, + 0x0250, 0x0251, 0x0252, 0x0181, 0x0186, 0x0255, 0x0189, 0x018a, + 0x0258, 0x018f, 0x025a, 0x0190, 0x025c, 0x025d, 0x025e, 0x025f, + 0x0193, 0x0261, 0x0262, 0x0194, 0x0264, 0x0265, 0x0266, 0x0267, + 0x0197, 0x0196, 0x026a, 0x2c62, 0x026c, 0x026d, 0x026e, 0x019c, + 0x0270, 0x0271, 0x019d, 0x0273, 0x0274, 0x019f, 0x0276, 0x0277, + 0x0278, 0x0279, 0x027a, 0x027b, 0x027c, 0x2c64, 0x027e, 0x027f, + 0x01a6, 0x0281, 0x0282, 0x01a9, 0x0284, 0x0285, 0x0286, 0x0287, + 0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x028d, 0x028e, 0x028f, + 0x0290, 0x0291, 0x01b7, 0x0293, 0x0294, 0x0295, 0x0296, 0x0297, + 0x0298, 0x0299, 0x029a, 0x029b, 0x029c, 0x029d, 0x029e, 0x029f, + 0x02a0, 0x02a1, 0x02a2, 0x02a3, 0x02a4, 0x02a5, 0x02a6, 0x02a7, + 0x02a8, 0x02a9, 0x02aa, 0x02ab, 0x02ac, 0x02ad, 0x02ae, 0x02af, + 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x02b5, 0x02b6, 0x02b7, + 0x02b8, 0x02b9, 0x02ba, 0x02bb, 0x02bc, 0x02bd, 0x02be, 0x02bf, + 0x02c0, 0x02c1, 0x02c2, 0x02c3, 0x02c4, 0x02c5, 0x02c6, 0x02c7, + 0x02c8, 0x02c9, 0x02ca, 0x02cb, 0x02cc, 0x02cd, 0x02ce, 0x02cf, + 0x02d0, 0x02d1, 0x02d2, 0x02d3, 0x02d4, 0x02d5, 0x02d6, 0x02d7, + 0x02d8, 0x02d9, 0x02da, 0x02db, 0x02dc, 0x02dd, 0x02de, 0x02df, + 0x02e0, 0x02e1, 0x02e2, 0x02e3, 0x02e4, 0x02e5, 0x02e6, 0x02e7, + 0x02e8, 0x02e9, 0x02ea, 0x02eb, 0x02ec, 0x02ed, 0x02ee, 0x02ef, + 0x02f0, 0x02f1, 0x02f2, 0x02f3, 0x02f4, 0x02f5, 0x02f6, 0x02f7, + 0x02f8, 0x02f9, 0x02fa, 0x02fb, 0x02fc, 0x02fd, 0x02fe, 0x02ff, + 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, + 0x0308, 0x0309, 0x030a, 0x030b, 0x030c, 0x030d, 0x030e, 0x030f, + 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, + 0x0318, 0x0319, 0x031a, 0x031b, 0x031c, 0x031d, 0x031e, 0x031f, + 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, + 0x0328, 0x0329, 0x032a, 0x032b, 0x032c, 0x032d, 0x032e, 0x032f, + 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337, + 0x0338, 0x0339, 0x033a, 0x033b, 0x033c, 0x033d, 0x033e, 0x033f, + 0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347, + 0x0348, 0x0349, 0x034a, 0x034b, 0x034c, 0x034d, 0x034e, 0x034f, + 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, + 0x0358, 0x0359, 0x035a, 0x035b, 0x035c, 0x035d, 0x035e, 0x035f, + 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, + 0x0368, 0x0369, 0x036a, 0x036b, 0x036c, 0x036d, 0x036e, 0x036f, + 0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377, + 0x0378, 0x0379, 0x037a, 0x03fd, 0x03fe, 0x03ff, 0x037e, 0x037f, + 0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0386, 0x0387, + 0x0388, 0x0389, 0x038a, 0x038b, 0x038c, 0x038d, 0x038e, 0x038f, + 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, + 0x03a0, 0x03a1, 0x03a2, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x0386, 0x0388, 0x0389, 0x038a, + 0x03b0, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, + 0x03a0, 0x03a1, 0x03a3, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x03cf, + 0x03d0, 0x03d1, 0x03d2, 0x03d3, 0x03d4, 0x03d5, 0x03d6, 0x03d7, + 0x03d8, 0x03d8, 0x03da, 0x03da, 0x03dc, 0x03dc, 0x03de, 0x03de, + 0x03e0, 0x03e0, 0x03e2, 0x03e2, 0x03e4, 0x03e4, 0x03e6, 0x03e6, + 0x03e8, 0x03e8, 0x03ea, 0x03ea, 0x03ec, 0x03ec, 0x03ee, 0x03ee, + 0x03f0, 0x03f1, 0x03f9, 0x03f3, 0x03f4, 0x03f5, 0x03f6, 0x03f7, + 0x03f7, 0x03f9, 0x03fa, 0x03fa, 0x03fc, 0x03fd, 0x03fe, 0x03ff, + 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, + 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, + 0x0460, 0x0460, 0x0462, 0x0462, 0x0464, 0x0464, 0x0466, 0x0466, + 0x0468, 0x0468, 0x046a, 0x046a, 0x046c, 0x046c, 0x046e, 0x046e, + 0x0470, 0x0470, 0x0472, 0x0472, 0x0474, 0x0474, 0x0476, 0x0476, + 0x0478, 0x0478, 0x047a, 0x047a, 0x047c, 0x047c, 0x047e, 0x047e, + 0x0480, 0x0480, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, + 0x0488, 0x0489, 0x048a, 0x048a, 0x048c, 0x048c, 0x048e, 0x048e, + 0x0490, 0x0490, 0x0492, 0x0492, 0x0494, 0x0494, 0x0496, 0x0496, + 0x0498, 0x0498, 0x049a, 0x049a, 0x049c, 0x049c, 0x049e, 0x049e, + 0x04a0, 0x04a0, 0x04a2, 0x04a2, 0x04a4, 0x04a4, 0x04a6, 0x04a6, + 0x04a8, 0x04a8, 0x04aa, 0x04aa, 0x04ac, 0x04ac, 0x04ae, 0x04ae, + 0x04b0, 0x04b0, 0x04b2, 0x04b2, 0x04b4, 0x04b4, 0x04b6, 0x04b6, + 0x04b8, 0x04b8, 0x04ba, 0x04ba, 0x04bc, 0x04bc, 0x04be, 0x04be, + 0x04c0, 0x04c1, 0x04c1, 0x04c3, 0x04c3, 0x04c5, 0x04c5, 0x04c7, + 0x04c7, 0x04c9, 0x04c9, 0x04cb, 0x04cb, 0x04cd, 0x04cd, 0x04c0, + 0x04d0, 0x04d0, 0x04d2, 0x04d2, 0x04d4, 0x04d4, 0x04d6, 0x04d6, + 0x04d8, 0x04d8, 0x04da, 0x04da, 0x04dc, 0x04dc, 0x04de, 0x04de, + 0x04e0, 0x04e0, 0x04e2, 0x04e2, 0x04e4, 0x04e4, 0x04e6, 0x04e6, + 0x04e8, 0x04e8, 0x04ea, 0x04ea, 0x04ec, 0x04ec, 0x04ee, 0x04ee, + 0x04f0, 0x04f0, 0x04f2, 0x04f2, 0x04f4, 0x04f4, 0x04f6, 0x04f6, + 0x04f8, 0x04f8, 0x04fa, 0x04fa, 0x04fc, 0x04fc, 0x04fe, 0x04fe, + 0x0500, 0x0500, 0x0502, 0x0502, 0x0504, 0x0504, 0x0506, 0x0506, + 0x0508, 0x0508, 0x050a, 0x050a, 0x050c, 0x050c, 0x050e, 0x050e, + 0x0510, 0x0510, 0x0512, 0x0512, 0x0514, 0x0515, 0x0516, 0x0517, + 0x0518, 0x0519, 0x051a, 0x051b, 0x051c, 0x051d, 0x051e, 0x051f, + 0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527, + 0x0528, 0x0529, 0x052a, 0x052b, 0x052c, 0x052d, 0x052e, 0x052f, + 0x0530, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, + 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f, + 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, + 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f, + 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0557, + 0x0558, 0x0559, 0x055a, 0x055b, 0x055c, 0x055d, 0x055e, 0x055f, + 0x0560, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, + 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f, + 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, + 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f, + 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0xffff, + 0x17f6, 0x2c63, 0x1d7e, 0x1d7f, 0x1d80, 0x1d81, 0x1d82, 0x1d83, + 0x1d84, 0x1d85, 0x1d86, 0x1d87, 0x1d88, 0x1d89, 0x1d8a, 0x1d8b, + 0x1d8c, 0x1d8d, 0x1d8e, 0x1d8f, 0x1d90, 0x1d91, 0x1d92, 0x1d93, + 0x1d94, 0x1d95, 0x1d96, 0x1d97, 0x1d98, 0x1d99, 0x1d9a, 0x1d9b, + 0x1d9c, 0x1d9d, 0x1d9e, 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, + 0x1da4, 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa, 0x1dab, + 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, 0x1db1, 0x1db2, 0x1db3, + 0x1db4, 0x1db5, 0x1db6, 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, + 0x1dbc, 0x1dbd, 0x1dbe, 0x1dbf, 0x1dc0, 0x1dc1, 0x1dc2, 0x1dc3, + 0x1dc4, 0x1dc5, 0x1dc6, 0x1dc7, 0x1dc8, 0x1dc9, 0x1dca, 0x1dcb, + 0x1dcc, 0x1dcd, 0x1dce, 0x1dcf, 0x1dd0, 0x1dd1, 0x1dd2, 0x1dd3, + 0x1dd4, 0x1dd5, 0x1dd6, 0x1dd7, 0x1dd8, 0x1dd9, 0x1dda, 0x1ddb, + 0x1ddc, 0x1ddd, 0x1dde, 0x1ddf, 0x1de0, 0x1de1, 0x1de2, 0x1de3, + 0x1de4, 0x1de5, 0x1de6, 0x1de7, 0x1de8, 0x1de9, 0x1dea, 0x1deb, + 0x1dec, 0x1ded, 0x1dee, 0x1def, 0x1df0, 0x1df1, 0x1df2, 0x1df3, + 0x1df4, 0x1df5, 0x1df6, 0x1df7, 0x1df8, 0x1df9, 0x1dfa, 0x1dfb, + 0x1dfc, 0x1dfd, 0x1dfe, 0x1dff, 0x1e00, 0x1e00, 0x1e02, 0x1e02, + 0x1e04, 0x1e04, 0x1e06, 0x1e06, 0x1e08, 0x1e08, 0x1e0a, 0x1e0a, + 0x1e0c, 0x1e0c, 0x1e0e, 0x1e0e, 0x1e10, 0x1e10, 0x1e12, 0x1e12, + 0x1e14, 0x1e14, 0x1e16, 0x1e16, 0x1e18, 0x1e18, 0x1e1a, 0x1e1a, + 0x1e1c, 0x1e1c, 0x1e1e, 0x1e1e, 0x1e20, 0x1e20, 0x1e22, 0x1e22, + 0x1e24, 0x1e24, 0x1e26, 0x1e26, 0x1e28, 0x1e28, 0x1e2a, 0x1e2a, + 0x1e2c, 0x1e2c, 0x1e2e, 0x1e2e, 0x1e30, 0x1e30, 0x1e32, 0x1e32, + 0x1e34, 0x1e34, 0x1e36, 0x1e36, 0x1e38, 0x1e38, 0x1e3a, 0x1e3a, + 0x1e3c, 0x1e3c, 0x1e3e, 0x1e3e, 0x1e40, 0x1e40, 0x1e42, 0x1e42, + 0x1e44, 0x1e44, 0x1e46, 0x1e46, 0x1e48, 0x1e48, 0x1e4a, 0x1e4a, + 0x1e4c, 0x1e4c, 0x1e4e, 0x1e4e, 0x1e50, 0x1e50, 0x1e52, 0x1e52, + 0x1e54, 0x1e54, 0x1e56, 0x1e56, 0x1e58, 0x1e58, 0x1e5a, 0x1e5a, + 0x1e5c, 0x1e5c, 0x1e5e, 0x1e5e, 0x1e60, 0x1e60, 0x1e62, 0x1e62, + 0x1e64, 0x1e64, 0x1e66, 0x1e66, 0x1e68, 0x1e68, 0x1e6a, 0x1e6a, + 0x1e6c, 0x1e6c, 0x1e6e, 0x1e6e, 0x1e70, 0x1e70, 0x1e72, 0x1e72, + 0x1e74, 0x1e74, 0x1e76, 0x1e76, 0x1e78, 0x1e78, 0x1e7a, 0x1e7a, + 0x1e7c, 0x1e7c, 0x1e7e, 0x1e7e, 0x1e80, 0x1e80, 0x1e82, 0x1e82, + 0x1e84, 0x1e84, 0x1e86, 0x1e86, 0x1e88, 0x1e88, 0x1e8a, 0x1e8a, + 0x1e8c, 0x1e8c, 0x1e8e, 0x1e8e, 0x1e90, 0x1e90, 0x1e92, 0x1e92, + 0x1e94, 0x1e94, 0x1e96, 0x1e97, 0x1e98, 0x1e99, 0x1e9a, 0x1e9b, + 0x1e9c, 0x1e9d, 0x1e9e, 0x1e9f, 0x1ea0, 0x1ea0, 0x1ea2, 0x1ea2, + 0x1ea4, 0x1ea4, 0x1ea6, 0x1ea6, 0x1ea8, 0x1ea8, 0x1eaa, 0x1eaa, + 0x1eac, 0x1eac, 0x1eae, 0x1eae, 0x1eb0, 0x1eb0, 0x1eb2, 0x1eb2, + 0x1eb4, 0x1eb4, 0x1eb6, 0x1eb6, 0x1eb8, 0x1eb8, 0x1eba, 0x1eba, + 0x1ebc, 0x1ebc, 0x1ebe, 0x1ebe, 0x1ec0, 0x1ec0, 0x1ec2, 0x1ec2, + 0x1ec4, 0x1ec4, 0x1ec6, 0x1ec6, 0x1ec8, 0x1ec8, 0x1eca, 0x1eca, + 0x1ecc, 0x1ecc, 0x1ece, 0x1ece, 0x1ed0, 0x1ed0, 0x1ed2, 0x1ed2, + 0x1ed4, 0x1ed4, 0x1ed6, 0x1ed6, 0x1ed8, 0x1ed8, 0x1eda, 0x1eda, + 0x1edc, 0x1edc, 0x1ede, 0x1ede, 0x1ee0, 0x1ee0, 0x1ee2, 0x1ee2, + 0x1ee4, 0x1ee4, 0x1ee6, 0x1ee6, 0x1ee8, 0x1ee8, 0x1eea, 0x1eea, + 0x1eec, 0x1eec, 0x1eee, 0x1eee, 0x1ef0, 0x1ef0, 0x1ef2, 0x1ef2, + 0x1ef4, 0x1ef4, 0x1ef6, 0x1ef6, 0x1ef8, 0x1ef8, 0x1efa, 0x1efb, + 0x1efc, 0x1efd, 0x1efe, 0x1eff, 0x1f08, 0x1f09, 0x1f0a, 0x1f0b, + 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, 0x1f08, 0x1f09, 0x1f0a, 0x1f0b, + 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, 0x1f18, 0x1f19, 0x1f1a, 0x1f1b, + 0x1f1c, 0x1f1d, 0x1f16, 0x1f17, 0x1f18, 0x1f19, 0x1f1a, 0x1f1b, + 0x1f1c, 0x1f1d, 0x1f1e, 0x1f1f, 0x1f28, 0x1f29, 0x1f2a, 0x1f2b, + 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, 0x1f28, 0x1f29, 0x1f2a, 0x1f2b, + 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, 0x1f38, 0x1f39, 0x1f3a, 0x1f3b, + 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, 0x1f38, 0x1f39, 0x1f3a, 0x1f3b, + 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, 0x1f48, 0x1f49, 0x1f4a, 0x1f4b, + 0x1f4c, 0x1f4d, 0x1f46, 0x1f47, 0x1f48, 0x1f49, 0x1f4a, 0x1f4b, + 0x1f4c, 0x1f4d, 0x1f4e, 0x1f4f, 0x1f50, 0x1f59, 0x1f52, 0x1f5b, + 0x1f54, 0x1f5d, 0x1f56, 0x1f5f, 0x1f58, 0x1f59, 0x1f5a, 0x1f5b, + 0x1f5c, 0x1f5d, 0x1f5e, 0x1f5f, 0x1f68, 0x1f69, 0x1f6a, 0x1f6b, + 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, 0x1f68, 0x1f69, 0x1f6a, 0x1f6b, + 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, 0x1fba, 0x1fbb, 0x1fc8, 0x1fc9, + 0x1fca, 0x1fcb, 0x1fda, 0x1fdb, 0x1ff8, 0x1ff9, 0x1fea, 0x1feb, + 0x1ffa, 0x1ffb, 0x1f7e, 0x1f7f, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, + 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, + 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, + 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, + 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, + 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, + 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fb8, 0x1fb9, 0x1fb2, 0x1fbc, + 0x1fb4, 0x1fb5, 0x1fb6, 0x1fb7, 0x1fb8, 0x1fb9, 0x1fba, 0x1fbb, + 0x1fbc, 0x1fbd, 0x1fbe, 0x1fbf, 0x1fc0, 0x1fc1, 0x1fc2, 0x1fc3, + 0x1fc4, 0x1fc5, 0x1fc6, 0x1fc7, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, + 0x1fc3, 0x1fcd, 0x1fce, 0x1fcf, 0x1fd8, 0x1fd9, 0x1fd2, 0x1fd3, + 0x1fd4, 0x1fd5, 0x1fd6, 0x1fd7, 0x1fd8, 0x1fd9, 0x1fda, 0x1fdb, + 0x1fdc, 0x1fdd, 0x1fde, 0x1fdf, 0x1fe8, 0x1fe9, 0x1fe2, 0x1fe3, + 0x1fe4, 0x1fec, 0x1fe6, 0x1fe7, 0x1fe8, 0x1fe9, 0x1fea, 0x1feb, + 0x1fec, 0x1fed, 0x1fee, 0x1fef, 0x1ff0, 0x1ff1, 0x1ff2, 0x1ff3, + 0x1ff4, 0x1ff5, 0x1ff6, 0x1ff7, 0x1ff8, 0x1ff9, 0x1ffa, 0x1ffb, + 0x1ff3, 0x1ffd, 0x1ffe, 0x1fff, 0x2000, 0x2001, 0x2002, 0x2003, + 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a, 0x200b, + 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2013, + 0x2014, 0x2015, 0x2016, 0x2017, 0x2018, 0x2019, 0x201a, 0x201b, + 0x201c, 0x201d, 0x201e, 0x201f, 0x2020, 0x2021, 0x2022, 0x2023, + 0x2024, 0x2025, 0x2026, 0x2027, 0x2028, 0x2029, 0x202a, 0x202b, + 0x202c, 0x202d, 0x202e, 0x202f, 0x2030, 0x2031, 0x2032, 0x2033, + 0x2034, 0x2035, 0x2036, 0x2037, 0x2038, 0x2039, 0x203a, 0x203b, + 0x203c, 0x203d, 0x203e, 0x203f, 0x2040, 0x2041, 0x2042, 0x2043, + 0x2044, 0x2045, 0x2046, 0x2047, 0x2048, 0x2049, 0x204a, 0x204b, + 0x204c, 0x204d, 0x204e, 0x204f, 0x2050, 0x2051, 0x2052, 0x2053, + 0x2054, 0x2055, 0x2056, 0x2057, 0x2058, 0x2059, 0x205a, 0x205b, + 0x205c, 0x205d, 0x205e, 0x205f, 0x2060, 0x2061, 0x2062, 0x2063, + 0x2064, 0x2065, 0x2066, 0x2067, 0x2068, 0x2069, 0x206a, 0x206b, + 0x206c, 0x206d, 0x206e, 0x206f, 0x2070, 0x2071, 0x2072, 0x2073, + 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x207a, 0x207b, + 0x207c, 0x207d, 0x207e, 0x207f, 0x2080, 0x2081, 0x2082, 0x2083, + 0x2084, 0x2085, 0x2086, 0x2087, 0x2088, 0x2089, 0x208a, 0x208b, + 0x208c, 0x208d, 0x208e, 0x208f, 0x2090, 0x2091, 0x2092, 0x2093, + 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, + 0x209c, 0x209d, 0x209e, 0x209f, 0x20a0, 0x20a1, 0x20a2, 0x20a3, + 0x20a4, 0x20a5, 0x20a6, 0x20a7, 0x20a8, 0x20a9, 0x20aa, 0x20ab, + 0x20ac, 0x20ad, 0x20ae, 0x20af, 0x20b0, 0x20b1, 0x20b2, 0x20b3, + 0x20b4, 0x20b5, 0x20b6, 0x20b7, 0x20b8, 0x20b9, 0x20ba, 0x20bb, + 0x20bc, 0x20bd, 0x20be, 0x20bf, 0x20c0, 0x20c1, 0x20c2, 0x20c3, + 0x20c4, 0x20c5, 0x20c6, 0x20c7, 0x20c8, 0x20c9, 0x20ca, 0x20cb, + 0x20cc, 0x20cd, 0x20ce, 0x20cf, 0x20d0, 0x20d1, 0x20d2, 0x20d3, + 0x20d4, 0x20d5, 0x20d6, 0x20d7, 0x20d8, 0x20d9, 0x20da, 0x20db, + 0x20dc, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x20e1, 0x20e2, 0x20e3, + 0x20e4, 0x20e5, 0x20e6, 0x20e7, 0x20e8, 0x20e9, 0x20ea, 0x20eb, + 0x20ec, 0x20ed, 0x20ee, 0x20ef, 0x20f0, 0x20f1, 0x20f2, 0x20f3, + 0x20f4, 0x20f5, 0x20f6, 0x20f7, 0x20f8, 0x20f9, 0x20fa, 0x20fb, + 0x20fc, 0x20fd, 0x20fe, 0x20ff, 0x2100, 0x2101, 0x2102, 0x2103, + 0x2104, 0x2105, 0x2106, 0x2107, 0x2108, 0x2109, 0x210a, 0x210b, + 0x210c, 0x210d, 0x210e, 0x210f, 0x2110, 0x2111, 0x2112, 0x2113, + 0x2114, 0x2115, 0x2116, 0x2117, 0x2118, 0x2119, 0x211a, 0x211b, + 0x211c, 0x211d, 0x211e, 0x211f, 0x2120, 0x2121, 0x2122, 0x2123, + 0x2124, 0x2125, 0x2126, 0x2127, 0x2128, 0x2129, 0x212a, 0x212b, + 0x212c, 0x212d, 0x212e, 0x212f, 0x2130, 0x2131, 0x2132, 0x2133, + 0x2134, 0x2135, 0x2136, 0x2137, 0x2138, 0x2139, 0x213a, 0x213b, + 0x213c, 0x213d, 0x213e, 0x213f, 0x2140, 0x2141, 0x2142, 0x2143, + 0x2144, 0x2145, 0x2146, 0x2147, 0x2148, 0x2149, 0x214a, 0x214b, + 0x214c, 0x214d, 0x2132, 0x214f, 0x2150, 0x2151, 0x2152, 0x2153, + 0x2154, 0x2155, 0x2156, 0x2157, 0x2158, 0x2159, 0x215a, 0x215b, + 0x215c, 0x215d, 0x215e, 0x215f, 0x2160, 0x2161, 0x2162, 0x2163, + 0x2164, 0x2165, 0x2166, 0x2167, 0x2168, 0x2169, 0x216a, 0x216b, + 0x216c, 0x216d, 0x216e, 0x216f, 0x2160, 0x2161, 0x2162, 0x2163, + 0x2164, 0x2165, 0x2166, 0x2167, 0x2168, 0x2169, 0x216a, 0x216b, + 0x216c, 0x216d, 0x216e, 0x216f, 0x2180, 0x2181, 0x2182, 0x2183, + 0x2183, 0xffff, 0x034b, 0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba, + 0x24bb, 0x24bc, 0x24bd, 0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2, + 0x24c3, 0x24c4, 0x24c5, 0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca, + 0x24cb, 0x24cc, 0x24cd, 0x24ce, 0x24cf, 0xffff, 0x0746, 0x2c00, + 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07, 0x2c08, + 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f, 0x2c10, + 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17, 0x2c18, + 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f, 0x2c20, + 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27, 0x2c28, + 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x2c5f, 0x2c60, + 0x2c60, 0x2c62, 0x2c63, 0x2c64, 0x2c65, 0x2c66, 0x2c67, 0x2c67, + 0x2c69, 0x2c69, 0x2c6b, 0x2c6b, 0x2c6d, 0x2c6e, 0x2c6f, 0x2c70, + 0x2c71, 0x2c72, 0x2c73, 0x2c74, 0x2c75, 0x2c75, 0x2c77, 0x2c78, + 0x2c79, 0x2c7a, 0x2c7b, 0x2c7c, 0x2c7d, 0x2c7e, 0x2c7f, 0x2c80, + 0x2c80, 0x2c82, 0x2c82, 0x2c84, 0x2c84, 0x2c86, 0x2c86, 0x2c88, + 0x2c88, 0x2c8a, 0x2c8a, 0x2c8c, 0x2c8c, 0x2c8e, 0x2c8e, 0x2c90, + 0x2c90, 0x2c92, 0x2c92, 0x2c94, 0x2c94, 0x2c96, 0x2c96, 0x2c98, + 0x2c98, 0x2c9a, 0x2c9a, 0x2c9c, 0x2c9c, 0x2c9e, 0x2c9e, 0x2ca0, + 0x2ca0, 0x2ca2, 0x2ca2, 0x2ca4, 0x2ca4, 0x2ca6, 0x2ca6, 0x2ca8, + 0x2ca8, 0x2caa, 0x2caa, 0x2cac, 0x2cac, 0x2cae, 0x2cae, 0x2cb0, + 0x2cb0, 0x2cb2, 0x2cb2, 0x2cb4, 0x2cb4, 0x2cb6, 0x2cb6, 0x2cb8, + 0x2cb8, 0x2cba, 0x2cba, 0x2cbc, 0x2cbc, 0x2cbe, 0x2cbe, 0x2cc0, + 0x2cc0, 0x2cc2, 0x2cc2, 0x2cc4, 0x2cc4, 0x2cc6, 0x2cc6, 0x2cc8, + 0x2cc8, 0x2cca, 0x2cca, 0x2ccc, 0x2ccc, 0x2cce, 0x2cce, 0x2cd0, + 0x2cd0, 0x2cd2, 0x2cd2, 0x2cd4, 0x2cd4, 0x2cd6, 0x2cd6, 0x2cd8, + 0x2cd8, 0x2cda, 0x2cda, 0x2cdc, 0x2cdc, 0x2cde, 0x2cde, 0x2ce0, + 0x2ce0, 0x2ce2, 0x2ce2, 0x2ce4, 0x2ce5, 0x2ce6, 0x2ce7, 0x2ce8, + 0x2ce9, 0x2cea, 0x2ceb, 0x2cec, 0x2ced, 0x2cee, 0x2cef, 0x2cf0, + 0x2cf1, 0x2cf2, 0x2cf3, 0x2cf4, 0x2cf5, 0x2cf6, 0x2cf7, 0x2cf8, + 0x2cf9, 0x2cfa, 0x2cfb, 0x2cfc, 0x2cfd, 0x2cfe, 0x2cff, 0x10a0, + 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7, 0x10a8, + 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af, 0x10b0, + 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7, 0x10b8, + 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf, 0x10c0, + 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0xffff, 0xd21b, 0xff21, + 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27, 0xff28, 0xff29, + 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f, 0xff30, 0xff31, + 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37, 0xff38, 0xff39, + 0xff3a, 0xff5b, 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, + 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67, 0xff68, 0xff69, + 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f, 0xff70, 0xff71, + 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77, 0xff78, 0xff79, + 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f, 0xff80, 0xff81, + 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, + 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f, 0xff90, 0xff91, + 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97, 0xff98, 0xff99, + 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f, 0xffa0, 0xffa1, + 0xffa2, 0xffa3, 0xffa4, 0xffa5, 0xffa6, 0xffa7, 0xffa8, 0xffa9, + 0xffaa, 0xffab, 0xffac, 0xffad, 0xffae, 0xffaf, 0xffb0, 0xffb1, + 0xffb2, 0xffb3, 0xffb4, 0xffb5, 0xffb6, 0xffb7, 0xffb8, 0xffb9, + 0xffba, 0xffbb, 0xffbc, 0xffbd, 0xffbe, 0xffbf, 0xffc0, 0xffc1, + 0xffc2, 0xffc3, 0xffc4, 0xffc5, 0xffc6, 0xffc7, 0xffc8, 0xffc9, + 0xffca, 0xffcb, 0xffcc, 0xffcd, 0xffce, 0xffcf, 0xffd0, 0xffd1, + 0xffd2, 0xffd3, 0xffd4, 0xffd5, 0xffd6, 0xffd7, 0xffd8, 0xffd9, + 0xffda, 0xffdb, 0xffdc, 0xffdd, 0xffde, 0xffdf, 0xffe0, 0xffe1, + 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe7, 0xffe8, 0xffe9, + 0xffea, 0xffeb, 0xffec, 0xffed, 0xffee, 0xffef, 0xfff0, 0xfff1, + 0xfff2, 0xfff3, 0xfff4, 0xfff5, 0xfff6, 0xfff7, 0xfff8, 0xfff9, + 0xfffa, 0xfffb, 0xfffc, 0xfffd, 0xfffe, 0xffff, +}; + +/* + * Allow full-width illegal characters : + * "MS windows 7" supports full-width-invalid-name-characters. + * So we should check half-width-invalid-name-characters(ASCII) only + * for compatibility. + * + * " * / : < > ? \ | + */ +static unsigned short bad_uni_chars[] = { + 0x0022, 0x002A, 0x002F, 0x003A, + 0x003C, 0x003E, 0x003F, 0x005C, 0x007C, + 0 +}; + +static int exfat_convert_char_to_ucs2(struct nls_table *nls, + const unsigned char *ch, int ch_len, unsigned short *ucs2, + int *lossy) +{ + int len; + + *ucs2 = 0x0; + + if (ch[0] < 0x80) { + *ucs2 = ch[0]; + return 1; + } + + len = nls->char2uni(ch, ch_len, ucs2); + if (len < 0) { + /* conversion failed */ + if (lossy != NULL) + *lossy |= NLS_NAME_LOSSY; + *ucs2 = '_'; + return 1; + } + return len; +} + +static int exfat_convert_ucs2_to_char(struct nls_table *nls, + unsigned short ucs2, unsigned char *ch, int *lossy) +{ + int len; + + ch[0] = 0x0; + + if (ucs2 < 0x0080) { + ch[0] = ucs2; + return 1; + } + + len = nls->uni2char(ucs2, ch, MAX_CHARSET_SIZE); + if (len < 0) { + /* conversion failed */ + if (lossy != NULL) + *lossy |= NLS_NAME_LOSSY; + ch[0] = '_'; + return 1; + } + return len; +} + +unsigned short exfat_toupper(struct super_block *sb, unsigned short a) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + return sbi->vol_utbl[a] ? sbi->vol_utbl[a] : a; +} + +static unsigned short *exfat_wstrchr(unsigned short *str, unsigned short wchar) +{ + while (*str) { + if (*(str++) == wchar) + return str; + } + return NULL; +} + +int exfat_uniname_ncmp(struct super_block *sb, unsigned short *a, + unsigned short *b, unsigned int len) +{ + int i; + + for (i = 0; i < len; i++, a++, b++) + if (exfat_toupper(sb, *a) != exfat_toupper(sb, *b)) + return 1; + return 0; +} + +static int exfat_utf16_to_utf8(struct super_block *sb, + struct exfat_uni_name *p_uniname, unsigned char *p_cstring, + int buflen) +{ + int len; + const unsigned short *uniname = p_uniname->name; + + /* always len >= 0 */ + len = utf16s_to_utf8s(uniname, MAX_NAME_LENGTH, UTF16_HOST_ENDIAN, + p_cstring, buflen); + p_cstring[len] = '\0'; + return len; +} + +static int exfat_utf8_to_utf16(struct super_block *sb, + const unsigned char *p_cstring, const int len, + struct exfat_uni_name *p_uniname, int *p_lossy) +{ + int i, unilen, lossy = NLS_NAME_NO_LOSSY; + unsigned short upname[MAX_NAME_LENGTH + 1]; + unsigned short *uniname = p_uniname->name; + + WARN_ON(!len); + + unilen = utf8s_to_utf16s(p_cstring, len, UTF16_HOST_ENDIAN, + (wchar_t *)uniname, MAX_NAME_LENGTH + 2); + if (unilen < 0) { + exfat_msg(sb, KERN_ERR, + "failed to %s (err : %d) nls len : %d", + __func__, unilen, len); + return unilen; + } + + if (unilen > MAX_NAME_LENGTH) { + exfat_msg(sb, KERN_ERR, + "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d", + __func__, len, unilen, MAX_NAME_LENGTH); + return -ENAMETOOLONG; + } + + p_uniname->name_len = unilen & 0xFF; + + for (i = 0; i < unilen; i++) { + if (*uniname < 0x0020 || + exfat_wstrchr(bad_uni_chars, *uniname)) + lossy |= NLS_NAME_LOSSY; + + upname[i] = exfat_toupper(sb, *uniname); + uniname++; + } + + *uniname = '\0'; + p_uniname->name_len = unilen; + p_uniname->name_hash = exfat_calc_chksum_2byte(upname, unilen << 1, 0, + CS_DEFAULT); + + if (p_lossy) + *p_lossy = lossy; + return unilen; +} + +#define PLANE_SIZE 0x00010000 +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff + +unsigned short exfat_high_surrogate(unicode_t u) +{ + return ((u - PLANE_SIZE) >> 10) + SURROGATE_PAIR; +} + +unsigned short exfat_low_surrogate(unicode_t u) +{ + return ((u - PLANE_SIZE) & SURROGATE_BITS) | SURROGATE_PAIR | + SURROGATE_LOW; +} + +static int __exfat_utf16_to_nls(struct super_block *sb, + struct exfat_uni_name *p_uniname, unsigned char *p_cstring, + int buflen) +{ + int i, j, len, out_len = 0; + unsigned char buf[MAX_CHARSET_SIZE]; + const unsigned short *uniname = p_uniname->name; + struct nls_table *nls = EXFAT_SB(sb)->nls_io; + + i = 0; + while (i < MAX_NAME_LENGTH && out_len < (buflen - 1)) { + if (*uniname == '\0') + break; + if ((*uniname & SURROGATE_MASK) != SURROGATE_PAIR) { + len = exfat_convert_ucs2_to_char(nls, *uniname, buf, + NULL); + } else { + /* Process UTF-16 surrogate pair as one character */ + if (!(*uniname & SURROGATE_LOW) && + i+1 < MAX_NAME_LENGTH && + (*(uniname+1) & SURROGATE_MASK) == SURROGATE_PAIR && + (*(uniname+1) & SURROGATE_LOW)) { + uniname++; + i++; + } + + /* + * UTF-16 surrogate pair encodes code points above + * U+FFFF. Code points above U+FFFF are not supported + * by kernel NLS framework therefore use replacement + * character + */ + len = 1; + buf[0] = '_'; + } + + if (out_len + len >= buflen) + len = buflen - 1 - out_len; + out_len += len; + + if (len > 1) { + for (j = 0; j < len; j++) + *p_cstring++ = buf[j]; + } else { /* len == 1 */ + *p_cstring++ = *buf; + } + + uniname++; + i++; + } + + *p_cstring = '\0'; + return out_len; +} + +static int exfat_nls_to_ucs2(struct super_block *sb, + const unsigned char *p_cstring, const int len, + struct exfat_uni_name *p_uniname, int *p_lossy) +{ + int i = 0, unilen = 0, lossy = NLS_NAME_NO_LOSSY; + unsigned short upname[MAX_NAME_LENGTH + 1]; + unsigned short *uniname = p_uniname->name; + struct nls_table *nls = EXFAT_SB(sb)->nls_io; + + WARN_ON(!len); + + while (unilen < MAX_NAME_LENGTH && i < len) { + i += exfat_convert_char_to_ucs2(nls, p_cstring + i, len - i, + uniname, &lossy); + + if (*uniname < 0x0020 || + exfat_wstrchr(bad_uni_chars, *uniname)) + lossy |= NLS_NAME_LOSSY; + + upname[unilen] = exfat_toupper(sb, *uniname); + uniname++; + unilen++; + } + + if (p_cstring[i] != '\0') + lossy |= NLS_NAME_OVERLEN; + + *uniname = '\0'; + p_uniname->name_len = unilen; + p_uniname->name_hash = exfat_calc_chksum_2byte(upname, unilen << 1, 0, + CS_DEFAULT); + + if (p_lossy) + *p_lossy = lossy; + return unilen; +} + +int exfat_utf16_to_nls(struct super_block *sb, struct exfat_uni_name *uniname, + unsigned char *p_cstring, int buflen) +{ + if (EXFAT_SB(sb)->options.utf8) + return exfat_utf16_to_utf8(sb, uniname, p_cstring, + buflen); + return __exfat_utf16_to_nls(sb, uniname, p_cstring, buflen); +} + +int exfat_nls_to_utf16(struct super_block *sb, const unsigned char *p_cstring, + const int len, struct exfat_uni_name *uniname, int *p_lossy) +{ + if (EXFAT_SB(sb)->options.utf8) + return exfat_utf8_to_utf16(sb, p_cstring, len, + uniname, p_lossy); + return exfat_nls_to_ucs2(sb, p_cstring, len, uniname, p_lossy); +} + +static int exfat_load_upcase_table(struct super_block *sb, + sector_t sector, unsigned long long num_sectors, + unsigned int utbl_checksum) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int sect_size = sb->s_blocksize; + unsigned int i, index = 0, checksum = 0; + int ret; + unsigned char skip = false; + unsigned short *upcase_table; + + upcase_table = kcalloc(UTBL_COUNT, sizeof(unsigned short), GFP_KERNEL); + if (!upcase_table) + return -ENOMEM; + + sbi->vol_utbl = upcase_table; + num_sectors += sector; + + while (sector < num_sectors) { + struct buffer_head *bh; + + bh = sb_bread(sb, sector); + if (!bh) { + exfat_msg(sb, KERN_ERR, + "failed to read sector(0x%llx)\n", + (unsigned long long)sector); + ret = -EIO; + goto free_table; + } + sector++; + for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) { + unsigned short uni = get_unaligned_le16(bh->b_data + i); + + checksum = ((checksum & 1) ? 0x80000000 : 0) + + (checksum >> 1) + + *(((unsigned char *)bh->b_data) + i); + checksum = ((checksum & 1) ? 0x80000000 : 0) + + (checksum >> 1) + + *(((unsigned char *)bh->b_data) + (i + 1)); + + if (skip) { + index += uni; + skip = false; + } else if (uni == index) { + index++; + } else if (uni == 0xFFFF) { + skip = true; + } else { /* uni != index , uni != 0xFFFF */ + upcase_table[index] = uni; + index++; + } + } + brelse(bh); + } + + if (index >= 0xFFFF && utbl_checksum == checksum) + return 0; + + exfat_msg(sb, KERN_ERR, + "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)\n", + index, checksum, utbl_checksum); + ret = -EINVAL; +free_table: + exfat_free_upcase_table(sbi); + return ret; +} + +static int exfat_load_default_upcase_table(struct super_block *sb) +{ + int i, ret = -EIO; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned char skip = false; + unsigned short uni = 0, *upcase_table; + unsigned int index = 0; + + upcase_table = kcalloc(UTBL_COUNT, sizeof(unsigned short), GFP_KERNEL); + if (!upcase_table) + return -ENOMEM; + + sbi->vol_utbl = upcase_table; + + for (i = 0; index <= 0xFFFF && i < EXFAT_NUM_UPCASE; i++) { + uni = uni_def_upcase[i]; + if (skip) { + index += uni; + skip = false; + } else if (uni == index) { + index++; + } else if (uni == 0xFFFF) { + skip = true; + } else { + upcase_table[index] = uni; + index++; + } + } + + if (index >= 0xFFFF) + return 0; + + /* FATAL error: default upcase table has error */ + exfat_free_upcase_table(sbi); + return ret; +} + +int exfat_create_upcase_table(struct super_block *sb) +{ + int i, ret; + unsigned int tbl_clu, type; + sector_t sector; + unsigned long long tbl_size, num_sectors; + unsigned char blksize_bits = sb->s_blocksize_bits; + struct exfat_chain clu; + struct exfat_dentry *ep; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + + clu.dir = sbi->root_dir; + clu.flags = ALLOC_FAT_CHAIN; + + while (clu.dir != EXFAT_EOF_CLUSTER) { + for (i = 0; i < sbi->dentries_per_clu; i++) { + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + + type = exfat_get_entry_type(ep); + if (type == TYPE_UNUSED) { + brelse(bh); + break; + } + + if (type != TYPE_UPCASE) { + brelse(bh); + continue; + } + + tbl_clu = le32_to_cpu(ep->dentry.upcase.start_clu); + tbl_size = le64_to_cpu(ep->dentry.upcase.size); + + sector = exfat_cluster_to_sector(sbi, tbl_clu); + num_sectors = ((tbl_size - 1) >> blksize_bits) + 1; + ret = exfat_load_upcase_table(sb, sector, num_sectors, + le32_to_cpu(ep->dentry.upcase.checksum)); + + brelse(bh); + if (ret && ret != -EIO) + goto load_default; + + /* load successfully */ + return ret; + } + + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + } + +load_default: + /* load default upcase table */ + return exfat_load_default_upcase_table(sb); +} + +void exfat_free_upcase_table(struct exfat_sb_info *sbi) +{ + kfree(sbi->vol_utbl); +} diff --git a/fs/exfat/super.c b/fs/exfat/super.c new file mode 100644 index 000000000000..16ed202ef527 --- /dev/null +++ b/fs/exfat/super.c @@ -0,0 +1,722 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/time.h> +#include <linux/mount.h> +#include <linux/cred.h> +#include <linux/statfs.h> +#include <linux/seq_file.h> +#include <linux/blkdev.h> +#include <linux/fs_struct.h> +#include <linux/iversion.h> +#include <linux/nls.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static char exfat_default_iocharset[] = CONFIG_EXFAT_DEFAULT_IOCHARSET; +static struct kmem_cache *exfat_inode_cachep; + +static void exfat_free_iocharset(struct exfat_sb_info *sbi) +{ + if (sbi->options.iocharset != exfat_default_iocharset) + kfree(sbi->options.iocharset); +} + +static void exfat_delayed_free(struct rcu_head *p) +{ + struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu); + + unload_nls(sbi->nls_io); + exfat_free_iocharset(sbi); + exfat_free_upcase_table(sbi); + kfree(sbi); +} + +static void exfat_put_super(struct super_block *sb) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + mutex_lock(&sbi->s_lock); + if (test_and_clear_bit(EXFAT_SB_DIRTY, &sbi->s_state)) + sync_blockdev(sb->s_bdev); + exfat_set_vol_flags(sb, VOL_CLEAN); + exfat_free_bitmap(sbi); + mutex_unlock(&sbi->s_lock); + + call_rcu(&sbi->rcu, exfat_delayed_free); +} + +static int exfat_sync_fs(struct super_block *sb, int wait) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + int err = 0; + + /* If there are some dirty buffers in the bdev inode */ + mutex_lock(&sbi->s_lock); + if (test_and_clear_bit(EXFAT_SB_DIRTY, &sbi->s_state)) { + sync_blockdev(sb->s_bdev); + if (exfat_set_vol_flags(sb, VOL_CLEAN)) + err = -EIO; + } + mutex_unlock(&sbi->s_lock); + return err; +} + +static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned long long id = huge_encode_dev(sb->s_bdev->bd_dev); + + if (sbi->used_clusters == EXFAT_CLUSTERS_UNTRACKED) { + mutex_lock(&sbi->s_lock); + if (exfat_count_used_clusters(sb, &sbi->used_clusters)) { + mutex_unlock(&sbi->s_lock); + return -EIO; + } + mutex_unlock(&sbi->s_lock); + } + + buf->f_type = sb->s_magic; + buf->f_bsize = sbi->cluster_size; + buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */ + buf->f_bfree = buf->f_blocks - sbi->used_clusters; + buf->f_bavail = buf->f_bfree; + buf->f_fsid.val[0] = (unsigned int)id; + buf->f_fsid.val[1] = (unsigned int)(id >> 32); + /* Unicode utf16 255 characters */ + buf->f_namelen = EXFAT_MAX_FILE_LEN * NLS_MAX_CHARSET_SIZE; + return 0; +} + +int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct pbr64 *bpb; + bool sync = 0; + + /* flags are not changed */ + if (sbi->vol_flag == new_flag) + return 0; + + sbi->vol_flag = new_flag; + + /* skip updating volume dirty flag, + * if this volume has been mounted with read-only + */ + if (sb_rdonly(sb)) + return 0; + + if (!sbi->pbr_bh) { + sbi->pbr_bh = sb_bread(sb, 0); + if (!sbi->pbr_bh) { + exfat_msg(sb, KERN_ERR, "failed to read boot sector"); + return -ENOMEM; + } + } + + bpb = (struct pbr64 *)sbi->pbr_bh->b_data; + bpb->bsx.vol_flags = cpu_to_le16(new_flag); + + if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->pbr_bh)) + sync = true; + else + sync = false; + + set_buffer_uptodate(sbi->pbr_bh); + mark_buffer_dirty(sbi->pbr_bh); + + if (sync) + sync_dirty_buffer(sbi->pbr_bh); + return 0; +} + +static int exfat_show_options(struct seq_file *m, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_mount_options *opts = &sbi->options; + + /* Show partition info */ + if (!uid_eq(opts->fs_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->fs_uid)); + if (!gid_eq(opts->fs_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->fs_gid)); + seq_printf(m, ",fmask=%04o,dmask=%04o", opts->fs_fmask, opts->fs_dmask); + if (opts->allow_utime) + seq_printf(m, ",allow_utime=%04o", opts->allow_utime); + if (opts->utf8) + seq_puts(m, ",iocharset=utf8"); + else if (sbi->nls_io) + seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); + seq_printf(m, ",bps=%ld", sb->s_blocksize); + if (opts->errors == EXFAT_ERRORS_CONT) + seq_puts(m, ",errors=continue"); + else if (opts->errors == EXFAT_ERRORS_PANIC) + seq_puts(m, ",errors=panic"); + else + seq_puts(m, ",errors=remount-ro"); + if (opts->discard) + seq_puts(m, ",discard"); + if (opts->time_offset) + seq_printf(m, ",time_offset=%d", opts->time_offset); + return 0; +} + +static struct inode *exfat_alloc_inode(struct super_block *sb) +{ + struct exfat_inode_info *ei; + + ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + + init_rwsem(&ei->truncate_lock); + return &ei->vfs_inode; +} + +static void exfat_free_inode(struct inode *inode) +{ + kmem_cache_free(exfat_inode_cachep, EXFAT_I(inode)); +} + +static const struct super_operations exfat_sops = { + .alloc_inode = exfat_alloc_inode, + .free_inode = exfat_free_inode, + .write_inode = exfat_write_inode, + .evict_inode = exfat_evict_inode, + .put_super = exfat_put_super, + .sync_fs = exfat_sync_fs, + .statfs = exfat_statfs, + .show_options = exfat_show_options, +}; + +enum { + Opt_uid, + Opt_gid, + Opt_umask, + Opt_dmask, + Opt_fmask, + Opt_allow_utime, + Opt_charset, + Opt_errors, + Opt_discard, + Opt_time_offset, +}; + +static const struct constant_table exfat_param_enums[] = { + { "continue", EXFAT_ERRORS_CONT }, + { "panic", EXFAT_ERRORS_PANIC }, + { "remount-ro", EXFAT_ERRORS_RO }, + {} +}; + +static const struct fs_parameter_spec exfat_parameters[] = { + fsparam_u32("uid", Opt_uid), + fsparam_u32("gid", Opt_gid), + fsparam_u32oct("umask", Opt_umask), + fsparam_u32oct("dmask", Opt_dmask), + fsparam_u32oct("fmask", Opt_fmask), + fsparam_u32oct("allow_utime", Opt_allow_utime), + fsparam_string("iocharset", Opt_charset), + fsparam_enum("errors", Opt_errors, exfat_param_enums), + fsparam_flag("discard", Opt_discard), + fsparam_s32("time_offset", Opt_time_offset), + {} +}; + +static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct exfat_sb_info *sbi = fc->s_fs_info; + struct exfat_mount_options *opts = &sbi->options; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, exfat_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->fs_uid = make_kuid(current_user_ns(), result.uint_32); + break; + case Opt_gid: + opts->fs_gid = make_kgid(current_user_ns(), result.uint_32); + break; + case Opt_umask: + opts->fs_fmask = result.uint_32; + opts->fs_dmask = result.uint_32; + break; + case Opt_dmask: + opts->fs_dmask = result.uint_32; + break; + case Opt_fmask: + opts->fs_fmask = result.uint_32; + break; + case Opt_allow_utime: + opts->allow_utime = result.uint_32 & 0022; + break; + case Opt_charset: + exfat_free_iocharset(sbi); + opts->iocharset = kstrdup(param->string, GFP_KERNEL); + if (!opts->iocharset) + return -ENOMEM; + break; + case Opt_errors: + opts->errors = result.uint_32; + break; + case Opt_discard: + opts->discard = 1; + break; + case Opt_time_offset: + /* + * Make the limit 24 just in case someone invents something + * unusual. + */ + if (result.int_32 < -24 * 60 || result.int_32 > 24 * 60) + return -EINVAL; + opts->time_offset = result.int_32; + break; + default: + return -EINVAL; + } + + return 0; +} + +static void exfat_hash_init(struct super_block *sb) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + int i; + + spin_lock_init(&sbi->inode_hash_lock); + for (i = 0; i < EXFAT_HASH_SIZE; i++) + INIT_HLIST_HEAD(&sbi->inode_hashtable[i]); +} + +static int exfat_read_root(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_chain cdir; + int num_subdirs, num_clu = 0; + + exfat_chain_set(&ei->dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + ei->entry = -1; + ei->start_clu = sbi->root_dir; + ei->flags = ALLOC_FAT_CHAIN; + ei->type = TYPE_DIR; + ei->version = 0; + ei->rwoffset = 0; + ei->hint_bmap.off = EXFAT_EOF_CLUSTER; + ei->hint_stat.eidx = 0; + ei->hint_stat.clu = sbi->root_dir; + ei->hint_femp.eidx = EXFAT_HINT_NONE; + + exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + if (exfat_count_num_clusters(sb, &cdir, &num_clu)) + return -EIO; + i_size_write(inode, num_clu << sbi->cluster_size_bits); + + num_subdirs = exfat_count_dir_entries(sb, &cdir); + if (num_subdirs < 0) + return -EIO; + set_nlink(inode, num_subdirs + EXFAT_MIN_SUBDIR); + + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + inode_inc_iversion(inode); + inode->i_generation = 0; + inode->i_mode = exfat_make_mode(sbi, ATTR_SUBDIR, 0777); + inode->i_op = &exfat_dir_inode_operations; + inode->i_fop = &exfat_dir_operations; + + inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) + & ~(sbi->cluster_size - 1)) >> inode->i_blkbits; + EXFAT_I(inode)->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; + EXFAT_I(inode)->i_size_aligned = i_size_read(inode); + EXFAT_I(inode)->i_size_ondisk = i_size_read(inode); + + exfat_save_attr(inode, ATTR_SUBDIR); + inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = + current_time(inode); + exfat_cache_init_inode(inode); + return 0; +} + +static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb, + struct buffer_head **prev_bh) +{ + struct pbr *p_pbr = (struct pbr *) (*prev_bh)->b_data; + unsigned short logical_sect = 0; + + logical_sect = 1 << p_pbr->bsx.f64.sect_size_bits; + + if (!is_power_of_2(logical_sect) || + logical_sect < 512 || logical_sect > 4096) { + exfat_msg(sb, KERN_ERR, "bogus logical sector size %u", + logical_sect); + return NULL; + } + + if (logical_sect < sb->s_blocksize) { + exfat_msg(sb, KERN_ERR, + "logical sector size too small for device (logical sector size = %u)", + logical_sect); + return NULL; + } + + if (logical_sect > sb->s_blocksize) { + struct buffer_head *bh = NULL; + + __brelse(*prev_bh); + *prev_bh = NULL; + + if (!sb_set_blocksize(sb, logical_sect)) { + exfat_msg(sb, KERN_ERR, + "unable to set blocksize %u", logical_sect); + return NULL; + } + bh = sb_bread(sb, 0); + if (!bh) { + exfat_msg(sb, KERN_ERR, + "unable to read boot sector (logical sector size = %lu)", + sb->s_blocksize); + return NULL; + } + + *prev_bh = bh; + p_pbr = (struct pbr *) bh->b_data; + } + return p_pbr; +} + +/* mount the file system volume */ +static int __exfat_fill_super(struct super_block *sb) +{ + int ret; + struct pbr *p_pbr; + struct pbr64 *p_bpb; + struct buffer_head *bh; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + /* set block size to read super block */ + sb_min_blocksize(sb, 512); + + /* read boot sector */ + bh = sb_bread(sb, 0); + if (!bh) { + exfat_msg(sb, KERN_ERR, "unable to read boot sector"); + return -EIO; + } + + /* PRB is read */ + p_pbr = (struct pbr *)bh->b_data; + + /* check the validity of PBR */ + if (le16_to_cpu((p_pbr->signature)) != PBR_SIGNATURE) { + exfat_msg(sb, KERN_ERR, "invalid boot record signature"); + ret = -EINVAL; + goto free_bh; + } + + + /* check logical sector size */ + p_pbr = exfat_read_pbr_with_logical_sector(sb, &bh); + if (!p_pbr) { + ret = -EIO; + goto free_bh; + } + + /* + * res_zero field must be filled with zero to prevent mounting + * from FAT volume. + */ + if (memchr_inv(p_pbr->bpb.f64.res_zero, 0, + sizeof(p_pbr->bpb.f64.res_zero))) { + ret = -EINVAL; + goto free_bh; + } + + p_bpb = (struct pbr64 *)p_pbr; + if (!p_bpb->bsx.num_fats) { + exfat_msg(sb, KERN_ERR, "bogus number of FAT structure"); + ret = -EINVAL; + goto free_bh; + } + + sbi->sect_per_clus = 1 << p_bpb->bsx.sect_per_clus_bits; + sbi->sect_per_clus_bits = p_bpb->bsx.sect_per_clus_bits; + sbi->cluster_size_bits = sbi->sect_per_clus_bits + sb->s_blocksize_bits; + sbi->cluster_size = 1 << sbi->cluster_size_bits; + sbi->num_FAT_sectors = le32_to_cpu(p_bpb->bsx.fat_length); + sbi->FAT1_start_sector = le32_to_cpu(p_bpb->bsx.fat_offset); + sbi->FAT2_start_sector = p_bpb->bsx.num_fats == 1 ? + sbi->FAT1_start_sector : + sbi->FAT1_start_sector + sbi->num_FAT_sectors; + sbi->data_start_sector = le32_to_cpu(p_bpb->bsx.clu_offset); + sbi->num_sectors = le64_to_cpu(p_bpb->bsx.vol_length); + /* because the cluster index starts with 2 */ + sbi->num_clusters = le32_to_cpu(p_bpb->bsx.clu_count) + + EXFAT_RESERVED_CLUSTERS; + + sbi->root_dir = le32_to_cpu(p_bpb->bsx.root_cluster); + sbi->dentries_per_clu = 1 << + (sbi->cluster_size_bits - DENTRY_SIZE_BITS); + + sbi->vol_flag = le16_to_cpu(p_bpb->bsx.vol_flags); + sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; + sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED; + + if (le16_to_cpu(p_bpb->bsx.vol_flags) & VOL_DIRTY) { + sbi->vol_flag |= VOL_DIRTY; + exfat_msg(sb, KERN_WARNING, + "Volume was not properly unmounted. Some data may be corrupt. Please run fsck."); + } + + /* exFAT file size is limited by a disk volume size */ + sb->s_maxbytes = (u64)(sbi->num_clusters - EXFAT_RESERVED_CLUSTERS) << + sbi->cluster_size_bits; + + ret = exfat_create_upcase_table(sb); + if (ret) { + exfat_msg(sb, KERN_ERR, "failed to load upcase table"); + goto free_bh; + } + + ret = exfat_load_bitmap(sb); + if (ret) { + exfat_msg(sb, KERN_ERR, "failed to load alloc-bitmap"); + goto free_upcase_table; + } + + ret = exfat_count_used_clusters(sb, &sbi->used_clusters); + if (ret) { + exfat_msg(sb, KERN_ERR, "failed to scan clusters"); + goto free_alloc_bitmap; + } + + return 0; + +free_alloc_bitmap: + exfat_free_bitmap(sbi); +free_upcase_table: + exfat_free_upcase_table(sbi); +free_bh: + brelse(bh); + return ret; +} + +static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct exfat_sb_info *sbi = sb->s_fs_info; + struct exfat_mount_options *opts = &sbi->options; + struct inode *root_inode; + int err; + + if (opts->allow_utime == (unsigned short)-1) + opts->allow_utime = ~opts->fs_dmask & 0022; + + if (opts->discard) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + + if (!blk_queue_discard(q)) + exfat_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but the device does not support discard"); + opts->discard = 0; + } + + sb->s_flags |= SB_NODIRATIME; + sb->s_magic = EXFAT_SUPER_MAGIC; + sb->s_op = &exfat_sops; + + sb->s_time_gran = 1; + sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS; + sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS; + + err = __exfat_fill_super(sb); + if (err) { + exfat_msg(sb, KERN_ERR, "failed to recognize exfat type"); + goto check_nls_io; + } + + /* set up enough so that it can read an inode */ + exfat_hash_init(sb); + + if (!strcmp(sbi->options.iocharset, "utf8")) + opts->utf8 = 1; + else { + sbi->nls_io = load_nls(sbi->options.iocharset); + if (!sbi->nls_io) { + exfat_msg(sb, KERN_ERR, "IO charset %s not found", + sbi->options.iocharset); + err = -EINVAL; + goto free_table; + } + } + + if (sbi->options.utf8) + sb->s_d_op = &exfat_utf8_dentry_ops; + else + sb->s_d_op = &exfat_dentry_ops; + + root_inode = new_inode(sb); + if (!root_inode) { + exfat_msg(sb, KERN_ERR, "failed to allocate root inode."); + err = -ENOMEM; + goto free_table; + } + + root_inode->i_ino = EXFAT_ROOT_INO; + inode_set_iversion(root_inode, 1); + err = exfat_read_root(root_inode); + if (err) { + exfat_msg(sb, KERN_ERR, "failed to initialize root inode."); + goto put_inode; + } + + exfat_hash_inode(root_inode, EXFAT_I(root_inode)->i_pos); + insert_inode_hash(root_inode); + + sb->s_root = d_make_root(root_inode); + if (!sb->s_root) { + exfat_msg(sb, KERN_ERR, "failed to get the root dentry"); + err = -ENOMEM; + goto put_inode; + } + + return 0; + +put_inode: + iput(root_inode); + sb->s_root = NULL; + +free_table: + exfat_free_upcase_table(sbi); + exfat_free_bitmap(sbi); + +check_nls_io: + unload_nls(sbi->nls_io); + exfat_free_iocharset(sbi); + sb->s_fs_info = NULL; + kfree(sbi); + return err; +} + +static int exfat_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, exfat_fill_super); +} + +static void exfat_free(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations exfat_context_ops = { + .parse_param = exfat_parse_param, + .get_tree = exfat_get_tree, + .free = exfat_free, +}; + +static int exfat_init_fs_context(struct fs_context *fc) +{ + struct exfat_sb_info *sbi; + + sbi = kzalloc(sizeof(struct exfat_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + mutex_init(&sbi->s_lock); + ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + sbi->options.fs_uid = current_uid(); + sbi->options.fs_gid = current_gid(); + sbi->options.fs_fmask = current->fs->umask; + sbi->options.fs_dmask = current->fs->umask; + sbi->options.allow_utime = -1; + sbi->options.iocharset = exfat_default_iocharset; + sbi->options.errors = EXFAT_ERRORS_RO; + + fc->s_fs_info = sbi; + fc->ops = &exfat_context_ops; + return 0; +} + +static struct file_system_type exfat_fs_type = { + .owner = THIS_MODULE, + .name = "exfat", + .init_fs_context = exfat_init_fs_context, + .parameters = exfat_parameters, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static void exfat_inode_init_once(void *foo) +{ + struct exfat_inode_info *ei = (struct exfat_inode_info *)foo; + + INIT_HLIST_NODE(&ei->i_hash_fat); + inode_init_once(&ei->vfs_inode); +} + +static int __init init_exfat_fs(void) +{ + int err; + + err = exfat_cache_init(); + if (err) + return err; + + exfat_inode_cachep = kmem_cache_create("exfat_inode_cache", + sizeof(struct exfat_inode_info), + 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + exfat_inode_init_once); + if (!exfat_inode_cachep) { + err = -ENOMEM; + goto shutdown_cache; + } + + err = register_filesystem(&exfat_fs_type); + if (err) + goto destroy_cache; + + return 0; + +destroy_cache: + kmem_cache_destroy(exfat_inode_cachep); +shutdown_cache: + exfat_cache_shutdown(); + return err; +} + +static void __exit exit_exfat_fs(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(exfat_inode_cachep); + unregister_filesystem(&exfat_fs_type); + exfat_cache_shutdown(); +} + +module_init(init_exfat_fs); +module_exit(exit_exfat_fs); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("exFAT filesystem support"); +MODULE_AUTHOR("Samsung Electronics Co., Ltd."); diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 0456bc990b5e..943cc469f42f 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -56,6 +56,7 @@ #include <linux/buffer_head.h> #include <linux/init.h> +#include <linux/printk.h> #include <linux/slab.h> #include <linux/mbcache.h> #include <linux/quotaops.h> @@ -84,8 +85,8 @@ printk("\n"); \ } while (0) #else -# define ea_idebug(f...) -# define ea_bdebug(f...) +# define ea_idebug(inode, f...) no_printk(f) +# define ea_bdebug(bh, f...) no_printk(f) #endif static int ext2_xattr_set2(struct inode *, struct buffer_head *, @@ -790,7 +791,15 @@ ext2_xattr_delete_inode(struct inode *inode) struct buffer_head *bh = NULL; struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb); - down_write(&EXT2_I(inode)->xattr_sem); + /* + * We are the only ones holding inode reference. The xattr_sem should + * better be unlocked! We could as well just not acquire xattr_sem at + * all but this makes the code more futureproof. OTOH we need trylock + * here to avoid false-positive warning from lockdep about reclaim + * circular dependency. + */ + if (WARN_ON_ONCE(!down_write_trylock(&EXT2_I(inode)->xattr_sem))) + return; if (!EXT2_I(inode)->i_file_acl) goto cleanup; @@ -864,8 +873,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh) true); if (error) { if (error == -EBUSY) { - ea_bdebug(bh, "already in cache (%d cache entries)", - atomic_read(&ext2_xattr_cache->c_entry_count)); + ea_bdebug(bh, "already in cache"); error = 0; } } else diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index cee888cdc235..16272e6ddcf4 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -39,7 +39,7 @@ struct ext2_xattr_entry { __le32 e_value_block; /* disk block attribute is stored on (n/i) */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ - char e_name[0]; /* attribute name */ + char e_name[]; /* attribute name */ }; #define EXT2_XATTR_PAD_BITS 2 diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8fd0b3cdab4c..0e0a4d6209c7 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -516,10 +516,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, wait_on_buffer(bh); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { - ext4_set_errno(sb, EIO); - ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, (unsigned long long) bh->b_blocknr); + ext4_error_err(sb, EIO, "Cannot read block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, (unsigned long long) bh->b_blocknr); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 0a734ffb4310..16e9b2fda03a 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -166,10 +166,8 @@ static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + (start_blk + count > ext4_blocks_count(sbi->s_es))) return 0; - } if (system_blks == NULL) return 1; @@ -181,10 +179,8 @@ static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, n = n->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = n->rb_right; - else { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + else return 0; - } } return 1; } @@ -220,10 +216,12 @@ static int ext4_protect_reserved_inode(struct super_block *sb, } else { if (!ext4_data_block_valid_rcu(sbi, system_blks, map.m_pblk, n)) { - ext4_error(sb, "blocks %llu-%llu from inode %u " - "overlap system zone", map.m_pblk, - map.m_pblk + map.m_len - 1, ino); err = -EFSCORRUPTED; + __ext4_error(sb, __func__, __LINE__, -err, + map.m_pblk, "blocks %llu-%llu " + "from inode %u overlap system zone", + map.m_pblk, + map.m_pblk + map.m_len - 1, ino); break; } err = add_system_zone(system_blks, map.m_pblk, n); @@ -365,7 +363,6 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, int ext4_check_blockref(const char *function, unsigned int line, struct inode *inode, __le32 *p, unsigned int max) { - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; __le32 *bref = p; unsigned int blk; @@ -379,7 +376,6 @@ int ext4_check_blockref(const char *function, unsigned int line, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - es->s_last_error_block = cpu_to_le64(blk); ext4_error_inode(inode, function, line, blk, "invalid block"); return -EFSCORRUPTED; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9aa1f75409b0..c654205f648d 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -392,7 +392,7 @@ struct fname { __u32 inode; __u8 name_len; __u8 file_type; - char name[0]; + char name[]; }; /* diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 61b37a052052..91eb4381cae5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -414,7 +414,7 @@ struct flex_groups { #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ -#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ @@ -487,7 +487,7 @@ enum { EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ - EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ +/* 22 was formerly EXT4_INODE_EOFBLOCKS */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ @@ -533,7 +533,6 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(VERITY); CHECK_FLAG_VALUE(EA_INODE); - CHECK_FLAG_VALUE(EOFBLOCKS); CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(PROJINHERIT); CHECK_FLAG_VALUE(RESERVED); @@ -2771,21 +2770,20 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); -extern void ext4_set_errno(struct super_block *sb, int err); -extern __printf(4, 5) -void __ext4_error(struct super_block *, const char *, unsigned int, +extern __printf(6, 7) +void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64, const char *, ...); -extern __printf(5, 6) -void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, - const char *, ...); +extern __printf(6, 7) +void __ext4_error_inode(struct inode *, const char *, unsigned int, + ext4_fsblk_t, int, const char *, ...); extern __printf(5, 6) void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); -extern __printf(4, 5) -void __ext4_abort(struct super_block *, const char *, unsigned int, +extern __printf(5, 6) +void __ext4_abort(struct super_block *, const char *, unsigned int, int, const char *, ...); extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, @@ -2806,8 +2804,12 @@ void __ext4_grp_locked_error(const char *, unsigned int, #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) -#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) +#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \ + __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a) + +#define ext4_error_inode_block(inode, block, err, fmt, a...) \ + __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \ + (fmt), ## a) #define EXT4_ERROR_FILE(file, block, fmt, a...) \ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) @@ -2815,13 +2817,18 @@ void __ext4_grp_locked_error(const char *, unsigned int, #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ - __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) + __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__) +#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ + __ext4_error_inode((inode), (func), (line), (block), \ + (err), (fmt), ##__VA_ARGS__) #define ext4_error_file(file, func, line, block, fmt, ...) \ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) #define ext4_error(sb, fmt, ...) \ - __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) -#define ext4_abort(sb, fmt, ...) \ - __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) + __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__) +#define ext4_error_err(sb, err, fmt, ...) \ + __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__) +#define ext4_abort(sb, err, fmt, ...) \ + __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning_inode(inode, fmt, ...) \ @@ -2839,7 +2846,12 @@ void __ext4_grp_locked_error(const char *, unsigned int, #define ext4_error_inode(inode, func, line, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ - __ext4_error_inode(inode, "", 0, block, " "); \ + __ext4_error_inode(inode, "", 0, block, 0, " "); \ +} while (0) +#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, err, " "); \ } while (0) #define ext4_error_file(file, func, line, block, fmt, ...) \ do { \ @@ -2849,12 +2861,17 @@ do { \ #define ext4_error(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ - __ext4_error(sb, "", 0, " "); \ + __ext4_error(sb, "", 0, 0, 0, " "); \ +} while (0) +#define ext4_error_err(sb, err, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, err, 0, " "); \ } while (0) -#define ext4_abort(sb, fmt, ...) \ +#define ext4_abort(sb, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ - __ext4_abort(sb, "", 0, " "); \ + __ext4_abort(sb, "", 0, err, " "); \ } while (0) #define ext4_warning(sb, fmt, ...) \ do { \ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 1f53d64e42a5..7f16e1af8d5c 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -80,8 +80,7 @@ static int ext4_journal_check_start(struct super_block *sb) * take the FS itself readonly cleanly. */ if (journal && is_journal_aborted(journal)) { - ext4_set_errno(sb, -journal->j_errno); - ext4_abort(sb, "Detected aborted journal"); + ext4_abort(sb, -journal->j_errno, "Detected aborted journal"); return -EROFS; } return 0; @@ -272,8 +271,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); - ext4_set_errno(inode->i_sb, -err); - __ext4_abort(inode->i_sb, where, line, + __ext4_abort(inode->i_sb, where, line, -err, "error %d when attempting revoke", err); } BUFFER_TRACE(bh, "exit"); @@ -332,6 +330,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, err); } } else { + set_buffer_uptodate(bh); if (inode) mark_buffer_dirty_inode(bh, inode); else @@ -342,11 +341,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, struct ext4_super_block *es; es = EXT4_SB(inode->i_sb)->s_es; - es->s_last_error_block = - cpu_to_le64(bh->b_blocknr); - ext4_set_errno(inode->i_sb, EIO); - ext4_error_inode(inode, where, line, - bh->b_blocknr, + ext4_error_inode_err(inode, where, line, + bh->b_blocknr, EIO, "IO error syncing itable block"); err = -EIO; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 7ea4f6fa173b..4b9002f0e84c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -512,6 +512,9 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 0; if (ext4_should_journal_data(inode)) return 0; + /* temporary fix to prevent generic/422 test failures */ + if (!test_opt(inode->i_sb, DELALLOC)) + return 0; return 1; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 954013d6076b..031752cfb6f7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -28,6 +28,7 @@ #include <linux/uaccess.h> #include <linux/fiemap.h> #include <linux/backing-dev.h> +#include <linux/iomap.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -83,13 +84,6 @@ static void ext4_extent_block_csum_set(struct inode *inode, et->et_checksum = ext4_extent_block_csum(inode, eh); } -static int ext4_split_extent(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - struct ext4_map_blocks *map, - int split_flag, - int flags); - static int ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, @@ -97,9 +91,6 @@ static int ext4_split_extent_at(handle_t *handle, int split_flag, int flags); -static int ext4_find_delayed_extent(struct inode *inode, - struct extent_status *newes); - static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { /* @@ -358,8 +349,8 @@ static int ext4_valid_extent_idx(struct inode *inode, } static int ext4_valid_extent_entries(struct inode *inode, - struct ext4_extent_header *eh, - int depth) + struct ext4_extent_header *eh, + ext4_fsblk_t *pblk, int depth) { unsigned short entries; if (eh->eh_entries == 0) @@ -370,8 +361,6 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - ext4_fsblk_t pblock = 0; ext4_lblk_t lblock = 0; ext4_lblk_t prev = 0; int len = 0; @@ -383,8 +372,7 @@ static int ext4_valid_extent_entries(struct inode *inode, lblock = le32_to_cpu(ext->ee_block); len = ext4_ext_get_actual_len(ext); if ((lblock <= prev) && prev) { - pblock = ext4_ext_pblock(ext); - es->s_last_error_block = cpu_to_le64(pblock); + *pblk = ext4_ext_pblock(ext); return 0; } ext++; @@ -431,7 +419,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid eh_entries"; goto corrupted; } - if (!ext4_valid_extent_entries(inode, eh, depth)) { + if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; } @@ -449,14 +437,14 @@ static int __ext4_ext_check(const char *function, unsigned int line, return 0; corrupted: - ext4_set_errno(inode->i_sb, -err); - ext4_error_inode(inode, function, line, 0, - "pblk %llu bad header/extent: %s - magic %x, " - "entries %u, max %u(%u), depth %u(%u)", - (unsigned long long) pblk, error_msg, - le16_to_cpu(eh->eh_magic), - le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), - max, le16_to_cpu(eh->eh_depth), depth); + ext4_error_inode_err(inode, function, line, 0, -err, + "pblk %llu bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + (unsigned long long) pblk, error_msg, + le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), + le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); return err; } @@ -556,6 +544,12 @@ int ext4_ext_precache(struct inode *inode) down_read(&ei->i_data_sem); depth = ext_depth(inode); + /* Don't cache anything if there are no external extent blocks */ + if (!depth) { + up_read(&ei->i_data_sem); + return ret; + } + path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS); if (path == NULL) { @@ -563,9 +557,6 @@ int ext4_ext_precache(struct inode *inode) return -ENOMEM; } - /* Don't cache anything if there are no external extent blocks */ - if (depth == 0) - goto out; path[0].p_hdr = ext_inode_hdr(inode); ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); if (ret) @@ -2134,155 +2125,6 @@ cleanup: return err; } -static int ext4_fill_fiemap_extents(struct inode *inode, - ext4_lblk_t block, ext4_lblk_t num, - struct fiemap_extent_info *fieinfo) -{ - struct ext4_ext_path *path = NULL; - struct ext4_extent *ex; - struct extent_status es; - ext4_lblk_t next, next_del, start = 0, end = 0; - ext4_lblk_t last = block + num; - int exists, depth = 0, err = 0; - unsigned int flags = 0; - unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; - - while (block < last && block != EXT_MAX_BLOCKS) { - num = last - block; - /* find extent for this block */ - down_read(&EXT4_I(inode)->i_data_sem); - - path = ext4_find_extent(inode, block, &path, 0); - if (IS_ERR(path)) { - up_read(&EXT4_I(inode)->i_data_sem); - err = PTR_ERR(path); - path = NULL; - break; - } - - depth = ext_depth(inode); - if (unlikely(path[depth].p_hdr == NULL)) { - up_read(&EXT4_I(inode)->i_data_sem); - EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - err = -EFSCORRUPTED; - break; - } - ex = path[depth].p_ext; - next = ext4_ext_next_allocated_block(path); - - flags = 0; - exists = 0; - if (!ex) { - /* there is no extent yet, so try to allocate - * all requested space */ - start = block; - end = block + num; - } else if (le32_to_cpu(ex->ee_block) > block) { - /* need to allocate space before found extent */ - start = block; - end = le32_to_cpu(ex->ee_block); - if (block + num < end) - end = block + num; - } else if (block >= le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex)) { - /* need to allocate space after found extent */ - start = block; - end = block + num; - if (end >= next) - end = next; - } else if (block >= le32_to_cpu(ex->ee_block)) { - /* - * some part of requested space is covered - * by found extent - */ - start = block; - end = le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex); - if (block + num < end) - end = block + num; - exists = 1; - } else { - BUG(); - } - BUG_ON(end <= start); - - if (!exists) { - es.es_lblk = start; - es.es_len = end - start; - es.es_pblk = 0; - } else { - es.es_lblk = le32_to_cpu(ex->ee_block); - es.es_len = ext4_ext_get_actual_len(ex); - es.es_pblk = ext4_ext_pblock(ex); - if (ext4_ext_is_unwritten(ex)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - } - - /* - * Find delayed extent and update es accordingly. We call - * it even in !exists case to find out whether es is the - * last existing extent or not. - */ - next_del = ext4_find_delayed_extent(inode, &es); - if (!exists && next_del) { - exists = 1; - flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - } - up_read(&EXT4_I(inode)->i_data_sem); - - if (unlikely(es.es_len == 0)) { - EXT4_ERROR_INODE(inode, "es.es_len == 0"); - err = -EFSCORRUPTED; - break; - } - - /* - * This is possible iff next == next_del == EXT_MAX_BLOCKS. - * we need to check next == EXT_MAX_BLOCKS because it is - * possible that an extent is with unwritten and delayed - * status due to when an extent is delayed allocated and - * is allocated by fallocate status tree will track both of - * them in a extent. - * - * So we could return a unwritten and delayed extent, and - * its block is equal to 'next'. - */ - if (next == next_del && next == EXT_MAX_BLOCKS) { - flags |= FIEMAP_EXTENT_LAST; - if (unlikely(next_del != EXT_MAX_BLOCKS || - next != EXT_MAX_BLOCKS)) { - EXT4_ERROR_INODE(inode, - "next extent == %u, next " - "delalloc extent = %u", - next, next_del); - err = -EFSCORRUPTED; - break; - } - } - - if (exists) { - err = fiemap_fill_next_extent(fieinfo, - (__u64)es.es_lblk << blksize_bits, - (__u64)es.es_pblk << blksize_bits, - (__u64)es.es_len << blksize_bits, - flags); - if (err < 0) - break; - if (err == 1) { - err = 0; - break; - } - } - - block = es.es_lblk + es.es_len; - } - - ext4_ext_drop_refs(path); - kfree(path); - return err; -} - static int ext4_fill_es_cache_info(struct inode *inode, ext4_lblk_t block, ext4_lblk_t num, struct fiemap_extent_info *fieinfo) @@ -3874,64 +3716,11 @@ out: return err; } -/* - * Handle EOFBLOCKS_FL flag, clearing it if necessary - */ -static int check_eofblocks_fl(handle_t *handle, struct inode *inode, - ext4_lblk_t lblk, - struct ext4_ext_path *path, - unsigned int len) -{ - int i, depth; - struct ext4_extent_header *eh; - struct ext4_extent *last_ex; - - if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) - return 0; - - depth = ext_depth(inode); - eh = path[depth].p_hdr; - - /* - * We're going to remove EOFBLOCKS_FL entirely in future so we - * do not care for this case anymore. Simply remove the flag - * if there are no extents. - */ - if (unlikely(!eh->eh_entries)) - goto out; - last_ex = EXT_LAST_EXTENT(eh); - /* - * We should clear the EOFBLOCKS_FL flag if we are writing the - * last block in the last extent in the file. We test this by - * first checking to see if the caller to - * ext4_ext_get_blocks() was interested in the last block (or - * a block beyond the last block) in the current extent. If - * this turns out to be false, we can bail out from this - * function immediately. - */ - if (lblk + len < le32_to_cpu(last_ex->ee_block) + - ext4_ext_get_actual_len(last_ex)) - return 0; - /* - * If the caller does appear to be planning to write at or - * beyond the end of the current extent, we then test to see - * if the current extent is the last extent in the file, by - * checking to make sure it was reached via the rightmost node - * at each level of the tree. - */ - for (i = depth-1; i >= 0; i--) - if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) - return 0; -out: - ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - return ext4_mark_inode_dirty(handle, inode); -} - static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, - unsigned int allocated) + unsigned int *allocated) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; @@ -3991,14 +3780,12 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); - if (err) - return err; + map->m_flags |= EXT4_MAP_UNWRITTEN; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; - return allocated; + if (*allocated > map->m_len) + *allocated = map->m_len; + map->m_len = *allocated; + return 0; } static int @@ -4007,7 +3794,9 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { +#ifdef EXT_DEBUG struct ext4_ext_path *path = *ppath; +#endif int ret = 0; int err = 0; @@ -4047,11 +3836,9 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } ret = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); - if (ret >= 0) { + if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, - path, map->m_len); - } else + else err = ret; map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = newblock; @@ -4100,12 +3887,6 @@ out: map_out: map->m_flags |= EXT4_MAP_MAPPED; - if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, - map->m_len); - if (err < 0) - goto out2; - } out1: if (allocated > map->m_len) allocated = map->m_len; @@ -4244,12 +4025,11 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth, ret; + int err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; ext4_lblk_t cluster_offset; - bool map_from_cluster = false; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); @@ -4308,12 +4088,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { - allocated = convert_initialized_extent( - handle, inode, map, &path, - allocated); + err = convert_initialized_extent(handle, + inode, map, &path, &allocated); goto out2; - } else if (!ext4_ext_is_unwritten(ex)) + } else if (!ext4_ext_is_unwritten(ex)) { goto out; + } ret = ext4_ext_handle_unwritten_extents( handle, inode, map, &path, flags, @@ -4364,7 +4144,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map_from_cluster = true; goto got_allocated_blocks; } @@ -4385,7 +4164,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map_from_cluster = true; goto got_allocated_blocks; } @@ -4442,7 +4220,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out2; ext_debug("allocate new block: goal %llu, found %llu/%u\n", ar.goal, newblock, allocated); - free_on_err = 1; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; if (ar.len > allocated) @@ -4453,28 +4230,28 @@ got_allocated_blocks: ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); /* Mark unwritten */ - if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; } - err = 0; - if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) - err = check_eofblocks_fl(handle, inode, map->m_lblk, - path, ar.len); - if (!err) - err = ext4_ext_insert_extent(handle, inode, &path, - &newex, flags); - - if (err && free_on_err) { - int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? - EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; - /* free data blocks we just allocated */ - /* not a good idea to call discard here directly, - * but otherwise we'd need to call it every free() */ - ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, NULL, newblock, - EXT4_C2B(sbi, allocated_clusters), fb_flags); + err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); + if (err) { + if (allocated_clusters) { + int fb_flags = 0; + + /* + * free data blocks we just allocated. + * not a good idea to call discard here directly, + * but otherwise we'd need to call it every free(). + */ + ext4_discard_preallocations(inode); + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; + ext4_free_blocks(handle, inode, NULL, newblock, + EXT4_C2B(sbi, allocated_clusters), + fb_flags); + } goto out2; } @@ -4491,7 +4268,7 @@ got_allocated_blocks: * clusters discovered to be delayed allocated. Once allocated, a * cluster is not included in the reserved count. */ - if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) { + if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) { if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { /* * When allocating delayed allocated clusters, simply @@ -4645,10 +4422,6 @@ retry: epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) inode->i_mtime = inode->i_ctime; - } else { - if (epos > inode->i_size) - ext4_set_inode_flag(inode, - EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -4802,16 +4575,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, } inode->i_mtime = inode->i_ctime = current_time(inode); - if (new_size) { + if (new_size) ext4_update_inode_size(inode, new_size); - } else { - /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. - */ - if (offset + len > inode->i_size) - ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - } ext4_mark_inode_dirty(handle, inode); /* Zero out partial block at the edges of the range */ @@ -5009,64 +4774,13 @@ int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) return ret < 0 ? ret : err; } -/* - * If newes is not existing extent (newes->ec_pblk equals zero) find - * delayed extent at start of newes and update newes accordingly and - * return start of the next delayed extent. - * - * If newes is existing extent (newes->ec_pblk is not equal zero) - * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed - * extent found. Leave newes unmodified. - */ -static int ext4_find_delayed_extent(struct inode *inode, - struct extent_status *newes) -{ - struct extent_status es; - ext4_lblk_t block, next_del; - - if (newes->es_pblk == 0) { - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - newes->es_lblk, - newes->es_lblk + newes->es_len - 1, - &es); - - /* - * No extent in extent-tree contains block @newes->es_pblk, - * then the block may stay in 1)a hole or 2)delayed-extent. - */ - if (es.es_len == 0) - /* A hole found. */ - return 0; - - if (es.es_lblk > newes->es_lblk) { - /* A hole found. */ - newes->es_len = min(es.es_lblk - newes->es_lblk, - newes->es_len); - return 0; - } - - newes->es_len = es.es_lblk + es.es_len - newes->es_lblk; - } - - block = newes->es_lblk + newes->es_len; - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block, - EXT_MAX_BLOCKS, &es); - if (es.es_len == 0) - next_del = EXT_MAX_BLOCKS; - else - next_del = es.es_lblk; - - return next_del; -} - -static int ext4_xattr_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo) +static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap) { __u64 physical = 0; - __u64 length; - __u32 flags = FIEMAP_EXTENT_LAST; + __u64 length = 0; int blockbits = inode->i_sb->s_blocksize_bits; int error = 0; + u16 iomap_type; /* in-inode? */ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { @@ -5081,40 +4795,49 @@ static int ext4_xattr_fiemap(struct inode *inode, EXT4_I(inode)->i_extra_isize; physical += offset; length = EXT4_SB(inode->i_sb)->s_inode_size - offset; - flags |= FIEMAP_EXTENT_DATA_INLINE; brelse(iloc.bh); - } else { /* external block */ + iomap_type = IOMAP_INLINE; + } else if (EXT4_I(inode)->i_file_acl) { /* external block */ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; + iomap_type = IOMAP_MAPPED; + } else { + /* no in-inode or external block for xattr, so return -ENOENT */ + error = -ENOENT; + goto out; } - if (physical) - error = fiemap_fill_next_extent(fieinfo, 0, physical, - length, flags); - return (error < 0 ? error : 0); + iomap->addr = physical; + iomap->offset = 0; + iomap->length = length; + iomap->type = iomap_type; + iomap->flags = 0; +out: + return error; } -static int _ext4_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, - int (*fill)(struct inode *, ext4_lblk_t, - ext4_lblk_t, - struct fiemap_extent_info *)) +static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned flags, + struct iomap *iomap, struct iomap *srcmap) { - ext4_lblk_t start_blk; - u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR; + int error; - int error = 0; - - if (ext4_has_inline_data(inode)) { - int has_inline = 1; + error = ext4_iomap_xattr_fiemap(inode, iomap); + if (error == 0 && (offset >= iomap->length)) + error = -ENOENT; + return error; +} - error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline, - start, len); +static const struct iomap_ops ext4_iomap_xattr_ops = { + .iomap_begin = ext4_iomap_xattr_begin, +}; - if (has_inline) - return error; - } +static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, bool from_es_cache) +{ + ext4_lblk_t start_blk; + u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR; + int error = 0; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); @@ -5123,19 +4846,19 @@ static int _ext4_fiemap(struct inode *inode, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - /* fallback to generic here if not in extents fmt */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && - fill == ext4_fill_fiemap_extents) - return generic_block_fiemap(inode, fieinfo, start, len, - ext4_get_block); - - if (fill == ext4_fill_es_cache_info) + if (from_es_cache) ext4_fiemap_flags &= FIEMAP_FLAG_XATTR; + if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) return -EBADR; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { - error = ext4_xattr_fiemap(inode, fieinfo); + fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); + } else if (!from_es_cache) { + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_report_ops); } else { ext4_lblk_t len_blks; __u64 last_blk; @@ -5150,7 +4873,8 @@ static int _ext4_fiemap(struct inode *inode, * Walk the extent tree gathering extent information * and pushing extents back to the user. */ - error = fill(inode, start_blk, len_blks, fieinfo); + error = ext4_fill_es_cache_info(inode, start_blk, len_blks, + fieinfo); } return error; } @@ -5158,8 +4882,7 @@ static int _ext4_fiemap(struct inode *inode, int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - return _ext4_fiemap(inode, fieinfo, start, len, - ext4_fill_fiemap_extents); + return _ext4_fiemap(inode, fieinfo, start, len, false); } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -5175,8 +4898,7 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, return 0; } - return _ext4_fiemap(inode, fieinfo, start, len, - ext4_fill_es_cache_info); + return _ext4_fiemap(inode, fieinfo, start, len, true); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5f225881176b..0d624250a62b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -872,6 +872,7 @@ const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f95ee99091e4..b420c9dc444d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -196,10 +196,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); if (!buffer_uptodate(bh)) { put_bh(bh); - ext4_set_errno(sb, EIO); - ext4_error(sb, "Cannot read inode bitmap - " - "block_group = %u, inode_bitmap = %llu", - block_group, bitmap_blk); + ext4_error_err(sb, EIO, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EIO); @@ -712,21 +711,34 @@ out: static int find_inode_bit(struct super_block *sb, ext4_group_t group, struct buffer_head *bitmap, unsigned long *ino) { + bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL; + unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb); + next: *ino = ext4_find_next_zero_bit((unsigned long *) bitmap->b_data, EXT4_INODES_PER_GROUP(sb), *ino); if (*ino >= EXT4_INODES_PER_GROUP(sb)) - return 0; + goto not_found; - if ((EXT4_SB(sb)->s_journal == NULL) && - recently_deleted(sb, group, *ino)) { + if (check_recently_deleted && recently_deleted(sb, group, *ino)) { + recently_deleted_ino = *ino; *ino = *ino + 1; if (*ino < EXT4_INODES_PER_GROUP(sb)) goto next; - return 0; + goto not_found; } - + return 1; +not_found: + if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb)) + return 0; + /* + * Not reusing recently deleted inodes is mostly a preference. We don't + * want to report ENOSPC or skew allocation patterns because of that. + * So return even recently deleted inode if we could find better in the + * given range. + */ + *ino = recently_deleted_ino; return 1; } @@ -1231,9 +1243,9 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); - ext4_set_errno(sb, -err); - ext4_error(sb, "couldn't read orphan inode %lu (err %d)", - ino, err); + ext4_error_err(sb, -err, + "couldn't read orphan inode %lu (err %d)", + ino, err); return inode; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 569fc68e8975..107f0043f67f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1019,7 +1019,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, nr, + ext4_error_inode_block(inode, nr, EIO, "Read failure"); continue; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index fad82d08fca5..f35e289e17aa 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -98,10 +98,9 @@ int ext4_get_max_inline_size(struct inode *inode) error = ext4_get_inode_loc(inode, &iloc); if (error) { - ext4_set_errno(inode->i_sb, -error); - ext4_error_inode(inode, __func__, __LINE__, 0, - "can't get inode location %lu", - inode->i_ino); + ext4_error_inode_err(inode, __func__, __LINE__, 0, -error, + "can't get inode location %lu", + inode->i_ino); return 0; } @@ -1762,9 +1761,9 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) err = ext4_get_inode_loc(dir, &iloc); if (err) { - ext4_set_errno(dir->i_sb, -err); - EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", - err, dir->i_ino); + EXT4_ERROR_INODE_ERR(dir, -err, + "error %d getting inode %lu block", + err, dir->i_ino); return true; } @@ -1857,47 +1856,6 @@ out: return error; } -int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline, __u64 start, __u64 len) -{ - __u64 physical = 0; - __u64 inline_len; - __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | - FIEMAP_EXTENT_LAST; - int error = 0; - struct ext4_iloc iloc; - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - *has_inline = 0; - goto out; - } - inline_len = min_t(size_t, ext4_get_inline_size(inode), - i_size_read(inode)); - if (start >= inline_len) - goto out; - if (start + len < inline_len) - inline_len = start + len; - inline_len -= start; - - error = ext4_get_inode_loc(inode, &iloc); - if (error) - goto out; - - physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; - physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; - physical += offsetof(struct ext4_inode, i_block); - - brelse(iloc.bh); -out: - up_read(&EXT4_I(inode)->xattr_sem); - if (physical) - error = fiemap_fill_next_extent(fieinfo, start, physical, - inline_len, flags); - return (error < 0 ? error : 0); -} - int ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fa0ff78dc033..e416096fc081 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -269,10 +269,9 @@ void ext4_evict_inode(struct inode *inode) if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { - ext4_set_errno(inode->i_sb, -err); - ext4_error(inode->i_sb, - "couldn't truncate inode %lu (err %d)", - inode->i_ino, err); + ext4_error_err(inode->i_sb, -err, + "couldn't truncate inode %lu (err %d)", + inode->i_ino, err); goto stop_handle; } } @@ -2478,10 +2477,9 @@ update_disksize: up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { - ext4_set_errno(inode->i_sb, -err2); - ext4_error(inode->i_sb, - "Failed to mark inode %lu dirty", - inode->i_ino); + ext4_error_err(inode->i_sb, -err2, + "Failed to mark inode %lu dirty", + inode->i_ino); } if (!err) err = err2; @@ -3212,7 +3210,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) return 0; } - return generic_block_bmap(mapping, block, ext4_get_block); + return iomap_bmap(mapping, block, &ext4_iomap_ops); } static int ext4_readpage(struct file *file, struct page *page) @@ -3333,6 +3331,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; + if ((map->m_flags & EXT4_MAP_MAPPED) && + !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + iomap->flags |= IOMAP_F_MERGED; + /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits @@ -3542,12 +3544,28 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + /* + * Fiemap callers may call for offset beyond s_bitmap_maxbytes. + * So handle it here itself instead of querying ext4_map_blocks(). + * Since ext4_map_blocks() will warn about it and will return + * -EIO error. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (offset >= sbi->s_bitmap_maxbytes) { + map.m_flags = 0; + goto set_iomap; + } + } + ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; if (ret == 0) delalloc = ext4_iomap_is_delalloc(inode, &map); +set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length); if (delalloc && iomap->type == IOMAP_HOLE) iomap->type = IOMAP_DELALLOC; @@ -4144,8 +4162,6 @@ int ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return 0; - ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); @@ -4364,8 +4380,7 @@ make_io: wait_on_buffer(bh); if (!buffer_uptodate(bh)) { simulate_eio: - ext4_set_errno(inode->i_sb, EIO); - EXT4_ERROR_INODE_BLOCK(inode, block, + ext4_error_inode_block(inode, block, EIO, "unable to read itable block"); brelse(bh); return -EIO; @@ -4517,7 +4532,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); - __ext4_error(sb, function, line, + __ext4_error(sb, function, line, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); @@ -4580,9 +4595,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (!ext4_inode_csum_verify(inode, raw_inode, ei) || ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) { - ext4_set_errno(inode->i_sb, EFSBADCRC); - ext4_error_inode(inode, function, line, 0, - "iget: checksum invalid"); + ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, + "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } @@ -4812,7 +4826,7 @@ static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); - u64 i_blocks = inode->i_blocks; + u64 i_blocks = READ_ONCE(inode->i_blocks); struct super_block *sb = inode->i_sb; if (i_blocks <= ~0U) { @@ -4982,7 +4996,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { + if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) { ext4_isize_set(raw_inode, ei->i_disksize); need_datasync = 1; } @@ -5131,9 +5145,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - ext4_set_errno(inode->i_sb, EIO); - EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, - "IO error syncing inode"); + ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, + "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a0ec750018dd..bfc1281fc4cb 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -327,18 +327,6 @@ static int ext4_ioctl_setflags(struct inode *inode, if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; - if (flags & EXT4_EOFBLOCKS_FL) { - /* we don't support adding EOFBLOCKS flag */ - if (!(oldflags & EXT4_EOFBLOCKS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (oldflags & EXT4_EOFBLOCKS_FL) { - err = ext4_truncate(inode); - if (err) - goto flags_out; - } - if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) { if (!ext4_has_feature_casefold(sb)) { err = -EOPNOTSUPP; @@ -1210,6 +1198,11 @@ resizefs_out: return -EOPNOTSUPP; return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_NONCE: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_nonce(filp, (void __user *)arg); + case EXT4_IOC_CLEAR_ES_CACHE: { if (!inode_owner_or_capable(inode)) @@ -1370,6 +1363,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_REMOVE_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + case FS_IOC_GET_ENCRYPTION_NONCE: case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: case FS_IOC_ENABLE_VERITY: diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 51a78eb65f3c..87c85be4c12e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1901,8 +1901,15 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, BUG_ON(buddy == NULL); k = mb_find_next_zero_bit(buddy, max, 0); - BUG_ON(k >= max); - + if (k >= max) { + ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, + "%d free clusters of order %d. But found 0", + grp->bb_counters[i], i); + ext4_mark_group_bitmap_corrupted(ac->ac_sb, + e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + break; + } ac->ac_found++; ac->ac_b_ex.fe_len = 1 << i; @@ -3914,9 +3921,9 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); - ext4_set_errno(sb, -err); - ext4_error(sb, "Error %d reading block bitmap for %u", - err, group); + ext4_error_err(sb, -err, + "Error %d reading block bitmap for %u", + err, group); return 0; } @@ -4083,18 +4090,16 @@ repeat: err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { - ext4_set_errno(sb, -err); - ext4_error(sb, "Error %d loading buddy information for %u", - err, group); + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", + err, group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); - ext4_set_errno(sb, -err); - ext4_error(sb, "Error %d reading block bitmap for %u", - err, group); + ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", + err, group); ext4_mb_unload_buddy(&e4b); continue; } @@ -4302,7 +4307,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], - pa_inode_list) { + pa_inode_list, + lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* @@ -4347,9 +4353,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { - ext4_set_errno(sb, -err); - ext4_error(sb, "Error %d loading buddy information for %u", - err, group); + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", + err, group); continue; } ext4_lock_group(sb, group); @@ -4386,7 +4391,8 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], - pa_inode_list) { + pa_inode_list, + lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { spin_unlock(&tmp_pa->pa_lock); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 87f7551c5132..d34cb8c46655 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -175,8 +175,8 @@ static int kmmpd(void *data) */ if (retval) { if ((failed_writes % 60) == 0) { - ext4_set_errno(sb, -retval); - ext4_error(sb, "Error writing to MMP block"); + ext4_error_err(sb, -retval, + "Error writing to MMP block"); } failed_writes++; } @@ -208,9 +208,9 @@ static int kmmpd(void *data) retval = read_mmp_block(sb, &bh_check, mmp_block); if (retval) { - ext4_set_errno(sb, -retval); - ext4_error(sb, "error reading MMP data: %d", - retval); + ext4_error_err(sb, -retval, + "error reading MMP data: %d", + retval); goto exit_thread; } @@ -222,8 +222,7 @@ static int kmmpd(void *data) "Error while updating MMP info. " "The filesystem seems to have been" " multiply mounted."); - ext4_set_errno(sb, EBUSY); - ext4_error(sb, "abort"); + ext4_error_err(sb, EBUSY, "abort"); put_bh(bh_check); retval = -EBUSY; goto exit_thread; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 30ce3dc69378..1ed86fb6c302 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -422,8 +422,8 @@ repair_branches: block_len_in_page, 0, &err2); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { - EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), - "Unable to copy data block," + ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), + EIO, "Unable to copy data block," " data will be lost."); *err = -EIO; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b05ea72f38fd..a8aca4772aaa 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -160,9 +160,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { - ext4_set_errno(inode->i_sb, EFSBADCRC); - ext4_error_inode(inode, func, line, block, - "Directory index failed checksum"); + ext4_error_inode_err(inode, func, line, block, + EFSBADCRC, + "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } @@ -172,9 +172,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { - ext4_set_errno(inode->i_sb, EFSBADCRC); - ext4_error_inode(inode, func, line, block, - "Directory block failed checksum"); + ext4_error_inode_err(inode, func, line, block, + EFSBADCRC, + "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } @@ -233,13 +233,13 @@ struct dx_root u8 unused_flags; } info; - struct dx_entry entries[0]; + struct dx_entry entries[]; }; struct dx_node { struct fake_dirent fake; - struct dx_entry entries[0]; + struct dx_entry entries[]; }; @@ -1532,9 +1532,9 @@ restart: goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_set_errno(sb, EIO); - EXT4_ERROR_INODE(dir, "reading directory lblock %lu", - (unsigned long) block); + EXT4_ERROR_INODE_ERR(dir, EIO, + "reading directory lblock %lu", + (unsigned long) block); brelse(bh); ret = ERR_PTR(-EIO); goto cleanup_and_exit; @@ -1543,9 +1543,9 @@ restart: !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { - ext4_set_errno(sb, EFSBADCRC); - EXT4_ERROR_INODE(dir, "checksumming directory " - "block %lu", (unsigned long)block); + EXT4_ERROR_INODE_ERR(dir, EFSBADCRC, + "checksumming directory " + "block %lu", (unsigned long)block); brelse(bh); ret = ERR_PTR(-EFSBADCRC); goto cleanup_and_exit; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 68b39e75446a..de6fe969f773 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -125,11 +125,10 @@ static void ext4_finish_bio(struct bio *bio) } bh = head = page_buffers(page); /* - * We check all buffers in the page under BH_Uptodate_Lock + * We check all buffers in the page under b_uptodate_lock * to avoid races with other end io clearing async_write flags */ - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &head->b_state); + spin_lock_irqsave(&head->b_uptodate_lock, flags); do { if (bh_offset(bh) < bio_start || bh_offset(bh) + bh->b_size > bio_end) { @@ -141,8 +140,7 @@ static void ext4_finish_bio(struct bio *bio) if (bio->bi_status) buffer_io_error(bh); } while ((bh = bh->b_this_page) != head); - bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&head->b_uptodate_lock, flags); if (!under_io) { fscrypt_free_bounce_page(bounce_page); end_page_writeback(page); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c7c4adb664e..9728e7b0e84f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -43,7 +43,7 @@ #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/unicode.h> - +#include <linux/part_stat.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -335,10 +335,12 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) -static void __save_error_info(struct super_block *sb, const char *func, - unsigned int line) +static void __save_error_info(struct super_block *sb, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; + int err; EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (bdev_read_only(sb->s_bdev)) @@ -347,8 +349,62 @@ static void __save_error_info(struct super_block *sb, const char *func, ext4_update_tstamp(es, s_last_error_time); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); es->s_last_error_line = cpu_to_le32(line); - if (es->s_last_error_errcode == 0) - es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED; + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); + switch (error) { + case EIO: + err = EXT4_ERR_EIO; + break; + case ENOMEM: + err = EXT4_ERR_ENOMEM; + break; + case EFSBADCRC: + err = EXT4_ERR_EFSBADCRC; + break; + case 0: + case EFSCORRUPTED: + err = EXT4_ERR_EFSCORRUPTED; + break; + case ENOSPC: + err = EXT4_ERR_ENOSPC; + break; + case ENOKEY: + err = EXT4_ERR_ENOKEY; + break; + case EROFS: + err = EXT4_ERR_EROFS; + break; + case EFBIG: + err = EXT4_ERR_EFBIG; + break; + case EEXIST: + err = EXT4_ERR_EEXIST; + break; + case ERANGE: + err = EXT4_ERR_ERANGE; + break; + case EOVERFLOW: + err = EXT4_ERR_EOVERFLOW; + break; + case EBUSY: + err = EXT4_ERR_EBUSY; + break; + case ENOTDIR: + err = EXT4_ERR_ENOTDIR; + break; + case ENOTEMPTY: + err = EXT4_ERR_ENOTEMPTY; + break; + case ESHUTDOWN: + err = EXT4_ERR_ESHUTDOWN; + break; + case EFAULT: + err = EXT4_ERR_EFAULT; + break; + default: + err = EXT4_ERR_UNKNOWN; + } + es->s_last_error_errcode = err; if (!es->s_first_error_time) { es->s_first_error_time = es->s_last_error_time; es->s_first_error_time_hi = es->s_last_error_time_hi; @@ -368,11 +424,13 @@ static void __save_error_info(struct super_block *sb, const char *func, le32_add_cpu(&es->s_error_count, 1); } -static void save_error_info(struct super_block *sb, const char *func, - unsigned int line) +static void save_error_info(struct super_block *sb, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) { - __save_error_info(sb, func, line); - ext4_commit_super(sb, 1); + __save_error_info(sb, error, ino, block, func, line); + if (!bdev_read_only(sb->s_bdev)) + ext4_commit_super(sb, 1); } /* @@ -477,7 +535,8 @@ static void ext4_handle_error(struct super_block *sb) "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, - unsigned int line, const char *fmt, ...) + unsigned int line, int error, __u64 block, + const char *fmt, ...) { struct va_format vaf; va_list args; @@ -495,24 +554,21 @@ void __ext4_error(struct super_block *sb, const char *function, sb->s_id, function, line, current->comm, &vaf); va_end(args); } - save_error_info(sb, function, line); + save_error_info(sb, error, 0, block, function, line); ext4_handle_error(sb); } void __ext4_error_inode(struct inode *inode, const char *function, - unsigned int line, ext4_fsblk_t block, + unsigned int line, ext4_fsblk_t block, int error, const char *fmt, ...) { va_list args; struct va_format vaf; - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return; trace_ext4_error(inode->i_sb, function, line); - es->s_last_error_ino = cpu_to_le32(inode->i_ino); - es->s_last_error_block = cpu_to_le64(block); if (ext4_error_ratelimit(inode->i_sb)) { va_start(args, fmt); vaf.fmt = fmt; @@ -529,7 +585,8 @@ void __ext4_error_inode(struct inode *inode, const char *function, current->comm, &vaf); va_end(args); } - save_error_info(inode->i_sb, function, line); + save_error_info(inode->i_sb, error, inode->i_ino, block, + function, line); ext4_handle_error(inode->i_sb); } @@ -548,7 +605,6 @@ void __ext4_error_file(struct file *file, const char *function, trace_ext4_error(inode->i_sb, function, line); es = EXT4_SB(inode->i_sb)->s_es; - es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) @@ -570,7 +626,8 @@ void __ext4_error_file(struct file *file, const char *function, current->comm, path, &vaf); va_end(args); } - save_error_info(inode->i_sb, function, line); + save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block, + function, line); ext4_handle_error(inode->i_sb); } @@ -614,66 +671,6 @@ const char *ext4_decode_error(struct super_block *sb, int errno, return errstr; } -void ext4_set_errno(struct super_block *sb, int err) -{ - if (err < 0) - err = -err; - - switch (err) { - case EIO: - err = EXT4_ERR_EIO; - break; - case ENOMEM: - err = EXT4_ERR_ENOMEM; - break; - case EFSBADCRC: - err = EXT4_ERR_EFSBADCRC; - break; - case EFSCORRUPTED: - err = EXT4_ERR_EFSCORRUPTED; - break; - case ENOSPC: - err = EXT4_ERR_ENOSPC; - break; - case ENOKEY: - err = EXT4_ERR_ENOKEY; - break; - case EROFS: - err = EXT4_ERR_EROFS; - break; - case EFBIG: - err = EXT4_ERR_EFBIG; - break; - case EEXIST: - err = EXT4_ERR_EEXIST; - break; - case ERANGE: - err = EXT4_ERR_ERANGE; - break; - case EOVERFLOW: - err = EXT4_ERR_EOVERFLOW; - break; - case EBUSY: - err = EXT4_ERR_EBUSY; - break; - case ENOTDIR: - err = EXT4_ERR_ENOTDIR; - break; - case ENOTEMPTY: - err = EXT4_ERR_ENOTEMPTY; - break; - case ESHUTDOWN: - err = EXT4_ERR_ESHUTDOWN; - break; - case EFAULT: - err = EXT4_ERR_EFAULT; - break; - default: - err = EXT4_ERR_UNKNOWN; - } - EXT4_SB(sb)->s_es->s_last_error_errcode = err; -} - /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ @@ -698,8 +695,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, sb->s_id, function, line, errstr); } - ext4_set_errno(sb, -errno); - save_error_info(sb, function, line); + save_error_info(sb, -errno, 0, 0, function, line); ext4_handle_error(sb); } @@ -714,7 +710,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, */ void __ext4_abort(struct super_block *sb, const char *function, - unsigned int line, const char *fmt, ...) + unsigned int line, int error, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -722,7 +718,7 @@ void __ext4_abort(struct super_block *sb, const char *function, if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return; - save_error_info(sb, function, line); + save_error_info(sb, error, 0, 0, function, line); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -741,7 +737,6 @@ void __ext4_abort(struct super_block *sb, const char *function, sb->s_flags |= SB_RDONLY; if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); - save_error_info(sb, function, line); } if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { if (EXT4_SB(sb)->s_journal && @@ -815,15 +810,12 @@ __acquires(bitlock) { struct va_format vaf; va_list args; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return; trace_ext4_error(sb, function, line); - es->s_last_error_ino = cpu_to_le32(ino); - es->s_last_error_block = cpu_to_le64(block); - __save_error_info(sb, function, line); + __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); @@ -927,7 +919,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; - char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) @@ -935,8 +926,9 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) return bdev; fail: - ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld", - __bdevname(dev, b), PTR_ERR(bdev)); + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); return NULL; } @@ -1024,17 +1016,22 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->rsv_conversion_wq); + /* + * Unregister sysfs before destroying jbd2 journal. + * Since we could still access attr_journal_task attribute via sysfs + * path which could have sbi->s_journal->j_task as NULL + */ + ext4_unregister_sysfs(sb); + if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; if ((err < 0) && !aborted) { - ext4_set_errno(sb, -err); - ext4_abort(sb, "Couldn't clean up the journal"); + ext4_abort(sb, -err, "Couldn't clean up the journal"); } } - ext4_unregister_sysfs(sb); ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); @@ -2180,6 +2177,14 @@ static int parse_options(char *options, struct super_block *sb, } } #endif + if (test_opt(sb, DIOREAD_NOLOCK)) { + int blocksize = + BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); + if (blocksize < PAGE_SIZE) + ext4_msg(sb, KERN_WARNING, "Warning: mounting with an " + "experimental mount option 'dioread_nolock' " + "for blocksize < PAGE_SIZE"); + } return 1; } @@ -3609,7 +3614,8 @@ int ext4_calculate_overhead(struct super_block *sb) */ if (sbi->s_journal && !sbi->journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); - else if (ext4_has_feature_journal(sb) && !sbi->s_journal) { + else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { + /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); if (j_inode) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; @@ -3785,7 +3791,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); - set_opt(sb, DIOREAD_NOLOCK); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif @@ -3835,6 +3840,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize == PAGE_SIZE) + set_opt(sb, DIOREAD_NOLOCK); + if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, @@ -4157,7 +4166,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || sbi->s_inodes_per_group > blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", - sbi->s_blocks_per_group); + sbi->s_inodes_per_group); goto failed_mount; } sbi->s_itb_per_group = sbi->s_inodes_per_group / @@ -4286,9 +4295,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { - ext4_msg(sb, KERN_WARNING, "groups count too large: %u " + ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " "(block count %llu, first data block %u, " - "blocks per group %lu)", sbi->s_groups_count, + "blocks per group %lu)", blocks_count, ext4_blocks_count(es), le32_to_cpu(es->s_first_data_block), EXT4_BLOCKS_PER_GROUP(sb)); @@ -5433,7 +5442,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) - ext4_abort(sb, "Abort forced by user"); + ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); @@ -5622,10 +5631,8 @@ static int ext4_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = dquot->dq_dqb.dqb_bsoftlimit; - if (dquot->dq_dqb.dqb_bhardlimit && - (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) - limit = dquot->dq_dqb.dqb_bhardlimit; + limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, + dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { @@ -5637,11 +5644,8 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = dquot->dq_dqb.dqb_isoftlimit; - if (dquot->dq_dqb.dqb_ihardlimit && - (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) - limit = dquot->dq_dqb.dqb_ihardlimit; - + limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, + dquot->dq_dqb.dqb_ihardlimit); if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index d218ebdafa4a..04bfaf63752c 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -13,6 +13,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/proc_fs.h> +#include <linux/part_stat.h> #include "ext4.h" #include "ext4_jbd2.h" diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8cac7d95c3ad..21df43a25328 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -245,7 +245,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, bh->b_data); errout: if (error) - __ext4_error_inode(inode, function, line, 0, + __ext4_error_inode(inode, function, line, 0, -error, "corrupted xattr block %llu", (unsigned long long) bh->b_blocknr); else @@ -269,7 +269,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); errout: if (error) - __ext4_error_inode(inode, function, line, 0, + __ext4_error_inode(inode, function, line, 0, -error, "corrupted in-inode xattr"); return error; } @@ -2880,9 +2880,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (IS_ERR(bh)) { error = PTR_ERR(bh); if (error == -EIO) { - ext4_set_errno(inode->i_sb, EIO); - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE_ERR(inode, EIO, + "block %llu read error", + EXT4_I(inode)->i_file_acl); } bh = NULL; goto cleanup; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index f39cad2abe2a..ffe21ac77f78 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -48,7 +48,7 @@ struct ext4_xattr_entry { __le32 e_value_inum; /* inode in which the value is stored */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ - char e_name[0]; /* attribute name */ + char e_name[]; /* attribute name */ }; #define EXT4_XATTR_PAD_BITS 2 @@ -118,7 +118,7 @@ struct ext4_xattr_ibody_find { struct ext4_xattr_inode_array { unsigned int count; /* # of used items in the array */ - struct inode *inodes[0]; + struct inode *inodes[]; }; extern const struct xattr_handler ext4_xattr_user_handler; diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index f0faada30f30..bb68d21e1f8c 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -118,3 +118,12 @@ config F2FS_FS_LZ4 default y help Support LZ4 compress algorithm, if unsure, say Y. + +config F2FS_FS_ZSTD + bool "ZSTD compression support" + depends on F2FS_FS_COMPRESSION + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS + default y + help + Support ZSTD compress algorithm, if unsure, say Y. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 44e84ac5c941..852890b72d6a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -50,9 +50,6 @@ repeat: return page; } -/* - * We guarantee no failure on the returned page. - */ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, bool is_meta) { @@ -206,7 +203,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, } /* - * Readahead CP/NAT/SIT/SSA pages + * Readahead CP/NAT/SIT/SSA/POR pages */ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) @@ -898,7 +895,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) return -ENOMEM; /* * Finding out valid cp block involves read both - * sets( cp pack1 and cp pack 2) + * sets( cp pack 1 and cp pack 2) */ cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); @@ -1250,20 +1247,20 @@ static void unblock_operations(struct f2fs_sb_info *sbi) f2fs_unlock_all(sbi); } -void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!get_pages(sbi, F2FS_WB_CP_DATA)) + if (!get_pages(sbi, type)) break; if (unlikely(f2fs_cp_error(sbi))) break; - io_schedule_timeout(5*HZ); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); } finish_wait(&sbi->cp_wait, &wait); } @@ -1301,10 +1298,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) else __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || - is_sbi_flag_set(sbi, SBI_IS_RESIZEFS)) + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS)) + __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); + else + __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) __set_ckpt_flags(ckpt, CP_DISABLED_FLAG); else @@ -1384,13 +1385,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) && - !f2fs_cp_error(sbi)); - /* - * modify checkpoint - * version number is already updated - */ + /* start to update checkpoint, cp ver is already updated previously */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { @@ -1493,11 +1489,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) && - !f2fs_cp_error(sbi)); + /* Wait for all dirty meta pages to be submitted for IO */ + f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); /* wait for previous submitted meta pages writeback */ - f2fs_wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); @@ -1506,7 +1502,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); - f2fs_wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); /* * invalidate intermediate page cache borrowed from meta inode which are @@ -1543,9 +1539,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; } -/* - * We guarantee that this checkpoint procedure will not fail. - */ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1613,7 +1606,6 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_sit_entries(sbi, cpc); - /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); if (err) f2fs_release_discard_addrs(sbi); @@ -1626,7 +1618,7 @@ stop: if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); - /* do checkpoint periodically */ + /* update CP_TIME to trigger checkpoint periodically */ f2fs_update_time(sbi, CP_TIME); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d8a64be90a50..df7b2d15eacd 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -11,6 +11,7 @@ #include <linux/backing-dev.h> #include <linux/lzo.h> #include <linux/lz4.h> +#include <linux/zstd.h> #include "f2fs.h" #include "node.h" @@ -20,6 +21,8 @@ struct f2fs_compress_ops { int (*init_compress_ctx)(struct compress_ctx *cc); void (*destroy_compress_ctx)(struct compress_ctx *cc); int (*compress_pages)(struct compress_ctx *cc); + int (*init_decompress_ctx)(struct decompress_io_ctx *dic); + void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic); int (*decompress_pages)(struct decompress_io_ctx *dic); }; @@ -52,7 +55,7 @@ bool f2fs_is_compressed_page(struct page *page) } static void f2fs_set_compressed_page(struct page *page, - struct inode *inode, pgoff_t index, void *data, refcount_t *r) + struct inode *inode, pgoff_t index, void *data) { SetPagePrivate(page); set_page_private(page, (unsigned long)data); @@ -60,8 +63,6 @@ static void f2fs_set_compressed_page(struct page *page, /* i_crypto_info and iv index */ page->index = index; page->mapping = inode->i_mapping; - if (r) - refcount_inc(r); } static void f2fs_put_compressed_page(struct page *page) @@ -291,6 +292,165 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = { }; #endif +#ifdef CONFIG_F2FS_FS_ZSTD +#define F2FS_ZSTD_DEFAULT_CLEVEL 1 + +static int zstd_init_compress_ctx(struct compress_ctx *cc) +{ + ZSTD_parameters params; + ZSTD_CStream *stream; + void *workspace; + unsigned int workspace_size; + + params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0); + workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams); + + workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + workspace_size, GFP_NOFS); + if (!workspace) + return -ENOMEM; + + stream = ZSTD_initCStream(params, 0, workspace, workspace_size); + if (!stream) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initCStream failed\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__); + kvfree(workspace); + return -EIO; + } + + cc->private = workspace; + cc->private2 = stream; + + cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; + return 0; +} + +static void zstd_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; + cc->private2 = NULL; +} + +static int zstd_compress_pages(struct compress_ctx *cc) +{ + ZSTD_CStream *stream = cc->private2; + ZSTD_inBuffer inbuf; + ZSTD_outBuffer outbuf; + int src_size = cc->rlen; + int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE; + int ret; + + inbuf.pos = 0; + inbuf.src = cc->rbuf; + inbuf.size = src_size; + + outbuf.pos = 0; + outbuf.dst = cc->cbuf->cdata; + outbuf.size = dst_size; + + ret = ZSTD_compressStream(stream, &outbuf, &inbuf); + if (ZSTD_isError(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__, ZSTD_getErrorCode(ret)); + return -EIO; + } + + ret = ZSTD_endStream(stream, &outbuf); + if (ZSTD_isError(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_endStream returned %d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__, ZSTD_getErrorCode(ret)); + return -EIO; + } + + cc->clen = outbuf.pos; + return 0; +} + +static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) +{ + ZSTD_DStream *stream; + void *workspace; + unsigned int workspace_size; + + workspace_size = ZSTD_DStreamWorkspaceBound(MAX_COMPRESS_WINDOW_SIZE); + + workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode), + workspace_size, GFP_NOFS); + if (!workspace) + return -ENOMEM; + + stream = ZSTD_initDStream(MAX_COMPRESS_WINDOW_SIZE, + workspace, workspace_size); + if (!stream) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, + __func__); + kvfree(workspace); + return -EIO; + } + + dic->private = workspace; + dic->private2 = stream; + + return 0; +} + +static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic) +{ + kvfree(dic->private); + dic->private = NULL; + dic->private2 = NULL; +} + +static int zstd_decompress_pages(struct decompress_io_ctx *dic) +{ + ZSTD_DStream *stream = dic->private2; + ZSTD_inBuffer inbuf; + ZSTD_outBuffer outbuf; + int ret; + + inbuf.pos = 0; + inbuf.src = dic->cbuf->cdata; + inbuf.size = dic->clen; + + outbuf.pos = 0; + outbuf.dst = dic->rbuf; + outbuf.size = dic->rlen; + + ret = ZSTD_decompressStream(stream, &outbuf, &inbuf); + if (ZSTD_isError(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, + __func__, ZSTD_getErrorCode(ret)); + return -EIO; + } + + if (dic->rlen != outbuf.pos) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + __func__, dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + + return 0; +} + +static const struct f2fs_compress_ops f2fs_zstd_ops = { + .init_compress_ctx = zstd_init_compress_ctx, + .destroy_compress_ctx = zstd_destroy_compress_ctx, + .compress_pages = zstd_compress_pages, + .init_decompress_ctx = zstd_init_decompress_ctx, + .destroy_decompress_ctx = zstd_destroy_decompress_ctx, + .decompress_pages = zstd_decompress_pages, +}; +#endif + static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { #ifdef CONFIG_F2FS_FS_LZO &f2fs_lzo_ops, @@ -302,6 +462,11 @@ static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { #else NULL, #endif +#ifdef CONFIG_F2FS_FS_ZSTD + &f2fs_zstd_ops, +#else + NULL, +#endif }; bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -334,9 +499,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc) trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, cc->cluster_size, fi->i_compress_algorithm); - ret = cops->init_compress_ctx(cc); - if (ret) - goto out; + if (cops->init_compress_ctx) { + ret = cops->init_compress_ctx(cc); + if (ret) + goto out; + } max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); @@ -380,21 +547,27 @@ static int f2fs_compress_pages(struct compress_ctx *cc) } cc->cbuf->clen = cpu_to_le32(cc->clen); - cc->cbuf->chksum = cpu_to_le32(0); for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) cc->cbuf->reserved[i] = cpu_to_le32(0); + nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + /* zero out any unused part of the last page */ + memset(&cc->cbuf->cdata[cc->clen], 0, + (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE)); + vunmap(cc->cbuf); vunmap(cc->rbuf); - nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); - for (i = nr_cpages; i < cc->nr_cpages; i++) { f2fs_put_compressed_page(cc->cpages[i]); cc->cpages[i] = NULL; } + if (cops->destroy_compress_ctx) + cops->destroy_compress_ctx(cc); + cc->nr_cpages = nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, @@ -413,7 +586,8 @@ out_free_cpages: kfree(cc->cpages); cc->cpages = NULL; destroy_compress_ctx: - cops->destroy_compress_ctx(cc); + if (cops->destroy_compress_ctx) + cops->destroy_compress_ctx(cc); out: trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); @@ -447,10 +621,16 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) goto out_free_dic; } + if (cops->init_decompress_ctx) { + ret = cops->init_decompress_ctx(dic); + if (ret) + goto out_free_dic; + } + dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL); if (!dic->rbuf) { ret = -ENOMEM; - goto out_free_dic; + goto destroy_decompress_ctx; } dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO); @@ -473,7 +653,12 @@ out_vunmap_cbuf: vunmap(dic->cbuf); out_vunmap_rbuf: vunmap(dic->rbuf); +destroy_decompress_ctx: + if (cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); out_free_dic: + if (verity) + refcount_set(&dic->ref, dic->nr_cpages); if (!verity) f2fs_decompress_end_io(dic->rpages, dic->cluster_size, ret, false); @@ -532,8 +717,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc) return true; } -/* return # of compressed block addresses */ -static int f2fs_compressed_blocks(struct compress_ctx *cc) +static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) { struct dnode_of_data dn; int ret; @@ -554,10 +738,15 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc) for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, + blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i); - if (blkaddr != NULL_ADDR) - ret++; + if (compr) { + if (__is_valid_data_blkaddr(blkaddr)) + ret++; + } else { + if (blkaddr != NULL_ADDR) + ret++; + } } } fail: @@ -565,6 +754,18 @@ fail: return ret; } +/* return # of compressed blocks in compressed cluster */ +static int f2fs_compressed_blocks(struct compress_ctx *cc) +{ + return __f2fs_cluster_blocks(cc, true); +} + +/* return # of valid blocks in compressed cluster */ +static int f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) +{ + return __f2fs_cluster_blocks(cc, false); +} + int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) { struct compress_ctx cc = { @@ -574,7 +775,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, }; - return f2fs_compressed_blocks(&cc); + return f2fs_cluster_blocks(&cc, false); } static bool cluster_may_compress(struct compress_ctx *cc) @@ -623,7 +824,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, bool prealloc; retry: - ret = f2fs_compressed_blocks(cc); + ret = f2fs_cluster_blocks(cc, false); if (ret <= 0) return ret; @@ -653,7 +854,7 @@ retry: struct bio *bio = NULL; ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, - &last_block_in_bio, false); + &last_block_in_bio, false, true); f2fs_destroy_compress_ctx(cc); if (ret) goto release_pages; @@ -772,7 +973,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, .encrypted_page = NULL, .compressed_page = NULL, .submitted = false, - .need_lock = LOCK_RETRY, .io_type = io_type, .io_wbc = wbc, .encrypted = f2fs_encrypted_file(cc->inode), @@ -785,16 +985,17 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, loff_t psize; int i, err; - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + if (!f2fs_trylock_op(sbi)) + return -EAGAIN; - f2fs_lock_op(sbi); + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (err) goto out_unlock_op; for (i = 0; i < cc->cluster_size; i++) { - if (datablock_addr(dn.inode, dn.node_page, + if (data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i) == NULL_ADDR) goto out_put_dnode; } @@ -813,7 +1014,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - refcount_set(&cic->ref, 1); + refcount_set(&cic->ref, cc->nr_cpages); cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << cc->log_cluster_size, GFP_NOFS); if (!cic->rpages) @@ -823,8 +1024,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, for (i = 0; i < cc->nr_cpages; i++) { f2fs_set_compressed_page(cc->cpages[i], inode, - cc->rpages[i + 1]->index, - cic, i ? &cic->ref : NULL); + cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; if (fio.encrypted) { fio.page = cc->rpages[i + 1]; @@ -843,9 +1043,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, dn.node_page, - dn.ofs_in_node); - fio.page = cic->rpages[i]; + blkaddr = f2fs_data_blkaddr(&dn); + fio.page = cc->rpages[i]; fio.old_blkaddr = blkaddr; /* cluster header */ @@ -895,10 +1094,10 @@ unlock_continue: f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - down_write(&fi->i_sem); + spin_lock(&fi->i_size_lock); if (fi->last_disk_size < psize) fi->last_disk_size = psize; - up_write(&fi->i_sem); + spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); f2fs_destroy_compress_ctx(cc); @@ -984,24 +1183,30 @@ retry_write: unlock_page(cc->rpages[i]); ret = 0; } else if (ret == -EAGAIN) { + /* + * for quota file, just redirty left pages to + * avoid deadlock caused by cluster update race + * from foreground operation. + */ + if (IS_NOQUOTA(cc->inode)) { + err = 0; + goto out_err; + } ret = 0; cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); lock_page(cc->rpages[i]); clear_page_dirty_for_io(cc->rpages[i]); goto retry_write; } err = ret; - goto out_fail; + goto out_err; } *submitted += _submitted; } return 0; - -out_fail: - /* TODO: revoke partially updated block addresses */ - BUG_ON(compr_blocks); out_err: for (++i; i < cc->cluster_size; i++) { if (!cc->rpages[i]) @@ -1069,7 +1274,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - refcount_set(&dic->ref, 1); + refcount_set(&dic->ref, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; @@ -1093,8 +1298,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) goto out_free; f2fs_set_compressed_page(page, cc->inode, - start_idx + i + 1, - dic, i ? &dic->ref : NULL); + start_idx + i + 1, dic); dic->cpages[i] = page; } @@ -1104,20 +1308,16 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) goto out_free; for (i = 0; i < dic->cluster_size; i++) { - if (cc->rpages[i]) + if (cc->rpages[i]) { + dic->tpages[i] = cc->rpages[i]; continue; + } dic->tpages[i] = f2fs_grab_page(); if (!dic->tpages[i]) goto out_free; } - for (i = 0; i < dic->cluster_size; i++) { - if (dic->tpages[i]) - continue; - dic->tpages[i] = cc->rpages[i]; - } - return dic; out_free: @@ -1133,7 +1333,10 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) for (i = 0; i < dic->cluster_size; i++) { if (dic->rpages[i]) continue; - f2fs_put_page(dic->tpages[i], 1); + if (!dic->tpages[i]) + continue; + unlock_page(dic->tpages[i]); + put_page(dic->tpages[i]); } kfree(dic->tpages); } @@ -1162,15 +1365,17 @@ void f2fs_decompress_end_io(struct page **rpages, if (!rpage) continue; - if (err || PageError(rpage)) { - ClearPageUptodate(rpage); - ClearPageError(rpage); - } else { - if (!verity || fsverity_verify_page(rpage)) - SetPageUptodate(rpage); - else - SetPageError(rpage); + if (err || PageError(rpage)) + goto clear_uptodate; + + if (!verity || fsverity_verify_page(rpage)) { + SetPageUptodate(rpage); + goto unlock; } +clear_uptodate: + ClearPageUptodate(rpage); + ClearPageError(rpage); +unlock: unlock_page(rpage); } } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b27b72107911..cdf2f626bea7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -54,17 +54,13 @@ static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); } -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail) +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio) { - struct bio *bio; - - if (no_fail) { + if (noio) { /* No failure on bio allocation */ - bio = __f2fs_bio_alloc(GFP_NOIO, npages); - if (!bio) - bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); - return bio; + return __f2fs_bio_alloc(GFP_NOIO, npages); } + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); return NULL; @@ -143,6 +139,8 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity) f2fs_decompress_pages(bio, page, verity); continue; } + if (verity) + continue; #endif /* PG_error was set if any post_read step failed */ @@ -191,12 +189,38 @@ static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) static void f2fs_verify_bio(struct bio *bio) { - struct page *page = bio_first_page_all(bio); - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); + struct bio_vec *bv; + struct bvec_iter_all iter_all; - f2fs_verify_pages(dic->rpages, dic->cluster_size); - f2fs_free_dic(dic); + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + struct decompress_io_ctx *dic; + + dic = (struct decompress_io_ctx *)page_private(page); + + if (dic) { + if (refcount_dec_not_one(&dic->ref)) + continue; + f2fs_verify_pages(dic->rpages, + dic->cluster_size); + f2fs_free_dic(dic); + continue; + } + + if (bio->bi_status || PageError(page)) + goto clear_uptodate; + + if (fsverity_verify_page(page)) { + SetPageUptodate(page); + goto unlock; + } +clear_uptodate: + ClearPageUptodate(page); + ClearPageError(page); +unlock: + dec_page_count(F2FS_P_SB(page), __read_io_type(page)); + unlock_page(page); + } } #endif @@ -364,9 +388,6 @@ static void f2fs_write_end_io(struct bio *bio) bio_put(bio); } -/* - * Return true, if pre_bio's bdev is same as its target device. - */ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio) { @@ -403,6 +424,9 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } +/* + * Return true, if pre_bio's bdev is same as its target device. + */ static bool __same_bdev(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio) { @@ -410,9 +434,6 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; } -/* - * Low-level block read/write IO operations. - */ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; @@ -445,7 +466,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (type != DATA && type != NODE) goto submit_io; - if (test_opt(sbi, LFS) && current->plug) + if (f2fs_lfs_mode(sbi) && current->plug) blk_finish_plug(current->plug); if (F2FS_IO_ALIGNED(sbi)) @@ -928,14 +949,15 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx) + pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; - bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), + for_write); if (!bio) return ERR_PTR(-ENOMEM); f2fs_target_device(sbi, blkaddr, bio); @@ -970,12 +992,12 @@ static void f2fs_release_read_bio(struct bio *bio) /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, - block_t blkaddr) + block_t blkaddr, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; - bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index); + bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1047,8 +1069,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + block_t blkaddr = f2fs_data_blkaddr(dn); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -1162,7 +1183,7 @@ got_it: return page; } - err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, for_write); if (err) goto put_err; return page; @@ -1300,8 +1321,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (err) return err; - dn->data_blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = f2fs_data_blkaddr(dn); if (dn->data_blkaddr != NULL_ADDR) goto alloc; @@ -1388,13 +1408,9 @@ void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) } /* - * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with - * f2fs_map_blocks structure. - * If original data blocks are allocated, then give them to blockdev. - * Otherwise, - * a. preallocate requested block addresses - * b. do not use extent cache for better performance - * c. give the block addresses to blockdev + * f2fs_map_blocks() tries to find or build mapping relationship which + * maps continuous logical blocks to physical blocks, and return such + * info via f2fs_map_blocks structure. */ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) @@ -1422,7 +1438,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, end = pgofs + maxblocks; if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { - if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && + if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) goto next_dnode; @@ -1467,7 +1483,7 @@ next_dnode: end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + blkaddr = f2fs_data_blkaddr(&dn); if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { @@ -1477,7 +1493,7 @@ next_block: if (__is_valid_data_blkaddr(blkaddr)) { /* use out-place-update for driect IO under LFS mode */ - if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && + if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) { err = __allocate_data_block(&dn, map->m_seg_type); if (err) @@ -1980,7 +1996,8 @@ submit_and_realloc: } if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, - is_readahead ? REQ_RAHEAD : 0, page->index); + is_readahead ? REQ_RAHEAD : 0, page->index, + false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2015,7 +2032,7 @@ out: #ifdef CONFIG_F2FS_FS_COMPRESSION int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, - bool is_readahead) + bool is_readahead, bool for_write) { struct dnode_of_data dn; struct inode *inode = cc->inode; @@ -2031,7 +2048,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); - last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + last_block_in_file = (f2fs_readpage_limit(inode) + + blocksize - 1) >> blkbits; /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { @@ -2067,7 +2085,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, dn.node_page, + blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i); if (!__is_valid_data_blkaddr(blkaddr)) @@ -2096,7 +2114,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, struct page *page = dic->cpages[i]; block_t blkaddr; - blkaddr = datablock_addr(dn.inode, dn.node_page, + blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); if (bio && !page_is_mergeable(sbi, bio, @@ -2109,7 +2127,7 @@ submit_and_realloc: if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index); + page->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2210,7 +2228,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead); + is_readahead, false); f2fs_destroy_compress_ctx(&cc); if (ret) goto set_error_page; @@ -2253,7 +2271,7 @@ next_page: ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead); + is_readahead, false); f2fs_destroy_compress_ctx(&cc); } } @@ -2326,7 +2344,7 @@ retry_encrypt: /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { f2fs_flush_merged_writes(fio->sbi); - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } @@ -2397,7 +2415,7 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - if (test_opt(sbi, LFS)) + if (f2fs_lfs_mode(sbi)) return true; if (S_ISDIR(inode->i_mode)) return true; @@ -2647,10 +2665,10 @@ write: if (err) { file_set_keep_isize(inode); } else { - down_write(&F2FS_I(inode)->i_sem); + spin_lock(&F2FS_I(inode)->i_size_lock); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - up_write(&F2FS_I(inode)->i_sem); + spin_unlock(&F2FS_I(inode)->i_size_lock); } done: @@ -2917,7 +2935,7 @@ result: if (wbc->sync_mode == WB_SYNC_ALL) { cond_resched(); congestion_wait(BLK_RW_ASYNC, - HZ/50); + DEFAULT_IO_TIMEOUT); goto retry_write; } goto next; @@ -2973,15 +2991,17 @@ next: static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { + /* to avoid deadlock in path of data flush */ + if (F2FS_I(inode)->cp_task) + return false; + if (!S_ISREG(inode->i_mode)) return false; - if (f2fs_compressed_file(inode)) - return true; if (IS_NOQUOTA(inode)) return false; - /* to avoid deadlock in path of data flush */ - if (F2FS_I(inode)->cp_task) - return false; + + if (f2fs_compressed_file(inode)) + return true; if (wbc->sync_mode != WB_SYNC_ALL) return true; if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) @@ -3283,7 +3303,7 @@ repeat: err = -EFSCORRUPTED; goto fail; } - err = f2fs_submit_page_read(inode, page, blkaddr); + err = f2fs_submit_page_read(inode, page, blkaddr, true); if (err) goto fail; @@ -3464,7 +3484,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, rw == WRITE ? get_data_block_dio_write : get_data_block_dio, NULL, f2fs_dio_submit_bio, - DIO_LOCKING | DIO_SKIP_HOLES); + rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES : + DIO_SKIP_HOLES); if (do_opu) up_read(&fi->i_gc_rwsem[READ]); @@ -3861,7 +3882,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) int __init f2fs_init_bio_entry_cache(void) { - bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab", + bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); if (!bio_entry_slab) return -ENOMEM; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 6b89eae5e4ca..0dbcb0f9c019 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -301,6 +301,9 @@ static int stat_show(struct seq_file *s, void *v) si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", si->overp_segs, si->rsvd_segs); + seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n", + ktime_get_boottime_seconds(), + SIT_I(si->sbi)->mounted_time); if (test_opt(si->sbi, DISCARD)) seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", si->utilization, si->valid_count, si->discard_blks); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 27d0dd7a16d6..44bfc464df78 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -471,7 +471,6 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, struct page *dpage) { struct page *page; - int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir)); int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { @@ -498,8 +497,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - if ((IS_ENCRYPTED(dir) || dummy_encrypt) && - f2fs_may_encrypt(inode)) { + if (IS_ENCRYPTED(inode)) { err = fscrypt_inherit_context(dir, inode, page, false); if (err) goto put_error; @@ -850,12 +848,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, 0); set_page_dirty(page); - dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir, false); - - if (inode) - f2fs_drop_nlink(dir, inode); - if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_page_cache_dirty_tag(page); @@ -867,6 +859,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_remove_dirty_inode(dir); } f2fs_put_page(page, 1); + + dir->i_ctime = dir->i_mtime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + + if (inode) + f2fs_drop_nlink(dir, inode); } bool f2fs_empty_dir(struct inode *dir) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5355be6b6755..ba470d5687fe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,6 +22,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/quotaops.h> +#include <linux/part_stat.h> #include <crypto/hash.h> #include <linux/fscrypt.h> @@ -74,7 +75,6 @@ extern const char *f2fs_fault_name[FAULT_MAX]; /* * For mount options */ -#define F2FS_MOUNT_BG_GC 0x00000001 #define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002 #define F2FS_MOUNT_DISCARD 0x00000004 #define F2FS_MOUNT_NOHEAP 0x00000008 @@ -88,11 +88,8 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 -#define F2FS_MOUNT_FORCE_FG_GC 0x00004000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 -#define F2FS_MOUNT_ADAPTIVE 0x00020000 -#define F2FS_MOUNT_LFS 0x00040000 #define F2FS_MOUNT_USRQUOTA 0x00080000 #define F2FS_MOUNT_GRPQUOTA 0x00100000 #define F2FS_MOUNT_PRJQUOTA 0x00200000 @@ -100,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 +#define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -138,6 +136,8 @@ struct f2fs_mount_info { int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ + int fs_mode; /* fs mode: LFS or ADAPTIVE */ + int bggc_mode; /* bggc mode: off, on or sync */ bool test_dummy_encryption; /* test dummy encryption */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint @@ -331,8 +331,8 @@ struct discard_policy { bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ + bool timeout; /* discard timeout for put_super */ unsigned int granularity; /* discard granularity */ - int timeout; /* discard timeout for put_super */ }; struct discard_cmd_control { @@ -427,6 +427,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) #define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) #define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64) +#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64) #define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL #define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL @@ -559,6 +560,9 @@ enum { #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ +/* congestion wait timeout value, default: 20ms */ +#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20)) + /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 @@ -675,6 +679,44 @@ enum { MAX_GC_FAILURE }; +/* used for f2fs_inode_info->flags */ +enum { + FI_NEW_INODE, /* indicate newly allocated inode */ + FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ + FI_DIRTY_DIR, /* indicate directory has dirty pages */ + FI_INC_LINK, /* need to increment i_nlink */ + FI_ACL_MODE, /* indicate acl mode */ + FI_NO_ALLOC, /* should not allocate any blocks */ + FI_FREE_NID, /* free allocated nide */ + FI_NO_EXTENT, /* not to use the extent cache */ + FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ + FI_INLINE_DENTRY, /* used for inline dentry */ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used for ipu per file */ + FI_ATOMIC_FILE, /* indicate atomic file */ + FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ + FI_VOLATILE_FILE, /* indicate volatile file */ + FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ + FI_DROP_CACHE, /* drop dirty page cache */ + FI_DATA_EXIST, /* indicate data exists */ + FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ + FI_PIN_FILE, /* indicate file should not be gced */ + FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ + FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ + FI_MMAP_FILE, /* indicate file was mmapped */ + FI_MAX, /* max flag, never be used */ +}; + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ @@ -687,7 +729,7 @@ struct f2fs_inode_info { umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ - unsigned long flags; /* use to pass per-file flags */ + unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ @@ -696,6 +738,7 @@ struct f2fs_inode_info { struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ + spinlock_t i_size_lock; /* protect last_disk_size */ #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; @@ -1172,6 +1215,20 @@ enum { }; enum { + BGGC_MODE_ON, /* background gc is on */ + BGGC_MODE_OFF, /* background gc is off */ + BGGC_MODE_SYNC, /* + * background gc is on, migrating blocks + * like foreground gc + */ +}; + +enum { + FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ + FS_MODE_LFS, /* use lfs allocation only */ +}; + +enum { WHINT_MODE_OFF, /* not pass down write hints */ WHINT_MODE_USER, /* try to pass down hints given by users */ WHINT_MODE_FS, /* pass down hints with F2FS policy */ @@ -1211,13 +1268,13 @@ enum fsync_mode { enum compress_algorithm_type { COMPRESS_LZO, COMPRESS_LZ4, + COMPRESS_ZSTD, COMPRESS_MAX, }; -#define COMPRESS_DATA_RESERVED_SIZE 4 +#define COMPRESS_DATA_RESERVED_SIZE 5 struct compress_data { __le32 clen; /* compressed data size */ - __le32 chksum; /* checksum of compressed data */ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ u8 cdata[]; /* compressed data */ }; @@ -1241,6 +1298,7 @@ struct compress_ctx { size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ void *private; /* payload buffer for specified compression algorithm */ + void *private2; /* extra payload buffer */ }; /* compress context for write IO path */ @@ -1270,11 +1328,14 @@ struct decompress_io_ctx { size_t clen; /* valid data length in cbuf */ refcount_t ref; /* referrence count of compressed page */ bool failed; /* indicate IO error during decompression */ + void *private; /* payload buffer for specified decompression algorithm */ + void *private2; /* extra payload buffer */ }; #define NULL_CLUSTER ((unsigned int)(~0)) #define MIN_COMPRESS_LOG_SIZE 2 #define MAX_COMPRESS_LOG_SIZE 8 +#define MAX_COMPRESS_WINDOW_SIZE ((PAGE_SIZE) << MAX_COMPRESS_LOG_SIZE) struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ @@ -1470,6 +1531,9 @@ struct f2fs_sb_info { __u32 s_chksum_seed; struct workqueue_struct *post_read_wq; /* post read workqueue */ + + struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ + unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ }; struct f2fs_private_dio { @@ -2210,7 +2274,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, dquot_free_inode(inode); } else { if (unlikely(inode->i_blocks == 0)) { - f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu", + f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu", inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -2378,7 +2442,7 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node) } static inline int f2fs_has_extra_attr(struct inode *inode); -static inline block_t datablock_addr(struct inode *inode, +static inline block_t data_blkaddr(struct inode *inode, struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; @@ -2388,9 +2452,9 @@ static inline block_t datablock_addr(struct inode *inode, raw_node = F2FS_NODE(node_page); - /* from GC path only */ if (is_inode) { if (!inode) + /* from GC path only */ base = offset_in_addr(&raw_node->i); else if (f2fs_has_extra_attr(inode)) base = get_extra_isize(inode); @@ -2400,6 +2464,11 @@ static inline block_t datablock_addr(struct inode *inode, return le32_to_cpu(addr_array[base + offset]); } +static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn) +{ + return data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node); +} + static inline int f2fs_test_bit(unsigned int nr, char *addr) { int mask; @@ -2497,43 +2566,6 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) return flags & F2FS_OTHER_FLMASK; } -/* used for f2fs_inode_info->flags */ -enum { - FI_NEW_INODE, /* indicate newly allocated inode */ - FI_DIRTY_INODE, /* indicate inode is dirty or not */ - FI_AUTO_RECOVER, /* indicate inode is recoverable */ - FI_DIRTY_DIR, /* indicate directory has dirty pages */ - FI_INC_LINK, /* need to increment i_nlink */ - FI_ACL_MODE, /* indicate acl mode */ - FI_NO_ALLOC, /* should not allocate any blocks */ - FI_FREE_NID, /* free allocated nide */ - FI_NO_EXTENT, /* not to use the extent cache */ - FI_INLINE_XATTR, /* used for inline xattr */ - FI_INLINE_DATA, /* used for inline data*/ - FI_INLINE_DENTRY, /* used for inline dentry */ - FI_APPEND_WRITE, /* inode has appended data */ - FI_UPDATE_WRITE, /* inode has in-place-update data */ - FI_NEED_IPU, /* used for ipu per file */ - FI_ATOMIC_FILE, /* indicate atomic file */ - FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ - FI_VOLATILE_FILE, /* indicate volatile file */ - FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ - FI_DROP_CACHE, /* drop dirty page cache */ - FI_DATA_EXIST, /* indicate data exists */ - FI_INLINE_DOTS, /* indicate inline dot dentries */ - FI_DO_DEFRAG, /* indicate defragment is running */ - FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ - FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ - FI_HOT_DATA, /* indicate file is hot */ - FI_EXTRA_ATTR, /* indicate file has extra attribute */ - FI_PROJ_INHERIT, /* indicate file inherits projectid */ - FI_PIN_FILE, /* indicate file should not be gced */ - FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ - FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ - FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ - FI_MMAP_FILE, /* indicate file was mmapped */ -}; - static inline void __mark_inode_dirty_flag(struct inode *inode, int flag, bool set) { @@ -2548,27 +2580,24 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: - case FI_COMPRESSED_FILE: f2fs_mark_inode_dirty_sync(inode, true); } } static inline void set_inode_flag(struct inode *inode, int flag) { - if (!test_bit(flag, &F2FS_I(inode)->flags)) - set_bit(flag, &F2FS_I(inode)->flags); + test_and_set_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, true); } static inline int is_inode_flag_set(struct inode *inode, int flag) { - return test_bit(flag, &F2FS_I(inode)->flags); + return test_bit(flag, F2FS_I(inode)->flags); } static inline void clear_inode_flag(struct inode *inode, int flag) { - if (test_bit(flag, &F2FS_I(inode)->flags)) - clear_bit(flag, &F2FS_I(inode)->flags); + test_and_clear_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, false); } @@ -2659,19 +2688,19 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) struct f2fs_inode_info *fi = F2FS_I(inode); if (ri->i_inline & F2FS_INLINE_XATTR) - set_bit(FI_INLINE_XATTR, &fi->flags); + set_bit(FI_INLINE_XATTR, fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) - set_bit(FI_INLINE_DATA, &fi->flags); + set_bit(FI_INLINE_DATA, fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) - set_bit(FI_INLINE_DENTRY, &fi->flags); + set_bit(FI_INLINE_DENTRY, fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) - set_bit(FI_DATA_EXIST, &fi->flags); + set_bit(FI_DATA_EXIST, fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) - set_bit(FI_INLINE_DOTS, &fi->flags); + set_bit(FI_INLINE_DOTS, fi->flags); if (ri->i_inline & F2FS_EXTRA_ATTR) - set_bit(FI_EXTRA_ATTR, &fi->flags); + set_bit(FI_EXTRA_ATTR, fi->flags); if (ri->i_inline & F2FS_PIN_FILE) - set_bit(FI_PIN_FILE, &fi->flags); + set_bit(FI_PIN_FILE, fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2856,9 +2885,9 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) if (!f2fs_is_time_consistent(inode)) return false; - down_read(&F2FS_I(inode)->i_sem); + spin_lock(&F2FS_I(inode)->i_size_lock); ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); - up_read(&F2FS_I(inode)->i_sem); + spin_unlock(&F2FS_I(inode)->i_size_lock); return ret; } @@ -3212,7 +3241,7 @@ void f2fs_drop_inmem_pages(struct inode *inode); void f2fs_drop_inmem_page(struct inode *inode, struct page *page); int f2fs_commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); -void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); @@ -3308,7 +3337,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_page(struct inode *inode, struct page *page); void f2fs_remove_dirty_inode(struct inode *inode); int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); -void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi); +void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); @@ -3319,7 +3348,7 @@ void f2fs_destroy_checkpoint_caches(void); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail); +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, @@ -3775,7 +3804,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, - bool is_readahead); + bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_free_dic(struct decompress_io_ctx *dic); void f2fs_decompress_end_io(struct page **rpages, @@ -3812,6 +3841,7 @@ static inline void set_compress_context(struct inode *inode) F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline u64 f2fs_disable_compressed_file(struct inode *inode) @@ -3820,12 +3850,17 @@ static inline u64 f2fs_disable_compressed_file(struct inode *inode) if (!f2fs_compressed_file(inode)) return 0; - if (fi->i_compr_blocks) - return fi->i_compr_blocks; + if (S_ISREG(inode->i_mode)) { + if (get_dirty_pages(inode)) + return 1; + if (fi->i_compr_blocks) + return fi->i_compr_blocks; + } fi->i_flags &= ~F2FS_COMPR_FL; - clear_inode_flag(inode, FI_COMPRESSED_FILE); stat_dec_compr_inode(inode); + clear_inode_flag(inode, FI_COMPRESSED_FILE); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -3902,31 +3937,25 @@ static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi) return false; } - -static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) +static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) { - clear_opt(sbi, ADAPTIVE); - clear_opt(sbi, LFS); - - switch (mt) { - case F2FS_MOUNT_ADAPTIVE: - set_opt(sbi, ADAPTIVE); - break; - case F2FS_MOUNT_LFS: - set_opt(sbi, LFS); - break; - } + return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } -static inline bool f2fs_may_encrypt(struct inode *inode) +static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode) { #ifdef CONFIG_FS_ENCRYPTION + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); umode_t mode = inode->i_mode; - return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); -#else - return false; + /* + * If the directory encrypted or dummy encryption enabled, + * then we should encrypt the inode. + */ + if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) + return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #endif + return false; } static inline bool f2fs_may_compress(struct inode *inode) @@ -3970,7 +3999,7 @@ static inline int allow_outplace_dio(struct inode *inode, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int rw = iov_iter_rw(iter); - return (test_opt(sbi, LFS) && (rw == WRITE) && + return (f2fs_lfs_mode(sbi) && (rw == WRITE) && !block_unaligned_IO(inode, iocb, iter)); } @@ -3992,7 +4021,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, */ if (f2fs_sb_has_blkzoned(sbi)) return true; - if (test_opt(sbi, LFS) && (rw == WRITE)) { + if (f2fs_lfs_mode(sbi) && (rw == WRITE)) { if (block_unaligned_IO(inode, iocb, iter)) return true; if (F2FS_IO_ALIGNED(sbi)) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0d4da644df3b..6ab8f621a3c5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -106,13 +106,20 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) err = f2fs_get_block(&dn, page->index); f2fs_put_dnode(&dn); __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); - if (err) { - unlock_page(page); - goto out_sem; - } } - /* fill the page */ +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!need_alloc) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + f2fs_put_dnode(&dn); + } +#endif + if (err) { + unlock_page(page); + goto out_sem; + } + f2fs_wait_on_page_writeback(page, DATA, false, true); /* wait for GCed page writeback via META_MAPPING */ @@ -448,8 +455,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, - dn.node_page, dn.ofs_in_node); + blkaddr = f2fs_data_blkaddr(&dn); if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), @@ -793,6 +799,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, } flags = fi->i_flags; + if (flags & F2FS_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; if (flags & F2FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (IS_ENCRYPTED(inode)) @@ -804,7 +812,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, if (IS_VERITY(inode)) stat->attributes |= STATX_ATTR_VERITY; - stat->attributes_mask |= (STATX_ATTR_APPEND | + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | + STATX_ATTR_APPEND | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP | @@ -929,10 +938,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - down_write(&F2FS_I(inode)->i_sem); + spin_lock(&F2FS_I(inode)->i_size_lock); inode->i_mtime = inode->i_ctime = current_time(inode); F2FS_I(inode)->last_disk_size = i_size_read(inode); - up_write(&F2FS_I(inode)->i_sem); + spin_unlock(&F2FS_I(inode)->i_size_lock); } __setattr_copy(inode, attr); @@ -1109,8 +1118,7 @@ next_dnode: done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.inode, - dn.node_page, dn.ofs_in_node); + *blkaddr = f2fs_data_blkaddr(&dn); if (__is_valid_data_blkaddr(*blkaddr) && !f2fs_is_valid_blkaddr(sbi, *blkaddr, @@ -1121,7 +1129,7 @@ next_dnode: if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) { - if (test_opt(sbi, LFS)) { + if (f2fs_lfs_mode(sbi)) { f2fs_put_dnode(&dn); return -EOPNOTSUPP; } @@ -1199,8 +1207,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.inode, - dn.node_page, dn.ofs_in_node); + dn.data_blkaddr = f2fs_data_blkaddr(&dn); f2fs_truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { @@ -1376,8 +1383,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->inode, dn->node_page, - dn->ofs_in_node) == NULL_ADDR) + if (f2fs_data_blkaddr(dn) == NULL_ADDR) count++; } @@ -1388,8 +1394,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = f2fs_data_blkaddr(dn); /* * f2fs_reserve_new_blocks will not guarantee entire block * allocation. @@ -1787,12 +1792,15 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) { struct f2fs_inode_info *fi = F2FS_I(inode); + u32 masked_flags = fi->i_flags & mask; + + f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask)); /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) return -EPERM; - if ((iflags ^ fi->i_flags) & F2FS_CASEFOLD_FL) { + if ((iflags ^ masked_flags) & F2FS_CASEFOLD_FL) { if (!f2fs_sb_has_casefold(F2FS_I_SB(inode))) return -EOPNOTSUPP; if (!f2fs_empty_dir(inode)) @@ -1806,27 +1814,22 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) return -EINVAL; } - if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) { - if (S_ISREG(inode->i_mode) && - (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) || - F2FS_HAS_BLOCKS(inode))) - return -EINVAL; + if ((iflags ^ masked_flags) & F2FS_COMPR_FL) { + if (masked_flags & F2FS_COMPR_FL) { + if (f2fs_disable_compressed_file(inode)) + return -EINVAL; + } if (iflags & F2FS_NOCOMP_FL) return -EINVAL; if (iflags & F2FS_COMPR_FL) { - int err = f2fs_convert_inline_inode(inode); - - if (err) - return err; - if (!f2fs_may_compress(inode)) return -EINVAL; set_compress_context(inode); } } - if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) { - if (fi->i_flags & F2FS_COMPR_FL) + if ((iflags ^ masked_flags) & F2FS_NOCOMP_FL) { + if (masked_flags & F2FS_COMPR_FL) return -EINVAL; } @@ -2423,6 +2426,14 @@ static int f2fs_ioc_get_encryption_key_status(struct file *filp, return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); } +static int f2fs_ioc_get_encryption_nonce(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_nonce(filp, (void __user *)arg); +} + static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -3393,6 +3404,21 @@ out: return err; } +static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u64 blocks; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + blocks = F2FS_I(inode)->i_compr_blocks; + return put_user(blocks, (u64 __user *)arg); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -3437,6 +3463,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_remove_encryption_key_all_users(filp, arg); case FS_IOC_GET_ENCRYPTION_KEY_STATUS: return f2fs_ioc_get_encryption_key_status(filp, arg); + case FS_IOC_GET_ENCRYPTION_NONCE: + return f2fs_ioc_get_encryption_nonce(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); case F2FS_IOC_GARBAGE_COLLECT_RANGE: @@ -3471,6 +3499,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_get_volume_name(filp, arg); case F2FS_IOC_SET_VOLUME_NAME: return f2fs_set_volume_name(filp, arg); + case F2FS_IOC_GET_COMPRESS_BLOCKS: + return f2fs_get_compress_blocks(filp, arg); default: return -ENOTTY; } @@ -3498,8 +3528,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - if (!f2fs_is_compress_backend_ready(inode)) - return -EOPNOTSUPP; + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) { @@ -3611,6 +3643,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_REMOVE_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + case FS_IOC_GET_ENCRYPTION_NONCE: case F2FS_IOC_GARBAGE_COLLECT: case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: @@ -3628,6 +3661,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_MEASURE_VERITY: case F2FS_IOC_GET_VOLUME_NAME: case F2FS_IOC_SET_VOLUME_NAME: + case F2FS_IOC_GET_COMPRESS_BLOCKS: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index db8725d473b5..26248c8936db 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -31,6 +31,8 @@ static int gc_thread_func(void *data) set_freezable(); do { + bool sync_mode; + wait_event_interruptible_timeout(*wq, kthread_should_stop() || freezing(current) || gc_th->gc_wake, @@ -101,15 +103,17 @@ static int gc_thread_func(void *data) do_gc: stat_inc_bggc_count(sbi->stat_info); + sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) + if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, prefree_segments(sbi), free_segments(sbi)); /* balancing f2fs's metadata periodically */ - f2fs_balance_fs_bg(sbi); + f2fs_balance_fs_bg(sbi, true); next: sb_end_write(sbi->sb); @@ -192,7 +196,10 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - /* we need to check every dirty segments in the FG_GC case */ + /* + * adjust candidates range, should select all dirty segments for + * foreground GC and urgent GC cases. + */ if (gc_type != FG_GC && (sbi->gc_mode != GC_URGENT) && p->max_search > sbi->max_victim_search) @@ -634,7 +641,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); + source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) { @@ -762,7 +769,7 @@ static int move_data_block(struct inode *inode, block_t bidx, struct page *page, *mpage; block_t newaddr; int err = 0; - bool lfs_mode = test_opt(fio.sbi, LFS); + bool lfs_mode = f2fs_lfs_mode(fio.sbi); /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); @@ -970,7 +977,8 @@ retry: if (err) { clear_cold_data(page); if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); goto retry; } if (is_dirty) @@ -1018,8 +1026,8 @@ next_step: * race condition along with SSR block allocation. */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || - get_valid_blocks(sbi, segno, false) == - sbi->blocks_per_seg) + get_valid_blocks(sbi, segno, true) == + BLKS_PER_SEC(sbi)) return submitted; if (check_valid_map(sbi, segno, off) == 0) @@ -1203,7 +1211,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, if (get_valid_blocks(sbi, segno, false) == 0) goto freed; - if (__is_large_section(sbi) && + if (gc_type == BG_GC && __is_large_section(sbi) && migrated >= sbi->migration_granularity) goto skip; if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) @@ -1233,12 +1241,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, segno, gc_type); stat_inc_seg_count(sbi, type, gc_type); + migrated++; freed: if (gc_type == FG_GC && get_valid_blocks(sbi, segno, false) == 0) seg_freed++; - migrated++; if (__is_large_section(sbi) && segno + 1 < end_segno) sbi->next_victim_seg[gc_type] = segno + 1; @@ -1434,12 +1442,19 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) { struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi); - int section_count = le32_to_cpu(raw_sb->section_count); - int segment_count = le32_to_cpu(raw_sb->segment_count); - int segment_count_main = le32_to_cpu(raw_sb->segment_count_main); - long long block_count = le64_to_cpu(raw_sb->block_count); + int section_count; + int segment_count; + int segment_count_main; + long long block_count; int segs = secs * sbi->segs_per_sec; + down_write(&sbi->sb_lock); + + section_count = le32_to_cpu(raw_sb->section_count); + segment_count = le32_to_cpu(raw_sb->segment_count); + segment_count_main = le32_to_cpu(raw_sb->segment_count_main); + block_count = le64_to_cpu(raw_sb->block_count); + raw_sb->section_count = cpu_to_le32(section_count + secs); raw_sb->segment_count = cpu_to_le32(segment_count + segs); raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); @@ -1453,6 +1468,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) raw_sb->devs[last_dev].total_segments = cpu_to_le32(dev_segs + segs); } + + up_write(&sbi->sb_lock); } static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) @@ -1570,11 +1587,17 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) goto out; } + mutex_lock(&sbi->cp_mutex); update_fs_metadata(sbi, -secs); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); + set_sbi_flag(sbi, SBI_IS_DIRTY); + mutex_unlock(&sbi->cp_mutex); + err = f2fs_sync_fs(sbi->sb, 1); if (err) { + mutex_lock(&sbi->cp_mutex); update_fs_metadata(sbi, secs); + mutex_unlock(&sbi->cp_mutex); update_sb_metadata(sbi, secs); f2fs_commit_super(sbi, false); } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 78c3f1d70f1d..44582a4db513 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -291,13 +291,30 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) fi->i_flags & F2FS_COMPR_FL && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { - if (ri->i_compress_algorithm >= COMPRESS_MAX) + if (ri->i_compress_algorithm >= COMPRESS_MAX) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " + "compress algorithm: %u, run fsck to fix", + __func__, inode->i_ino, + ri->i_compress_algorithm); return false; - if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks) + } + if (le64_to_cpu(ri->i_compr_blocks) > + SECTOR_TO_BLOCK(inode->i_blocks)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent " + "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", + __func__, inode->i_ino, + le64_to_cpu(ri->i_compr_blocks), + SECTOR_TO_BLOCK(inode->i_blocks)); return false; + } if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || - ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) + ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " + "log cluster size: %u, run fsck to fix", + __func__, inode->i_ino, + ri->i_log_cluster_size); return false; + } } return true; @@ -345,7 +362,7 @@ static int do_read_inode(struct inode *inode) fi->i_flags = le32_to_cpu(ri->i_flags); if (S_ISREG(inode->i_mode)) fi->i_flags &= ~F2FS_PROJINHERIT_FL; - fi->flags = 0; + bitmap_zero(fi->flags, FI_MAX); fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; @@ -518,7 +535,7 @@ retry: inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); goto retry; } } @@ -759,7 +776,7 @@ no_delete: else f2fs_inode_synced(inode); - /* ino == 0, if f2fs_new_inode() was failed t*/ + /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */ if (inode->i_ino) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2aa035422c0f..f54119da2217 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -75,9 +75,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); - /* If the directory encrypted, then we should encrypt the inode. */ - if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && - f2fs_may_encrypt(inode)) + if (f2fs_may_encrypt(dir, inode)) f2fs_set_encrypted_inode(inode); if (f2fs_sb_has_extra_attr(sbi)) { @@ -177,7 +175,7 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub) } /* - * Set multimedia files as cold files for hot/cold data separation + * Set file's temperature for hot/cold data separation */ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) @@ -876,12 +874,6 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { - int err = fscrypt_get_encryption_info(dir); - if (err) - return err; - } - return __f2fs_tmpfile(dir, dentry, mode, NULL); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9d02cdcdbb07..ecbd6bd14a49 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -510,9 +510,6 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) return nr - nr_shrink; } -/* - * This function always returns success - */ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { @@ -716,8 +713,7 @@ got: /* * Caller should call f2fs_put_dnode(dn). * Also, it should grab and release a rwsem by calling f2fs_lock_op() and - * f2fs_unlock_op() only if ro is not set RDONLY_NODE. - * In the case of RDONLY_NODE, we don't need to care about mutex. + * f2fs_unlock_op() only if mode is set with ALLOC_NODE. */ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { @@ -809,8 +805,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = f2fs_data_blkaddr(dn); return 0; release_pages: @@ -1188,8 +1183,9 @@ int f2fs_remove_inode_page(struct inode *inode) } if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) { - f2fs_warn(F2FS_I_SB(inode), "Inconsistent i_blocks, ino:%lu, iblocks:%llu", - inode->i_ino, (unsigned long long)inode->i_blocks); + f2fs_warn(F2FS_I_SB(inode), + "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu", + inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); } @@ -1562,15 +1558,16 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (atomic && !test_opt(sbi, NOBARRIER)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; - set_page_writeback(page); - ClearPageError(page); - + /* should add to global list before clearing PAGECACHE status */ if (f2fs_in_warm_node_list(sbi, page)) { seq = f2fs_add_fsync_node_entry(sbi, page); if (seq_id) *seq_id = seq; } + set_page_writeback(page); + ClearPageError(page); + fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); @@ -1979,7 +1976,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, goto skip_write; /* balancing f2fs's metadata in background */ - f2fs_balance_fs_bg(sbi); + f2fs_balance_fs_bg(sbi, true); /* collect a number of dirty node pages and write together */ if (wbc->sync_mode != WB_SYNC_ALL && @@ -2602,7 +2599,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) retry: ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); if (!ipage) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); goto retry; } @@ -3193,22 +3190,22 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) int __init f2fs_create_node_manager_caches(void) { - nat_entry_slab = f2fs_kmem_cache_create("nat_entry", + nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry", sizeof(struct nat_entry)); if (!nat_entry_slab) goto fail; - free_nid_slab = f2fs_kmem_cache_create("free_nid", + free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid", sizeof(struct free_nid)); if (!free_nid_slab) goto destroy_nat_entry; - nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set", + nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set", sizeof(struct nat_entry_set)); if (!nat_entry_set_slab) goto destroy_free_nid; - fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry", + fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry", sizeof(struct fsync_node_entry)); if (!fsync_node_entry_slab) goto destroy_nat_entry_set; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 763d5c0951d1..dd804c07eeb0 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -496,8 +496,7 @@ out: return 0; truncate_out: - if (datablock_addr(tdn.inode, tdn.node_page, - tdn.ofs_in_node) == blkaddr) + if (f2fs_data_blkaddr(&tdn) == blkaddr) f2fs_truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -535,7 +534,7 @@ retry_dn: err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); goto retry_dn; } goto out; @@ -560,8 +559,8 @@ retry_dn: for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - dest = datablock_addr(dn.inode, page, dn.ofs_in_node); + src = f2fs_data_blkaddr(&dn); + dest = data_blkaddr(dn.inode, page, dn.ofs_in_node); if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { @@ -618,7 +617,8 @@ retry_prev: err = check_index_in_prev_nodes(sbi, dest, &dn); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); goto retry_prev; } goto err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cf0eb002cfd4..b7a9421472a7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -172,7 +172,7 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - if (test_opt(sbi, LFS)) + if (f2fs_lfs_mode(sbi)) return false; if (sbi->gc_mode == GC_URGENT) return true; @@ -245,7 +245,8 @@ retry: LOOKUP_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); cond_resched(); goto retry; } @@ -312,7 +313,7 @@ next: skip: iput(inode); } - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); cond_resched(); if (gc_failure) { if (++looped >= count) @@ -415,7 +416,8 @@ retry: err = f2fs_do_write_data_page(&fio); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); cond_resched(); goto retry; } @@ -494,7 +496,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) - f2fs_balance_fs_bg(sbi); + f2fs_balance_fs_bg(sbi, false); if (!f2fs_is_checkpoint_ready(sbi)) return; @@ -509,7 +511,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } } -void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return; @@ -538,7 +540,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) || f2fs_time_over(sbi, CP_TIME)) { - if (test_opt(sbi, DATA_FLUSH)) { + if (test_opt(sbi, DATA_FLUSH) && from_bg) { struct blk_plug plug; mutex_lock(&sbi->flush_lock); @@ -1078,7 +1080,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; dpolicy->io_aware_gran = MAX_PLIST_NUM; - dpolicy->timeout = 0; + dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1103,6 +1105,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ dpolicy->granularity = 1; + dpolicy->timeout = true; } } @@ -1471,12 +1474,12 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, int i, issued = 0; bool io_interrupted = false; - if (dpolicy->timeout != 0) - f2fs_update_time(sbi, dpolicy->timeout); + if (dpolicy->timeout) + f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { - if (dpolicy->timeout != 0 && - f2fs_time_over(sbi, dpolicy->timeout)) + if (dpolicy->timeout && + f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (i + 1 < dpolicy->granularity) @@ -1497,8 +1500,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - if (dpolicy->timeout != 0 && - f2fs_time_over(sbi, dpolicy->timeout)) + if (dpolicy->timeout && + f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (dpolicy->io_aware && i < dpolicy->io_aware_gran && @@ -1677,7 +1680,6 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); - dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT; __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); @@ -1940,7 +1942,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); - bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi); + bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); mutex_lock(&dirty_i->seglist_lock); @@ -1972,7 +1974,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, (end - 1) <= cpc->trim_end) continue; - if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) { + if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); continue; @@ -2801,7 +2803,7 @@ next: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); goto next; } skip: @@ -2830,7 +2832,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; - bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi); + bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -3193,7 +3195,7 @@ static void update_device_state(struct f2fs_io_info *fio) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); - bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA); + bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); if (keep_order) down_read(&fio->sbi->io_order_lock); @@ -4071,7 +4073,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); - sit_i->mounted_time = ktime_get_real_seconds(); + sit_i->mounted_time = ktime_get_boottime_seconds(); init_rwsem(&sit_i->sentry_lock); return 0; } @@ -4678,7 +4680,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; - if (!test_opt(sbi, LFS)) + if (!f2fs_lfs_mode(sbi)) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; @@ -4830,22 +4832,22 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) int __init f2fs_create_segment_manager_caches(void) { - discard_entry_slab = f2fs_kmem_cache_create("discard_entry", + discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry", sizeof(struct discard_entry)); if (!discard_entry_slab) goto fail; - discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd", + discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd", sizeof(struct discard_cmd)); if (!discard_cmd_slab) goto destroy_discard_entry; - sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", + sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) goto destroy_discard_cmd; - inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", + inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry", sizeof(struct inmem_pages)); if (!inmem_entry_slab) goto destroy_sit_entry_set; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 459dc3901a57..7a83bd530812 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -756,7 +756,7 @@ static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, bool base_time) { struct sit_info *sit_i = SIT_I(sbi); - time64_t diff, now = ktime_get_real_seconds(); + time64_t diff, now = ktime_get_boottime_seconds(); if (now >= sit_i->mounted_time) return sit_i->elapsed_time + now - sit_i->mounted_time; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index a467aca29cfe..d66de5999a26 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -58,7 +58,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, /* count extent cache entries */ count += __count_extent_cache(sbi); - /* shrink clean nat cache entries */ + /* count clean nat cache entries */ count += __count_nat_entries(sbi); /* count free nids cache entries */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 65a7a432dfee..f2dfc21c6abb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -24,6 +24,7 @@ #include <linux/sysfs.h> #include <linux/quota.h> #include <linux/unicode.h> +#include <linux/part_stat.h> #include "f2fs.h" #include "node.h" @@ -427,14 +428,11 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; if (strlen(name) == 2 && !strncmp(name, "on", 2)) { - set_opt(sbi, BG_GC); - clear_opt(sbi, FORCE_FG_GC); + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { - clear_opt(sbi, BG_GC); - clear_opt(sbi, FORCE_FG_GC); + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF; } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) { - set_opt(sbi, BG_GC); - set_opt(sbi, FORCE_FG_GC); + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC; } else { kvfree(name); return -EINVAL; @@ -446,7 +444,7 @@ static int parse_options(struct super_block *sb, char *options) break; case Opt_norecovery: /* this option mounts f2fs with ro */ - set_opt(sbi, DISABLE_ROLL_FORWARD); + set_opt(sbi, NORECOVERY); if (!f2fs_readonly(sb)) return -EINVAL; break; @@ -600,10 +598,10 @@ static int parse_options(struct super_block *sb, char *options) kvfree(name); return -EINVAL; } - set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; } else if (strlen(name) == 3 && !strncmp(name, "lfs", 3)) { - set_opt_mode(sbi, F2FS_MOUNT_LFS); + F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; } else { kvfree(name); return -EINVAL; @@ -832,6 +830,10 @@ static int parse_options(struct super_block *sb, char *options) !strcmp(name, "lz4")) { F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; + } else if (strlen(name) == 4 && + !strcmp(name, "zstd")) { + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_ZSTD; } else { kfree(name); return -EINVAL; @@ -904,7 +906,7 @@ static int parse_options(struct super_block *sb, char *options) } #endif - if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { + if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", F2FS_IO_SIZE_KB(sbi)); return -EINVAL; @@ -934,7 +936,7 @@ static int parse_options(struct super_block *sb, char *options) } } - if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) { + if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n"); return -EINVAL; } @@ -960,6 +962,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); init_rwsem(&fi->i_sem); + spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_ilist); @@ -1172,7 +1175,7 @@ static void f2fs_put_super(struct super_block *sb) /* our cp_error case, we can wait for any writeback page */ f2fs_flush_merged_writes(sbi); - f2fs_wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); f2fs_bug_on(sbi, sbi->fsync_node_num); @@ -1204,6 +1207,7 @@ static void f2fs_put_super(struct super_block *sb) kvfree(sbi->raw_super); destroy_device_list(sbi); + f2fs_destroy_xattr_caches(sbi); mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -1420,6 +1424,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, case COMPRESS_LZ4: algtype = "lz4"; break; + case COMPRESS_ZSTD: + algtype = "zstd"; + break; } seq_printf(seq, ",compress_algorithm=%s", algtype); @@ -1436,16 +1443,17 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) { - if (test_opt(sbi, FORCE_FG_GC)) - seq_printf(seq, ",background_gc=%s", "sync"); - else - seq_printf(seq, ",background_gc=%s", "on"); - } else { + if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) + seq_printf(seq, ",background_gc=%s", "sync"); + else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_ON) + seq_printf(seq, ",background_gc=%s", "on"); + else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) seq_printf(seq, ",background_gc=%s", "off"); - } + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); + if (test_opt(sbi, NORECOVERY)) + seq_puts(seq, ",norecovery"); if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); else @@ -1497,9 +1505,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",data_flush"); seq_puts(seq, ",mode="); - if (test_opt(sbi, ADAPTIVE)) + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_ADAPTIVE) seq_puts(seq, "adaptive"); - else if (test_opt(sbi, LFS)) + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) @@ -1570,11 +1578,11 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); - F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; + F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; F2FS_OPTION(sbi).compress_ext_cnt = 0; + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; - set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); @@ -1586,9 +1594,9 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, FLUSH_MERGE); set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi)) - set_opt_mode(sbi, F2FS_MOUNT_LFS); + F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; else - set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -1657,7 +1665,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) out_unlock: up_write(&sbi->gc_lock); restore_flag: - sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; } @@ -1780,7 +1788,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if background_gc = off is passed in mount * option. Also sync the filesystem. */ - if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) { + if ((*flags & SB_RDONLY) || + F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) { if (sbi->gc_thread) { f2fs_stop_gc_thread(sbi); need_restart_gc = true; @@ -1885,7 +1894,8 @@ repeat: page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); goto repeat; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); @@ -1927,6 +1937,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, int offset = off & (sb->s_blocksize - 1); size_t towrite = len; struct page *page; + void *fsdata = NULL; char *kaddr; int err = 0; int tocopy; @@ -1936,10 +1947,11 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, towrite); retry: err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, - &page, NULL); + &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); @@ -1952,7 +1964,7 @@ retry: flush_dcache_page(page); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, - page, NULL); + page, fsdata); offset = 0; towrite -= tocopy; off += tocopy; @@ -3456,12 +3468,17 @@ try_onemore: } } + /* init per sbi slab cache */ + err = f2fs_init_xattr_caches(sbi); + if (err) + goto free_io_dummy; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_err(sbi, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_io_dummy; + goto free_xattr_cache; } err = f2fs_get_valid_checkpoint(sbi); @@ -3589,7 +3606,7 @@ try_onemore: f2fs_err(sbi, "Cannot turn on quotas: error %d", err); } #endif - /* if there are nt orphan nodes free them */ + /* if there are any orphan inodes, free them */ err = f2fs_recover_orphan_inodes(sbi); if (err) goto free_meta; @@ -3598,7 +3615,8 @@ try_onemore: goto reset_checkpoint; /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + if (!test_opt(sbi, DISABLE_ROLL_FORWARD) && + !test_opt(sbi, NORECOVERY)) { /* * mount should be failed, when device has readonly mode, and * previous checkpoint was not done by clean system shutdown. @@ -3664,7 +3682,7 @@ reset_checkpoint: * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) { + if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) @@ -3733,6 +3751,8 @@ free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); sbi->meta_inode = NULL; +free_xattr_cache: + f2fs_destroy_xattr_caches(sbi); free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 91d649790b1b..e3bbbef9b4f0 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -109,47 +109,47 @@ static ssize_t features_show(struct f2fs_attr *a, return sprintf(buf, "0\n"); if (f2fs_sb_has_encrypt(sbi)) - len += snprintf(buf, PAGE_SIZE - len, "%s", + len += scnprintf(buf, PAGE_SIZE - len, "%s", "encryption"); if (f2fs_sb_has_blkzoned(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "blkzoned"); if (f2fs_sb_has_extra_attr(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "extra_attr"); if (f2fs_sb_has_project_quota(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "projquota"); if (f2fs_sb_has_inode_chksum(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_checksum"); if (f2fs_sb_has_flexible_inline_xattr(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "flexible_inline_xattr"); if (f2fs_sb_has_quota_ino(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "quota_ino"); if (f2fs_sb_has_inode_crtime(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_crtime"); if (f2fs_sb_has_lost_found(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "lost_found"); if (f2fs_sb_has_verity(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "verity"); if (f2fs_sb_has_sb_chksum(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "sb_checksum"); if (f2fs_sb_has_casefold(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); if (f2fs_sb_has_compression(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "compression"); - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "pin_file"); - len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + len += scnprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -185,6 +185,12 @@ static ssize_t encoding_show(struct f2fs_attr *a, return sprintf(buf, "(none)"); } +static ssize_t mounted_time_sec_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time); +} + #ifdef CONFIG_F2FS_STAT_FS static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) @@ -233,16 +239,16 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, int hot_count = sbi->raw_super->hot_ext_count; int len = 0, i; - len += snprintf(buf + len, PAGE_SIZE - len, + len += scnprintf(buf + len, PAGE_SIZE - len, "cold file extension:\n"); for (i = 0; i < cold_count; i++) - len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); - len += snprintf(buf + len, PAGE_SIZE - len, + len += scnprintf(buf + len, PAGE_SIZE - len, "hot file extension:\n"); for (i = cold_count; i < cold_count + hot_count; i++) - len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); return len; } @@ -544,6 +550,7 @@ F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); +F2FS_GENERAL_RO_ATTR(mounted_time_sec); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -573,7 +580,9 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); #endif F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); +#ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); +#endif #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -621,6 +630,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(reserved_blocks), ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), + ATTR_LIST(mounted_time_sec), #ifdef CONFIG_F2FS_STAT_FS ATTR_LIST(cp_foreground_calls), ATTR_LIST(cp_background_calls), @@ -654,7 +664,9 @@ static struct attribute *f2fs_feat_attrs[] = { #endif ATTR_LIST(sb_checksum), ATTR_LIST(casefold), +#ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compression), +#endif NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 296b3189448a..4f6582ef7ee3 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -23,6 +23,25 @@ #include "xattr.h" #include "segment.h" +static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline) +{ + if (likely(size == sbi->inline_xattr_slab_size)) { + *is_inline = true; + return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS); + } + *is_inline = false; + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} + +static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr, + bool is_inline) +{ + if (is_inline) + kmem_cache_free(sbi->inline_xattr_slab, xattr_addr); + else + kvfree(xattr_addr); +} + static int f2fs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) @@ -301,7 +320,8 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, - void **base_addr, int *base_size) + void **base_addr, int *base_size, + bool *is_inline) { void *cur_addr, *txattr_addr, *last_txattr_addr; void *last_addr = NULL; @@ -312,12 +332,12 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!xnid && !inline_size) return -ENODATA; - *base_size = XATTR_SIZE(xnid, inode) + XATTR_PADDING_SIZE; - txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS); + *base_size = XATTR_SIZE(inode) + XATTR_PADDING_SIZE; + txattr_addr = xattr_alloc(F2FS_I_SB(inode), *base_size, is_inline); if (!txattr_addr) return -ENOMEM; - last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(xnid, inode); + last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(inode); /* read from inline xattr */ if (inline_size) { @@ -362,7 +382,7 @@ check: *base_addr = txattr_addr; return 0; out: - kvfree(txattr_addr); + xattr_free(F2FS_I_SB(inode), txattr_addr, *is_inline); return err; } @@ -499,6 +519,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, unsigned int size, len; void *base_addr = NULL; int base_size; + bool is_inline; if (name == NULL) return -EINVAL; @@ -509,7 +530,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, - &entry, &base_addr, &base_size); + &entry, &base_addr, &base_size, &is_inline); up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -532,14 +553,13 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, } error = size; out: - kvfree(base_addr); + xattr_free(F2FS_I_SB(inode), base_addr, is_inline); return error; } ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); - nid_t xnid = F2FS_I(inode)->i_xattr_nid; struct f2fs_xattr_entry *entry; void *base_addr, *last_base_addr; int error = 0; @@ -551,7 +571,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (error) return error; - last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode); + last_base_addr = (void *)base_addr + XATTR_SIZE(inode); list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -609,7 +629,6 @@ static int __f2fs_setxattr(struct inode *inode, int index, { struct f2fs_xattr_entry *here, *last; void *base_addr, *last_base_addr; - nid_t xnid = F2FS_I(inode)->i_xattr_nid; int found, newsize; size_t len; __u32 new_hsize; @@ -633,7 +652,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) return error; - last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode); + last_base_addr = (void *)base_addr + XATTR_SIZE(inode); /* find entry with wanted name. */ here = __find_xattr(base_addr, last_base_addr, index, len, name); @@ -758,14 +777,34 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - /* protect xattr_ver */ - down_write(&F2FS_I(inode)->i_sem); down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); up_write(&F2FS_I(inode)->i_xattr_sem); - up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); f2fs_update_time(sbi, REQ_TIME); return err; } + +int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + char slab_name[32]; + + sprintf(slab_name, "f2fs_xattr_entry-%u:%u", MAJOR(dev), MINOR(dev)); + + sbi->inline_xattr_slab_size = F2FS_OPTION(sbi).inline_xattr_size * + sizeof(__le32) + XATTR_PADDING_SIZE; + + sbi->inline_xattr_slab = f2fs_kmem_cache_create(slab_name, + sbi->inline_xattr_slab_size); + if (!sbi->inline_xattr_slab) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) +{ + kmem_cache_destroy(sbi->inline_xattr_slab); +} diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index de0c600b9cab..938fcd20565d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -49,7 +49,7 @@ struct f2fs_xattr_entry { __u8 e_name_index; __u8 e_name_len; __le16 e_value_size; /* size of attribute value */ - char e_name[0]; /* attribute name */ + char e_name[]; /* attribute name */ }; #define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) @@ -73,7 +73,8 @@ struct f2fs_xattr_entry { entry = XATTR_NEXT_ENTRY(entry)) #define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) #define XATTR_PADDING_SIZE (sizeof(__u32)) -#define XATTR_SIZE(x,i) (((x) ? VALID_XATTR_BLOCK_SIZE : 0) + \ +#define XATTR_SIZE(i) ((F2FS_I(i)->i_xattr_nid ? \ + VALID_XATTR_BLOCK_SIZE : 0) + \ (inline_xattr_size(i))) #define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ VALID_XATTR_BLOCK_SIZE) @@ -130,6 +131,8 @@ extern int f2fs_setxattr(struct inode *, int, const char *, extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t, struct page *); extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); +extern int f2fs_init_xattr_caches(struct f2fs_sb_info *); +extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *); #else #define f2fs_xattr_handlers NULL @@ -150,6 +153,8 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, { return -EOPNOTSUPP; } +static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { } #endif #ifdef CONFIG_F2FS_FS_SECURITY diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 7e6fb43f9541..ab53e42a874a 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -368,8 +368,6 @@ bool fs_validate_description(const char *name, const struct fs_parameter_spec *param, *p2; bool good = true; - pr_notice("*** VALIDATE %s ***\n", name); - for (param = desc; param->name; param++) { /* Check for duplicate parameter names */ for (p2 = desc; p2 < param; p2++) { diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 09e6be8aa036..2e939f5fe751 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -21,6 +21,7 @@ #include "glock.h" #include "inode.h" #include "meta_io.h" +#include "quota.h" #include "rgrp.h" #include "trans.h" #include "util.h" @@ -116,14 +117,14 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) return -E2BIG; - ret = gfs2_rsqa_alloc(ip); + ret = gfs2_qa_get(ip); if (ret) return ret; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (ret) - return ret; + goto out; need_unlock = true; } @@ -143,5 +144,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) unlock: if (need_unlock) gfs2_glock_dq_uninit(&gh); +out: + gfs2_qa_put(ip); return ret; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ba83b49ce18c..786c1ce8f030 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -805,11 +805,16 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bd = bh->b_private; if (bd) { gfs2_assert_warn(sdp, bd->bd_bh == bh); - if (!list_empty(&bd->bd_list)) - list_del_init(&bd->bd_list); bd->bd_bh = NULL; bh->b_private = NULL; - kmem_cache_free(gfs2_bufdata_cachep, bd); + /* + * The bd may still be queued as a revoke, in which + * case we must not dequeue nor free it. + */ + if (!bd->bd_blkno && !list_empty(&bd->bd_list)) + list_del_init(&bd->bd_list); + if (list_empty(&bd->bd_list)) + kmem_cache_free(gfs2_bufdata_cachep, bd); } bh = bh->b_this_page; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 08f6fbb3655e..936a8ec6b48e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2183,7 +2183,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) inode_dio_wait(inode); - ret = gfs2_rsqa_alloc(ip); + ret = gfs2_qa_get(ip); if (ret) goto out; @@ -2194,7 +2194,8 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) ret = do_shrink(inode, newsize); out: - gfs2_rsqa_delete(ip, NULL); + gfs2_rs_delete(ip, NULL); + gfs2_qa_put(ip); return ret; } @@ -2223,7 +2224,7 @@ void gfs2_free_journal_extents(struct gfs2_jdesc *jd) struct gfs2_journal_extent *jext; while(!list_empty(&jd->extent_list)) { - jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list); + jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list); list_del(&jext->list); kfree(jext); } @@ -2244,7 +2245,7 @@ static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 b struct gfs2_journal_extent *jext; if (!list_empty(&jd->extent_list)) { - jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list); + jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list); if ((jext->dblock + jext->blocks) == dblock) { jext->blocks += blocks; return 0; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c8b62577e2f2..c3f7732415be 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -2028,7 +2028,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, error = gfs2_trans_begin(sdp, rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) + - RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks); + RES_DINODE + RES_STATFS + RES_QUOTA, RES_DINODE + + l_blocks); if (error) goto out_rg_gunlock; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index cb26be6f4351..fe305e4bfd37 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -458,10 +458,6 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); - ret = gfs2_rsqa_alloc(ip); - if (ret) - goto out; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) @@ -558,7 +554,6 @@ out_uninit: set_page_dirty(page); wait_for_stable_page(page); } -out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); } @@ -635,7 +630,17 @@ int gfs2_open_common(struct inode *inode, struct file *file) gfs2_assert_warn(GFS2_SB(inode), !file->private_data); file->private_data = fp; + if (file->f_mode & FMODE_WRITE) { + ret = gfs2_qa_get(GFS2_I(inode)); + if (ret) + goto fail; + } return 0; + +fail: + kfree(file->private_data); + file->private_data = NULL; + return ret; } /** @@ -690,10 +695,10 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if (!(file->f_mode & FMODE_WRITE)) - return 0; - - gfs2_rsqa_delete(ip, &inode->i_writecount); + if (file->f_mode & FMODE_WRITE) { + gfs2_rs_delete(ip, &inode->i_writecount); + gfs2_qa_put(ip); + } return 0; } @@ -849,10 +854,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct gfs2_inode *ip = GFS2_I(inode); ssize_t ret; - ret = gfs2_rsqa_alloc(ip); - if (ret) - return ret; - gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from)); if (iocb->ki_flags & IOCB_APPEND) { @@ -1149,17 +1150,11 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le if (mode & FALLOC_FL_PUNCH_HOLE) { ret = __gfs2_punch_hole(file, offset, len); } else { - ret = gfs2_rsqa_alloc(ip); - if (ret) - goto out_putw; - ret = __gfs2_fallocate(file, mode, offset, len); - if (ret) gfs2_rs_deltree(&ip->i_res); } -out_putw: put_write_access(inode); out_unlock: gfs2_glock_dq(&gh); @@ -1173,16 +1168,12 @@ static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - int error; - struct gfs2_inode *ip = GFS2_I(out->f_mapping->host); - - error = gfs2_rsqa_alloc(ip); - if (error) - return (ssize_t)error; + ssize_t ret; gfs2_size_hint(out, *ppos, len); - return iter_file_splice_write(pipe, out, ppos, len, flags); + ret = iter_file_splice_write(pipe, out, ppos, len, flags); + return ret; } #ifdef CONFIG_GFS2_FS_LOCKING_DLM diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d0eceaff3cea..29f9b6684b74 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -133,6 +133,33 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu) } } +/** + * glock_blocked_by_withdraw - determine if we can still use a glock + * @gl: the glock + * + * We need to allow some glocks to be enqueued, dequeued, promoted, and demoted + * when we're withdrawn. For example, to maintain metadata integrity, we should + * disallow the use of inode and rgrp glocks when withdrawn. Other glocks, like + * iopen or the transaction glocks may be safely used because none of their + * metadata goes through the journal. So in general, we should disallow all + * glocks that are journaled, and allow all the others. One exception is: + * we need to allow our active journal to be promoted and demoted so others + * may recover it and we can reacquire it when they're done. + */ +static bool glock_blocked_by_withdraw(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (likely(!gfs2_withdrawn(sdp))) + return false; + if (gl->gl_ops->go_flags & GLOF_NONDISK) + return false; + if (!sdp->sd_jdesc || + gl->gl_name.ln_number == sdp->sd_jdesc->jd_no_addr) + return false; + return true; +} + void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; @@ -244,7 +271,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); - GLOCK_BUG_ON(gl, mapping && mapping->nrpages); + GLOCK_BUG_ON(gl, mapping && mapping->nrpages && !gfs2_withdrawn(sdp)); trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } @@ -281,7 +308,7 @@ void gfs2_glock_put(struct gfs2_glock *gl) static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) { - const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list); + const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list); if ((gh->gh_state == LM_ST_EXCLUSIVE || gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head) return 0; @@ -549,8 +576,8 @@ __acquires(&gl->gl_lockref.lock) unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; - if (unlikely(gfs2_withdrawn(sdp)) && - target != LM_ST_UNLOCKED) + if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && + gh && !(gh->gh_flags & LM_FLAG_NOEXP)) return; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); @@ -575,13 +602,64 @@ __acquires(&gl->gl_lockref.lock) (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) clear_bit(GLF_BLOCKING, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); - if (glops->go_sync) - glops->go_sync(gl); - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + if (glops->go_sync) { + ret = glops->go_sync(gl); + /* If we had a problem syncing (due to io errors or whatever, + * we should not invalidate the metadata or tell dlm to + * release the glock to other nodes. + */ + if (ret) { + if (cmpxchg(&sdp->sd_log_error, 0, ret)) { + fs_err(sdp, "Error %d syncing glock \n", ret); + gfs2_dump_glock(NULL, gl, true); + } + return; + } + } + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { + /* + * The call to go_sync should have cleared out the ail list. + * If there are still items, we have a problem. We ought to + * withdraw, but we can't because the withdraw code also uses + * glocks. Warn about the error, dump the glock, then fall + * through and wait for logd to do the withdraw for us. + */ + if ((atomic_read(&gl->gl_ail_count) != 0) && + (!cmpxchg(&sdp->sd_log_error, 0, -EIO))) { + gfs2_assert_warn(sdp, !atomic_read(&gl->gl_ail_count)); + gfs2_dump_glock(NULL, gl, true); + } glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); - clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + } gfs2_glock_hold(gl); + /* + * Check for an error encountered since we called go_sync and go_inval. + * If so, we can't withdraw from the glock code because the withdraw + * code itself uses glocks (see function signal_our_withdraw) to + * change the mount to read-only. Most importantly, we must not call + * dlm to unlock the glock until the journal is in a known good state + * (after journal replay) otherwise other nodes may use the object + * (rgrp or dinode) and then later, journal replay will corrupt the + * file system. The best we can do here is wait for the logd daemon + * to see sd_log_error and withdraw, and in the meantime, requeue the + * work for later. + * + * However, if we're just unlocking the lock (say, for unmount, when + * gfs2_gl_hash_clear calls clear_glock) and recovery is complete + * then it's okay to tell dlm to unlock it. + */ + if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp))) + gfs2_withdraw_delayed(sdp); + if (glock_blocked_by_withdraw(gl)) { + if (target != LM_ST_UNLOCKED || + test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags)) { + gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); + goto out; + } + } + if (sdp->sd_lockstruct.ls_ops->lm_lock) { /* lock_dlm */ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); @@ -590,8 +668,7 @@ __acquires(&gl->gl_lockref.lock) test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) { finish_xmote(gl, target); gfs2_glock_queue_work(gl, 0); - } - else if (ret) { + } else if (ret) { fs_err(sdp, "lm_lock ret %d\n", ret); GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp)); } @@ -599,7 +676,7 @@ __acquires(&gl->gl_lockref.lock) finish_xmote(gl, target); gfs2_glock_queue_work(gl, 0); } - +out: spin_lock(&gl->gl_lockref.lock); } @@ -613,7 +690,7 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) struct gfs2_holder *gh; if (!list_empty(&gl->gl_holders)) { - gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); if (test_bit(HIF_HOLDER, &gh->gh_iflags)) return gh; } @@ -645,6 +722,9 @@ __acquires(&gl->gl_lockref.lock) goto out_unlock; if (nonblock) goto out_sched; + smp_mb(); + if (atomic_read(&gl->gl_revokes) != 0) + goto out_sched; set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; @@ -1160,7 +1240,7 @@ fail: } list_add_tail(&gh->gh_list, insert_pt); do_cancel: - gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { spin_unlock(&gl->gl_lockref.lock); if (sdp->sd_lockstruct.ls_ops->lm_cancel) @@ -1194,10 +1274,9 @@ trap_recursive: int gfs2_glock_nq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; int error = 0; - if (unlikely(gfs2_withdrawn(sdp))) + if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP)) return -EIO; if (test_bit(GLF_LRU, &gl->gl_flags)) @@ -1241,24 +1320,32 @@ int gfs2_glock_poll(struct gfs2_holder *gh) void gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned delay = 0; int fast_path = 0; spin_lock(&gl->gl_lockref.lock); + /* + * If we're in the process of file system withdraw, we cannot just + * dequeue any glocks until our journal is recovered, lest we + * introduce file system corruption. We need two exceptions to this + * rule: We need to allow unlocking of nondisk glocks and the glock + * for our own journal that needs recovery. + */ + if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && + glock_blocked_by_withdraw(gl) && + gh->gh_gl != sdp->sd_jinode_gl) { + sdp->sd_glock_dqs_held++; + might_sleep(); + wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, + TASK_UNINTERRUPTIBLE); + } if (gh->gh_flags & GL_NOCACHE) handle_callback(gl, LM_ST_UNLOCKED, 0, false); list_del_init(&gh->gh_list); clear_bit(HIF_HOLDER, &gh->gh_iflags); if (find_first_holder(gl) == NULL) { - if (glops->go_unlock) { - GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags)); - spin_unlock(&gl->gl_lockref.lock); - glops->go_unlock(gh); - spin_lock(&gl->gl_lockref.lock); - clear_bit(GLF_LOCK, &gl->gl_flags); - } if (list_empty(&gl->gl_holders) && !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && !test_bit(GLF_DEMOTE, &gl->gl_flags)) @@ -1555,7 +1642,7 @@ __acquires(&lru_lock) list_sort(NULL, list, glock_cmp); while(!list_empty(list)) { - gl = list_entry(list->next, struct gfs2_glock, gl_lru); + gl = list_first_entry(list, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); if (!spin_trylock(&gl->gl_lockref.lock)) { add_back_to_lru: @@ -1596,7 +1683,7 @@ static long gfs2_scan_glock_lru(int nr) spin_lock(&lru_lock); while ((nr-- >= 0) && !list_empty(&lru_list)) { - gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); + gl = list_first_entry(&lru_list, struct gfs2_glock, gl_lru); /* Test for being demotable */ if (!test_bit(GLF_LOCK, &gl->gl_flags)) { diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 061d22e1ceb6..9e9c7a4b8c66 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -29,6 +29,8 @@ struct workqueue_struct *gfs2_freeze_wq; +extern struct workqueue_struct *gfs2_control_wq; + static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) { fs_err(gl->gl_name.ln_sbd, @@ -39,7 +41,8 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gfs2_glock2aspace(gl)); - gfs2_lm_withdraw(gl->gl_name.ln_sbd, "AIL error\n"); + gfs2_lm(gl->gl_name.ln_sbd, "AIL error\n"); + gfs2_withdraw(gl->gl_name.ln_sbd); } /** @@ -79,34 +82,62 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync, } -static void gfs2_ail_empty_gl(struct gfs2_glock *gl) +static int gfs2_ail_empty_gl(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_trans tr; + int ret; memset(&tr, 0, sizeof(tr)); INIT_LIST_HEAD(&tr.tr_buf); INIT_LIST_HEAD(&tr.tr_databuf); tr.tr_revokes = atomic_read(&gl->gl_ail_count); - if (!tr.tr_revokes) - return; + if (!tr.tr_revokes) { + bool have_revokes; + bool log_in_flight; + + /* + * We have nothing on the ail, but there could be revokes on + * the sdp revoke queue, in which case, we still want to flush + * the log and wait for it to finish. + * + * If the sdp revoke list is empty too, we might still have an + * io outstanding for writing revokes, so we should wait for + * it before returning. + * + * If none of these conditions are true, our revokes are all + * flushed and we can return. + */ + gfs2_log_lock(sdp); + have_revokes = !list_empty(&sdp->sd_log_revokes); + log_in_flight = atomic_read(&sdp->sd_log_in_flight); + gfs2_log_unlock(sdp); + if (have_revokes) + goto flush; + if (log_in_flight) + log_flush_wait(sdp); + return 0; + } /* A shortened, inline version of gfs2_trans_begin() * tr->alloced is not set since the transaction structure is * on the stack */ tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes); tr.tr_ip = _RET_IP_; - if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) - return; + ret = gfs2_log_reserve(sdp, tr.tr_reserved); + if (ret < 0) + return ret; WARN_ON_ONCE(current->journal_info); current->journal_info = &tr; __gfs2_ail_flush(gl, 0, tr.tr_revokes); gfs2_trans_end(sdp); +flush: gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_AIL_EMPTY_GL); + return 0; } void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) @@ -140,35 +171,32 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) * return to caller to demote/unlock the glock until I/O is complete. */ -static void rgrp_go_sync(struct gfs2_glock *gl) +static int rgrp_go_sync(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; - struct gfs2_rgrpd *rgd; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); int error; - spin_lock(&gl->gl_lockref.lock); - rgd = gl->gl_object; - if (rgd) - gfs2_rgrp_brelse(rgd); - spin_unlock(&gl->gl_lockref.lock); - if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) - return; + return 0; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_RGRP_GO_SYNC); filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + WARN_ON_ONCE(error); mapping_set_error(mapping, error); - gfs2_ail_empty_gl(gl); + if (!error) + error = gfs2_ail_empty_gl(gl); spin_lock(&gl->gl_lockref.lock); rgd = gl->gl_object; if (rgd) gfs2_free_clones(rgd); spin_unlock(&gl->gl_lockref.lock); + return error; } /** @@ -191,7 +219,6 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); - gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); if (rgd) @@ -236,12 +263,12 @@ static void gfs2_clear_glop_pending(struct gfs2_inode *ip) * */ -static void inode_go_sync(struct gfs2_glock *gl) +static int inode_go_sync(struct gfs2_glock *gl) { struct gfs2_inode *ip = gfs2_glock2inode(gl); int isreg = ip && S_ISREG(ip->i_inode.i_mode); struct address_space *metamapping = gfs2_glock2aspace(gl); - int error; + int error = 0; if (isreg) { if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) @@ -274,6 +301,7 @@ static void inode_go_sync(struct gfs2_glock *gl) out: gfs2_clear_glop_pending(ip); + return error; } /** @@ -291,8 +319,6 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) { struct gfs2_inode *ip = gfs2_glock2inode(gl); - gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count)); - if (flags & DIO_METADATA) { struct address_space *mapping = gfs2_glock2aspace(gl); truncate_inode_pages(mapping, 0); @@ -496,24 +522,29 @@ static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl, * */ -static void freeze_go_sync(struct gfs2_glock *gl) +static int freeze_go_sync(struct gfs2_glock *gl) { int error = 0; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - if (gl->gl_state == LM_ST_SHARED && + if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE); error = freeze_super(sdp->sd_vfs); if (error) { fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", error); + if (gfs2_withdrawn(sdp)) { + atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); + return 0; + } gfs2_assert_withdraw(sdp, 0); } queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | GFS2_LFC_FREEZE_GO_SYNC); } + return 0; } /** @@ -582,8 +613,76 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) } } +/** + * inode_go_free - wake up anyone waiting for dlm's unlock ast to free it + * @gl: glock being freed + * + * For now, this is only used for the journal inode glock. In withdraw + * situations, we need to wait for the glock to be freed so that we know + * other nodes may proceed with recovery / journal replay. + */ +static void inode_go_free(struct gfs2_glock *gl) +{ + /* Note that we cannot reference gl_object because it's already set + * to NULL by this point in its lifecycle. */ + if (!test_bit(GLF_FREEING, &gl->gl_flags)) + return; + clear_bit_unlock(GLF_FREEING, &gl->gl_flags); + wake_up_bit(&gl->gl_flags, GLF_FREEING); +} + +/** + * nondisk_go_callback - used to signal when a node did a withdraw + * @gl: the nondisk glock + * @remote: true if this came from a different cluster node + * + */ +static void nondisk_go_callback(struct gfs2_glock *gl, bool remote) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + /* Ignore the callback unless it's from another node, and it's the + live lock. */ + if (!remote || gl->gl_name.ln_number != GFS2_LIVE_LOCK) + return; + + /* First order of business is to cancel the demote request. We don't + * really want to demote a nondisk glock. At best it's just to inform + * us of another node's withdraw. We'll keep it in SH mode. */ + clear_bit(GLF_DEMOTE, &gl->gl_flags); + clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + + /* Ignore the unlock if we're withdrawn, unmounting, or in recovery. */ + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || + test_bit(SDF_WITHDRAWN, &sdp->sd_flags) || + test_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags)) + return; + + /* We only care when a node wants us to unlock, because that means + * they want a journal recovered. */ + if (gl->gl_demote_state != LM_ST_UNLOCKED) + return; + + if (sdp->sd_args.ar_spectator) { + fs_warn(sdp, "Spectator node cannot recover journals.\n"); + return; + } + + fs_warn(sdp, "Some node has withdrawn; checking for recovery.\n"); + set_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags); + /* + * We can't call remote_withdraw directly here or gfs2_recover_journal + * because this is called from the glock unlock function and the + * remote_withdraw needs to enqueue and dequeue the same "live" glock + * we were called from. So we queue it to the control work queue in + * lock_dlm. + */ + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); +} + const struct gfs2_glock_operations gfs2_meta_glops = { .go_type = LM_TYPE_META, + .go_flags = GLOF_NONDISK, }; const struct gfs2_glock_operations gfs2_inode_glops = { @@ -594,13 +693,13 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, .go_flags = GLOF_ASPACE | GLOF_LRU, + .go_free = inode_go_free, }; const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_sync = rgrp_go_sync, .go_inval = rgrp_go_inval, .go_lock = gfs2_rgrp_go_lock, - .go_unlock = gfs2_rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, .go_flags = GLOF_LVB, @@ -611,30 +710,34 @@ const struct gfs2_glock_operations gfs2_freeze_glops = { .go_xmote_bh = freeze_go_xmote_bh, .go_demote_ok = freeze_go_demote_ok, .go_type = LM_TYPE_NONDISK, + .go_flags = GLOF_NONDISK, }; const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, .go_callback = iopen_go_callback, - .go_flags = GLOF_LRU, + .go_flags = GLOF_LRU | GLOF_NONDISK, }; const struct gfs2_glock_operations gfs2_flock_glops = { .go_type = LM_TYPE_FLOCK, - .go_flags = GLOF_LRU, + .go_flags = GLOF_LRU | GLOF_NONDISK, }; const struct gfs2_glock_operations gfs2_nondisk_glops = { .go_type = LM_TYPE_NONDISK, + .go_flags = GLOF_NONDISK, + .go_callback = nondisk_go_callback, }; const struct gfs2_glock_operations gfs2_quota_glops = { .go_type = LM_TYPE_QUOTA, - .go_flags = GLOF_LVB | GLOF_LRU, + .go_flags = GLOF_LVB | GLOF_LRU | GLOF_NONDISK, }; const struct gfs2_glock_operations gfs2_journal_glops = { .go_type = LM_TYPE_JOURNAL, + .go_flags = GLOF_NONDISK, }; const struct gfs2_glock_operations *gfs2_glops_list[] = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 9fd88ed18807..84a824293a78 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -234,20 +234,21 @@ struct lm_lockname { struct gfs2_glock_operations { - void (*go_sync) (struct gfs2_glock *gl); + int (*go_sync) (struct gfs2_glock *gl); int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); - void (*go_unlock) (struct gfs2_holder *gh); void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl, const char *fs_id_buf); void (*go_callback)(struct gfs2_glock *gl, bool remote); + void (*go_free)(struct gfs2_glock *gl); const int go_type; const unsigned long go_flags; -#define GLOF_ASPACE 1 -#define GLOF_LVB 2 -#define GLOF_LRU 4 +#define GLOF_ASPACE 1 /* address space attached */ +#define GLOF_LVB 2 /* Lock Value Block attached */ +#define GLOF_LRU 4 /* LRU managed */ +#define GLOF_NONDISK 8 /* not I/O related */ }; enum { @@ -294,6 +295,7 @@ struct gfs2_qadata { /* quota allocation data */ struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS]; struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS]; unsigned int qa_qd_num; + int qa_ref; }; /* Resource group multi-block reservation, in order of appearance: @@ -343,6 +345,7 @@ enum { GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, GLF_INODE_CREATING = 16, /* Inode creation occurring */ + GLF_FREEING = 18, /* Wait for glock to be freed */ }; struct gfs2_glock { @@ -542,6 +545,7 @@ struct gfs2_jdesc { struct list_head jd_revoke_list; unsigned int jd_replay_tail; + u64 jd_no_addr; }; struct gfs2_statfs_change_host { @@ -616,8 +620,12 @@ enum { SDF_RORECOVERY = 7, /* read only recovery */ SDF_SKIP_DLM_UNLOCK = 8, SDF_FORCE_AIL_FLUSH = 9, - SDF_AIL1_IO_ERROR = 10, - SDF_FS_FROZEN = 11, + SDF_FS_FROZEN = 10, + SDF_WITHDRAWING = 11, /* Will withdraw eventually */ + SDF_WITHDRAW_IN_PROG = 12, /* Withdraw is in progress */ + SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */ + SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are + withdrawing */ }; enum gfs2_freeze_state { @@ -768,6 +776,7 @@ struct gfs2_sbd { struct gfs2_jdesc *sd_jdesc; struct gfs2_holder sd_journal_gh; struct gfs2_holder sd_jinode_gh; + struct gfs2_glock *sd_jinode_gl; struct gfs2_holder sd_sc_gh; struct gfs2_holder sd_qc_gh; @@ -828,7 +837,8 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; - int sd_log_error; + int sd_log_error; /* First log error */ + wait_queue_head_t sd_withdraw_wait; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; @@ -852,6 +862,7 @@ struct gfs2_sbd { unsigned long sd_last_warning; struct dentry *debugfs_dir; /* debugfs directory */ + unsigned long sd_glock_dqs_held; }; static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8294851a9dd9..70b2d3a1e866 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -144,7 +144,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) - goto fail_put; + goto fail; if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) { /* @@ -155,13 +155,13 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &i_gh); if (error) - goto fail_put; + goto fail; if (blktype != GFS2_BLKST_FREE) { error = gfs2_check_blk_type(sdp, no_addr, blktype); if (error) - goto fail_put; + goto fail; } } @@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, set_bit(GIF_INVALID, &ip->i_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) - goto fail_put; + goto fail; glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -182,7 +182,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, /* Inode glock must be locked already */ error = gfs2_inode_refresh(GFS2_I(inode)); if (error) - goto fail_refresh; + goto fail; } else { ip->i_no_formal_ino = no_formal_ino; inode->i_mode = DT2IF(type); @@ -197,17 +197,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, gfs2_glock_dq_uninit(&i_gh); return inode; -fail_refresh: - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - glock_clear_object(ip->i_iopen_gh.gh_gl, ip); - gfs2_glock_dq_uninit(&ip->i_iopen_gh); -fail_put: +fail: if (io_gl) gfs2_glock_put(io_gl); - glock_clear_object(ip->i_gl, ip); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); -fail: iget_failed(inode); return ERR_PTR(error); } @@ -594,13 +588,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; - error = gfs2_rsqa_alloc(dip); + error = gfs2_qa_get(dip); if (error) return error; error = gfs2_rindex_update(sdp); if (error) - return error; + goto fail; error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (error) @@ -647,7 +641,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_gunlock; ip = GFS2_I(inode); - error = gfs2_rsqa_alloc(ip); + error = gfs2_qa_get(ip); if (error) goto fail_free_acls; @@ -782,11 +776,13 @@ fail_gunlock2: clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); gfs2_glock_put(io_gl); fail_free_inode: + gfs2_qa_put(ip); if (ip->i_gl) { glock_clear_object(ip->i_gl, ip); gfs2_glock_put(ip->i_gl); } - gfs2_rsqa_delete(ip, NULL); + gfs2_rs_delete(ip, NULL); + gfs2_qa_put(ip); fail_free_acls: posix_acl_release(default_acl); posix_acl_release(acl); @@ -804,6 +800,7 @@ fail_gunlock: if (gfs2_holder_initialized(ghs + 1)) gfs2_glock_dq_uninit(ghs + 1); fail: + gfs2_qa_put(dip); return error; } @@ -905,7 +902,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (S_ISDIR(inode->i_mode)) return -EPERM; - error = gfs2_rsqa_alloc(dip); + error = gfs2_qa_get(dip); if (error) return error; @@ -1008,6 +1005,7 @@ out_gunlock: out_child: gfs2_glock_dq(ghs); out_parent: + gfs2_qa_put(ip); gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); return error; @@ -1368,7 +1366,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) return error; - error = gfs2_rsqa_alloc(ndip); + error = gfs2_qa_get(ndip); if (error) return error; @@ -1568,6 +1566,7 @@ out_gunlock_r: if (gfs2_holder_initialized(&r_gh)) gfs2_glock_dq_uninit(&r_gh); out: + gfs2_qa_put(ndip); return error; } @@ -1879,10 +1878,9 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) ouid = nuid = NO_UID_QUOTA_CHANGE; if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) ogid = ngid = NO_GID_QUOTA_CHANGE; - - error = gfs2_rsqa_alloc(ip); + error = gfs2_qa_get(ip); if (error) - goto out; + return error; error = gfs2_rindex_update(sdp); if (error) @@ -1920,6 +1918,7 @@ out_end_trans: out_gunlock_q: gfs2_quota_unlock(ip); out: + gfs2_qa_put(ip); return error; } @@ -1941,21 +1940,21 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) struct gfs2_holder i_gh; int error; - error = gfs2_rsqa_alloc(ip); + error = gfs2_qa_get(ip); if (error) return error; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); if (error) - return error; + goto out; error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out; + goto error; error = setattr_prepare(dentry, attr); if (error) - goto out; + goto error; if (attr->ia_valid & ATTR_SIZE) error = gfs2_setattr_size(inode, attr->ia_size); @@ -1967,10 +1966,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) error = posix_acl_chmod(inode, inode->i_mode); } -out: +error: if (!error) mark_inode_dirty(inode); gfs2_glock_dq_uninit(&i_gh); +out: + gfs2_qa_put(ip); return error; } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 7c7197343ee2..9f2b5609f225 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -16,6 +16,8 @@ #include "incore.h" #include "glock.h" +#include "glops.h" +#include "recovery.h" #include "util.h" #include "sys.h" #include "trace_gfs2.h" @@ -124,6 +126,8 @@ static void gdlm_ast(void *arg) switch (gl->gl_lksb.sb_status) { case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ + if (gl->gl_ops->go_free) + gl->gl_ops->go_free(gl); gfs2_glock_free(gl); return; case -DLM_ECANCEL: /* Cancel while getting lock */ @@ -323,6 +327,7 @@ static void gdlm_cancel(struct gfs2_glock *gl) /* * dlm/gfs2 recovery coordination using dlm_recover callbacks * + * 0. gfs2 checks for another cluster node withdraw, needing journal replay * 1. dlm_controld sees lockspace members change * 2. dlm_controld blocks dlm-kernel locking activity * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep) @@ -571,6 +576,28 @@ static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) &ls->ls_control_lksb, "control_lock"); } +/** + * remote_withdraw - react to a node withdrawing from the file system + * @sdp: The superblock + */ +static void remote_withdraw(struct gfs2_sbd *sdp) +{ + struct gfs2_jdesc *jd; + int ret = 0, count = 0; + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) + continue; + ret = gfs2_recover_journal(jd, true); + if (ret) + break; + count++; + } + + /* Now drop the additional reference we acquired */ + fs_err(sdp, "Journals checked: %d, ret = %d.\n", count, ret); +} + static void gfs2_control_func(struct work_struct *work) { struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work); @@ -581,6 +608,13 @@ static void gfs2_control_func(struct work_struct *work) int recover_size; int i, error; + /* First check for other nodes that may have done a withdraw. */ + if (test_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags)) { + remote_withdraw(sdp); + clear_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags); + return; + } + spin_lock(&ls->ls_recover_spin); /* * No MOUNT_DONE means we're still mounting; control_mount() @@ -1079,6 +1113,10 @@ static void gdlm_recover_prep(void *arg) struct gfs2_sbd *sdp = arg; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + if (gfs2_withdrawn(sdp)) { + fs_err(sdp, "recover_prep ignored due to withdraw.\n"); + return; + } spin_lock(&ls->ls_recover_spin); ls->ls_recover_block = ls->ls_recover_start; set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); @@ -1101,6 +1139,11 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) struct lm_lockstruct *ls = &sdp->sd_lockstruct; int jid = slot->slot - 1; + if (gfs2_withdrawn(sdp)) { + fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n", + jid); + return; + } spin_lock(&ls->ls_recover_spin); if (ls->ls_recover_size < jid + 1) { fs_err(sdp, "recover_slot jid %d gen %u short size %d\n", @@ -1125,6 +1168,10 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, struct gfs2_sbd *sdp = arg; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + if (gfs2_withdrawn(sdp)) { + fs_err(sdp, "recover_done ignored due to withdraw.\n"); + return; + } /* ensure the ls jid arrays are large enough */ set_recover_size(sdp, slots, num_slots); @@ -1152,6 +1199,11 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, { struct lm_lockstruct *ls = &sdp->sd_lockstruct; + if (gfs2_withdrawn(sdp)) { + fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n", + jid); + return; + } if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) return; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 00a2e721a374..3a75843ae580 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -88,8 +88,7 @@ static void gfs2_remove_from_ail(struct gfs2_bufdata *bd) static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct writeback_control *wbc, - struct gfs2_trans *tr, - bool *withdraw) + struct gfs2_trans *tr) __releases(&sdp->sd_ail_lock) __acquires(&sdp->sd_ail_lock) { @@ -97,6 +96,7 @@ __acquires(&sdp->sd_ail_lock) struct address_space *mapping; struct gfs2_bufdata *bd, *s; struct buffer_head *bh; + int ret = 0; list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; @@ -104,16 +104,21 @@ __acquires(&sdp->sd_ail_lock) gfs2_assert(sdp, bd->bd_tr == tr); if (!buffer_busy(bh)) { - if (!buffer_uptodate(bh) && - !test_and_set_bit(SDF_AIL1_IO_ERROR, - &sdp->sd_flags)) { + if (buffer_uptodate(bh)) { + list_move(&bd->bd_ail_st_list, + &tr->tr_ail2_list); + continue; + } + if (!cmpxchg(&sdp->sd_log_error, 0, -EIO)) { gfs2_io_error_bh(sdp, bh); - *withdraw = true; + gfs2_withdraw_delayed(sdp); } - list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); - continue; } + if (gfs2_withdrawn(sdp)) { + gfs2_remove_from_ail(bd); + continue; + } if (!buffer_dirty(bh)) continue; if (gl == bd->bd_gl) @@ -124,16 +129,50 @@ __acquires(&sdp->sd_ail_lock) if (!mapping) continue; spin_unlock(&sdp->sd_ail_lock); - generic_writepages(mapping, wbc); + ret = generic_writepages(mapping, wbc); spin_lock(&sdp->sd_ail_lock); - if (wbc->nr_to_write <= 0) + if (ret || wbc->nr_to_write <= 0) break; - return 1; + return -EBUSY; } - return 0; + return ret; } +static void dump_ail_list(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + + fs_err(sdp, "Error: In gfs2_ail1_flush for ten minutes! t=%d\n", + current->journal_info ? 1 : 0); + + list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_reverse(bd, &tr->tr_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + fs_err(sdp, "bd %p: blk:0x%llx bh=%p ", bd, + (unsigned long long)bd->bd_blkno, bh); + if (!bh) { + fs_err(sdp, "\n"); + continue; + } + fs_err(sdp, "0x%llx up2:%d dirt:%d lkd:%d req:%d " + "map:%d new:%d ar:%d aw:%d delay:%d " + "io err:%d unwritten:%d dfr:%d pin:%d esc:%d\n", + (unsigned long long)bh->b_blocknr, + buffer_uptodate(bh), buffer_dirty(bh), + buffer_locked(bh), buffer_req(bh), + buffer_mapped(bh), buffer_new(bh), + buffer_async_read(bh), buffer_async_write(bh), + buffer_delay(bh), buffer_write_io_error(bh), + buffer_unwritten(bh), + buffer_defer_completion(bh), + buffer_pinned(bh), buffer_escaped(bh)); + } + } +} /** * gfs2_ail1_flush - start writeback of some ail1 entries @@ -149,23 +188,36 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) struct list_head *head = &sdp->sd_ail1_list; struct gfs2_trans *tr; struct blk_plug plug; - bool withdraw = false; + int ret; + unsigned long flush_start = jiffies; trace_gfs2_ail_flush(sdp, wbc, 1); blk_start_plug(&plug); spin_lock(&sdp->sd_ail_lock); restart: + ret = 0; + if (time_after(jiffies, flush_start + (HZ * 600))) { + dump_ail_list(sdp); + goto out; + } list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; - if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw) && - !gfs2_withdrawn(sdp)) - goto restart; + ret = gfs2_ail1_start_one(sdp, wbc, tr); + if (ret) { + if (ret == -EBUSY) + goto restart; + break; + } } +out: spin_unlock(&sdp->sd_ail_lock); blk_finish_plug(&plug); - if (withdraw) - gfs2_lm_withdraw(sdp, NULL); + if (ret) { + gfs2_lm(sdp, "gfs2_ail1_start_one (generic_writepages) " + "returned: %d\n", ret); + gfs2_withdraw(sdp); + } trace_gfs2_ail_flush(sdp, wbc, 0); } @@ -189,12 +241,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) /** * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced * @sdp: the filesystem - * @ai: the AIL entry + * @tr: the transaction + * @max_revokes: If nonzero, issue revokes for the bd items for written buffers * */ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, - bool *withdraw) + int *max_revokes) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; @@ -203,12 +256,32 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, bd_ail_st_list) { bh = bd->bd_bh; gfs2_assert(sdp, bd->bd_tr == tr); - if (buffer_busy(bh)) + /* + * If another process flagged an io error, e.g. writing to the + * journal, error all other bhs and move them off the ail1 to + * prevent a tight loop when unmount tries to flush ail1, + * regardless of whether they're still busy. If no outside + * errors were found and the buffer is busy, move to the next. + * If the ail buffer is not busy and caught an error, flag it + * for others. + */ + if (!sdp->sd_log_error && buffer_busy(bh)) continue; if (!buffer_uptodate(bh) && - !test_and_set_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) { + !cmpxchg(&sdp->sd_log_error, 0, -EIO)) { gfs2_io_error_bh(sdp, bh); - *withdraw = true; + gfs2_withdraw_delayed(sdp); + } + /* + * If we have space for revokes and the bd is no longer on any + * buf list, we can just add a revoke for it immediately and + * avoid having to put it on the ail2 list, where it would need + * to be revoked later. + */ + if (*max_revokes && list_empty(&bd->bd_list)) { + gfs2_add_revoke(sdp, bd); + (*max_revokes)--; + continue; } list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); } @@ -217,20 +290,20 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, /** * gfs2_ail1_empty - Try to empty the ail1 lists * @sdp: The superblock + * @max_revokes: If non-zero, add revokes where appropriate * * Tries to empty the ail1 lists, starting with the oldest first */ -static int gfs2_ail1_empty(struct gfs2_sbd *sdp) +static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) { struct gfs2_trans *tr, *s; int oldest_tr = 1; int ret; - bool withdraw = false; spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { - gfs2_ail1_empty_one(sdp, tr, &withdraw); + gfs2_ail1_empty_one(sdp, tr, &max_revokes); if (list_empty(&tr->tr_ail1_list) && oldest_tr) list_move(&tr->tr_list, &sdp->sd_ail2_list); else @@ -239,8 +312,10 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp) ret = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); - if (withdraw) - gfs2_lm_withdraw(sdp, "fatal: I/O error(s)\n"); + if (test_bit(SDF_WITHDRAWING, &sdp->sd_flags)) { + gfs2_lm(sdp, "fatal: I/O error(s)\n"); + gfs2_withdraw(sdp); + } return ret; } @@ -268,20 +343,17 @@ static void gfs2_ail1_wait(struct gfs2_sbd *sdp) } /** - * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced - * @sdp: the filesystem - * @ai: the AIL entry - * + * gfs2_ail_empty_tr - empty one of the ail lists for a transaction */ -static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +static void gfs2_ail_empty_tr(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + struct list_head *head) { - struct list_head *head = &tr->tr_ail2_list; struct gfs2_bufdata *bd; while (!list_empty(head)) { - bd = list_entry(head->prev, struct gfs2_bufdata, - bd_ail_st_list); + bd = list_first_entry(head, struct gfs2_bufdata, + bd_ail_st_list); gfs2_assert(sdp, bd->bd_tr == tr); gfs2_remove_from_ail(bd); } @@ -303,7 +375,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) if (!rm) continue; - gfs2_ail2_empty_one(sdp, tr); + gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); list_del(&tr->tr_list); gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); @@ -487,7 +559,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp) if (list_empty(&sdp->sd_ail1_list)) { tail = sdp->sd_log_head; } else { - tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans, + tr = list_last_entry(&sdp->sd_ail1_list, struct gfs2_trans, tr_list); tail = tr->tr_first; } @@ -512,7 +584,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) } -static void log_flush_wait(struct gfs2_sbd *sdp) +void log_flush_wait(struct gfs2_sbd *sdp) { DEFINE_WAIT(wait); @@ -549,7 +621,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) spin_lock(&sdp->sd_ordered_lock); list_sort(NULL, &sdp->sd_log_ordered, &ip_cmp); while (!list_empty(&sdp->sd_log_ordered)) { - ip = list_entry(sdp->sd_log_ordered.next, struct gfs2_inode, i_ordered); + ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered); if (ip->i_inode.i_mapping->nrpages == 0) { test_and_clear_bit(GIF_ORDERED, &ip->i_flags); list_del(&ip->i_ordered); @@ -570,7 +642,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) spin_lock(&sdp->sd_ordered_lock); while (!list_empty(&sdp->sd_log_ordered)) { - ip = list_entry(sdp->sd_log_ordered.next, struct gfs2_inode, i_ordered); + ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered); list_del(&ip->i_ordered); WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags)); if (ip->i_inode.i_mapping->nrpages == 0) @@ -616,27 +688,24 @@ void gfs2_glock_remove_revoke(struct gfs2_glock *gl) } } +/** + * gfs2_write_revokes - Add as many revokes to the system transaction as we can + * @sdp: The GFS2 superblock + * + * Our usual strategy is to defer writing revokes as much as we can in the hope + * that we'll eventually overwrite the journal, which will make those revokes + * go away. This changes when we flush the log: at that point, there will + * likely be some left-over space in the last revoke block of that transaction. + * We can fill that space with additional revokes for blocks that have already + * been written back. This will basically come at no cost now, and will save + * us from having to keep track of those blocks on the AIL2 list later. + */ void gfs2_write_revokes(struct gfs2_sbd *sdp) { - struct gfs2_trans *tr; - struct gfs2_bufdata *bd, *tmp; - int have_revokes = 0; + /* number of revokes we still have room for */ int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); - gfs2_ail1_empty(sdp); - spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { - list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) { - if (list_empty(&bd->bd_list)) { - have_revokes = 1; - goto done; - } - } - } -done: - spin_unlock(&sdp->sd_ail_lock); - if (have_revokes == 0) - return; + gfs2_log_lock(sdp); while (sdp->sd_log_num_revoke > max_revokes) max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); max_revokes -= sdp->sd_log_num_revoke; @@ -647,20 +716,7 @@ done: if (!sdp->sd_log_blks_reserved) atomic_dec(&sdp->sd_log_blks_free); } - gfs2_log_lock(sdp); - spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { - list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) { - if (max_revokes == 0) - goto out_of_blocks; - if (!list_empty(&bd->bd_list)) - continue; - gfs2_add_revoke(sdp, bd); - max_revokes--; - } - } -out_of_blocks: - spin_unlock(&sdp->sd_ail_lock); + gfs2_ail1_empty(sdp, max_revokes); gfs2_log_unlock(sdp); if (!sdp->sd_log_num_revoke) { @@ -787,6 +843,40 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) } /** + * ail_drain - drain the ail lists after a withdraw + * @sdp: Pointer to GFS2 superblock + */ +static void ail_drain(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + + spin_lock(&sdp->sd_ail_lock); + /* + * For transactions on the sd_ail1_list we need to drain both the + * ail1 and ail2 lists. That's because function gfs2_ail1_start_one + * (temporarily) moves items from its tr_ail1 list to tr_ail2 list + * before revokes are sent for that block. Items on the sd_ail2_list + * should have already gotten beyond that point, so no need. + */ + while (!list_empty(&sdp->sd_ail1_list)) { + tr = list_first_entry(&sdp->sd_ail1_list, struct gfs2_trans, + tr_list); + gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail1_list); + gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); + list_del(&tr->tr_list); + kfree(tr); + } + while (!list_empty(&sdp->sd_ail2_list)) { + tr = list_first_entry(&sdp->sd_ail2_list, struct gfs2_trans, + tr_list); + gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); + list_del(&tr->tr_list); + kfree(tr); + } + spin_unlock(&sdp->sd_ail_lock); +} + +/** * gfs2_log_flush - flush incore transaction(s) * @sdp: the filesystem * @gl: The glock structure to flush. If NULL, flush the whole incore log @@ -796,11 +886,18 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) { - struct gfs2_trans *tr; + struct gfs2_trans *tr = NULL; enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); down_write(&sdp->sd_log_flush_lock); + /* + * Do this check while holding the log_flush_lock to prevent new + * buffers from being added to the ail via gfs2_pin() + */ + if (gfs2_withdrawn(sdp)) + goto out; + /* Log might have been flushed while we waited for the flush lock */ if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) { up_write(&sdp->sd_log_flush_lock); @@ -819,17 +916,27 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) INIT_LIST_HEAD(&tr->tr_ail2_list); tr->tr_first = sdp->sd_log_flush_head; if (unlikely (state == SFS_FROZEN)) - gfs2_assert_withdraw(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new); + if (gfs2_assert_withdraw_delayed(sdp, + !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) + goto out; } if (unlikely(state == SFS_FROZEN)) - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); - gfs2_assert_withdraw(sdp, - sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke); + if (gfs2_assert_withdraw_delayed(sdp, !sdp->sd_log_num_revoke)) + goto out; + if (gfs2_assert_withdraw_delayed(sdp, + sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke)) + goto out; gfs2_ordered_write(sdp); + if (gfs2_withdrawn(sdp)) + goto out; lops_before_commit(sdp, tr); + if (gfs2_withdrawn(sdp)) + goto out; gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); + if (gfs2_withdrawn(sdp)) + goto out; if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); @@ -839,6 +946,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) trace_gfs2_log_blocks(sdp, -1); log_write_header(sdp, flags); } + if (gfs2_withdrawn(sdp)) + goto out; lops_after_commit(sdp, tr); gfs2_log_lock(sdp); @@ -859,9 +968,11 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) for (;;) { gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); - if (gfs2_ail1_empty(sdp)) + if (gfs2_ail1_empty(sdp, 0)) break; } + if (gfs2_withdrawn(sdp)) + goto out; atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); log_write_header(sdp, flags); @@ -874,6 +985,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); } +out: + if (gfs2_withdrawn(sdp)) { + ail_drain(sdp); /* frees all transactions */ + tr = NULL; + } + trace_gfs2_log_flush(sdp, 0, flags); up_write(&sdp->sd_log_flush_lock); @@ -1016,16 +1133,17 @@ int gfs2_logd(void *data) /* Check for errors writing to the journal */ if (sdp->sd_log_error) { - gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: error %d: " - "withdrawing the file system to " - "prevent further damage.\n", - sdp->sd_fsname, sdp->sd_log_error); + gfs2_lm(sdp, + "GFS2: fsid=%s: error %d: " + "withdrawing the file system to " + "prevent further damage.\n", + sdp->sd_fsname, sdp->sd_log_error); + gfs2_withdraw(sdp); } did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { - gfs2_ail1_empty(sdp); + gfs2_ail1_empty(sdp, 0); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_LOGD_JFLUSH_REQD); did_flush = true; @@ -1034,7 +1152,7 @@ int gfs2_logd(void *data) if (gfs2_ail_flush_reqd(sdp)) { gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); - gfs2_ail1_empty(sdp); + gfs2_ail1_empty(sdp, 0); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_LOGD_AIL_FLUSH_REQD); did_flush = true; diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index c0a65e5a126b..c1cd6ae17659 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -73,6 +73,7 @@ extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 type); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); +extern void log_flush_wait(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index c090d5ad3f22..5ea96757afc4 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -203,8 +203,12 @@ static void gfs2_end_log_write(struct bio *bio) struct bvec_iter_all iter_all; if (bio->bi_status) { - fs_err(sdp, "Error %d writing to journal, jid=%u\n", - bio->bi_status, sdp->sd_jdesc->jd_jid); + if (!cmpxchg(&sdp->sd_log_error, 0, (int)bio->bi_status)) + fs_err(sdp, "Error %d writing to journal, jid=%u\n", + bio->bi_status, sdp->sd_jdesc->jd_jid); + gfs2_withdraw_delayed(sdp); + /* prevent more writes to the journal */ + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); wake_up(&sdp->sd_logd_waitq); } @@ -730,7 +734,7 @@ static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) head = &tr->tr_buf; while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); gfs2_unpin(sdp, bd->bd_bh, tr); } @@ -900,7 +904,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) struct gfs2_glock *gl; while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); gl = bd->bd_gl; gfs2_glock_remove_revoke(gl); @@ -1079,7 +1083,7 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) head = &tr->tr_databuf; while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); gfs2_unpin(sdp, bd->bd_bh, tr); } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0c3772974030..4b72abcf83b2 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -251,7 +251,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head *bh, *bhs[2]; int num = 0; - if (unlikely(gfs2_withdrawn(sdp))) { + if (unlikely(gfs2_withdrawn(sdp)) && + (!sdp->sd_jdesc || (blkno != sdp->sd_jdesc->jd_no_addr))) { *bhp = NULL; return -EIO; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index a1a8ef7ed3fd..e2b69ffcc6a8 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -552,6 +552,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) mutex_lock(&sdp->sd_jindex_mutex); for (;;) { + struct gfs2_inode *jip; + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); if (error) break; @@ -591,6 +593,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) spin_lock(&sdp->sd_jindex_spin); jd->jd_jid = sdp->sd_journals++; + jip = GFS2_I(jd->jd_inode); + jd->jd_no_addr = jip->i_no_addr; list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); spin_unlock(&sdp->sd_jindex_spin); } @@ -600,48 +604,6 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) return error; } -/** - * check_journal_clean - Make sure a journal is clean for a spectator mount - * @sdp: The GFS2 superblock - * @jd: The journal descriptor - * - * Returns: 0 if the journal is clean or locked, else an error - */ -static int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) -{ - int error; - struct gfs2_holder j_gh; - struct gfs2_log_header_host head; - struct gfs2_inode *ip; - - ip = GFS2_I(jd->jd_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | - GL_EXACT | GL_NOCACHE, &j_gh); - if (error) { - fs_err(sdp, "Error locking journal for spectator mount.\n"); - return -EPERM; - } - error = gfs2_jdesc_check(jd); - if (error) { - fs_err(sdp, "Error checking journal for spectator mount.\n"); - goto out_unlock; - } - error = gfs2_find_jhead(jd, &head, false); - if (error) { - fs_err(sdp, "Error parsing journal for spectator mount.\n"); - goto out_unlock; - } - if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { - error = -EPERM; - fs_err(sdp, "jid=%u: Journal is dirty, so the first mounter " - "must not be a spectator.\n", jd->jd_jid); - } - -out_unlock: - gfs2_glock_dq_uninit(&j_gh); - return error; -} - static int init_journal(struct gfs2_sbd *sdp, int undo) { struct inode *master = d_inode(sdp->sd_master_dir); @@ -694,7 +656,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid, &gfs2_journal_glops, - LM_ST_EXCLUSIVE, LM_FLAG_NOEXP, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | GL_NOCACHE, &sdp->sd_journal_gh); if (error) { fs_err(sdp, "can't acquire journal glock: %d\n", error); @@ -702,6 +665,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) } ip = GFS2_I(sdp->sd_jdesc->jd_inode); + sdp->sd_jinode_gl = ip->i_gl; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE, &sdp->sd_jinode_gh); @@ -732,7 +696,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) struct gfs2_jdesc *jd = gfs2_jdesc_find(sdp, x); if (sdp->sd_args.ar_spectator) { - error = check_journal_clean(sdp, jd); + error = check_journal_clean(sdp, jd, true); if (error) goto fail_jinode_gh; continue; @@ -762,10 +726,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) return 0; fail_jinode_gh: - if (!sdp->sd_args.ar_spectator) + /* A withdraw may have done dq/uninit so now we need to check it */ + if (!sdp->sd_args.ar_spectator && + gfs2_holder_initialized(&sdp->sd_jinode_gh)) gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); fail_journal_gh: - if (!sdp->sd_args.ar_spectator) + if (!sdp->sd_args.ar_spectator && + gfs2_holder_initialized(&sdp->sd_journal_gh)) gfs2_glock_dq_uninit(&sdp->sd_journal_gh); fail_jindex: gfs2_jindex_free(sdp); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e9f93045eb01..cc0c4b5800be 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -115,7 +115,7 @@ static void gfs2_qd_dispose(struct list_head *list) struct gfs2_sbd *sdp; while (!list_empty(list)) { - qd = list_entry(list->next, struct gfs2_quota_data, qd_lru); + qd = list_first_entry(list, struct gfs2_quota_data, qd_lru); sdp = qd->qd_gl->gl_name.ln_sbd; list_del(&qd->qd_lru); @@ -525,11 +525,11 @@ static void qdsb_put(struct gfs2_quota_data *qd) } /** - * gfs2_qa_alloc - make sure we have a quota allocations data structure, - * if necessary + * gfs2_qa_get - make sure we have a quota allocations data structure, + * if necessary * @ip: the inode for this reservation */ -int gfs2_qa_alloc(struct gfs2_inode *ip) +int gfs2_qa_get(struct gfs2_inode *ip) { int error = 0; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); @@ -540,17 +540,21 @@ int gfs2_qa_alloc(struct gfs2_inode *ip) down_write(&ip->i_rw_mutex); if (ip->i_qadata == NULL) { ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS); - if (!ip->i_qadata) + if (!ip->i_qadata) { error = -ENOMEM; + goto out; + } } + ip->i_qadata->qa_ref++; +out: up_write(&ip->i_rw_mutex); return error; } -void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount) +void gfs2_qa_put(struct gfs2_inode *ip) { down_write(&ip->i_rw_mutex); - if (ip->i_qadata && ((wcount == NULL) || (atomic_read(wcount) <= 1))) { + if (ip->i_qadata && --ip->i_qadata->qa_ref == 0) { kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata); ip->i_qadata = NULL; } @@ -566,27 +570,27 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; - if (ip->i_qadata == NULL) { - error = gfs2_rsqa_alloc(ip); - if (error) - return error; - } + error = gfs2_qa_get(ip); + if (error) + return error; qd = ip->i_qadata->qa_qd; if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) || - gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) - return -EIO; + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) { + error = -EIO; + goto out; + } error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd); if (error) - goto out; + goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd); if (error) - goto out; + goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; @@ -594,7 +598,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) !uid_eq(uid, ip->i_inode.i_uid)) { error = qdsb_get(sdp, make_kqid_uid(uid), qd); if (error) - goto out; + goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; } @@ -603,14 +607,15 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) !gid_eq(gid, ip->i_inode.i_gid)) { error = qdsb_get(sdp, make_kqid_gid(gid), qd); if (error) - goto out; + goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; } -out: +out_unhold: if (error) gfs2_quota_unhold(ip); +out: return error; } @@ -621,6 +626,7 @@ void gfs2_quota_unhold(struct gfs2_inode *ip) if (ip->i_qadata == NULL) return; + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { @@ -628,6 +634,7 @@ void gfs2_quota_unhold(struct gfs2_inode *ip) ip->i_qadata->qa_qd[x] = NULL; } ip->i_qadata->qa_qd_num = 0; + gfs2_qa_put(ip); } static int sort_qd(const void *a, const void *b) @@ -876,7 +883,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) unsigned int nalloc = 0, blocks; int error; - error = gfs2_rsqa_alloc(ip); + error = gfs2_qa_get(ip); if (error) return error; @@ -884,8 +891,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) &data_blocks, &ind_blocks); ghs = kmalloc_array(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); - if (!ghs) - return -ENOMEM; + if (!ghs) { + error = -ENOMEM; + goto out; + } sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); inode_lock(&ip->i_inode); @@ -893,12 +902,12 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); if (error) - goto out; + goto out_dq; } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); if (error) - goto out; + goto out_dq; for (x = 0; x < num_qd; x++) { offset = qd2offset(qda[x]); @@ -950,13 +959,15 @@ out_ipres: gfs2_inplace_release(ip); out_alloc: gfs2_glock_dq_uninit(&i_gh); -out: +out_dq: while (qx--) gfs2_glock_dq_uninit(&ghs[qx]); inode_unlock(&ip->i_inode); kfree(ghs); gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC); +out: + gfs2_qa_put(ip); return error; } @@ -1259,6 +1270,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; + BUG_ON(ip->i_qadata->qa_ref <= 0); for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; @@ -1441,7 +1453,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) spin_lock(&qd_lock); while (!list_empty(head)) { - qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); + qd = list_last_entry(head, struct gfs2_quota_data, qd_list); list_del(&qd->qd_list); @@ -1476,8 +1488,8 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) if (error == 0 || error == -EROFS) return; if (!gfs2_withdrawn(sdp)) { - fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); - sdp->sd_log_error = error; + if (!cmpxchg(&sdp->sd_log_error, 0, error)) + fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); wake_up(&sdp->sd_logd_waitq); } } @@ -1504,7 +1516,7 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp) ip = NULL; spin_lock(&sdp->sd_trunc_lock); if (!list_empty(&sdp->sd_trunc_list)) { - ip = list_entry(sdp->sd_trunc_list.next, + ip = list_first_entry(&sdp->sd_trunc_list, struct gfs2_inode, i_trunc_list); list_del_init(&ip->i_trunc_list); } @@ -1541,6 +1553,8 @@ int gfs2_quotad(void *data) while (!kthread_should_stop()) { + if (gfs2_withdrawn(sdp)) + goto bypass; /* Update the master statfs file */ if (sdp->sd_statfs_force_sync) { int error = gfs2_statfs_sync(sdp->sd_vfs, 0); @@ -1561,6 +1575,7 @@ int gfs2_quotad(void *data) try_to_freeze(); +bypass: t = min(quotad_timeo, statfs_timeo); prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); @@ -1674,7 +1689,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (error) return error; - error = gfs2_rsqa_alloc(ip); + error = gfs2_qa_get(ip); if (error) goto out_put; @@ -1743,6 +1758,7 @@ out_i: out_q: gfs2_glock_dq_uninit(&q_gh); out_unlockput: + gfs2_qa_put(ip); inode_unlock(&ip->i_inode); out_put: qd_put(qd); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 765627d9a91e..7f9ca8ef40fc 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -15,8 +15,8 @@ struct gfs2_sbd; #define NO_UID_QUOTA_CHANGE INVALID_UID #define NO_GID_QUOTA_CHANGE INVALID_GID -extern int gfs2_qa_alloc(struct gfs2_inode *ip); -extern void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern int gfs2_qa_get(struct gfs2_inode *ip); +extern void gfs2_qa_put(struct gfs2_inode *ip); extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_unhold(struct gfs2_inode *ip); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 85f830e56945..96c345f49273 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -111,7 +111,7 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) struct gfs2_revoke_replay *rr; while (!list_empty(head)) { - rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list); + rr = list_first_entry(head, struct gfs2_revoke_replay, rr_list); list_del(&rr->rr_list); kfree(rr); } @@ -305,6 +305,11 @@ void gfs2_recover_func(struct work_struct *work) int error = 0; int jlocked = 0; + if (gfs2_withdrawn(sdp)) { + fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n", + jd->jd_jid); + goto fail; + } t_start = ktime_get(); if (sdp->sd_args.ar_spectator) goto fail; @@ -393,6 +398,10 @@ void gfs2_recover_func(struct work_struct *work) fs_info(sdp, "jid=%u: Replaying journal...0x%x to 0x%x\n", jd->jd_jid, head.lh_tail, head.lh_blkno); + /* We take the sd_log_flush_lock here primarily to prevent log + * flushes and simultaneous journal replays from stomping on + * each other wrt sd_log_bio. */ + down_read(&sdp->sd_log_flush_lock); for (pass = 0; pass < 2; pass++) { lops_before_scan(jd, &head, pass); error = foreach_descriptor(jd, head.lh_tail, @@ -403,6 +412,7 @@ void gfs2_recover_func(struct work_struct *work) } clean_journal(jd, &head); + up_read(&sdp->sd_log_flush_lock); gfs2_glock_dq_uninit(&thaw_gh); t_rep = ktime_get(); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index e7bf91ec231c..a321c34e3d6e 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -457,24 +457,24 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) } if (count[0] != rgd->rd_free) { - if (gfs2_consist_rgrpd(rgd)) - fs_err(sdp, "free data mismatch: %u != %u\n", - count[0], rgd->rd_free); + gfs2_lm(sdp, "free data mismatch: %u != %u\n", + count[0], rgd->rd_free); + gfs2_consist_rgrpd(rgd); return; } tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; if (count[1] != tmp) { - if (gfs2_consist_rgrpd(rgd)) - fs_err(sdp, "used data mismatch: %u != %u\n", - count[1], tmp); + gfs2_lm(sdp, "used data mismatch: %u != %u\n", + count[1], tmp); + gfs2_consist_rgrpd(rgd); return; } if (count[2] + count[3] != rgd->rd_dinodes) { - if (gfs2_consist_rgrpd(rgd)) - fs_err(sdp, "used metadata mismatch: %u != %u\n", - count[2] + count[3], rgd->rd_dinodes); + gfs2_lm(sdp, "used metadata mismatch: %u != %u\n", + count[2] + count[3], rgd->rd_dinodes); + gfs2_consist_rgrpd(rgd); return; } } @@ -590,16 +590,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd) } } -/** - * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode - * plus a quota allocations data structure, if necessary - * @ip: the inode for this reservation - */ -int gfs2_rsqa_alloc(struct gfs2_inode *ip) -{ - return gfs2_qa_alloc(ip); -} - static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs, const char *fs_id_buf) { @@ -672,18 +662,17 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) } /** - * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation + * gfs2_rs_delete - delete a multi-block reservation * @ip: The inode for this reservation * @wcount: The inode's write count, or NULL * */ -void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount) +void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) { down_write(&ip->i_rw_mutex); if ((wcount == NULL) || (atomic_read(wcount) <= 1)) gfs2_rs_deltree(&ip->i_res); up_write(&ip->i_rw_mutex); - gfs2_qa_delete(ip, wcount); } /** @@ -720,8 +709,12 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { - glock_clear_object(gl, rgd); + if (gl->gl_state != LM_ST_UNLOCKED) { + gfs2_glock_cb(gl, LM_ST_UNLOCKED); + flush_delayed_work(&gl->gl_work); + } gfs2_rgrp_brelse(rgd); + glock_clear_object(gl, rgd); gfs2_glock_put(gl); } @@ -733,17 +726,6 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) } } -static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) -{ - struct gfs2_sbd *sdp = rgd->rd_sbd; - - fs_info(sdp, "ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); - fs_info(sdp, "ri_length = %u\n", rgd->rd_length); - fs_info(sdp, "ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); - fs_info(sdp, "ri_data = %u\n", rgd->rd_data); - fs_info(sdp, "ri_bitbytes = %u\n", rgd->rd_bitbytes); -} - /** * gfs2_compute_bitstructs - Compute the bitmap sizes * @rgd: The resource group descriptor @@ -814,11 +796,20 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) } bi = rgd->rd_bits + (length - 1); if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) { - if (gfs2_consist_rgrpd(rgd)) { - gfs2_rindex_print(rgd); - fs_err(sdp, "start=%u len=%u offset=%u\n", - bi->bi_start, bi->bi_bytes, bi->bi_offset); - } + gfs2_lm(sdp, + "ri_addr = %llu\n" + "ri_length = %u\n" + "ri_data0 = %llu\n" + "ri_data = %u\n" + "ri_bitbytes = %u\n" + "start=%u len=%u offset=%u\n", + (unsigned long long)rgd->rd_addr, + rgd->rd_length, + (unsigned long long)rgd->rd_data0, + rgd->rd_data, + rgd->rd_bitbytes, + bi->bi_start, bi->bi_bytes, bi->bi_offset); + gfs2_consist_rgrpd(rgd); return -EIO; } @@ -1286,23 +1277,6 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd) bi->bi_bh = NULL; } } - -} - -/** - * gfs2_rgrp_go_unlock - Unlock a rgrp glock - * @gh: The glock holder for the resource group - * - */ - -void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) -{ - struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; - int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) | - test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags); - - if (rgd && demote_requested) - gfs2_rgrp_brelse(rgd); } int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, @@ -1832,10 +1806,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 }; while (1) { - down_write(&sdp->sd_log_flush_lock); error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, true); - up_write(&sdp->sd_log_flush_lock); if (error == -ENOSPC) break; if (WARN_ON_ONCE(error)) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index c14a673ae36f..a1d7e14fc55b 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -33,7 +33,6 @@ extern int gfs2_rindex_update(struct gfs2_sbd *sdp); extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); -extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); @@ -45,9 +44,8 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); -extern int gfs2_rsqa_alloc(struct gfs2_inode *ip); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 68cc7c291a81..37fc41632aa2 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -61,11 +61,13 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp) sdp->sd_journals = 0; spin_unlock(&sdp->sd_jindex_spin); + sdp->sd_jdesc = NULL; while (!list_empty(&list)) { - jd = list_entry(list.next, struct gfs2_jdesc, jd_list); + jd = list_first_entry(&list, struct gfs2_jdesc, jd_list); gfs2_free_journal_extents(jd); list_del(&jd->jd_list); iput(jd->jd_inode); + jd->jd_inode = NULL; kfree(jd); } } @@ -171,9 +173,13 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) goto fail_threads; j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); + if (gfs2_withdrawn(sdp)) { + error = -EIO; + goto fail; + } error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); - if (error) + if (error || gfs2_withdrawn(sdp)) goto fail; if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { @@ -187,7 +193,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) gfs2_log_pointers_init(sdp, head.lh_blkno); error = gfs2_quota_init(sdp); - if (error) + if (error || gfs2_withdrawn(sdp)) goto fail; set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); @@ -446,7 +452,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) out: while (!list_empty(&list)) { - lfcc = list_entry(list.next, struct lfcc, list); + lfcc = list_first_entry(&list, struct lfcc, list); list_del(&lfcc->list); gfs2_glock_dq_uninit(&lfcc->gh); kfree(lfcc); @@ -599,34 +605,63 @@ out: int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { struct gfs2_holder freeze_gh; - int error; - - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE, - &freeze_gh); - if (error && !gfs2_withdrawn(sdp)) - return error; + int error = 0; + int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + gfs2_holder_mark_uninitialized(&freeze_gh); + if (sdp->sd_freeze_gl && + !gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) { + if (!log_write_allowed) { + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, + LM_ST_SHARED, GL_NOCACHE | + LM_FLAG_TRY, &freeze_gh); + if (error == GLR_TRYFAILED) + error = 0; + } else { + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, + LM_ST_SHARED, GL_NOCACHE, + &freeze_gh); + if (error && !gfs2_withdrawn(sdp)) + return error; + } + } flush_workqueue(gfs2_delete_workqueue); - if (sdp->sd_quotad_process) + if (!log_write_allowed && current == sdp->sd_quotad_process) + fs_warn(sdp, "The quotad daemon is withdrawing.\n"); + else if (sdp->sd_quotad_process) kthread_stop(sdp->sd_quotad_process); sdp->sd_quotad_process = NULL; - if (sdp->sd_logd_process) + + if (!log_write_allowed && current == sdp->sd_logd_process) + fs_warn(sdp, "The logd daemon is withdrawing.\n"); + else if (sdp->sd_logd_process) kthread_stop(sdp->sd_logd_process); sdp->sd_logd_process = NULL; - gfs2_quota_sync(sdp->sd_vfs, 0); - gfs2_statfs_sync(sdp->sd_vfs, 0); - - gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN | - GFS2_LFC_MAKE_FS_RO); - wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0); - gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); + if (log_write_allowed) { + gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_statfs_sync(sdp->sd_vfs, 0); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN | + GFS2_LFC_MAKE_FS_RO); + wait_event(sdp->sd_reserving_log_wait, + atomic_read(&sdp->sd_reserving_log) == 0); + gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == + sdp->sd_jdesc->jd_blocks); + } else { + wait_event_timeout(sdp->sd_reserving_log_wait, + atomic_read(&sdp->sd_reserving_log) == 0, + HZ * 5); + } if (gfs2_holder_initialized(&freeze_gh)) gfs2_glock_dq_uninit(&freeze_gh); gfs2_quota_cleanup(sdp); + if (!log_write_allowed) + sdp->sd_vfs->s_flags |= SB_RDONLY; + return error; } @@ -677,8 +712,10 @@ restart: gfs2_glock_put(sdp->sd_freeze_gl); if (!sdp->sd_args.ar_spectator) { - gfs2_glock_dq_uninit(&sdp->sd_journal_gh); - gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + if (gfs2_holder_initialized(&sdp->sd_journal_gh)) + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); + if (gfs2_holder_initialized(&sdp->sd_jinode_gh)) + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); iput(sdp->sd_sc_inode); @@ -1356,14 +1393,6 @@ out_unlock: if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); - if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - glock_clear_object(ip->i_iopen_gh.gh_gl, ip); - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq(&ip->i_iopen_gh); - } - gfs2_holder_uninit(&ip->i_iopen_gh); - } if (gfs2_holder_initialized(&gh)) { glock_clear_object(ip->i_gl, ip); gfs2_glock_dq_uninit(&gh); @@ -1372,22 +1401,30 @@ out_unlock: fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: truncate_inode_pages_final(&inode->i_data); - gfs2_rsqa_delete(ip, NULL); + if (ip->i_qadata) + gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); + gfs2_rs_delete(ip, NULL); + gfs2_qa_put(ip); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - glock_clear_object(ip->i_gl, ip); - wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); - gfs2_glock_add_to_lru(ip->i_gl); - gfs2_glock_put_eventually(ip->i_gl); - ip->i_gl = NULL; + if (ip->i_gl) { + glock_clear_object(ip->i_gl, ip); + wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); + gfs2_glock_add_to_lru(ip->i_gl); + gfs2_glock_put_eventually(ip->i_gl); + ip->i_gl = NULL; + } if (gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; glock_clear_object(gl, ip); - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq(&ip->i_iopen_gh); + } gfs2_glock_hold(gl); - gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_put_eventually(gl); } } @@ -1401,6 +1438,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) return NULL; ip->i_flags = 0; ip->i_gl = NULL; + gfs2_holder_mark_uninitialized(&ip->i_iopen_gh); memset(&ip->i_res, 0, sizeof(ip->i_res)); RB_CLEAR_NODE(&ip->i_res.rs_node); ip->i_rahead = 0; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index b8bf811a1305..51900554ed81 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -26,7 +26,6 @@ extern void gfs2_jindex_free(struct gfs2_sbd *sdp); extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); - extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 8ccb68f4ed16..d28c41bd69b0 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -136,7 +136,8 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) if (val != 1) return -EINVAL; - gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n"); + gfs2_lm(sdp, "withdrawing from cluster at user's request\n"); + gfs2_withdraw(sdp); return len; } @@ -434,6 +435,8 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) * never clear the DFL_BLOCK_LOCKS flag, so all our locks would * permanently stop working. */ + if (!sdp->sd_jdesc) + goto out; if (sdp->sd_jdesc->jd_jid == jid && !sdp->sd_args.ar_spectator) goto out; rv = -ENOENT; diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index a685637a5b55..ffe840505082 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -228,6 +228,10 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) fs_info(sdp, "GFS2:adding buf while frozen\n"); gfs2_assert_withdraw(sdp, 0); } + if (unlikely(gfs2_withdrawn(sdp))) { + fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n", + (unsigned long long)bd->bd_bh->b_blocknr); + } gfs2_pin(sdp, bd->bd_bh); mh->__pad0 = cpu_to_be64(0); mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index ec600b487498..9b64d40ab379 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -11,12 +11,18 @@ #include <linux/buffer_head.h> #include <linux/crc32.h> #include <linux/gfs2_ondisk.h> +#include <linux/delay.h> #include <linux/uaccess.h> #include "gfs2.h" #include "incore.h" #include "glock.h" +#include "glops.h" +#include "log.h" +#include "lops.h" +#include "recovery.h" #include "rgrp.h" +#include "super.h" #include "util.h" struct kmem_cache *gfs2_glock_cachep __read_mostly; @@ -33,32 +39,257 @@ void gfs2_assert_i(struct gfs2_sbd *sdp) fs_emerg(sdp, "fatal assertion failed\n"); } -int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) +/** + * check_journal_clean - Make sure a journal is clean for a spectator mount + * @sdp: The GFS2 superblock + * @jd: The journal descriptor + * + * Returns: 0 if the journal is clean or locked, else an error + */ +int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + bool verbose) +{ + int error; + struct gfs2_holder j_gh; + struct gfs2_log_header_host head; + struct gfs2_inode *ip; + + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | + GL_EXACT | GL_NOCACHE, &j_gh); + if (error) { + if (verbose) + fs_err(sdp, "Error %d locking journal for spectator " + "mount.\n", error); + return -EPERM; + } + error = gfs2_jdesc_check(jd); + if (error) { + if (verbose) + fs_err(sdp, "Error checking journal for spectator " + "mount.\n"); + goto out_unlock; + } + error = gfs2_find_jhead(jd, &head, false); + if (error) { + if (verbose) + fs_err(sdp, "Error parsing journal for spectator " + "mount.\n"); + goto out_unlock; + } + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EPERM; + if (verbose) + fs_err(sdp, "jid=%u: Journal is dirty, so the first " + "mounter must not be a spectator.\n", + jd->jd_jid); + } + +out_unlock: + gfs2_glock_dq_uninit(&j_gh); + return error; +} + +static void signal_our_withdraw(struct gfs2_sbd *sdp) +{ + struct gfs2_glock *gl = sdp->sd_live_gh.gh_gl; + struct inode *inode = sdp->sd_jdesc->jd_inode; + struct gfs2_inode *ip = GFS2_I(inode); + u64 no_formal_ino = ip->i_no_formal_ino; + int ret = 0; + int tries; + + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) + return; + + /* Prevent any glock dq until withdraw recovery is complete */ + set_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); + /* + * Don't tell dlm we're bailing until we have no more buffers in the + * wind. If journal had an IO error, the log code should just purge + * the outstanding buffers rather than submitting new IO. Making the + * file system read-only will flush the journal, etc. + * + * During a normal unmount, gfs2_make_fs_ro calls gfs2_log_shutdown + * which clears SDF_JOURNAL_LIVE. In a withdraw, we must not write + * any UNMOUNT log header, so we can't call gfs2_log_shutdown, and + * therefore we need to clear SDF_JOURNAL_LIVE manually. + */ + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + if (!sb_rdonly(sdp->sd_vfs)) + ret = gfs2_make_fs_ro(sdp); + + /* + * Drop the glock for our journal so another node can recover it. + */ + if (gfs2_holder_initialized(&sdp->sd_journal_gh)) { + gfs2_glock_dq_wait(&sdp->sd_journal_gh); + gfs2_holder_uninit(&sdp->sd_journal_gh); + } + sdp->sd_jinode_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq(&sdp->sd_jinode_gh); + if (test_bit(SDF_FS_FROZEN, &sdp->sd_flags)) { + /* Make sure gfs2_unfreeze works if partially-frozen */ + flush_workqueue(gfs2_freeze_wq); + atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); + thaw_super(sdp->sd_vfs); + } else { + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); + } + + /* + * holder_uninit to force glock_put, to force dlm to let go + */ + gfs2_holder_uninit(&sdp->sd_jinode_gh); + + /* + * Note: We need to be careful here: + * Our iput of jd_inode will evict it. The evict will dequeue its + * glock, but the glock dq will wait for the withdraw unless we have + * exception code in glock_dq. + */ + iput(inode); + /* + * Wait until the journal inode's glock is freed. This allows try locks + * on other nodes to be successful, otherwise we remain the owner of + * the glock as far as dlm is concerned. + */ + if (gl->gl_ops->go_free) { + set_bit(GLF_FREEING, &gl->gl_flags); + wait_on_bit(&gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE); + } + + if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */ + clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); + goto skip_recovery; + } + /* + * Dequeue the "live" glock, but keep a reference so it's never freed. + */ + gfs2_glock_hold(gl); + gfs2_glock_dq_wait(&sdp->sd_live_gh); + /* + * We enqueue the "live" glock in EX so that all other nodes + * get a demote request and act on it. We don't really want the + * lock in EX, so we send a "try" lock with 1CB to produce a callback. + */ + fs_warn(sdp, "Requesting recovery of jid %d.\n", + sdp->sd_lockstruct.ls_jid); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | LM_FLAG_NOEXP, + &sdp->sd_live_gh); + msleep(GL_GLOCK_MAX_HOLD); + /* + * This will likely fail in a cluster, but succeed standalone: + */ + ret = gfs2_glock_nq(&sdp->sd_live_gh); + + /* + * If we actually got the "live" lock in EX mode, there are no other + * nodes available to replay our journal. So we try to replay it + * ourselves. We hold the "live" glock to prevent other mounters + * during recovery, then just dequeue it and reacquire it in our + * normal SH mode. Just in case the problem that caused us to + * withdraw prevents us from recovering our journal (e.g. io errors + * and such) we still check if the journal is clean before proceeding + * but we may wait forever until another mounter does the recovery. + */ + if (ret == 0) { + fs_warn(sdp, "No other mounters found. Trying to recover our " + "own journal jid %d.\n", sdp->sd_lockstruct.ls_jid); + if (gfs2_recover_journal(sdp->sd_jdesc, 1)) + fs_warn(sdp, "Unable to recover our journal jid %d.\n", + sdp->sd_lockstruct.ls_jid); + gfs2_glock_dq_wait(&sdp->sd_live_gh); + gfs2_holder_reinit(LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT, + &sdp->sd_live_gh); + gfs2_glock_nq(&sdp->sd_live_gh); + } + + gfs2_glock_queue_put(gl); /* drop the extra reference we acquired */ + clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); + + /* + * At this point our journal is evicted, so we need to get a new inode + * for it. Once done, we need to call gfs2_find_jhead which + * calls gfs2_map_journal_extents to map it for us again. + * + * Note that we don't really want it to look up a FREE block. The + * GFS2_BLKST_FREE simply overrides a block check in gfs2_inode_lookup + * which would otherwise fail because it requires grabbing an rgrp + * glock, which would fail with -EIO because we're withdrawing. + */ + inode = gfs2_inode_lookup(sdp->sd_vfs, DT_UNKNOWN, + sdp->sd_jdesc->jd_no_addr, no_formal_ino, + GFS2_BLKST_FREE); + if (IS_ERR(inode)) { + fs_warn(sdp, "Reprocessing of jid %d failed with %ld.\n", + sdp->sd_lockstruct.ls_jid, PTR_ERR(inode)); + goto skip_recovery; + } + sdp->sd_jdesc->jd_inode = inode; + + /* + * Now wait until recovery is complete. + */ + for (tries = 0; tries < 10; tries++) { + ret = check_journal_clean(sdp, sdp->sd_jdesc, false); + if (!ret) + break; + msleep(HZ); + fs_warn(sdp, "Waiting for journal recovery jid %d.\n", + sdp->sd_lockstruct.ls_jid); + } +skip_recovery: + if (!ret) + fs_warn(sdp, "Journal recovery complete for jid %d.\n", + sdp->sd_lockstruct.ls_jid); + else + fs_warn(sdp, "Journal recovery skipped for %d until next " + "mount.\n", sdp->sd_lockstruct.ls_jid); + fs_warn(sdp, "Glock dequeues delayed: %lu\n", sdp->sd_glock_dqs_held); + sdp->sd_glock_dqs_held = 0; + wake_up_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY); +} + +void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...) { - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - const struct lm_lockops *lm = ls->ls_ops; - va_list args; struct va_format vaf; + va_list args; if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && - test_and_set_bit(SDF_WITHDRAWN, &sdp->sd_flags)) - return 0; - - if (fmt) { - va_start(args, fmt); + test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + fs_err(sdp, "%pV", &vaf); + va_end(args); +} - vaf.fmt = fmt; - vaf.va = &args; +int gfs2_withdraw(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + const struct lm_lockops *lm = ls->ls_ops; - fs_err(sdp, "%pV", &vaf); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && + test_and_set_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { + if (!test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags)) + return -1; - va_end(args); + wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG, + TASK_UNINTERRUPTIBLE); + return -1; } + set_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { fs_err(sdp, "about to withdraw this file system\n"); BUG_ON(sdp->sd_args.ar_debug); + signal_our_withdraw(sdp); + kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) @@ -69,8 +300,11 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) lm->lm_unmount(sdp); } set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); - fs_err(sdp, "withdrawn\n"); + fs_err(sdp, "File system withdrawn\n"); dump_stack(); + clear_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags); + smp_mb__after_atomic(); + wake_up_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG); } if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) @@ -81,35 +315,45 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) /** * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false - * Returns: -1 if this call withdrew the machine, - * -2 if it was already withdrawn */ -int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, - const char *function, char *file, unsigned int line) +void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line, + bool delayed) { - int me; - me = gfs2_lm_withdraw(sdp, - "fatal: assertion \"%s\" failed\n" - " function = %s, file = %s, line = %u\n", - assertion, function, file, line); + if (gfs2_withdrawn(sdp)) + return; + + fs_err(sdp, + "fatal: assertion \"%s\" failed\n" + " function = %s, file = %s, line = %u\n", + assertion, function, file, line); + + /* + * If errors=panic was specified on mount, it won't help to delay the + * withdraw. + */ + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + delayed = false; + + if (delayed) + gfs2_withdraw_delayed(sdp); + else + gfs2_withdraw(sdp); dump_stack(); - return (me) ? -1 : -2; } /** * gfs2_assert_warn_i - Print a message to the console if @assertion is false - * Returns: -1 if we printed something - * -2 if we didn't */ -int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, - const char *function, char *file, unsigned int line) +void gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line) { if (time_before(jiffies, sdp->sd_last_warning + gfs2_tune_get(sdp, gt_complain_secs) * HZ)) - return -2; + return; if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n", @@ -127,69 +371,59 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, sdp->sd_fsname, function, file, line); sdp->sd_last_warning = jiffies; - - return -1; } /** * gfs2_consist_i - Flag a filesystem consistency error and withdraw - * Returns: -1 if this call withdrew the machine, - * 0 if it was already withdrawn */ -int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function, - char *file, unsigned int line) +void gfs2_consist_i(struct gfs2_sbd *sdp, const char *function, + char *file, unsigned int line) { - int rv; - rv = gfs2_lm_withdraw(sdp, - "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n", - function, file, line); - return rv; + gfs2_lm(sdp, + "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n", + function, file, line); + gfs2_withdraw(sdp); } /** * gfs2_consist_inode_i - Flag an inode consistency error and withdraw - * Returns: -1 if this call withdrew the machine, - * 0 if it was already withdrawn */ -int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, - const char *function, char *file, unsigned int line) +void gfs2_consist_inode_i(struct gfs2_inode *ip, + const char *function, char *file, unsigned int line) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - int rv; - rv = gfs2_lm_withdraw(sdp, - "fatal: filesystem consistency error\n" - " inode = %llu %llu\n" - " function = %s, file = %s, line = %u\n", - (unsigned long long)ip->i_no_formal_ino, - (unsigned long long)ip->i_no_addr, - function, file, line); - return rv; + + gfs2_lm(sdp, + "fatal: filesystem consistency error\n" + " inode = %llu %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + function, file, line); + gfs2_withdraw(sdp); } /** * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw - * Returns: -1 if this call withdrew the machine, - * 0 if it was already withdrawn */ -int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, - const char *function, char *file, unsigned int line) +void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, + const char *function, char *file, unsigned int line) { struct gfs2_sbd *sdp = rgd->rd_sbd; char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; - int rv; sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf); - rv = gfs2_lm_withdraw(sdp, - "fatal: filesystem consistency error\n" - " RG = %llu\n" - " function = %s, file = %s, line = %u\n", - (unsigned long long)rgd->rd_addr, - function, file, line); - return rv; + gfs2_lm(sdp, + "fatal: filesystem consistency error\n" + " RG = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)rgd->rd_addr, + function, file, line); + gfs2_withdraw(sdp); } /** @@ -203,12 +437,14 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, unsigned int line) { int me; - me = gfs2_lm_withdraw(sdp, - "fatal: invalid metadata block\n" - " bh = %llu (%s)\n" - " function = %s, file = %s, line = %u\n", - (unsigned long long)bh->b_blocknr, type, - function, file, line); + + gfs2_lm(sdp, + "fatal: invalid metadata block\n" + " bh = %llu (%s)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, + function, file, line); + me = gfs2_withdraw(sdp); return (me) ? -1 : -2; } @@ -223,12 +459,14 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, char *file, unsigned int line) { int me; - me = gfs2_lm_withdraw(sdp, - "fatal: invalid metadata block\n" - " bh = %llu (type: exp=%u, found=%u)\n" - " function = %s, file = %s, line = %u\n", - (unsigned long long)bh->b_blocknr, type, t, - function, file, line); + + gfs2_lm(sdp, + "fatal: invalid metadata block\n" + " bh = %llu (type: exp=%u, found=%u)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, t, + function, file, line); + me = gfs2_withdraw(sdp); return (me) ? -1 : -2; } @@ -241,12 +479,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, unsigned int line) { - int rv; - rv = gfs2_lm_withdraw(sdp, - "fatal: I/O error\n" - " function = %s, file = %s, line = %u\n", - function, file, line); - return rv; + gfs2_lm(sdp, + "fatal: I/O error\n" + " function = %s, file = %s, line = %u\n", + function, file, line); + return gfs2_withdraw(sdp); } /** @@ -258,14 +495,14 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line, bool withdraw) { - if (!gfs2_withdrawn(sdp)) - fs_err(sdp, - "fatal: I/O error\n" - " block = %llu\n" - " function = %s, file = %s, line = %u\n", - (unsigned long long)bh->b_blocknr, - function, file, line); + if (gfs2_withdrawn(sdp)) + return; + + fs_err(sdp, "fatal: I/O error\n" + " block = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, function, file, line); if (withdraw) - gfs2_lm_withdraw(sdp, NULL); + gfs2_withdraw(sdp); } diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index f2702bc9837c..a3542560da6f 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -36,41 +36,59 @@ do { \ } while (0) -int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, - const char *function, char *file, unsigned int line); +void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line, + bool delayed); #define gfs2_assert_withdraw(sdp, assertion) \ -((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \ - __func__, __FILE__, __LINE__)) - - -int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, - const char *function, char *file, unsigned int line); + ({ \ + bool _bool = (assertion); \ + if (unlikely(!_bool)) \ + gfs2_assert_withdraw_i((sdp), #assertion, \ + __func__, __FILE__, __LINE__, false); \ + !_bool; \ + }) + +#define gfs2_assert_withdraw_delayed(sdp, assertion) \ + ({ \ + bool _bool = (assertion); \ + if (unlikely(!_bool)) \ + gfs2_assert_withdraw_i((sdp), #assertion, \ + __func__, __FILE__, __LINE__, true); \ + !_bool; \ + }) + +void gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line); #define gfs2_assert_warn(sdp, assertion) \ -((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \ - __func__, __FILE__, __LINE__)) - + ({ \ + bool _bool = (assertion); \ + if (unlikely(!_bool)) \ + gfs2_assert_warn_i((sdp), #assertion, \ + __func__, __FILE__, __LINE__); \ + !_bool; \ + }) -int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, - const char *function, char *file, unsigned int line); +void gfs2_consist_i(struct gfs2_sbd *sdp, + const char *function, char *file, unsigned int line); #define gfs2_consist(sdp) \ -gfs2_consist_i((sdp), 0, __func__, __FILE__, __LINE__) +gfs2_consist_i((sdp), __func__, __FILE__, __LINE__) -int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, - const char *function, char *file, unsigned int line); +void gfs2_consist_inode_i(struct gfs2_inode *ip, + const char *function, char *file, unsigned int line); #define gfs2_consist_inode(ip) \ -gfs2_consist_inode_i((ip), 0, __func__, __FILE__, __LINE__) +gfs2_consist_inode_i((ip), __func__, __FILE__, __LINE__) -int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, - const char *function, char *file, unsigned int line); +void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, + const char *function, char *file, unsigned int line); #define gfs2_consist_rgrpd(rgd) \ -gfs2_consist_rgrpd_i((rgd), 0, __func__, __FILE__, __LINE__) +gfs2_consist_rgrpd_i((rgd), __func__, __FILE__, __LINE__) int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, @@ -129,6 +147,9 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, unsigned int line); +extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + bool verbose); + #define gfs2_io_error(sdp) \ gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__); @@ -165,18 +186,29 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, } /** + * gfs2_withdraw_delayed - withdraw as soon as possible without deadlocks + * @sdp: the superblock + */ +static inline void gfs2_withdraw_delayed(struct gfs2_sbd *sdp) +{ + set_bit(SDF_WITHDRAWING, &sdp->sd_flags); +} + +/** * gfs2_withdrawn - test whether the file system is withdrawing or withdrawn * @sdp: the superblock */ static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) { - return test_bit(SDF_WITHDRAWN, &sdp->sd_flags); + return test_bit(SDF_WITHDRAWN, &sdp->sd_flags) || + test_bit(SDF_WITHDRAWING, &sdp->sd_flags); } #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) __printf(2, 3) -int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...); +void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...); +int gfs2_withdraw(struct gfs2_sbd *sdp); #endif /* __UTIL_DOT_H__ */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index bbe593d16bea..9d7667bc4292 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1222,7 +1222,7 @@ static int gfs2_xattr_set(const struct xattr_handler *handler, struct gfs2_holder gh; int ret; - ret = gfs2_rsqa_alloc(ip); + ret = gfs2_qa_get(ip); if (ret) return ret; @@ -1231,15 +1231,19 @@ static int gfs2_xattr_set(const struct xattr_handler *handler, if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (ret) - return ret; + goto out; } else { - if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) - return -EIO; + if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) { + ret = -EIO; + goto out; + } gfs2_holder_mark_uninitialized(&gh); } ret = __gfs2_xattr_set(inode, name, value, size, flags, handler->flags); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); +out: + gfs2_qa_put(ip); return ret; } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e6b8c49076bb..c070c0d8e3e9 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -139,8 +139,8 @@ static char *inode_name(struct inode *ino) static char *follow_link(char *link) { - int len, n; char *name, *resolved, *end; + int n; name = __getname(); if (!name) { @@ -164,15 +164,13 @@ static char *follow_link(char *link) return name; *(end + 1) = '\0'; - len = strlen(link) + strlen(name) + 1; - resolved = kmalloc(len, GFP_KERNEL); + resolved = kasprintf(GFP_KERNEL, "%s%s", link, name); if (resolved == NULL) { n = -ENOMEM; goto out_free; } - sprintf(resolved, "%s%s", link, name); __putname(name); kfree(link); return resolved; @@ -921,18 +919,16 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; - /* NULL is printed as <NULL> by sprintf: avoid that. */ + /* NULL is printed as '(null)' by printf(): avoid that. */ if (req_root == NULL) req_root = ""; err = -ENOMEM; sb->s_fs_info = host_root_path = - kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL); + kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root); if (host_root_path == NULL) goto out; - sprintf(host_root_path, "%s/%s", root_ino, req_root); - root_inode = new_inode(sb); if (!root_inode) goto out; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index aff8642f0c2e..991c60c7ffe0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -393,10 +393,9 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() prevents page faults in the - * truncated range. It checks i_size before allocation, and again after - * with the page table lock for the page held. The same lock must be - * acquired to unmap a page. + * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents + * page faults in the truncated range by checking i_size. i_size is + * modified while holding i_mmap_rwsem. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserv map @@ -436,7 +435,15 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, index = page->index; hash = hugetlb_fault_mutex_hash(mapping, index); - mutex_lock(&hugetlb_fault_mutex_table[hash]); + if (!truncate_op) { + /* + * Only need to hold the fault mutex in the + * hole punch case. This prevents races with + * page faults. Races are not possible in the + * case of truncation. + */ + mutex_lock(&hugetlb_fault_mutex_table[hash]); + } /* * If page is mapped, it was faulted in after being @@ -450,7 +457,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, if (unlikely(page_mapped(page))) { BUG_ON(truncate_op); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_lock_write(mapping); + mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vmdelete_list(&mapping->i_mmap, index * pages_per_huge_page(h), (index + 1) * pages_per_huge_page(h)); @@ -477,7 +486,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } unlock_page(page); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); + if (!truncate_op) + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } huge_pagevec_release(&pvec); cond_resched(); @@ -515,8 +525,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; - i_size_write(inode, offset); i_mmap_lock_write(mapping); + i_size_write(inode, offset); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); @@ -638,7 +648,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* mutex taken here, fault path and hole punch */ + /* + * fault mutex taken here, protects against fault path + * and hole punch. inode_lock previously taken protects + * against truncation. + */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); diff --git a/fs/internal.h b/fs/internal.h index f3f280b952a3..aa5d45524e87 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -38,7 +38,6 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) /* * buffer.c */ -extern void guard_bio_eod(struct bio *bio); extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, get_block_t *get_block, struct iomap *iomap); @@ -61,7 +60,6 @@ extern int finish_clean_context(struct fs_context *fc); */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); -extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); long do_mknodat(int dfd, const char __user *filename, umode_t mode, diff --git a/fs/io-wq.c b/fs/io-wq.c index 5cef075c0b37..4023c9846860 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -17,6 +17,7 @@ #include <linux/kthread.h> #include <linux/rculist_nulls.h> #include <linux/fs_struct.h> +#include <linux/task_work.h> #include "io-wq.h" @@ -69,6 +70,8 @@ struct io_worker { #define IO_WQ_HASH_ORDER 5 #endif +#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) + struct io_wqe_acct { unsigned nr_workers; unsigned max_workers; @@ -98,6 +101,7 @@ struct io_wqe { struct list_head all_list; struct io_wq *wq; + struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; }; /* @@ -107,8 +111,7 @@ struct io_wq { struct io_wqe **wqes; unsigned long state; - get_work_fn *get_work; - put_work_fn *put_work; + free_work_fn *free_work; struct task_struct *manager; struct user_struct *user; @@ -376,26 +379,35 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) return __io_worker_unuse(wqe, worker); } -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) +static inline unsigned int io_get_work_hash(struct io_wq_work *work) +{ + return work->flags >> IO_WQ_HASH_SHIFT; +} + +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; - struct io_wq_work *work; + struct io_wq_work *work, *tail; + unsigned int hash; wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); /* not hashed, can run anytime */ - if (!(work->flags & IO_WQ_WORK_HASHED)) { - wq_node_del(&wqe->work_list, node, prev); + if (!io_wq_is_hashed(work)) { + wq_list_del(&wqe->work_list, node, prev); return work; } /* hashed, can run if not already running */ - *hash = work->flags >> IO_WQ_HASH_SHIFT; - if (!(wqe->hash_map & BIT_ULL(*hash))) { - wqe->hash_map |= BIT_ULL(*hash); - wq_node_del(&wqe->work_list, node, prev); + hash = io_get_work_hash(work); + if (!(wqe->hash_map & BIT(hash))) { + wqe->hash_map |= BIT(hash); + /* all items with this hash lie in [work, tail] */ + tail = wqe->hash_tail[hash]; + wqe->hash_tail[hash] = NULL; + wq_list_cut(&wqe->work_list, &tail->list, prev); return work; } } @@ -440,16 +452,49 @@ static void io_wq_switch_creds(struct io_worker *worker, worker->saved_creds = old_creds; } +static void io_impersonate_work(struct io_worker *worker, + struct io_wq_work *work) +{ + if (work->files && current->files != work->files) { + task_lock(current); + current->files = work->files; + task_unlock(current); + } + if (work->fs && current->fs != work->fs) + current->fs = work->fs; + if (work->mm != worker->mm) + io_wq_switch_mm(worker, work); + if (worker->cur_creds != work->creds) + io_wq_switch_creds(worker, work); +} + +static void io_assign_current_work(struct io_worker *worker, + struct io_wq_work *work) +{ + if (work) { + /* flush pending signals before assigning new work */ + if (signal_pending(current)) + flush_signals(current); + cond_resched(); + } + + spin_lock_irq(&worker->lock); + worker->cur_work = work; + spin_unlock_irq(&worker->lock); +} + +static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); + static void io_worker_handle_work(struct io_worker *worker) __releases(wqe->lock) { - struct io_wq_work *work, *old_work = NULL, *put_work = NULL; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; do { - unsigned hash = -1U; - + struct io_wq_work *work; + unsigned int hash; +get_next: /* * If we got some work, mark us as busy. If we didn't, but * the list isn't empty, it means we stalled on hashed work. @@ -457,81 +502,60 @@ static void io_worker_handle_work(struct io_worker *worker) * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(wqe, &hash); + work = io_get_next_work(wqe); if (work) __io_worker_busy(wqe, worker, work); else if (!wq_list_empty(&wqe->work_list)) wqe->flags |= IO_WQE_FLAG_STALLED; spin_unlock_irq(&wqe->lock); - if (put_work && wq->put_work) - wq->put_work(old_work); if (!work) break; -next: - /* flush any pending signals before assigning new work */ - if (signal_pending(current)) - flush_signals(current); - - cond_resched(); - - spin_lock_irq(&worker->lock); - worker->cur_work = work; - spin_unlock_irq(&worker->lock); - - if (work->flags & IO_WQ_WORK_CB) - work->func(&work); - - if (work->files && current->files != work->files) { - task_lock(current); - current->files = work->files; - task_unlock(current); - } - if (work->fs && current->fs != work->fs) - current->fs = work->fs; - if (work->mm != worker->mm) - io_wq_switch_mm(worker, work); - if (worker->cur_creds != work->creds) - io_wq_switch_creds(worker, work); - /* - * OK to set IO_WQ_WORK_CANCEL even for uncancellable work, - * the worker function will do the right thing. - */ - if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) - work->flags |= IO_WQ_WORK_CANCEL; - if (worker->mm) - work->flags |= IO_WQ_WORK_HAS_MM; - - if (wq->get_work) { - put_work = work; - wq->get_work(work); - } - - old_work = work; - work->func(&work); - - spin_lock_irq(&worker->lock); - worker->cur_work = NULL; - spin_unlock_irq(&worker->lock); - - spin_lock_irq(&wqe->lock); - - if (hash != -1U) { - wqe->hash_map &= ~BIT_ULL(hash); - wqe->flags &= ~IO_WQE_FLAG_STALLED; - } - if (work && work != old_work) { - spin_unlock_irq(&wqe->lock); - - if (put_work && wq->put_work) { - wq->put_work(put_work); - put_work = NULL; + io_assign_current_work(worker, work); + + /* handle a whole dependent link */ + do { + struct io_wq_work *old_work, *next_hashed, *linked; + + next_hashed = wq_next_work(work); + io_impersonate_work(worker, work); + /* + * OK to set IO_WQ_WORK_CANCEL even for uncancellable + * work, the worker function will do the right thing. + */ + if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) + work->flags |= IO_WQ_WORK_CANCEL; + + hash = io_get_work_hash(work); + linked = old_work = work; + linked->func(&linked); + linked = (old_work == linked) ? NULL : linked; + + work = next_hashed; + if (!work && linked && !io_wq_is_hashed(linked)) { + work = linked; + linked = NULL; + } + io_assign_current_work(worker, work); + wq->free_work(old_work); + + if (linked) + io_wqe_enqueue(wqe, linked); + + if (hash != -1U && !next_hashed) { + spin_lock_irq(&wqe->lock); + wqe->hash_map &= ~BIT_ULL(hash); + wqe->flags &= ~IO_WQE_FLAG_STALLED; + /* dependent work is not hashed */ + hash = -1U; + /* skip unnecessary unlock-lock wqe->lock */ + if (!work) + goto get_next; + spin_unlock_irq(&wqe->lock); } + } while (work); - /* dependent work not hashed */ - hash = -1U; - goto next; - } + spin_lock_irq(&wqe->lock); } while (1); } @@ -693,6 +717,9 @@ static int io_wq_manager(void *data) complete(&wq->done); while (!kthread_should_stop()) { + if (current->task_works) + task_work_run(); + for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; bool fork_worker[2] = { false, false }; @@ -715,6 +742,9 @@ static int io_wq_manager(void *data) schedule_timeout(HZ); } + if (current->task_works) + task_work_run(); + return 0; err: set_bit(IO_WQ_BIT_ERROR, &wq->state); @@ -747,17 +777,40 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, return true; } -static void io_run_cancel(struct io_wq_work *work) +static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) { + struct io_wq *wq = wqe->wq; + do { struct io_wq_work *old_work = work; work->flags |= IO_WQ_WORK_CANCEL; work->func(&work); work = (work == old_work) ? NULL : work; + wq->free_work(old_work); } while (work); } +static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) +{ + unsigned int hash; + struct io_wq_work *tail; + + if (!io_wq_is_hashed(work)) { +append: + wq_list_add_tail(&work->list, &wqe->work_list); + return; + } + + hash = io_get_work_hash(work); + tail = wqe->hash_tail[hash]; + wqe->hash_tail[hash] = work; + if (!tail) + goto append; + + wq_list_add_after(&work->list, &tail->list, &wqe->work_list); +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); @@ -771,13 +824,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) * It's close enough to not be an issue, fork() has the same delay. */ if (unlikely(!io_wq_can_queue(wqe, acct, work))) { - io_run_cancel(work); + io_run_cancel(work, wqe); return; } work_flags = work->flags; spin_lock_irqsave(&wqe->lock, flags); - wq_list_add_tail(&work->list, &wqe->work_list); + io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); @@ -794,19 +847,15 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) } /* - * Enqueue work, hashed by some key. Work items that hash to the same value - * will not be done in parallel. Used to limit concurrent writes, generally - * hashed by inode. + * Work items that hash to the same value will not be done in parallel. + * Used to limit concurrent writes, generally hashed by inode. */ -void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val) +void io_wq_hash_work(struct io_wq_work *work, void *val) { - struct io_wqe *wqe = wq->wqes[numa_node_id()]; - unsigned bit; - + unsigned int bit; bit = hash_ptr(val, IO_WQ_HASH_ORDER); work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); - io_wqe_enqueue(wqe, work); } static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) @@ -856,14 +905,13 @@ void io_wq_cancel_all(struct io_wq *wq) } struct io_cb_cancel_data { - struct io_wqe *wqe; - work_cancel_fn *cancel; - void *caller_data; + work_cancel_fn *fn; + void *data; }; -static bool io_work_cancel(struct io_worker *worker, void *cancel_data) +static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { - struct io_cb_cancel_data *data = cancel_data; + struct io_cb_cancel_data *match = data; unsigned long flags; bool ret = false; @@ -874,83 +922,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data) spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && - data->cancel(worker->cur_work, data->caller_data)) { - send_sig(SIGINT, worker->task, 1); - ret = true; - } - spin_unlock_irqrestore(&worker->lock, flags); - - return ret; -} - -static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, - work_cancel_fn *cancel, - void *cancel_data) -{ - struct io_cb_cancel_data data = { - .wqe = wqe, - .cancel = cancel, - .caller_data = cancel_data, - }; - struct io_wq_work_node *node, *prev; - struct io_wq_work *work; - unsigned long flags; - bool found = false; - - spin_lock_irqsave(&wqe->lock, flags); - wq_list_for_each(node, prev, &wqe->work_list) { - work = container_of(node, struct io_wq_work, list); - - if (cancel(work, cancel_data)) { - wq_node_del(&wqe->work_list, node, prev); - found = true; - break; - } - } - spin_unlock_irqrestore(&wqe->lock, flags); - - if (found) { - io_run_cancel(work); - return IO_WQ_CANCEL_OK; - } - - rcu_read_lock(); - found = io_wq_for_each_worker(wqe, io_work_cancel, &data); - rcu_read_unlock(); - return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; -} - -enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data) -{ - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; - int node; - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - ret = io_wqe_cancel_cb_work(wqe, cancel, data); - if (ret != IO_WQ_CANCEL_NOTFOUND) - break; - } - - return ret; -} - -struct work_match { - bool (*fn)(struct io_wq_work *, void *data); - void *data; -}; - -static bool io_wq_worker_cancel(struct io_worker *worker, void *data) -{ - struct work_match *match = data; - unsigned long flags; - bool ret = false; - - spin_lock_irqsave(&worker->lock, flags); - if (match->fn(worker->cur_work, match->data) && - !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) { + match->fn(worker->cur_work, match->data)) { send_sig(SIGINT, worker->task, 1); ret = true; } @@ -960,7 +932,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) } static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, - struct work_match *match) + struct io_cb_cancel_data *match) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; @@ -977,7 +949,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, work = container_of(node, struct io_wq_work, list); if (match->fn(work, match->data)) { - wq_node_del(&wqe->work_list, node, prev); + wq_list_del(&wqe->work_list, node, prev); found = true; break; } @@ -985,7 +957,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, spin_unlock_irqrestore(&wqe->lock, flags); if (found) { - io_run_cancel(work); + io_run_cancel(work, wqe); return IO_WQ_CANCEL_OK; } @@ -1001,22 +973,16 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; } -static bool io_wq_work_match(struct io_wq_work *work, void *data) -{ - return work == data; -} - -enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) +enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, + void *data) { - struct work_match match = { - .fn = io_wq_work_match, - .data = cwork + struct io_cb_cancel_data match = { + .fn = cancel, + .data = data, }; enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; int node; - cwork->flags |= IO_WQ_WORK_CANCEL; - for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; @@ -1028,33 +994,28 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) return ret; } +static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) +{ + return work == data; +} + +enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) +{ + return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork); +} + static bool io_wq_pid_match(struct io_wq_work *work, void *data) { pid_t pid = (pid_t) (unsigned long) data; - if (work) - return work->task_pid == pid; - return false; + return work->task_pid == pid; } enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) { - struct work_match match = { - .fn = io_wq_pid_match, - .data = (void *) (unsigned long) pid - }; - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; - int node; + void *data = (void *) (unsigned long) pid; - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - ret = io_wqe_cancel_work(wqe, &match); - if (ret != IO_WQ_CANCEL_NOTFOUND) - break; - } - - return ret; + return io_wq_cancel_cb(wq, io_wq_pid_match, data); } struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) @@ -1062,6 +1023,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret = -ENOMEM, node; struct io_wq *wq; + if (WARN_ON_ONCE(!data->free_work)) + return ERR_PTR(-EINVAL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); @@ -1072,8 +1036,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) return ERR_PTR(-ENOMEM); } - wq->get_work = data->get_work; - wq->put_work = data->put_work; + wq->free_work = data->free_work; /* caller must already hold a reference to this */ wq->user = data->user; @@ -1130,7 +1093,7 @@ err: bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) { - if (data->get_work != wq->get_work || data->put_work != wq->put_work) + if (data->free_work != wq->free_work) return false; return refcount_inc_not_zero(&wq->use_refs); @@ -1168,3 +1131,8 @@ void io_wq_destroy(struct io_wq *wq) if (refcount_dec_and_test(&wq->use_refs)) __io_wq_destroy(wq); } + +struct task_struct *io_wq_get_task(struct io_wq *wq) +{ + return wq->manager; +} diff --git a/fs/io-wq.h b/fs/io-wq.h index e5e15f2c93ec..5ba12de7572f 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -5,10 +5,8 @@ struct io_wq; enum { IO_WQ_WORK_CANCEL = 1, - IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_UNBOUND = 32, - IO_WQ_WORK_CB = 128, IO_WQ_WORK_NO_CANCEL = 256, IO_WQ_WORK_CONCURRENT = 512, @@ -30,6 +28,18 @@ struct io_wq_work_list { struct io_wq_work_node *last; }; +static inline void wq_list_add_after(struct io_wq_work_node *node, + struct io_wq_work_node *pos, + struct io_wq_work_list *list) +{ + struct io_wq_work_node *next = pos->next; + + pos->next = node; + node->next = next; + if (!next) + list->last = node; +} + static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { @@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } -static inline void wq_node_del(struct io_wq_work_list *list, - struct io_wq_work_node *node, +static inline void wq_list_cut(struct io_wq_work_list *list, + struct io_wq_work_node *last, struct io_wq_work_node *prev) { - if (node == list->first) - WRITE_ONCE(list->first, node->next); - if (node == list->last) + /* first in the list, if prev==NULL */ + if (!prev) + WRITE_ONCE(list->first, last->next); + else + prev->next = last->next; + + if (last == list->last) list->last = prev; - if (prev) - prev->next = node->next; - node->next = NULL; + last->next = NULL; +} + +static inline void wq_list_del(struct io_wq_work_list *list, + struct io_wq_work_node *node, + struct io_wq_work_node *prev) +{ + wq_list_cut(list, node, prev); } #define wq_list_for_each(pos, prv, head) \ @@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, } while (0) struct io_wq_work { - union { - struct io_wq_work_node list; - void *data; - }; + struct io_wq_work_node list; void (*func)(struct io_wq_work **); struct files_struct *files; struct mm_struct *mm; @@ -83,14 +99,20 @@ struct io_wq_work { *(work) = (struct io_wq_work){ .func = _func }; \ } while (0) \ -typedef void (get_work_fn)(struct io_wq_work *); -typedef void (put_work_fn)(struct io_wq_work *); +static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) +{ + if (!work->list.next) + return NULL; + + return container_of(work->list.next, struct io_wq_work, list); +} + +typedef void (free_work_fn)(struct io_wq_work *); struct io_wq_data { struct user_struct *user; - get_work_fn *get_work; - put_work_fn *put_work; + free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); @@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); -void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val); +void io_wq_hash_work(struct io_wq_work *work, void *val); + +static inline bool io_wq_is_hashed(struct io_wq_work *work) +{ + return work->flags & IO_WQ_WORK_HASHED; +} void io_wq_cancel_all(struct io_wq *wq); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); @@ -109,6 +136,8 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *); enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, void *data); +struct task_struct *io_wq_get_task(struct io_wq *wq); + #if defined(CONFIG_IO_WQ) extern void io_wq_worker_sleeping(struct task_struct *); extern void io_wq_worker_running(struct task_struct *); diff --git a/fs/io_uring.c b/fs/io_uring.c index 3affd96a98ba..5190bfb6a665 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -44,6 +44,7 @@ #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/compat.h> +#include <net/compat.h> #include <linux/refcount.h> #include <linux/uio.h> #include <linux/bits.h> @@ -76,6 +77,8 @@ #include <linux/fadvise.h> #include <linux/eventpoll.h> #include <linux/fs_struct.h> +#include <linux/splice.h> +#include <linux/task_work.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -183,14 +186,30 @@ struct fixed_file_table { struct file **files; }; +struct fixed_file_ref_node { + struct percpu_ref refs; + struct list_head node; + struct list_head file_list; + struct fixed_file_data *file_data; + struct work_struct work; +}; + struct fixed_file_data { struct fixed_file_table *table; struct io_ring_ctx *ctx; + struct percpu_ref *cur_refs; struct percpu_ref refs; - struct llist_head put_llist; - struct work_struct ref_work; struct completion done; + struct list_head ref_list; + spinlock_t lock; +}; + +struct io_buffer { + struct list_head list; + __u64 addr; + __s32 len; + __u16 bid; }; struct io_ring_ctx { @@ -270,6 +289,8 @@ struct io_ring_ctx { struct socket *ring_sock; #endif + struct idr io_buffer_idr; + struct idr personality_idr; struct { @@ -290,7 +311,6 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; - struct llist_head poll_llist; /* * ->poll_list is protected by the ctx->uring_lock for @@ -306,6 +326,8 @@ struct io_ring_ctx { spinlock_t inflight_lock; struct list_head inflight_list; } ____cacheline_aligned_in_smp; + + struct work_struct exit_work; }; /* @@ -386,7 +408,9 @@ struct io_sr_msg { void __user *buf; }; int msg_flags; + int bgid; size_t len; + struct io_buffer *kbuf; }; struct io_open { @@ -430,6 +454,24 @@ struct io_epoll { struct epoll_event event; }; +struct io_splice { + struct file *file_out; + struct file *file_in; + loff_t off_out; + loff_t off_in; + u64 len; + unsigned int flags; +}; + +struct io_provide_buf { + struct file *file; + __u64 addr; + __s32 len; + __u32 bgid; + __u16 nbufs; + __u16 bid; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -464,6 +506,7 @@ enum { REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, @@ -479,6 +522,11 @@ enum { REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, + REQ_F_POLLED_BIT, + REQ_F_BUFFER_SELECTED_BIT, + + /* not a real bit, just to check we're not overflowing the space */ + __REQ_F_LAST_BIT, }; enum { @@ -492,6 +540,8 @@ enum { REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), /* IOSQE_ASYNC */ REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + /* IOSQE_BUFFER_SELECT */ + REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), /* already grabbed next link */ REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), @@ -521,6 +571,15 @@ enum { REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), /* in overflow list */ REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), + /* already went through poll handler */ + REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + /* buffer already selected */ + REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), +}; + +struct async_poll { + struct io_poll_iocb poll; + struct io_wq_work work; }; /* @@ -546,32 +605,45 @@ struct io_kiocb { struct io_fadvise fadvise; struct io_madvise madvise; struct io_epoll epoll; + struct io_splice splice; + struct io_provide_buf pbuf; }; struct io_async_ctx *io; - /* - * llist_node is only used for poll deferred completions - */ - struct llist_node llist_node; - bool in_async; + int cflags; bool needs_fixed_file; u8 opcode; struct io_ring_ctx *ctx; - union { - struct list_head list; - struct hlist_node hash_node; - }; - struct list_head link_list; + struct list_head list; unsigned int flags; refcount_t refs; + struct task_struct *task; + unsigned long fsize; u64 user_data; u32 result; u32 sequence; + struct list_head link_list; + struct list_head inflight_entry; - struct io_wq_work work; + struct percpu_ref *fixed_file_refs; + + union { + /* + * Only commands that never go async can use the below fields, + * obviously. Right now only IORING_OP_POLL_ADD uses them, and + * async armed poll handlers for regular commands. The latter + * restore the work, if needed. + */ + struct { + struct callback_head task_work; + struct hlist_node hash_node; + struct async_poll *apoll; + }; + struct io_wq_work work; + }; }; #define IO_PLUG_THRESHOLD 2 @@ -615,6 +687,11 @@ struct io_op_def { unsigned file_table : 1; /* needs ->fs */ unsigned needs_fs : 1; + /* set if opcode supports polled "wait" */ + unsigned pollin : 1; + unsigned pollout : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; }; static const struct io_op_def io_op_defs[] = { @@ -624,6 +701,8 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_WRITEV] = { .async_ctx = 1, @@ -631,6 +710,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -638,11 +718,13 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -658,6 +740,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, + .pollout = 1, }, [IORING_OP_RECVMSG] = { .async_ctx = 1, @@ -665,6 +748,8 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_TIMEOUT] = { .async_ctx = 1, @@ -676,6 +761,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .file_table = 1, + .pollin = 1, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { @@ -687,6 +773,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, @@ -715,11 +802,14 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_WRITE] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -731,11 +821,14 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_RECV] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_OPENAT2] = { .needs_file = 1, @@ -747,6 +840,13 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .file_table = 1, }, + [IORING_OP_SPLICE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_PROVIDE_BUFFERS] = {}, + [IORING_OP_REMOVE_BUFFERS] = {}, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -759,8 +859,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); static int io_grab_files(struct io_kiocb *req); -static void io_ring_file_ref_flush(struct fixed_file_data *data); static void io_cleanup_req(struct io_kiocb *req); +static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, + int fd, struct file **out_file, bool fixed); +static void __io_queue_sqe(struct io_kiocb *req, + const struct io_uring_sqe *sqe); static struct kmem_cache *req_cachep; @@ -827,11 +930,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); init_completion(&ctx->completions[1]); + idr_init(&ctx->io_buffer_idr); idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); - init_llist_head(&ctx->poll_llist); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -952,15 +1055,14 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) } } -static inline bool io_prep_async_work(struct io_kiocb *req, +static inline void io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { const struct io_op_def *def = &io_op_defs[req->opcode]; - bool do_hashed = false; if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file) - do_hashed = true; + io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; @@ -969,25 +1071,18 @@ static inline bool io_prep_async_work(struct io_kiocb *req, io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); - return do_hashed; } static inline void io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link; - bool do_hashed; - do_hashed = io_prep_async_work(req, &link); + io_prep_async_work(req, &link); - trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, - req->flags); - if (!do_hashed) { - io_wq_enqueue(ctx->io_wq, &req->work); - } else { - io_wq_enqueue_hashed(ctx->io_wq, &req->work, - file_inode(req->file)); - } + trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, + &req->work, req->flags); + io_wq_enqueue(ctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1054,24 +1149,19 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return false; if (!ctx->eventfd_async) return true; - return io_wq_current_is_worker() || in_interrupt(); + return io_wq_current_is_worker(); } -static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev) +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (trigger_ev) + if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) -{ - __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx)); -} - /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -1108,7 +1198,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, 0); + WRITE_ONCE(cqe->flags, req->cflags); } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1132,7 +1222,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) return cqe != NULL; } -static void io_cqring_fill_event(struct io_kiocb *req, long res) +static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_cqe *cqe; @@ -1148,7 +1238,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) if (likely(cqe)) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, 0); + WRITE_ONCE(cqe->flags, cflags); } else if (ctx->cq_overflow_flushed) { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1160,23 +1250,34 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) req->flags |= REQ_F_OVERFLOW; refcount_inc(&req->refs); req->result = res; + req->cflags = cflags; list_add_tail(&req->list, &ctx->cq_overflow_list); } } -static void io_cqring_add_event(struct io_kiocb *req, long res) +static void io_cqring_fill_event(struct io_kiocb *req, long res) +{ + __io_cqring_fill_event(req, res, 0); +} + +static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(req, res); + __io_cqring_fill_event(req, res, cflags); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } +static void io_cqring_add_event(struct io_kiocb *req, long res) +{ + __io_cqring_add_event(req, res, 0); +} + static inline bool io_is_fallback_req(struct io_kiocb *req) { return req == (struct io_kiocb *) @@ -1194,8 +1295,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) return NULL; } -static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, - struct io_submit_state *state) +static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, + struct io_submit_state *state) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; @@ -1228,46 +1329,30 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req = state->reqs[state->free_reqs]; } -got_it: - req->io = NULL; - req->file = NULL; - req->ctx = ctx; - req->flags = 0; - /* one is dropped after submission, the other at completion */ - refcount_set(&req->refs, 2); - req->result = 0; - INIT_IO_WORK(&req->work, io_wq_submit_work); return req; fallback: - req = io_get_fallback_req(ctx); - if (req) - goto got_it; - percpu_ref_put(&ctx->refs); - return NULL; + return io_get_fallback_req(ctx); } -static void __io_req_do_free(struct io_kiocb *req) +static inline void io_put_file(struct io_kiocb *req, struct file *file, + bool fixed) { - if (likely(!io_is_fallback_req(req))) - kmem_cache_free(req_cachep, req); + if (fixed) + percpu_ref_put(req->fixed_file_refs); else - clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); + fput(file); } static void __io_req_aux_free(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); kfree(req->io); - if (req->file) { - if (req->flags & REQ_F_FIXED_FILE) - percpu_ref_put(&ctx->file_data->refs); - else - fput(req->file); - } + if (req->file) + io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); + if (req->task) + put_task_struct(req->task); io_req_work_drop_env(req); } @@ -1288,7 +1373,10 @@ static void __io_free_req(struct io_kiocb *req) } percpu_ref_put(&req->ctx->refs); - __io_req_do_free(req); + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); } struct req_batch { @@ -1299,21 +1387,18 @@ struct req_batch { static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) { - int fixed_refs = rb->to_free; - if (!rb->to_free) return; if (rb->need_iter) { int i, inflight = 0; unsigned long flags; - fixed_refs = 0; for (i = 0; i < rb->to_free; i++) { struct io_kiocb *req = rb->reqs[i]; if (req->flags & REQ_F_FIXED_FILE) { req->file = NULL; - fixed_refs++; + percpu_ref_put(req->fixed_file_refs); } if (req->flags & REQ_F_INFLIGHT) inflight++; @@ -1339,8 +1424,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) } do_free: kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); - if (fixed_refs) - percpu_ref_put_many(&ctx->file_data->refs, fixed_refs); percpu_ref_put_many(&ctx->refs, rb->to_free); rb->to_free = rb->need_iter = 0; } @@ -1474,6 +1557,30 @@ static void io_free_req(struct io_kiocb *req) io_queue_async_work(nxt); } +static void io_link_work_cb(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *link; + + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + io_queue_linked_timeout(link); + io_wq_submit_work(workptr); +} + +static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) +{ + struct io_kiocb *link; + const struct io_op_def *def = &io_op_defs[nxt->opcode]; + + if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file) + io_wq_hash_work(&nxt->work, file_inode(nxt->file)); + + *workptr = &nxt->work; + link = io_prep_linked_timeout(nxt); + if (link) + nxt->work.func = io_link_work_cb; +} + /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1493,6 +1600,26 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } +static void io_steal_work(struct io_kiocb *req, + struct io_wq_work **workptr) +{ + /* + * It's in an io-wq worker, so there always should be at least + * one reference, which will be dropped in io_put_work() just + * after the current handler returns. + * + * It also means, that if the counter dropped to 1, then there is + * no asynchronous users left, so it's safe to steal the next work. + */ + if (refcount_read(&req->refs) == 1) { + struct io_kiocb *nxt = NULL; + + io_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); + } +} + /* * Must only be used if we don't need to care about links, usually from * within the completion handling itself. @@ -1554,6 +1681,19 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) return true; } +static int io_put_kbuf(struct io_kiocb *req) +{ + struct io_buffer *kbuf; + int cflags; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + cflags |= IORING_CQE_F_BUFFER; + req->rw.addr = 0; + kfree(kbuf); + return cflags; +} + /* * Find and free completed poll iocbs */ @@ -1565,10 +1705,15 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { + int cflags = 0; + req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(req, req->result); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_kbuf(req); + + __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; if (refcount_dec_and_test(&req->refs) && @@ -1577,14 +1722,29 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, } io_commit_cqring(ctx); + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_ev_posted(ctx); io_free_req_many(ctx, &rb); } +static void io_iopoll_queue(struct list_head *again) +{ + struct io_kiocb *req; + + do { + req = list_first_entry(again, struct io_kiocb, list); + list_del(&req->list); + refcount_inc(&req->refs); + io_queue_async_work(req); + } while (!list_empty(again)); +} + static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { struct io_kiocb *req, *tmp; LIST_HEAD(done); + LIST_HEAD(again); bool spin; int ret; @@ -1599,9 +1759,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, struct kiocb *kiocb = &req->rw.kiocb; /* - * Move completed entries to our local list. If we find a - * request that requires polling, break out and complete - * the done list first, if we have entries there. + * Move completed and retryable entries to our local lists. + * If we find a request that requires polling, break out + * and complete those lists first, if we have entries there. */ if (req->flags & REQ_F_IOPOLL_COMPLETED) { list_move_tail(&req->list, &done); @@ -1610,6 +1770,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) break; + if (req->result == -EAGAIN) { + list_move_tail(&req->list, &again); + continue; + } + if (!list_empty(&again)) + break; + ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); if (ret < 0) break; @@ -1622,6 +1789,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) io_iopoll_complete(ctx, nr_events, &done); + if (!list_empty(&again)) + io_iopoll_queue(&again); + return ret; } @@ -1743,13 +1913,16 @@ static inline void req_set_fail_links(struct io_kiocb *req) static void io_complete_rw_common(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + int cflags = 0; if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); if (res != req->result) req_set_fail_links(req); - io_cqring_add_event(req, res); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_kbuf(req); + __io_cqring_add_event(req, res, cflags); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -1760,17 +1933,6 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) io_put_req(req); } -static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) -{ - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - struct io_kiocb *nxt = NULL; - - io_complete_rw_common(kiocb, res); - io_put_req_find_next(req, &nxt); - - return nxt; -} - static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); @@ -1841,7 +2003,7 @@ static void io_file_put(struct io_submit_state *state) * assuming most submissions are for one file, or at least that each file * has more than one submission. */ -static struct file *io_file_get(struct io_submit_state *state, int fd) +static struct file *__io_file_get(struct io_submit_state *state, int fd) { if (!state) return fget(fd); @@ -1938,7 +2100,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); - /* we own ->private, reuse it for the buffer index */ + /* we own ->private, reuse it for the buffer index / buffer ID */ req->rw.kiocb.private = (void *) (unsigned long) READ_ONCE(sqe->buf_index); return 0; @@ -1965,15 +2127,14 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, - bool in_async) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; - if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) - *nxt = __io_complete_rw(kiocb, ret); + if (ret >= 0 && kiocb->ki_complete == io_complete_rw) + io_complete_rw(kiocb, ret, 0); else io_rw_done(kiocb, ret); } @@ -2052,11 +2213,147 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, return len; } +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) +{ + if (needs_lock) + mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (needs_lock) + mutex_lock(&ctx->uring_lock); +} + +static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, + int bgid, struct io_buffer *kbuf, + bool needs_lock) +{ + struct io_buffer *head; + + if (req->flags & REQ_F_BUFFER_SELECTED) + return kbuf; + + io_ring_submit_lock(req->ctx, needs_lock); + + lockdep_assert_held(&req->ctx->uring_lock); + + head = idr_find(&req->ctx->io_buffer_idr, bgid); + if (head) { + if (!list_empty(&head->list)) { + kbuf = list_last_entry(&head->list, struct io_buffer, + list); + list_del(&kbuf->list); + } else { + kbuf = head; + idr_remove(&req->ctx->io_buffer_idr, bgid); + } + if (*len > kbuf->len) + *len = kbuf->len; + } else { + kbuf = ERR_PTR(-ENOBUFS); + } + + io_ring_submit_unlock(req->ctx, needs_lock); + + return kbuf; +} + +static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, + bool needs_lock) +{ + struct io_buffer *kbuf; + int bgid; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + bgid = (int) (unsigned long) req->rw.kiocb.private; + kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); + if (IS_ERR(kbuf)) + return kbuf; + req->rw.addr = (u64) (unsigned long) kbuf; + req->flags |= REQ_F_BUFFER_SELECTED; + return u64_to_user_ptr(kbuf->addr); +} + +#ifdef CONFIG_COMPAT +static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + struct compat_iovec __user *uiov; + compat_ssize_t clen; + void __user *buf; + ssize_t len; + + uiov = u64_to_user_ptr(req->rw.addr); + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + + len = clen; + buf = io_rw_buffer_select(req, &len, needs_lock); + if (IS_ERR(buf)) + return PTR_ERR(buf); + iov[0].iov_base = buf; + iov[0].iov_len = (compat_size_t) len; + return 0; +} +#endif + +static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); + void __user *buf; + ssize_t len; + + if (copy_from_user(iov, uiov, sizeof(*uiov))) + return -EFAULT; + + len = iov[0].iov_len; + if (len < 0) + return -EINVAL; + buf = io_rw_buffer_select(req, &len, needs_lock); + if (IS_ERR(buf)) + return PTR_ERR(buf); + iov[0].iov_base = buf; + iov[0].iov_len = len; + return 0; +} + +static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + if (req->flags & REQ_F_BUFFER_SELECTED) + return 0; + if (!req->rw.len) + return 0; + else if (req->rw.len > 1) + return -EINVAL; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return io_compat_import(req, iov, needs_lock); +#endif + + return __io_iov_buffer_select(req, iov, needs_lock); +} + static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter) + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) { void __user *buf = u64_to_user_ptr(req->rw.addr); size_t sqe_len = req->rw.len; + ssize_t ret; u8 opcode; opcode = req->opcode; @@ -2065,12 +2362,20 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return io_import_fixed(req, rw, iter); } - /* buffer index only valid with fixed read/write */ - if (req->rw.kiocb.private) + /* buffer index only valid with fixed read/write, or buffer select */ + if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - ssize_t ret; + if (req->flags & REQ_F_BUFFER_SELECT) { + buf = io_rw_buffer_select(req, &sqe_len, needs_lock); + if (IS_ERR(buf)) { + *iovec = NULL; + return PTR_ERR(buf); + } + req->rw.len = sqe_len; + } + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); *iovec = NULL; return ret < 0 ? ret : sqe_len; @@ -2086,6 +2391,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return iorw->size; } + if (req->flags & REQ_F_BUFFER_SELECT) { + ret = io_iov_buffer_select(req, *iovec, needs_lock); + if (!ret) { + ret = (*iovec)->iov_len; + iov_iter_init(iter, rw, *iovec, 1, ret); + } + *iovec = NULL; + return ret; + } + #ifdef CONFIG_COMPAT if (req->ctx->compat) return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, @@ -2162,19 +2477,26 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, req->io->rw.iov = iovec; if (!req->io->rw.iov) { req->io->rw.iov = req->io->rw.fast_iov; - memcpy(req->io->rw.iov, fast_iov, - sizeof(struct iovec) * iter->nr_segs); + if (req->io->rw.iov != fast_iov) + memcpy(req->io->rw.iov, fast_iov, + sizeof(struct iovec) * iter->nr_segs); } else { req->flags |= REQ_F_NEED_CLEANUP; } } +static inline int __io_alloc_async_ctx(struct io_kiocb *req) +{ + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + return req->io == NULL; +} + static int io_alloc_async_ctx(struct io_kiocb *req) { if (!io_op_defs[req->opcode].async_ctx) return 0; - req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); - return req->io == NULL; + + return __io_alloc_async_ctx(req); } static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, @@ -2184,7 +2506,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, if (!io_op_defs[req->opcode].async_ctx) return 0; if (!req->io) { - if (io_alloc_async_ctx(req)) + if (__io_alloc_async_ctx(req)) return -ENOMEM; io_req_map_rw(req, io_size, iovec, fast_iov, iter); @@ -2213,7 +2535,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, io = req->io; io->rw.iov = io->rw.fast_iov; req->io = NULL; - ret = io_import_iovec(READ, req, &io->rw.iov, &iter); + ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); req->io = io; if (ret < 0) return ret; @@ -2222,8 +2544,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_read(struct io_kiocb *req, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -2231,13 +2552,13 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t io_size, ret; - ret = io_import_iovec(READ, req, &iovec, &iter); + ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_flags &= ~IOCB_NOWAIT; req->result = 0; io_size = ret; @@ -2248,10 +2569,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; + if (force_nonblock && !io_file_supports_async(req->file)) goto copy_iov; - } iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); @@ -2265,13 +2584,16 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, req->in_async); + kiocb_done(kiocb, ret2); } else { copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; + /* any defer here is final, must blocking retry */ + if (!(req->flags & REQ_F_NOWAIT)) + req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } @@ -2295,6 +2617,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; + req->fsize = rlimit(RLIMIT_FSIZE); + /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -2302,7 +2626,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, io = req->io; io->rw.iov = io->rw.fast_iov; req->io = NULL; - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); + ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); req->io = io; if (ret < 0) return ret; @@ -2311,8 +2635,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_write(struct io_kiocb *req, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -2320,7 +2643,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t ret, io_size; - ret = io_import_iovec(WRITE, req, &iovec, &iter); + ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; @@ -2337,10 +2660,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; + if (force_nonblock && !io_file_supports_async(req->file)) goto copy_iov; - } /* file path doesn't support NOWAIT for non-direct_IO */ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && @@ -2367,24 +2688,33 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, } kiocb->ki_flags |= IOCB_WRITE; + if (!force_nonblock) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; + if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + + if (!force_nonblock) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + /* - * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. */ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, req->in_async); + kiocb_done(kiocb, ret2); } else { copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; + /* any defer here is final, must blocking retry */ + req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } @@ -2394,6 +2724,76 @@ out_free: return ret; } +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_splice* sp = &req->splice; + unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; + int ret; + + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; + + sp->file_in = NULL; + sp->off_in = READ_ONCE(sqe->splice_off_in); + sp->off_out = READ_ONCE(sqe->off); + sp->len = READ_ONCE(sqe->len); + sp->flags = READ_ONCE(sqe->splice_flags); + + if (unlikely(sp->flags & ~valid_flags)) + return -EINVAL; + + ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + + if (!S_ISREG(file_inode(sp->file_in)->i_mode)) + req->work.flags |= IO_WQ_WORK_UNBOUND; + + return 0; +} + +static bool io_splice_punt(struct file *file) +{ + if (get_pipe_info(file)) + return false; + if (!io_file_supports_async(file)) + return true; + return !(file->f_mode & O_NONBLOCK); +} + +static int io_splice(struct io_kiocb *req, bool force_nonblock) +{ + struct io_splice *sp = &req->splice; + struct file *in = sp->file_in; + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + loff_t *poff_in, *poff_out; + long ret; + + if (force_nonblock) { + if (io_splice_punt(in) || io_splice_punt(out)) + return -EAGAIN; + flags |= SPLICE_F_NONBLOCK; + } + + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; + poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + req->flags &= ~REQ_F_NEED_CLEANUP; + + io_cqring_add_event(req, ret); + if (ret != sp->len) + req_set_fail_links(req); + io_put_req(req); + return 0; +} + /* * IORING_OP_NOP just posts a completion event, nothing else. */ @@ -2442,85 +2842,63 @@ static bool io_req_cancelled(struct io_kiocb *req) return false; } -static void io_link_work_cb(struct io_wq_work **workptr) +static void __io_fsync(struct io_kiocb *req) { - struct io_wq_work *work = *workptr; - struct io_kiocb *link = work->data; - - io_queue_linked_timeout(link); - work->func = io_wq_submit_work; -} - -static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) -{ - struct io_kiocb *link; - - io_prep_async_work(nxt, &link); - *workptr = &nxt->work; - if (link) { - nxt->work.flags |= IO_WQ_WORK_CB; - nxt->work.func = io_link_work_cb; - nxt->work.data = link; - } -} - -static void io_fsync_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); loff_t end = req->sync.off + req->sync.len; - struct io_kiocb *nxt = NULL; int ret; - if (io_req_cancelled(req)) - return; - ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); + io_put_req(req); } -static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static void io_fsync_finish(struct io_wq_work **workptr) { - struct io_wq_work *work, *old_work; + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + if (io_req_cancelled(req)) + return; + __io_fsync(req); + io_steal_work(req, workptr); +} + +static int io_fsync(struct io_kiocb *req, bool force_nonblock) +{ /* fsync always requires a blocking context */ if (force_nonblock) { - io_put_req(req); req->work.func = io_fsync_finish; return -EAGAIN; } - - work = old_work = &req->work; - io_fsync_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); + __io_fsync(req); return 0; } -static void io_fallocate_finish(struct io_wq_work **workptr) +static void __io_fallocate(struct io_kiocb *req) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; int ret; - if (io_req_cancelled(req)) - return; - + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); + io_put_req(req); +} + +static void io_fallocate_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + + if (io_req_cancelled(req)) + return; + __io_fallocate(req); + io_steal_work(req, workptr); } static int io_fallocate_prep(struct io_kiocb *req, @@ -2532,26 +2910,19 @@ static int io_fallocate_prep(struct io_kiocb *req, req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->addr); req->sync.mode = READ_ONCE(sqe->len); + req->fsize = rlimit(RLIMIT_FSIZE); return 0; } -static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_fallocate(struct io_kiocb *req, bool force_nonblock) { - struct io_wq_work *work, *old_work; - /* fallocate always requiring blocking context */ if (force_nonblock) { - io_put_req(req); req->work.func = io_fallocate_finish; return -EAGAIN; } - work = old_work = &req->work; - io_fallocate_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); - + __io_fallocate(req); return 0; } @@ -2562,7 +2933,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) return -EBADF; if (req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -2571,6 +2942,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->open.how.mode = READ_ONCE(sqe->len); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); req->open.how.flags = READ_ONCE(sqe->open_flags); + if (force_o_largefile()) + req->open.how.flags |= O_LARGEFILE; req->open.filename = getname(fname); if (IS_ERR(req->open.filename)) { @@ -2593,7 +2966,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) return -EBADF; if (req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -2626,8 +2999,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_openat2(struct io_kiocb *req, bool force_nonblock) { struct open_flags op; struct file *file; @@ -2658,15 +3030,171 @@ err: if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; } -static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_openat(struct io_kiocb *req, bool force_nonblock) { req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); - return io_openat2(req, nxt, force_nonblock); + return io_openat2(req, force_nonblock); +} + +static int io_remove_buffers_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_provide_buf *p = &req->pbuf; + u64 tmp; + + if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -EINVAL; + + memset(p, 0, sizeof(*p)); + p->nbufs = tmp; + p->bgid = READ_ONCE(sqe->buf_group); + return 0; +} + +static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, + int bgid, unsigned nbufs) +{ + unsigned i = 0; + + /* shouldn't happen */ + if (!nbufs) + return 0; + + /* the head kbuf is the list itself */ + while (!list_empty(&buf->list)) { + struct io_buffer *nxt; + + nxt = list_first_entry(&buf->list, struct io_buffer, list); + list_del(&nxt->list); + kfree(nxt); + if (++i == nbufs) + return i; + } + i++; + kfree(buf); + idr_remove(&ctx->io_buffer_idr, bgid); + + return i; +} + +static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) +{ + struct io_provide_buf *p = &req->pbuf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer *head; + int ret = 0; + + io_ring_submit_lock(ctx, !force_nonblock); + + lockdep_assert_held(&ctx->uring_lock); + + ret = -ENOENT; + head = idr_find(&ctx->io_buffer_idr, p->bgid); + if (head) + ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); + + io_ring_submit_lock(ctx, !force_nonblock); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + +static int io_provide_buffers_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_provide_buf *p = &req->pbuf; + u64 tmp; + + if (sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -E2BIG; + p->nbufs = tmp; + p->addr = READ_ONCE(sqe->addr); + p->len = READ_ONCE(sqe->len); + + if (!access_ok(u64_to_user_ptr(p->addr), p->len)) + return -EFAULT; + + p->bgid = READ_ONCE(sqe->buf_group); + tmp = READ_ONCE(sqe->off); + if (tmp > USHRT_MAX) + return -E2BIG; + p->bid = tmp; + return 0; +} + +static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) +{ + struct io_buffer *buf; + u64 addr = pbuf->addr; + int i, bid = pbuf->bid; + + for (i = 0; i < pbuf->nbufs; i++) { + buf = kmalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + break; + + buf->addr = addr; + buf->len = pbuf->len; + buf->bid = bid; + addr += pbuf->len; + bid++; + if (!*head) { + INIT_LIST_HEAD(&buf->list); + *head = buf; + } else { + list_add_tail(&buf->list, &(*head)->list); + } + } + + return i ? i : -ENOMEM; +} + +static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) +{ + struct io_provide_buf *p = &req->pbuf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer *head, *list; + int ret = 0; + + io_ring_submit_lock(ctx, !force_nonblock); + + lockdep_assert_held(&ctx->uring_lock); + + list = head = idr_find(&ctx->io_buffer_idr, p->bgid); + + ret = io_add_buffers(p, &head); + if (ret < 0) + goto out; + + if (!list) { + ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1, + GFP_KERNEL); + if (ret < 0) { + __io_remove_buffers(ctx, head, p->bgid, -1U); + goto out; + } + } +out: + io_ring_submit_unlock(ctx, !force_nonblock); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; } static int io_epoll_ctl_prep(struct io_kiocb *req, @@ -2694,8 +3222,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #endif } -static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) { #if defined(CONFIG_EPOLL) struct io_epoll *ie = &req->epoll; @@ -2708,7 +3235,7 @@ static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; #else return -EOPNOTSUPP; @@ -2730,8 +3257,7 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #endif } -static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_madvise(struct io_kiocb *req, bool force_nonblock) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = &req->madvise; @@ -2744,7 +3270,7 @@ static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; #else return -EOPNOTSUPP; @@ -2762,8 +3288,7 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_fadvise(struct io_kiocb *req, bool force_nonblock) { struct io_fadvise *fa = &req->fadvise; int ret; @@ -2783,7 +3308,7 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; } @@ -2795,7 +3320,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) return -EBADF; if (req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -2820,8 +3345,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_statx(struct io_kiocb *req, bool force_nonblock) { struct io_open *ctx = &req->open; unsigned lookup_flags; @@ -2858,7 +3382,7 @@ err: if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; } @@ -2873,7 +3397,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); @@ -2885,7 +3409,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } /* only called when __close_fd_get_file() is done */ -static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt) +static void __io_close_finish(struct io_kiocb *req) { int ret; @@ -2894,22 +3418,19 @@ static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt) req_set_fail_links(req); io_cqring_add_event(req, ret); fput(req->close.put_file); - io_put_req_find_next(req, nxt); + io_put_req(req); } static void io_close_finish(struct io_wq_work **workptr) { struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; /* not cancellable, don't do io_req_cancelled() */ - __io_close_finish(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); + __io_close_finish(req); + io_steal_work(req, workptr); } -static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_close(struct io_kiocb *req, bool force_nonblock) { int ret; @@ -2919,23 +3440,25 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, return ret; /* if the file has a flush method, be safe and punt to async */ - if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) - goto eagain; + if (req->close.put_file->f_op->flush && force_nonblock) { + /* submission ref will be dropped, take it for async */ + refcount_inc(&req->refs); + + req->work.func = io_close_finish; + /* + * Do manual async queue here to avoid grabbing files - we don't + * need the files, and it'll cause io_close_finish() to close + * the file again and cause a double CQE entry for this request + */ + io_queue_async_work(req); + return 0; + } /* * No ->flush(), safely close from here and just punt the * fput() to async context. */ - __io_close_finish(req, nxt); - return 0; -eagain: - req->work.func = io_close_finish; - /* - * Do manual async queue here to avoid grabbing files - we don't - * need the files, and it'll cause io_close_finish() to close - * the file again and cause a double CQE entry for this request - */ - io_queue_async_work(req); + __io_close_finish(req); return 0; } @@ -2957,47 +3480,59 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void io_sync_file_range_finish(struct io_wq_work **workptr) +static void __io_sync_file_range(struct io_kiocb *req) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; int ret; - if (io_req_cancelled(req)) - return; - ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); + io_put_req(req); } -static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) + +static void io_sync_file_range_finish(struct io_wq_work **workptr) { - struct io_wq_work *work, *old_work; + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + + if (io_req_cancelled(req)) + return; + __io_sync_file_range(req); + io_put_req(req); /* put submission ref */ +} +static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) +{ /* sync_file_range always requires a blocking context */ if (force_nonblock) { - io_put_req(req); req->work.func = io_sync_file_range_finish; return -EAGAIN; } - work = old_work = &req->work; - io_sync_file_range_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); + __io_sync_file_range(req); return 0; } +#if defined(CONFIG_NET) +static int io_setup_async_msg(struct io_kiocb *req, + struct io_async_msghdr *kmsg) +{ + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) { + if (kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); + return -ENOMEM; + } + req->flags |= REQ_F_NEED_CLEANUP; + memcpy(&req->io->msg, kmsg, sizeof(*kmsg)); + return -EAGAIN; +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; struct io_async_ctx *io = req->io; int ret; @@ -3023,15 +3558,10 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; -#else - return -EOPNOTSUPP; -#endif } -static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -3071,18 +3601,8 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, flags |= MSG_DONTWAIT; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (force_nonblock && ret == -EAGAIN) { - if (req->io) - return -EAGAIN; - if (io_alloc_async_ctx(req)) { - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); - return -ENOMEM; - } - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - return -EAGAIN; - } + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; } @@ -3093,17 +3613,12 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; -#else - return -EOPNOTSUPP; -#endif } -static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) struct socket *sock; int ret; @@ -3144,17 +3659,120 @@ static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; -#else - return -EOPNOTSUPP; +} + +static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +{ + struct io_sr_msg *sr = &req->sr_msg; + struct iovec __user *uiov; + size_t iov_len; + int ret; + + ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr, + &uiov, &iov_len); + if (ret) + return ret; + + if (req->flags & REQ_F_BUFFER_SELECT) { + if (iov_len > 1) + return -EINVAL; + if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov))) + return -EFAULT; + sr->len = io->msg.iov[0].iov_len; + iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1, + sr->len); + io->msg.iov = NULL; + } else { + ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, + &io->msg.iov, &io->msg.msg.msg_iter); + if (ret > 0) + ret = 0; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_ctx *io) +{ + struct compat_msghdr __user *msg_compat; + struct io_sr_msg *sr = &req->sr_msg; + struct compat_iovec __user *uiov; + compat_uptr_t ptr; + compat_size_t len; + int ret; + + msg_compat = (struct compat_msghdr __user *) sr->msg; + ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, + &ptr, &len); + if (ret) + return ret; + + uiov = compat_ptr(ptr); + if (req->flags & REQ_F_BUFFER_SELECT) { + compat_ssize_t clen; + + if (len > 1) + return -EINVAL; + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + sr->len = io->msg.iov[0].iov_len; + io->msg.iov = NULL; + } else { + ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, + &io->msg.iov, + &io->msg.msg.msg_iter); + if (ret < 0) + return ret; + } + + return 0; +} #endif + +static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +{ + io->msg.iov = io->msg.fast_iov; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return __io_compat_recvmsg_copy_hdr(req, io); +#endif + + return __io_recvmsg_copy_hdr(req, io); +} + +static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, + int *cflags, bool needs_lock) +{ + struct io_sr_msg *sr = &req->sr_msg; + struct io_buffer *kbuf; + + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return NULL; + + kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); + if (IS_ERR(kbuf)) + return kbuf; + + sr->kbuf = kbuf; + req->flags |= REQ_F_BUFFER_SELECTED; + + *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + *cflags |= IORING_CQE_F_BUFFER; + return kbuf; } static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; struct io_async_ctx *io = req->io; int ret; @@ -3162,6 +3780,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); + sr->bgid = READ_ONCE(sqe->buf_group); #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -3174,30 +3793,24 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->flags & REQ_F_NEED_CLEANUP) return 0; - io->msg.iov = io->msg.fast_iov; - ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, - &io->msg.uaddr, &io->msg.iov); + ret = io_recvmsg_copy_hdr(req, io); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; -#else - return -EOPNOTSUPP; -#endif } -static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) struct io_async_msghdr *kmsg = NULL; struct socket *sock; - int ret; + int ret, cflags = 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; sock = sock_from_file(req->file, &ret); if (sock) { + struct io_buffer *kbuf; struct io_async_ctx io; unsigned flags; @@ -3209,19 +3822,23 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { - struct io_sr_msg *sr = &req->sr_msg; - kmsg = &io.msg; kmsg->msg.msg_name = &io.msg.addr; - io.msg.iov = io.msg.fast_iov; - ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, - sr->msg_flags, &io.msg.uaddr, - &io.msg.iov); + ret = io_recvmsg_copy_hdr(req, &io); if (ret) return ret; } + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) { + return PTR_ERR(kbuf); + } else if (kbuf) { + kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, + 1, req->sr_msg.len); + } + flags = req->sr_msg.msg_flags; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -3230,18 +3847,8 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - if (req->io) - return -EAGAIN; - if (io_alloc_async_ctx(req)) { - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); - return -ENOMEM; - } - memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - req->flags |= REQ_F_NEED_CLEANUP; - return -EAGAIN; - } + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; } @@ -3249,22 +3856,18 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); + __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; -#else - return -EOPNOTSUPP; -#endif } -static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) + struct io_buffer *kbuf = NULL; struct socket *sock; - int ret; + int ret, cflags = 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3272,15 +3875,25 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, sock = sock_from_file(req->file, &ret); if (sock) { struct io_sr_msg *sr = &req->sr_msg; + void __user *buf = sr->buf; struct msghdr msg; struct iovec iov; unsigned flags; - ret = import_single_range(READ, sr->buf, sr->len, &iov, + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + else if (kbuf) + buf = u64_to_user_ptr(kbuf->addr); + + ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); - if (ret) + if (ret) { + kfree(kbuf); return ret; + } + req->flags |= REQ_F_NEED_CLEANUP; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; @@ -3301,20 +3914,17 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, ret = -EINTR; } - io_cqring_add_event(req, ret); + kfree(kbuf); + req->flags &= ~REQ_F_NEED_CLEANUP; + __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; -#else - return -EOPNOTSUPP; -#endif } - static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_accept *accept = &req->accept; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) @@ -3327,14 +3937,9 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); return 0; -#else - return -EOPNOTSUPP; -#endif } -#if defined(CONFIG_NET) -static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int __io_accept(struct io_kiocb *req, bool force_nonblock) { struct io_accept *accept = &req->accept; unsigned file_flags; @@ -3351,44 +3956,34 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; } static void io_accept_finish(struct io_wq_work **workptr) { struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; if (io_req_cancelled(req)) return; - __io_accept(req, &nxt, false); - if (nxt) - io_wq_assign_next(workptr, nxt); + __io_accept(req, false); + io_steal_work(req, workptr); } -#endif -static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) int ret; - ret = __io_accept(req, nxt, force_nonblock); + ret = __io_accept(req, force_nonblock); if (ret == -EAGAIN && force_nonblock) { req->work.func = io_accept_finish; - io_put_req(req); return -EAGAIN; } return 0; -#else - return -EOPNOTSUPP; -#endif } static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_connect *conn = &req->connect; struct io_async_ctx *io = req->io; @@ -3405,15 +4000,10 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return move_addr_to_kernel(conn->addr, conn->addr_len, &io->connect.address); -#else - return -EOPNOTSUPP; -#endif } -static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) struct io_async_ctx __io, *io; unsigned file_flags; int ret; @@ -3449,25 +4039,303 @@ out: if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; -#else +} +#else /* !CONFIG_NET */ +static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ return -EOPNOTSUPP; -#endif } -static void io_poll_remove_one(struct io_kiocb *req) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) { - struct io_poll_iocb *poll = &req->poll; + return -EOPNOTSUPP; +} + +static int io_send(struct io_kiocb *req, bool force_nonblock) +{ + return -EOPNOTSUPP; +} + +static int io_recvmsg_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +{ + return -EOPNOTSUPP; +} + +static int io_recv(struct io_kiocb *req, bool force_nonblock) +{ + return -EOPNOTSUPP; +} + +static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +static int io_accept(struct io_kiocb *req, bool force_nonblock) +{ + return -EOPNOTSUPP; +} + +static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +static int io_connect(struct io_kiocb *req, bool force_nonblock) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_NET */ + +struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; + int error; +}; + +static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, + struct wait_queue_head *head) +{ + if (unlikely(poll->head)) { + pt->error = -EINVAL; + return; + } + + pt->error = 0; + poll->head = head; + add_wait_queue(head, &poll->wait); +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + + __io_queue_proc(&pt->req->apoll->poll, pt, head); +} + +static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, + __poll_t mask, task_work_func_t func) +{ + struct task_struct *tsk; + int ret; + + /* for instances that support it check for an event match first: */ + if (mask && !(mask & poll->events)) + return 0; + + trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + + list_del_init(&poll->wait.entry); + + tsk = req->task; + req->result = mask; + init_task_work(&req->task_work, func); + /* + * If this fails, then the task is exiting. Punt to one of the io-wq + * threads to ensure the work gets run, we can't always rely on exit + * cancelation taking care of this. + */ + ret = task_work_add(tsk, &req->task_work, true); + if (unlikely(ret)) { + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, true); + } + wake_up_process(tsk); + return 1; +} + +static void io_async_task_func(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct async_poll *apoll = req->apoll; + struct io_ring_ctx *ctx = req->ctx; + + trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); + + WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry)); + + if (hash_hashed(&req->hash_node)) { + spin_lock_irq(&ctx->completion_lock); + hash_del(&req->hash_node); + spin_unlock_irq(&ctx->completion_lock); + } + + /* restore ->work in case we need to retry again */ + memcpy(&req->work, &apoll->work, sizeof(req->work)); + + __set_current_state(TASK_RUNNING); + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL); + mutex_unlock(&ctx->uring_lock); + + kfree(apoll); +} + +static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = &req->apoll->poll; + + trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, + key_to_poll(key)); + + return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); +} + +static void io_poll_req_insert(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); +} + +static __poll_t __io_arm_poll_handler(struct io_kiocb *req, + struct io_poll_iocb *poll, + struct io_poll_table *ipt, __poll_t mask, + wait_queue_func_t wake_func) + __acquires(&ctx->completion_lock) +{ + struct io_ring_ctx *ctx = req->ctx; + bool cancel = false; + + poll->file = req->file; + poll->head = NULL; + poll->done = poll->canceled = false; + poll->events = mask; + + ipt->pt._key = mask; + ipt->req = req; + ipt->error = -EINVAL; + + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); + poll->wait.private = req; + + mask = vfs_poll(req->file, &ipt->pt) & poll->events; + + spin_lock_irq(&ctx->completion_lock); + if (likely(poll->head)) { + spin_lock(&poll->head->lock); + if (unlikely(list_empty(&poll->wait.entry))) { + if (ipt->error) + cancel = true; + ipt->error = 0; + mask = 0; + } + if (mask || ipt->error) + list_del_init(&poll->wait.entry); + else if (cancel) + WRITE_ONCE(poll->canceled, true); + else if (!poll->done) /* actually waiting for an event */ + io_poll_req_insert(req); + spin_unlock(&poll->head->lock); + } + + return mask; +} + +static bool io_arm_poll_handler(struct io_kiocb *req) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; + struct async_poll *apoll; + struct io_poll_table ipt; + __poll_t mask, ret; + + if (!req->file || !file_can_poll(req->file)) + return false; + if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED)) + return false; + if (!def->pollin && !def->pollout) + return false; + + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return false; + + req->flags |= REQ_F_POLLED; + memcpy(&apoll->work, &req->work, sizeof(req->work)); + + get_task_struct(current); + req->task = current; + req->apoll = apoll; + INIT_HLIST_NODE(&req->hash_node); + + mask = 0; + if (def->pollin) + mask |= POLLIN | POLLRDNORM; + if (def->pollout) + mask |= POLLOUT | POLLWRNORM; + mask |= POLLERR | POLLPRI; + + ipt.pt._qproc = io_async_queue_proc; + + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, + io_async_wake); + if (ret) { + ipt.error = 0; + apoll->poll.done = true; + spin_unlock_irq(&ctx->completion_lock); + memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll); + return false; + } + spin_unlock_irq(&ctx->completion_lock); + trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask, + apoll->poll.events); + return true; +} + +static bool __io_poll_remove_one(struct io_kiocb *req, + struct io_poll_iocb *poll) +{ + bool do_complete = false; spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); - io_queue_async_work(req); + do_complete = true; } spin_unlock(&poll->head->lock); + return do_complete; +} + +static bool io_poll_remove_one(struct io_kiocb *req) +{ + bool do_complete; + + if (req->opcode == IORING_OP_POLL_ADD) { + do_complete = __io_poll_remove_one(req, &req->poll); + } else { + /* non-poll requests have submit ref still */ + do_complete = __io_poll_remove_one(req, &req->apoll->poll); + if (do_complete) + io_put_req(req); + } + hash_del(&req->hash_node); + + if (do_complete) { + io_cqring_fill_event(req, -ECANCELED); + io_commit_cqring(req->ctx); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } + + return do_complete; } static void io_poll_remove_all(struct io_ring_ctx *ctx) @@ -3485,6 +4353,8 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) @@ -3494,10 +4364,11 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; hlist_for_each_entry(req, list, hash_node) { - if (sqe_addr == req->user_data) { - io_poll_remove_one(req); + if (sqe_addr != req->user_data) + continue; + if (io_poll_remove_one(req)) return 0; - } + return -EALREADY; } return -ENOENT; @@ -3543,186 +4414,66 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - if (error) - io_cqring_fill_event(req, error); - else - io_cqring_fill_event(req, mangle_poll(mask)); + io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); } -static void io_poll_complete_work(struct io_wq_work **workptr) +static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) { - struct io_wq_work *work = *workptr; - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_poll_iocb *poll = &req->poll; - struct poll_table_struct pt = { ._key = poll->events }; struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt = NULL; - __poll_t mask = 0; - int ret = 0; + struct io_poll_iocb *poll = &req->poll; - if (work->flags & IO_WQ_WORK_CANCEL) { - WRITE_ONCE(poll->canceled, true); - ret = -ECANCELED; - } else if (READ_ONCE(poll->canceled)) { - ret = -ECANCELED; - } + if (!req->result && !READ_ONCE(poll->canceled)) { + struct poll_table_struct pt = { ._key = poll->events }; - if (ret != -ECANCELED) - mask = vfs_poll(poll->file, &pt) & poll->events; + req->result = vfs_poll(req->file, &pt) & poll->events; + } - /* - * Note that ->ki_cancel callers also delete iocb from active_reqs after - * calling ->ki_cancel. We need the ctx_lock roundtrip here to - * synchronize with them. In the cancellation case the list_del_init - * itself is not actually needed, but harmless so we keep it in to - * avoid further branches in the fast path. - */ spin_lock_irq(&ctx->completion_lock); - if (!mask && ret != -ECANCELED) { + if (!req->result && !READ_ONCE(poll->canceled)) { add_wait_queue(poll->head, &poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } hash_del(&req->hash_node); - io_poll_complete(req, mask, ret); + io_poll_complete(req, req->result, 0); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req_find_next(req, nxt); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - - if (ret < 0) - req_set_fail_links(req); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); } -static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) +static void io_poll_task_func(struct callback_head *cb) { - struct io_kiocb *req, *tmp; - struct req_batch rb; + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_kiocb *nxt = NULL; - rb.to_free = rb.need_iter = 0; - spin_lock_irq(&ctx->completion_lock); - llist_for_each_entry_safe(req, tmp, nodes, llist_node) { - hash_del(&req->hash_node); - io_poll_complete(req, req->result, 0); + io_poll_task_handler(req, &nxt); + if (nxt) { + struct io_ring_ctx *ctx = nxt->ctx; - if (refcount_dec_and_test(&req->refs) && - !io_req_multi_free(&rb, req)) { - req->flags |= REQ_F_COMP_LOCKED; - io_free_req(req); - } + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(nxt, NULL); + mutex_unlock(&ctx->uring_lock); } - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - io_free_req_many(ctx, &rb); -} - -static void io_poll_flush(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct llist_node *nodes; - - nodes = llist_del_all(&req->ctx->poll_llist); - if (nodes) - __io_poll_flush(req->ctx, nodes); -} - -static void io_poll_trigger_evfd(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - eventfd_signal(req->ctx->cq_ev_fd, 1); - io_put_req(req); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_poll_iocb *poll = wait->private; - struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); - struct io_ring_ctx *ctx = req->ctx; - __poll_t mask = key_to_poll(key); - - /* for instances that support it check for an event match first: */ - if (mask && !(mask & poll->events)) - return 0; - - list_del_init(&poll->wait.entry); - - /* - * Run completion inline if we can. We're using trylock here because - * we are violating the completion_lock -> poll wq lock ordering. - * If we have a link timeout we're going to need the completion_lock - * for finalizing the request, mark us as having grabbed that already. - */ - if (mask) { - unsigned long flags; - - if (llist_empty(&ctx->poll_llist) && - spin_trylock_irqsave(&ctx->completion_lock, flags)) { - bool trigger_ev; - - hash_del(&req->hash_node); - io_poll_complete(req, mask, 0); - - trigger_ev = io_should_trigger_evfd(ctx); - if (trigger_ev && eventfd_signal_count()) { - trigger_ev = false; - req->work.func = io_poll_trigger_evfd; - } else { - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - req = NULL; - } - spin_unlock_irqrestore(&ctx->completion_lock, flags); - __io_cqring_ev_posted(ctx, trigger_ev); - } else { - req->result = mask; - req->llist_node.next = NULL; - /* if the list wasn't empty, we're done */ - if (!llist_add(&req->llist_node, &ctx->poll_llist)) - req = NULL; - else - req->work.func = io_poll_flush; - } - } - if (req) - io_queue_async_work(req); + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = &req->poll; - return 1; + return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); } -struct io_poll_table { - struct poll_table_struct pt; - struct io_kiocb *req; - int error; -}; - static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - if (unlikely(pt->req->poll.head)) { - pt->error = -EINVAL; - return; - } - - pt->error = 0; - pt->req->poll.head = head; - add_wait_queue(head, &pt->req->poll.wait); -} - -static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; - - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); + __io_queue_proc(&pt->req->poll, pt, head); } static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -3739,55 +4490,26 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + + get_task_struct(current); + req->task = current; return 0; } -static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) +static int io_poll_add(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; - bool cancel = false; __poll_t mask; - INIT_IO_WORK(&req->work, io_poll_complete_work); INIT_HLIST_NODE(&req->hash_node); - - poll->head = NULL; - poll->done = false; - poll->canceled = false; - - ipt.pt._qproc = io_poll_queue_proc; - ipt.pt._key = poll->events; - ipt.req = req; - ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ - - /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, io_poll_wake); - poll->wait.private = poll; - INIT_LIST_HEAD(&req->list); + ipt.pt._qproc = io_poll_queue_proc; - mask = vfs_poll(poll->file, &ipt.pt) & poll->events; + mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, + io_poll_wake); - spin_lock_irq(&ctx->completion_lock); - if (likely(poll->head)) { - spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { - if (ipt.error) - cancel = true; - ipt.error = 0; - mask = 0; - } - if (mask || ipt.error) - list_del_init(&poll->wait.entry); - else if (cancel) - WRITE_ONCE(poll->canceled, true); - else if (!poll->done) /* actually waiting for an event */ - io_poll_req_insert(req); - spin_unlock(&poll->head->lock); - } if (mask) { /* no async, we'd stolen it */ ipt.error = 0; io_poll_complete(req, mask, 0); @@ -3796,7 +4518,7 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) if (mask) { io_cqring_ev_posted(ctx); - io_put_req_find_next(req, nxt); + io_put_req(req); } return ipt.error; } @@ -4045,7 +4767,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) static void io_async_find_and_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req, __u64 sqe_addr, - struct io_kiocb **nxt, int success_ret) + int success_ret) { unsigned long flags; int ret; @@ -4071,7 +4793,7 @@ done: if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); } static int io_async_cancel_prep(struct io_kiocb *req, @@ -4087,11 +4809,11 @@ static int io_async_cancel_prep(struct io_kiocb *req, return 0; } -static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) +static int io_async_cancel(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); + io_async_find_and_cancel(ctx, req, req->cancel.addr, 0); return 0; } @@ -4226,6 +4948,15 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_EPOLL_CTL: ret = io_epoll_ctl_prep(req, sqe); break; + case IORING_OP_SPLICE: + ret = io_splice_prep(req, sqe); + break; + case IORING_OP_PROVIDE_BUFFERS: + ret = io_provide_buffers_prep(req, sqe); + break; + case IORING_OP_REMOVE_BUFFERS: + ret = io_remove_buffers_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -4272,29 +5003,43 @@ static void io_cleanup_req(struct io_kiocb *req) case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: + if (req->flags & REQ_F_BUFFER_SELECTED) + kfree((void *)(unsigned long)req->rw.addr); + /* fallthrough */ case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: if (io->rw.iov != io->rw.fast_iov) kfree(io->rw.iov); break; - case IORING_OP_SENDMSG: case IORING_OP_RECVMSG: + if (req->flags & REQ_F_BUFFER_SELECTED) + kfree(req->sr_msg.kbuf); + /* fallthrough */ + case IORING_OP_SENDMSG: if (io->msg.iov != io->msg.fast_iov) kfree(io->msg.iov); break; + case IORING_OP_RECV: + if (req->flags & REQ_F_BUFFER_SELECTED) + kfree(req->sr_msg.kbuf); + break; case IORING_OP_OPENAT: case IORING_OP_OPENAT2: case IORING_OP_STATX: putname(req->open.filename); break; + case IORING_OP_SPLICE: + io_put_file(req, req->splice.file_in, + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) + bool force_nonblock) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -4311,7 +5056,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_read(req, nxt, force_nonblock); + ret = io_read(req, force_nonblock); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: @@ -4321,7 +5066,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_write(req, nxt, force_nonblock); + ret = io_write(req, force_nonblock); break; case IORING_OP_FSYNC: if (sqe) { @@ -4329,7 +5074,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_fsync(req, nxt, force_nonblock); + ret = io_fsync(req, force_nonblock); break; case IORING_OP_POLL_ADD: if (sqe) { @@ -4337,7 +5082,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_poll_add(req, nxt); + ret = io_poll_add(req); break; case IORING_OP_POLL_REMOVE: if (sqe) { @@ -4353,7 +5098,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_sync_file_range(req, nxt, force_nonblock); + ret = io_sync_file_range(req, force_nonblock); break; case IORING_OP_SENDMSG: case IORING_OP_SEND: @@ -4363,9 +5108,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, nxt, force_nonblock); + ret = io_sendmsg(req, force_nonblock); else - ret = io_send(req, nxt, force_nonblock); + ret = io_send(req, force_nonblock); break; case IORING_OP_RECVMSG: case IORING_OP_RECV: @@ -4375,9 +5120,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, nxt, force_nonblock); + ret = io_recvmsg(req, force_nonblock); else - ret = io_recv(req, nxt, force_nonblock); + ret = io_recv(req, force_nonblock); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -4401,7 +5146,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_accept(req, nxt, force_nonblock); + ret = io_accept(req, force_nonblock); break; case IORING_OP_CONNECT: if (sqe) { @@ -4409,7 +5154,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_connect(req, nxt, force_nonblock); + ret = io_connect(req, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: if (sqe) { @@ -4417,7 +5162,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_async_cancel(req, nxt); + ret = io_async_cancel(req); break; case IORING_OP_FALLOCATE: if (sqe) { @@ -4425,7 +5170,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_fallocate(req, nxt, force_nonblock); + ret = io_fallocate(req, force_nonblock); break; case IORING_OP_OPENAT: if (sqe) { @@ -4433,7 +5178,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_openat(req, nxt, force_nonblock); + ret = io_openat(req, force_nonblock); break; case IORING_OP_CLOSE: if (sqe) { @@ -4441,7 +5186,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_close(req, nxt, force_nonblock); + ret = io_close(req, force_nonblock); break; case IORING_OP_FILES_UPDATE: if (sqe) { @@ -4457,7 +5202,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_statx(req, nxt, force_nonblock); + ret = io_statx(req, force_nonblock); break; case IORING_OP_FADVISE: if (sqe) { @@ -4465,7 +5210,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_fadvise(req, nxt, force_nonblock); + ret = io_fadvise(req, force_nonblock); break; case IORING_OP_MADVISE: if (sqe) { @@ -4473,7 +5218,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_madvise(req, nxt, force_nonblock); + ret = io_madvise(req, force_nonblock); break; case IORING_OP_OPENAT2: if (sqe) { @@ -4481,7 +5226,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_openat2(req, nxt, force_nonblock); + ret = io_openat2(req, force_nonblock); break; case IORING_OP_EPOLL_CTL: if (sqe) { @@ -4489,7 +5234,31 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_epoll_ctl(req, nxt, force_nonblock); + ret = io_epoll_ctl(req, force_nonblock); + break; + case IORING_OP_SPLICE: + if (sqe) { + ret = io_splice_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_splice(req, force_nonblock); + break; + case IORING_OP_PROVIDE_BUFFERS: + if (sqe) { + ret = io_provide_buffers_prep(req, sqe); + if (ret) + break; + } + ret = io_provide_buffers(req, force_nonblock); + break; + case IORING_OP_REMOVE_BUFFERS: + if (sqe) { + ret = io_remove_buffers_prep(req, sqe); + if (ret) + break; + } + ret = io_remove_buffers(req, force_nonblock); break; default: ret = -EINVAL; @@ -4522,7 +5291,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; int ret = 0; /* if NO_CANCEL is set, we must still run the work */ @@ -4532,9 +5300,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } if (!ret) { - req->in_async = true; do { - ret = io_issue_sqe(req, NULL, &nxt, false); + ret = io_issue_sqe(req, NULL, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -4546,18 +5313,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } while (1); } - /* drop submission reference */ - io_put_req(req); - if (ret) { req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); } - /* if a dependent link is ready, pass it back */ - if (!ret && nxt) - io_wq_assign_next(workptr, nxt); + io_steal_work(req, workptr); } static int io_req_needs_file(struct io_kiocb *req, int fd) @@ -4578,41 +5340,48 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK];; } -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, + int fd, struct file **out_file, bool fixed) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; - int fd; - - flags = READ_ONCE(sqe->flags); - fd = READ_ONCE(sqe->fd); - - if (!io_req_needs_file(req, fd)) - return 0; + struct file *file; - if (flags & IOSQE_FIXED_FILE) { + if (fixed) { if (unlikely(!ctx->file_data || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); - req->file = io_file_from_index(ctx, fd); - if (!req->file) + file = io_file_from_index(ctx, fd); + if (!file) return -EBADF; - req->flags |= REQ_F_FIXED_FILE; - percpu_ref_get(&ctx->file_data->refs); + req->fixed_file_refs = ctx->file_data->cur_refs; + percpu_ref_get(req->fixed_file_refs); } else { - if (req->needs_fixed_file) - return -EBADF; trace_io_uring_file_get(ctx, fd); - req->file = io_file_get(state, fd); - if (unlikely(!req->file)) + file = __io_file_get(state, fd); + if (unlikely(!file)) return -EBADF; } + *out_file = file; return 0; } +static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, + int fd, unsigned int flags) +{ + bool fixed; + + if (!io_req_needs_file(req, fd)) + return 0; + + fixed = (flags & IOSQE_FIXED_FILE); + if (unlikely(!fixed && req->needs_fixed_file)) + return -EBADF; + + return io_file_get(state, req, fd, &req->file, fixed); +} + static int io_grab_files(struct io_kiocb *req) { int ret = -EBADF; @@ -4672,8 +5441,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (prev) { req_set_fail_links(prev); - io_async_find_and_cancel(ctx, req, prev->user_data, NULL, - -ETIME); + io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req(prev); } else { io_cqring_add_event(req, -ETIME); @@ -4710,6 +5478,9 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK)) return NULL; + /* for polled retry, if flag is set, we already went through here */ + if (req->flags & REQ_F_POLLED) + return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, link_list); @@ -4723,7 +5494,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_kiocb *linked_timeout; - struct io_kiocb *nxt = NULL; + struct io_kiocb *nxt; const struct cred *old_creds = NULL; int ret; @@ -4739,7 +5510,7 @@ again: old_creds = override_creds(req->work.creds); } - ret = io_issue_sqe(req, sqe, &nxt, true); + ret = io_issue_sqe(req, sqe, true); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -4747,6 +5518,11 @@ again: */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { + if (io_arm_poll_handler(req)) { + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); + goto exit; + } punt: if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); @@ -4759,10 +5535,11 @@ punt: * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); - goto done_req; + goto exit; } err: + nxt = NULL; /* drop submission reference */ io_put_req_find_next(req, &nxt); @@ -4779,15 +5556,14 @@ err: req_set_fail_links(req); io_put_req(req); } -done_req: if (nxt) { req = nxt; - nxt = NULL; if (req->flags & REQ_F_FORCE_ASYNC) goto punt; goto again; } +exit: if (old_creds) revert_creds(old_creds); } @@ -4829,14 +5605,15 @@ static inline void io_queue_link_head(struct io_kiocb *req) } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC) + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ + IOSQE_BUFFER_SELECT) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; unsigned int sqe_flags; - int ret, id; + int ret, id, fd; sqe_flags = READ_ONCE(sqe->flags); @@ -4846,6 +5623,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, goto err_req; } + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[req->opcode].buffer_select) { + ret = -EOPNOTSUPP; + goto err_req; + } + id = READ_ONCE(sqe->personality); if (id) { req->work.creds = idr_find(&ctx->personality_idr, id); @@ -4857,10 +5640,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| - IOSQE_ASYNC); + req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | + IOSQE_ASYNC | IOSQE_FIXED_FILE | + IOSQE_BUFFER_SELECT); - ret = io_req_set_file(state, req, sqe); + fd = READ_ONCE(sqe->fd); + ret = io_req_set_file(state, req, fd, sqe_flags); if (unlikely(ret)) { err_req: io_cqring_add_event(req, ret); @@ -4976,8 +5761,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe **sqe_ptr) +static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) { u32 *sq_array = ctx->sq_array; unsigned head; @@ -4991,25 +5775,40 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, * though the application is the one updating it. */ head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]); - if (likely(head < ctx->sq_entries)) { - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = ctx->cached_sq_head; - *sqe_ptr = &ctx->sq_sqes[head]; - req->opcode = READ_ONCE((*sqe_ptr)->opcode); - req->user_data = READ_ONCE((*sqe_ptr)->user_data); - ctx->cached_sq_head++; - return true; - } + if (likely(head < ctx->sq_entries)) + return &ctx->sq_sqes[head]; /* drop invalid entries */ - ctx->cached_sq_head++; ctx->cached_sq_dropped++; WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); - return false; + return NULL; +} + +static inline void io_consume_sqe(struct io_ring_ctx *ctx) +{ + ctx->cached_sq_head++; +} + +static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + /* + * All io need record the previous position, if LINK vs DARIN, + * it can be used to mark the position of the first IO in the + * link list. + */ + req->sequence = ctx->cached_sq_head; + req->opcode = READ_ONCE(sqe->opcode); + req->user_data = READ_ONCE(sqe->user_data); + req->io = NULL; + req->file = NULL; + req->ctx = ctx; + req->flags = 0; + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); + req->task = NULL; + req->result = 0; + INIT_IO_WORK(&req->work, io_wq_submit_work); } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, @@ -5047,17 +5846,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, struct io_kiocb *req; int err; - req = io_get_req(ctx, statep); + sqe = io_get_sqe(ctx); + if (unlikely(!sqe)) { + io_consume_sqe(ctx); + break; + } + req = io_alloc_req(ctx, statep); if (unlikely(!req)) { if (!submitted) submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, req, &sqe)) { - __io_req_do_free(req); - break; - } + io_init_req(ctx, req, sqe); + io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; @@ -5079,7 +5881,6 @@ fail_req: *mm = ctx->sqo_mm; } - req->in_async = async; req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true, async); @@ -5163,6 +5964,8 @@ static int io_sq_thread(void *data) if (!list_empty(&ctx->poll_list) || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { + if (current->task_works) + task_work_run(); cond_resched(); continue; } @@ -5194,6 +5997,11 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } + if (current->task_works) { + task_work_run(); + finish_wait(&ctx->sqo_wait, &wait); + continue; + } if (signal_pending(current)) flush_signals(current); schedule(); @@ -5213,6 +6021,9 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } + if (current->task_works) + task_work_run(); + set_fs(old_fs); if (cur_mm) { unuse_mm(cur_mm); @@ -5277,8 +6088,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, struct io_rings *rings = ctx->rings; int ret = 0; - if (io_cqring_events(ctx, false) >= min_events) - return 0; + do { + if (io_cqring_events(ctx, false) >= min_events) + return 0; + if (!current->task_works) + break; + task_work_run(); + } while (1); if (sig) { #ifdef CONFIG_COMPAT @@ -5298,6 +6114,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); + if (current->task_works) + task_work_run(); if (io_should_wake(&iowq, false)) break; schedule(); @@ -5344,43 +6162,36 @@ static void io_file_ref_kill(struct percpu_ref *ref) complete(&data->done); } -static void io_file_ref_exit_and_free(struct work_struct *work) -{ - struct fixed_file_data *data; - - data = container_of(work, struct fixed_file_data, ref_work); - - /* - * Ensure any percpu-ref atomic switch callback has run, it could have - * been in progress when the files were being unregistered. Once - * that's done, we can safely exit and free the ref and containing - * data structure. - */ - rcu_barrier(); - percpu_ref_exit(&data->refs); - kfree(data); -} - static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { struct fixed_file_data *data = ctx->file_data; + struct fixed_file_ref_node *ref_node = NULL; unsigned nr_tables, i; + unsigned long flags; if (!data) return -ENXIO; - percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); - flush_work(&data->ref_work); + spin_lock_irqsave(&data->lock, flags); + if (!list_empty(&data->ref_list)) + ref_node = list_first_entry(&data->ref_list, + struct fixed_file_ref_node, node); + spin_unlock_irqrestore(&data->lock, flags); + if (ref_node) + percpu_ref_kill(&ref_node->refs); + + percpu_ref_kill(&data->refs); + + /* wait for all refs nodes to complete */ wait_for_completion(&data->done); - io_ring_file_ref_flush(data); __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) kfree(data->table[i].files); kfree(data->table); - INIT_WORK(&data->ref_work, io_file_ref_exit_and_free); - queue_work(system_wq, &data->ref_work); + percpu_ref_exit(&data->refs); + kfree(data); ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; @@ -5424,13 +6235,6 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) struct sk_buff *skb; int i, nr_files; - if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { - unsigned long inflight = ctx->user->unix_inflight + nr; - - if (inflight > task_rlimit(current, RLIMIT_NOFILE)) - return -EMFILE; - } - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); if (!fpl) return -ENOMEM; @@ -5605,50 +6409,72 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) } struct io_file_put { - struct llist_node llist; + struct list_head list; struct file *file; - struct completion *done; }; -static void io_ring_file_ref_flush(struct fixed_file_data *data) +static void io_file_put_work(struct work_struct *work) { + struct fixed_file_ref_node *ref_node; + struct fixed_file_data *file_data; + struct io_ring_ctx *ctx; struct io_file_put *pfile, *tmp; - struct llist_node *node; + unsigned long flags; - while ((node = llist_del_all(&data->put_llist)) != NULL) { - llist_for_each_entry_safe(pfile, tmp, node, llist) { - io_ring_file_put(data->ctx, pfile->file); - if (pfile->done) - complete(pfile->done); - else - kfree(pfile); - } + ref_node = container_of(work, struct fixed_file_ref_node, work); + file_data = ref_node->file_data; + ctx = file_data->ctx; + + list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) { + list_del_init(&pfile->list); + io_ring_file_put(ctx, pfile->file); + kfree(pfile); } + + spin_lock_irqsave(&file_data->lock, flags); + list_del_init(&ref_node->node); + spin_unlock_irqrestore(&file_data->lock, flags); + + percpu_ref_exit(&ref_node->refs); + kfree(ref_node); + percpu_ref_put(&file_data->refs); } -static void io_ring_file_ref_switch(struct work_struct *work) +static void io_file_data_ref_zero(struct percpu_ref *ref) { - struct fixed_file_data *data; + struct fixed_file_ref_node *ref_node; + + ref_node = container_of(ref, struct fixed_file_ref_node, refs); - data = container_of(work, struct fixed_file_data, ref_work); - io_ring_file_ref_flush(data); - percpu_ref_switch_to_percpu(&data->refs); + queue_work(system_wq, &ref_node->work); } -static void io_file_data_ref_zero(struct percpu_ref *ref) +static struct fixed_file_ref_node *alloc_fixed_file_ref_node( + struct io_ring_ctx *ctx) { - struct fixed_file_data *data; + struct fixed_file_ref_node *ref_node; - data = container_of(ref, struct fixed_file_data, refs); + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return ERR_PTR(-ENOMEM); - /* - * We can't safely switch from inside this context, punt to wq. If - * the table ref is going away, the table is being unregistered. - * Don't queue up the async work for that case, the caller will - * handle it. - */ - if (!percpu_ref_is_dying(&data->refs)) - queue_work(system_wq, &data->ref_work); + if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero, + 0, GFP_KERNEL)) { + kfree(ref_node); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&ref_node->node); + INIT_LIST_HEAD(&ref_node->file_list); + INIT_WORK(&ref_node->work, io_file_put_work); + ref_node->file_data = ctx->file_data; + return ref_node; + +} + +static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node) +{ + percpu_ref_exit(&ref_node->refs); + kfree(ref_node); } static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, @@ -5659,6 +6485,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, struct file *file; int fd, ret = 0; unsigned i; + struct fixed_file_ref_node *ref_node; + unsigned long flags; if (ctx->file_data) return -EBUSY; @@ -5672,6 +6500,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -ENOMEM; ctx->file_data->ctx = ctx; init_completion(&ctx->file_data->done); + INIT_LIST_HEAD(&ctx->file_data->ref_list); + spin_lock_init(&ctx->file_data->lock); nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); ctx->file_data->table = kcalloc(nr_tables, @@ -5683,15 +6513,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -ENOMEM; } - if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero, + if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { kfree(ctx->file_data->table); kfree(ctx->file_data); ctx->file_data = NULL; return -ENOMEM; } - ctx->file_data->put_llist.first = NULL; - INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch); if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { percpu_ref_exit(&ctx->file_data->refs); @@ -5754,9 +6582,22 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, } ret = io_sqe_files_scm(ctx); - if (ret) + if (ret) { io_sqe_files_unregister(ctx); + return ret; + } + ref_node = alloc_fixed_file_ref_node(ctx); + if (IS_ERR(ref_node)) { + io_sqe_files_unregister(ctx); + return PTR_ERR(ref_node); + } + + ctx->file_data->cur_refs = &ref_node->refs; + spin_lock_irqsave(&ctx->file_data->lock, flags); + list_add(&ref_node->node, &ctx->file_data->ref_list); + spin_unlock_irqrestore(&ctx->file_data->lock, flags); + percpu_ref_get(&ctx->file_data->refs); return ret; } @@ -5803,46 +6644,22 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static void io_atomic_switch(struct percpu_ref *ref) +static int io_queue_file_removal(struct fixed_file_data *data, + struct file *file) { - struct fixed_file_data *data; + struct io_file_put *pfile; + struct percpu_ref *refs = data->cur_refs; + struct fixed_file_ref_node *ref_node; - /* - * Juggle reference to ensure we hit zero, if needed, so we can - * switch back to percpu mode - */ - data = container_of(ref, struct fixed_file_data, refs); - percpu_ref_put(&data->refs); - percpu_ref_get(&data->refs); -} - -static bool io_queue_file_removal(struct fixed_file_data *data, - struct file *file) -{ - struct io_file_put *pfile, pfile_stack; - DECLARE_COMPLETION_ONSTACK(done); - - /* - * If we fail allocating the struct we need for doing async reomval - * of this file, just punt to sync and wait for it. - */ pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); - if (!pfile) { - pfile = &pfile_stack; - pfile->done = &done; - } + if (!pfile) + return -ENOMEM; + ref_node = container_of(refs, struct fixed_file_ref_node, refs); pfile->file = file; - llist_add(&pfile->llist, &data->put_llist); + list_add(&pfile->list, &ref_node->file_list); - if (pfile == &pfile_stack) { - percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); - wait_for_completion(&done); - flush_work(&data->ref_work); - return false; - } - - return true; + return 0; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, @@ -5850,17 +6667,23 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, unsigned nr_args) { struct fixed_file_data *data = ctx->file_data; - bool ref_switch = false; + struct fixed_file_ref_node *ref_node; struct file *file; __s32 __user *fds; int fd, i, err; __u32 done; + unsigned long flags; + bool needs_switch = false; if (check_add_overflow(up->offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; + ref_node = alloc_fixed_file_ref_node(ctx); + if (IS_ERR(ref_node)) + return PTR_ERR(ref_node); + done = 0; fds = u64_to_user_ptr(up->fds); while (nr_args) { @@ -5877,9 +6700,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { file = io_file_from_index(ctx, index); + err = io_queue_file_removal(data, file); + if (err) + break; table->files[index] = NULL; - if (io_queue_file_removal(data, file)) - ref_switch = true; + needs_switch = true; } if (fd != -1) { file = fget(fd); @@ -5910,11 +6735,19 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, up->offset++; } - if (ref_switch) - percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); + if (needs_switch) { + percpu_ref_kill(data->cur_refs); + spin_lock_irqsave(&data->lock, flags); + list_add(&ref_node->node, &data->ref_list); + data->cur_refs = &ref_node->refs; + spin_unlock_irqrestore(&data->lock, flags); + percpu_ref_get(&ctx->file_data->refs); + } else + destroy_fixed_file_ref_node(ref_node); return done ? done : err; } + static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { @@ -5932,20 +6765,14 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, return __io_sqe_files_update(ctx, &up, nr_args); } -static void io_put_work(struct io_wq_work *work) +static void io_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + /* Consider that io_steal_work() relies on this ref */ io_put_req(req); } -static void io_get_work(struct io_wq_work *work) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - refcount_inc(&req->refs); -} - static int io_init_wq_offload(struct io_ring_ctx *ctx, struct io_uring_params *p) { @@ -5956,8 +6783,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx, int ret = 0; data.user = ctx->user; - data.get_work = io_get_work; - data.put_work = io_put_work; + data.free_work = io_free_work; if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { /* Do QD, or 4 * CPUS, whatever is smallest */ @@ -6359,6 +7185,21 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) return -ENXIO; } +static int __io_destroy_buffers(int id, void *p, void *data) +{ + struct io_ring_ctx *ctx = data; + struct io_buffer *buf = p; + + __io_remove_buffers(ctx, buf, id, -1U); + return 0; +} + +static void io_destroy_buffers(struct io_ring_ctx *ctx) +{ + idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx); + idr_destroy(&ctx->io_buffer_idr); +} + static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); @@ -6369,6 +7210,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); + io_destroy_buffers(ctx); idr_destroy(&ctx->personality_idr); #if defined(CONFIG_UNIX) @@ -6431,6 +7273,18 @@ static int io_remove_personalities(int id, void *p, void *data) return 0; } +static void io_ring_exit_work(struct work_struct *work) +{ + struct io_ring_ctx *ctx; + + ctx = container_of(work, struct io_ring_ctx, exit_work); + if (ctx->rings) + io_cqring_overflow_flush(ctx, true); + + wait_for_completion(&ctx->completions[0]); + io_ring_ctx_free(ctx); +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -6458,8 +7312,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) if (ctx->rings) io_cqring_overflow_flush(ctx, true); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); - wait_for_completion(&ctx->completions[0]); - io_ring_ctx_free(ctx); + INIT_WORK(&ctx->exit_work, io_ring_exit_work); + queue_work(system_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) @@ -6623,6 +7477,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; + if (current->task_works) + task_work_run(); + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; @@ -6669,7 +7526,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, min_complete = min(min_complete, ctx->cq_entries); - if (ctx->flags & IORING_SETUP_IOPOLL) { + /* + * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user + * space applications don't need to do io completion events + * polling again, they can rely on io_sq_thread to do polling + * work, which can reduce cpu usage and uring_lock contention. + */ + if (ctx->flags & IORING_SETUP_IOPOLL && + !(ctx->flags & IORING_SETUP_SQPOLL)) { ret = io_iopoll_check(ctx, &nr_events, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); @@ -6745,6 +7609,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "Personalities:\n"); idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); } + seq_printf(m, "PollList:\n"); + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list = &ctx->cancel_hash[i]; + struct io_kiocb *req; + + hlist_for_each_entry(req, list, hash_node) + seq_printf(m, " op=%d, task_works=%d\n", req->opcode, + req->task->task_works != NULL); + } + spin_unlock_irq(&ctx->completion_lock); mutex_unlock(&ctx->uring_lock); } @@ -6961,7 +7836,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY; + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: @@ -7239,6 +8114,7 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(8, __u64, off); BUILD_BUG_SQE_ELEM(8, __u64, addr2); BUILD_BUG_SQE_ELEM(16, __u64, addr); + BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); BUILD_BUG_SQE_ELEM(24, __u32, len); BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); @@ -7253,11 +8129,14 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, __u32, open_flags); BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); + BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); BUILD_BUG_SQE_ELEM(32, __u64, user_data); BUILD_BUG_SQE_ELEM(40, __u16, buf_index); BUILD_BUG_SQE_ELEM(42, __u16, personality); + BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); + BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); return 0; }; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7c84c4c027c4..89e21961d1ad 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -302,6 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + gfp_t orig_gfp = gfp; int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; if (ctx->bio) @@ -310,6 +311,13 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (ctx->is_readahead) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); + /* + * If the bio_alloc fails, try it again for a single page to + * avoid having to deal with partial page reads. This emulates + * what do_mpage_readpage does. + */ + if (!ctx->bio) + ctx->bio = bio_alloc(orig_gfp, 1); ctx->bio->bi_opf = REQ_OP_READ; if (ctx->is_readahead) ctx->bio->bi_opf |= REQ_RAHEAD; @@ -503,7 +511,8 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); int iomap_releasepage(struct page *page, gfp_t gfp_mask) { - trace_iomap_releasepage(page->mapping->host, page, 0, 0); + trace_iomap_releasepage(page->mapping->host, page_offset(page), + PAGE_SIZE); /* * mm accommodates an old ext3 case where clean pages might not have had @@ -520,7 +529,7 @@ EXPORT_SYMBOL_GPL(iomap_releasepage); void iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) { - trace_iomap_invalidatepage(page->mapping->host, page, offset, len); + trace_iomap_invalidatepage(page->mapping->host, offset, len); /* * If we are invalidating the entire page, clear the dirty state from it @@ -974,13 +983,6 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); } -static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, - struct iomap *iomap) -{ - return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, - iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); -} - static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, void *data, struct iomap *iomap, struct iomap *srcmap) @@ -1000,7 +1002,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, bytes = min_t(loff_t, PAGE_SIZE - offset, count); if (IS_DAX(inode)) - status = iomap_dax_zero(pos, offset, bytes, iomap); + status = dax_iomap_zero(pos, offset, bytes, iomap); else status = iomap_zero(inode, pos, offset, bytes, iomap, srcmap); @@ -1519,7 +1521,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) u64 end_offset; loff_t offset; - trace_iomap_writepage(inode, page, 0, 0); + trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE); /* * Refuse to write the page out if we are called from reclaim context. diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 23837926c0c5..20dde5aadcdd 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -534,8 +534,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, /* * We are about to drop our additional submission reference, which - * might be the last reference to the dio. There are three three - * different ways we can progress here: + * might be the last reference to the dio. There are three different + * ways we can progress here: * * (a) If this is the last reference we will always complete and free * the dio ourselves. diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 6dc227b8c47e..4df19c66f597 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -41,14 +41,12 @@ DEFINE_EVENT(iomap_readpage_class, name, \ DEFINE_READPAGE_EVENT(iomap_readpage); DEFINE_READPAGE_EVENT(iomap_readpages); -DECLARE_EVENT_CLASS(iomap_page_class, - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, - unsigned int len), - TP_ARGS(inode, page, off, len), +DECLARE_EVENT_CLASS(iomap_range_class, + TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), + TP_ARGS(inode, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) - __field(pgoff_t, pgoff) __field(loff_t, size) __field(unsigned long, offset) __field(unsigned int, length) @@ -56,29 +54,26 @@ DECLARE_EVENT_CLASS(iomap_page_class, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->pgoff = page_offset(page); __entry->size = i_size_read(inode); __entry->offset = off; __entry->length = len; ), - TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset %lx " "length %x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->pgoff, __entry->size, __entry->offset, __entry->length) ) -#define DEFINE_PAGE_EVENT(name) \ -DEFINE_EVENT(iomap_page_class, name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ - unsigned int len), \ - TP_ARGS(inode, page, off, len)) -DEFINE_PAGE_EVENT(iomap_writepage); -DEFINE_PAGE_EVENT(iomap_releasepage); -DEFINE_PAGE_EVENT(iomap_invalidatepage); +#define DEFINE_RANGE_EVENT(name) \ +DEFINE_EVENT(iomap_range_class, name, \ + TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),\ + TP_ARGS(inode, off, len)) +DEFINE_RANGE_EVENT(iomap_writepage); +DEFINE_RANGE_EVENT(iomap_releasepage); +DEFINE_RANGE_EVENT(iomap_invalidatepage); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 27373f5792a4..e855d8260433 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -997,9 +997,10 @@ restart_loop: * journalled data) we need to unmap buffer and clear * more bits. We also need to be careful about the check * because the data page mapping can get cleared under - * out hands, which alse need not to clear more bits - * because the page and buffers will be freed and can - * never be reused once we are done with them. + * our hands. Note that if mapping == NULL, we don't + * need to make buffer unmapped because the page is + * already detached from the mapping and buffers cannot + * get reused. */ mapping = READ_ONCE(bh->b_page->mapping); if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index d0f7a5abd9a9..fc2469a20fed 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -53,6 +53,8 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc) kn->iattr->ia_ctime = kn->iattr->ia_atime; simple_xattrs_init(&kn->iattr->xattrs); + atomic_set(&kn->iattr->nr_user_xattrs, 0); + atomic_set(&kn->iattr->user_xattr_size, 0); out_unlock: ret = kn->iattr; mutex_unlock(&iattr_mutex); @@ -303,7 +305,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name, if (!attrs) return -ENOMEM; - return simple_xattr_set(&attrs->xattrs, name, value, size, flags); + return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL); } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, @@ -327,6 +329,86 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, return kernfs_xattr_set(kn, name, value, size, flags); } +static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, + const char *full_name, + struct simple_xattrs *xattrs, + const void *value, size_t size, int flags) +{ + atomic_t *sz = &kn->iattr->user_xattr_size; + atomic_t *nr = &kn->iattr->nr_user_xattrs; + ssize_t removed_size; + int ret; + + if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { + ret = -ENOSPC; + goto dec_count_out; + } + + if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { + ret = -ENOSPC; + goto dec_size_out; + } + + ret = simple_xattr_set(xattrs, full_name, value, size, flags, + &removed_size); + + if (!ret && removed_size >= 0) + size = removed_size; + else if (!ret) + return 0; +dec_size_out: + atomic_sub(size, sz); +dec_count_out: + atomic_dec(nr); + return ret; +} + +static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, + const char *full_name, + struct simple_xattrs *xattrs, + const void *value, size_t size, int flags) +{ + atomic_t *sz = &kn->iattr->user_xattr_size; + atomic_t *nr = &kn->iattr->nr_user_xattrs; + ssize_t removed_size; + int ret; + + ret = simple_xattr_set(xattrs, full_name, value, size, flags, + &removed_size); + + if (removed_size >= 0) { + atomic_sub(removed_size, sz); + atomic_dec(nr); + } + + return ret; +} + +static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, const void *value, + size_t size, int flags) +{ + const char *full_name = xattr_full_name(handler, suffix); + struct kernfs_node *kn = inode->i_private; + struct kernfs_iattrs *attrs; + + if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR)) + return -EOPNOTSUPP; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + if (value) + return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs, + value, size, flags); + else + return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs, + value, size, flags); + +} + static const struct xattr_handler kernfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = kernfs_vfs_xattr_get, @@ -339,8 +421,15 @@ static const struct xattr_handler kernfs_security_xattr_handler = { .set = kernfs_vfs_xattr_set, }; +static const struct xattr_handler kernfs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = kernfs_vfs_xattr_get, + .set = kernfs_vfs_user_xattr_set, +}; + const struct xattr_handler *kernfs_xattr_handlers[] = { &kernfs_trusted_xattr_handler, &kernfs_security_xattr_handler, + &kernfs_user_xattr_handler, NULL }; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 2f3c51d55261..7ee97ef59184 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -26,6 +26,8 @@ struct kernfs_iattrs { struct timespec64 ia_ctime; struct simple_xattrs xattrs; + atomic_t nr_user_xattrs; + atomic_t user_xattr_size; }; /* +1 to avoid triggering overflow warning when negating it */ diff --git a/fs/libfs.c b/fs/libfs.c index c686bd9caac6..3759fbacf522 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -891,7 +891,7 @@ int simple_attr_open(struct inode *inode, struct file *file, { struct simple_attr *attr; - attr = kmalloc(sizeof(*attr), GFP_KERNEL); + attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; @@ -931,9 +931,11 @@ ssize_t simple_attr_read(struct file *file, char __user *buf, if (ret) return ret; - if (*ppos) { /* continued read */ + if (*ppos && attr->get_buf[0]) { + /* continued read */ size = strlen(attr->get_buf); - } else { /* first read */ + } else { + /* first read */ u64 val; ret = attr->get(attr->data, &val); if (ret) diff --git a/fs/namei.c b/fs/namei.c index db6565c99825..a320371899cf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -503,9 +503,10 @@ struct nameidata { } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; struct nameidata *saved; - struct inode *link_inode; unsigned root_seq; int dfd; + kuid_t dir_uid; + umode_t dir_mode; } __randomize_layout; static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) @@ -530,52 +531,34 @@ static void restore_nameidata(void) kfree(now->stack); } -static int __nd_alloc_stack(struct nameidata *nd) +static bool nd_alloc_stack(struct nameidata *nd) { struct saved *p; - if (nd->flags & LOOKUP_RCU) { - p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), - GFP_ATOMIC); - if (unlikely(!p)) - return -ECHILD; - } else { - p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), - GFP_KERNEL); - if (unlikely(!p)) - return -ENOMEM; - } + p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), + nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); + if (unlikely(!p)) + return false; memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; - return 0; + return true; } /** - * path_connected - Verify that a path->dentry is below path->mnt.mnt_root - * @path: nameidate to verify + * path_connected - Verify that a dentry is below mnt.mnt_root * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ -static bool path_connected(const struct path *path) +static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { - struct vfsmount *mnt = path->mnt; struct super_block *sb = mnt->mnt_sb; /* Bind mounts and multi-root filesystems can have disconnected paths */ if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) return true; - return is_subdir(path->dentry, mnt->mnt_root); -} - -static inline int nd_alloc_stack(struct nameidata *nd) -{ - if (likely(nd->depth != EMBEDDED_LEVELS)) - return 0; - if (likely(nd->stack != nd->internal)) - return 0; - return __nd_alloc_stack(nd); + return is_subdir(dentry, mnt->mnt_root); } static void drop_links(struct nameidata *nd) @@ -608,10 +591,9 @@ static void terminate_walk(struct nameidata *nd) } /* path_put is needed afterwards regardless of success or failure */ -static bool legitimize_path(struct nameidata *nd, - struct path *path, unsigned seq) +static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) { - int res = __legitimize_mnt(path->mnt, nd->m_seq); + int res = __legitimize_mnt(path->mnt, mseq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; @@ -625,6 +607,12 @@ static bool legitimize_path(struct nameidata *nd, return !read_seqcount_retry(&path->dentry->d_seq, seq); } +static inline bool legitimize_path(struct nameidata *nd, + struct path *path, unsigned seq) +{ + return __legitimize_path(path, seq, nd->m_seq); +} + static bool legitimize_links(struct nameidata *nd) { int i; @@ -858,25 +846,6 @@ static int set_root(struct nameidata *nd) return 0; } -static void path_put_conditional(struct path *path, struct nameidata *nd) -{ - dput(path->dentry); - if (path->mnt != nd->path.mnt) - mntput(path->mnt); -} - -static inline void path_to_nameidata(const struct path *path, - struct nameidata *nd) -{ - if (!(nd->flags & LOOKUP_RCU)) { - dput(nd->path.dentry); - if (nd->path.mnt != path->mnt) - mntput(nd->path.mnt); - } - nd->path.mnt = path->mnt; - nd->path.dentry = path->dentry; -} - static int nd_jump_root(struct nameidata *nd) { if (unlikely(nd->flags & LOOKUP_BENEATH)) @@ -969,28 +938,21 @@ int sysctl_protected_regular __read_mostly; * * Returns 0 if following the symlink is allowed, -ve on error. */ -static inline int may_follow_link(struct nameidata *nd) +static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { - const struct inode *inode; - const struct inode *parent; - kuid_t puid; - if (!sysctl_protected_symlinks) return 0; /* Allowed if owner and follower match. */ - inode = nd->link_inode; if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; /* Allowed if parent directory not sticky and world-writable. */ - parent = nd->inode; - if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) + if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ - puid = parent->i_uid; - if (uid_valid(puid) && uid_eq(puid, inode->i_uid)) + if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -1113,63 +1075,6 @@ static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid, return 0; } -static __always_inline -const char *get_link(struct nameidata *nd) -{ - struct saved *last = nd->stack + nd->depth - 1; - struct dentry *dentry = last->link.dentry; - struct inode *inode = nd->link_inode; - int error; - const char *res; - - if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) - return ERR_PTR(-ELOOP); - - if (!(nd->flags & LOOKUP_RCU)) { - touch_atime(&last->link); - cond_resched(); - } else if (atime_needs_update(&last->link, inode)) { - if (unlikely(unlazy_walk(nd))) - return ERR_PTR(-ECHILD); - touch_atime(&last->link); - } - - error = security_inode_follow_link(dentry, inode, - nd->flags & LOOKUP_RCU); - if (unlikely(error)) - return ERR_PTR(error); - - nd->last_type = LAST_BIND; - res = READ_ONCE(inode->i_link); - if (!res) { - const char * (*get)(struct dentry *, struct inode *, - struct delayed_call *); - get = inode->i_op->get_link; - if (nd->flags & LOOKUP_RCU) { - res = get(NULL, inode, &last->done); - if (res == ERR_PTR(-ECHILD)) { - if (unlikely(unlazy_walk(nd))) - return ERR_PTR(-ECHILD); - res = get(dentry, inode, &last->done); - } - } else { - res = get(dentry, inode, &last->done); - } - if (IS_ERR_OR_NULL(res)) - return res; - } - if (*res == '/') { - error = nd_jump_root(nd); - if (unlikely(error)) - return ERR_PTR(error); - while (unlikely(*++res == '/')) - ; - } - if (!*res) - res = NULL; - return res; -} - /* * follow_up - Find the mountpoint of path's vfsmount * @@ -1203,19 +1108,59 @@ int follow_up(struct path *path) } EXPORT_SYMBOL(follow_up); +static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, + struct path *path, unsigned *seqp) +{ + while (mnt_has_parent(m)) { + struct dentry *mountpoint = m->mnt_mountpoint; + + m = m->mnt_parent; + if (unlikely(root->dentry == mountpoint && + root->mnt == &m->mnt)) + break; + if (mountpoint != m->mnt.mnt_root) { + path->mnt = &m->mnt; + path->dentry = mountpoint; + *seqp = read_seqcount_begin(&mountpoint->d_seq); + return true; + } + } + return false; +} + +static bool choose_mountpoint(struct mount *m, const struct path *root, + struct path *path) +{ + bool found; + + rcu_read_lock(); + while (1) { + unsigned seq, mseq = read_seqbegin(&mount_lock); + + found = choose_mountpoint_rcu(m, root, path, &seq); + if (unlikely(!found)) { + if (!read_seqretry(&mount_lock, mseq)) + break; + } else { + if (likely(__legitimize_path(path, seq, mseq))) + break; + rcu_read_unlock(); + path_put(path); + rcu_read_lock(); + } + } + rcu_read_unlock(); + return found; +} + /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ -static int follow_automount(struct path *path, struct nameidata *nd, - bool *need_mntput) +static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { - struct vfsmount *mnt; - int err; - - if (!path->dentry->d_op || !path->dentry->d_op->d_automount) - return -EREMOTE; + struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to @@ -1228,138 +1173,91 @@ static int follow_automount(struct path *path, struct nameidata *nd, * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ - if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && - path->dentry->d_inode) + dentry->d_inode) return -EISDIR; - nd->total_link_count++; - if (nd->total_link_count >= 40) + if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; - mnt = path->dentry->d_op->d_automount(path); - if (IS_ERR(mnt)) { - /* - * The filesystem is allowed to return -EISDIR here to indicate - * it doesn't want to automount. For instance, autofs would do - * this so that its userspace daemon can mount on this dentry. - * - * However, we can only permit this if it's a terminal point in - * the path being looked up; if it wasn't then the remainder of - * the path is inaccessible and we should say so. - */ - if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT)) - return -EREMOTE; - return PTR_ERR(mnt); - } - - if (!mnt) /* mount collision */ - return 0; - - if (!*need_mntput) { - /* lock_mount() may release path->mnt on error */ - mntget(path->mnt); - *need_mntput = true; - } - err = finish_automount(mnt, path); - - switch (err) { - case -EBUSY: - /* Someone else made a mount here whilst we were busy */ - return 0; - case 0: - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - return 0; - default: - return err; - } - + return finish_automount(dentry->d_op->d_automount(path), path); } /* - * Handle a dentry that is managed in some way. - * - Flagged for transit management (autofs) - * - Flagged as mountpoint - * - Flagged as automount point - * - * This may only be called in refwalk mode. - * On success path->dentry is known positive. - * - * Serialization is taken care of in namespace.c + * mount traversal - out-of-line part. One note on ->d_flags accesses - + * dentries are pinned but not locked here, so negative dentry can go + * positive right under us. Use of smp_load_acquire() provides a barrier + * sufficient for ->d_inode and ->d_flags consistency. */ -static int follow_managed(struct path *path, struct nameidata *nd) +static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, + int *count, unsigned lookup_flags) { - struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ - unsigned flags; + struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; - /* Given that we're not holding a lock here, we retain the value in a - * local variable for each dentry as we look at it so that we don't see - * the components of that value change under us */ - while (flags = smp_load_acquire(&path->dentry->d_flags), - unlikely(flags & DCACHE_MANAGED_DENTRY)) { + while (flags & DCACHE_MANAGED_DENTRY) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { - BUG_ON(!path->dentry->d_op); - BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } - /* Transit to a mounted filesystem. */ - if (flags & DCACHE_MOUNTED) { + if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); - if (mounted) { + if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); + // here we know it's positive + flags = path->dentry->d_flags; need_mntput = true; continue; } - - /* Something is mounted on this dentry in another - * namespace and/or whatever was mounted there in this - * namespace got unmounted before lookup_mnt() could - * get it */ } - /* Handle an automount point */ - if (flags & DCACHE_NEED_AUTOMOUNT) { - ret = follow_automount(path, nd, &need_mntput); - if (ret < 0) - break; - continue; - } + if (!(flags & DCACHE_NEED_AUTOMOUNT)) + break; - /* We didn't change the current path point */ - break; + // uncovered automount point + ret = follow_automount(path, count, lookup_flags); + flags = smp_load_acquire(&path->dentry->d_flags); + if (ret < 0) + break; } - if (need_mntput) { - if (path->mnt == mnt) - mntput(path->mnt); - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - ret = -EXDEV; - else - nd->flags |= LOOKUP_JUMPED; - } - if (ret == -EISDIR || !ret) - ret = 1; - if (ret > 0 && unlikely(d_flags_negative(flags))) + if (ret == -EISDIR) + ret = 0; + // possible if you race with several mount --move + if (need_mntput && path->mnt == mnt) + mntput(path->mnt); + if (!ret && unlikely(d_flags_negative(flags))) ret = -ENOENT; - if (unlikely(ret < 0)) - path_put_conditional(path, nd); + *jumped = need_mntput; return ret; } +static inline int traverse_mounts(struct path *path, bool *jumped, + int *count, unsigned lookup_flags) +{ + unsigned flags = smp_load_acquire(&path->dentry->d_flags); + + /* fastpath */ + if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { + *jumped = false; + if (unlikely(d_flags_negative(flags))) + return -ENOENT; + return 0; + } + return __traverse_mounts(path, flags, jumped, count, lookup_flags); +} + int follow_down_one(struct path *path) { struct vfsmount *mounted; @@ -1376,11 +1274,22 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); -static inline int managed_dentry_rcu(const struct path *path) +/* + * Follow down to the covering mount currently visible to userspace. At each + * point, the filesystem owning that dentry may be queried as to whether the + * caller is permitted to proceed or not. + */ +int follow_down(struct path *path) { - return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? - path->dentry->d_op->d_manage(path, true) : 0; + struct vfsmount *mnt = path->mnt; + bool jumped; + int ret = traverse_mounts(path, &jumped, NULL, 0); + + if (path->mnt != mnt) + mntput(mnt); + return ret; } +EXPORT_SYMBOL(follow_down); /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if @@ -1389,204 +1298,88 @@ static inline int managed_dentry_rcu(const struct path *path) static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, struct inode **inode, unsigned *seqp) { + struct dentry *dentry = path->dentry; + unsigned int flags = dentry->d_flags; + + if (likely(!(flags & DCACHE_MANAGED_DENTRY))) + return true; + + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return false; + for (;;) { - struct mount *mounted; /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - switch (managed_dentry_rcu(path)) { - case -ECHILD: - default: - return false; - case -EISDIR: - return true; - case 0: - break; - } - - if (!d_mountpoint(path->dentry)) - return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); - - mounted = __lookup_mnt(path->mnt, path->dentry); - if (!mounted) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return false; - path->mnt = &mounted->mnt; - path->dentry = mounted->mnt.mnt_root; - nd->flags |= LOOKUP_JUMPED; - *seqp = read_seqcount_begin(&path->dentry->d_seq); - /* - * Update the inode too. We don't need to re-check the - * dentry sequence number here after this d_inode read, - * because a mount-point is always pinned. - */ - *inode = path->dentry->d_inode; - } - return !read_seqretry(&mount_lock, nd->m_seq) && - !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); -} - -static int follow_dotdot_rcu(struct nameidata *nd) -{ - struct inode *inode = nd->inode; - - while (1) { - if (path_equal(&nd->path, &nd->root)) { - if (unlikely(nd->flags & LOOKUP_BENEATH)) - return -ECHILD; - break; + if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { + int res = dentry->d_op->d_manage(path, true); + if (res) + return res == -EISDIR; + flags = dentry->d_flags; } - if (nd->path.dentry != nd->path.mnt->mnt_root) { - struct dentry *old = nd->path.dentry; - struct dentry *parent = old->d_parent; - unsigned seq; - inode = parent->d_inode; - seq = read_seqcount_begin(&parent->d_seq); - if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) - return -ECHILD; - nd->path.dentry = parent; - nd->seq = seq; - if (unlikely(!path_connected(&nd->path))) - return -ECHILD; - break; - } else { - struct mount *mnt = real_mount(nd->path.mnt); - struct mount *mparent = mnt->mnt_parent; - struct dentry *mountpoint = mnt->mnt_mountpoint; - struct inode *inode2 = mountpoint->d_inode; - unsigned seq = read_seqcount_begin(&mountpoint->d_seq); - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) - return -ECHILD; - if (&mparent->mnt == nd->path.mnt) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -ECHILD; - /* we know that mountpoint was pinned */ - nd->path.dentry = mountpoint; - nd->path.mnt = &mparent->mnt; - inode = inode2; - nd->seq = seq; + if (flags & DCACHE_MOUNTED) { + struct mount *mounted = __lookup_mnt(path->mnt, dentry); + if (mounted) { + path->mnt = &mounted->mnt; + dentry = path->dentry = mounted->mnt.mnt_root; + nd->flags |= LOOKUP_JUMPED; + *seqp = read_seqcount_begin(&dentry->d_seq); + *inode = dentry->d_inode; + /* + * We don't need to re-check ->d_seq after this + * ->d_inode read - there will be an RCU delay + * between mount hash removal and ->mnt_root + * becoming unpinned. + */ + flags = dentry->d_flags; + continue; + } + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; } + return !(flags & DCACHE_NEED_AUTOMOUNT); } - while (unlikely(d_mountpoint(nd->path.dentry))) { - struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) - return -ECHILD; - if (!mounted) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -ECHILD; - nd->path.mnt = &mounted->mnt; - nd->path.dentry = mounted->mnt.mnt_root; - inode = nd->path.dentry->d_inode; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - } - nd->inode = inode; - return 0; } -/* - * Follow down to the covering mount currently visible to userspace. At each - * point, the filesystem owning that dentry may be queried as to whether the - * caller is permitted to proceed or not. - */ -int follow_down(struct path *path) +static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, + struct path *path, struct inode **inode, + unsigned int *seqp) { - unsigned managed; + bool jumped; int ret; - while (managed = READ_ONCE(path->dentry->d_flags), - unlikely(managed & DCACHE_MANAGED_DENTRY)) { - /* Allow the filesystem to manage the transit without i_mutex - * being held. - * - * We indicate to the filesystem if someone is trying to mount - * something here. This gives autofs the chance to deny anyone - * other than its daemon the right to mount on its - * superstructure. - * - * The filesystem may sleep at this point. - */ - if (managed & DCACHE_MANAGE_TRANSIT) { - BUG_ON(!path->dentry->d_op); - BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage(path, false); - if (ret < 0) - return ret == -EISDIR ? 0 : ret; - } - - /* Transit to a mounted filesystem. */ - if (managed & DCACHE_MOUNTED) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; - dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); - continue; - } - - /* Don't handle automount points here */ - break; - } - return 0; -} -EXPORT_SYMBOL(follow_down); - -/* - * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() - */ -static void follow_mount(struct path *path) -{ - while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; - dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); + path->mnt = nd->path.mnt; + path->dentry = dentry; + if (nd->flags & LOOKUP_RCU) { + unsigned int seq = *seqp; + if (unlikely(!*inode)) + return -ENOENT; + if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + return 0; + if (unlazy_child(nd, dentry, seq)) + return -ECHILD; + // *path might've been clobbered by __follow_mount_rcu() + path->mnt = nd->path.mnt; + path->dentry = dentry; } -} - -static int path_parent_directory(struct path *path) -{ - struct dentry *old = path->dentry; - /* rare case of legitimate dget_parent()... */ - path->dentry = dget_parent(path->dentry); - dput(old); - if (unlikely(!path_connected(path))) - return -ENOENT; - return 0; -} - -static int follow_dotdot(struct nameidata *nd) -{ - while (1) { - if (path_equal(&nd->path, &nd->root)) { - if (unlikely(nd->flags & LOOKUP_BENEATH)) - return -EXDEV; - break; - } - if (nd->path.dentry != nd->path.mnt->mnt_root) { - int ret = path_parent_directory(&nd->path); - if (ret) - return ret; - break; - } - if (!follow_up(&nd->path)) - break; + ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); + if (jumped) { if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -EXDEV; + ret = -EXDEV; + else + nd->flags |= LOOKUP_JUMPED; } - follow_mount(&nd->path); - nd->inode = nd->path.dentry->d_inode; - return 0; + if (unlikely(ret)) { + dput(path->dentry); + if (path->mnt != nd->path.mnt) + mntput(path->mnt); + } else { + *inode = d_backing_inode(path->dentry); + *seqp = 0; /* out of RCU mode, so the value doesn't matter */ + } + return ret; } /* @@ -1643,14 +1436,12 @@ static struct dentry *__lookup_hash(const struct qstr *name, return dentry; } -static int lookup_fast(struct nameidata *nd, - struct path *path, struct inode **inode, - unsigned *seqp) +static struct dentry *lookup_fast(struct nameidata *nd, + struct inode **inode, + unsigned *seqp) { - struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; int status = 1; - int err; /* * Rename seqlock is not required here because in the off chance @@ -1659,12 +1450,11 @@ static int lookup_fast(struct nameidata *nd, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (unlikely(!dentry)) { if (unlazy_walk(nd)) - return -ECHILD; - return 0; + return ERR_PTR(-ECHILD); + return NULL; } /* @@ -1672,9 +1462,8 @@ static int lookup_fast(struct nameidata *nd, * the dentry name information from lookup. */ *inode = d_backing_inode(dentry); - negative = d_is_negative(dentry); if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) - return -ECHILD; + return ERR_PTR(-ECHILD); /* * This sequence count validates that the parent had no @@ -1684,46 +1473,30 @@ static int lookup_fast(struct nameidata *nd, * enough, we can use __read_seqcount_retry here. */ if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) - return -ECHILD; + return ERR_PTR(-ECHILD); *seqp = seq; status = d_revalidate(dentry, nd->flags); - if (likely(status > 0)) { - /* - * Note: do negative dentry check after revalidation in - * case that drops it. - */ - if (unlikely(negative)) - return -ENOENT; - path->mnt = mnt; - path->dentry = dentry; - if (likely(__follow_mount_rcu(nd, path, inode, seqp))) - return 1; - } + if (likely(status > 0)) + return dentry; if (unlazy_child(nd, dentry, seq)) - return -ECHILD; + return ERR_PTR(-ECHILD); if (unlikely(status == -ECHILD)) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) - return 0; + return NULL; status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); - return status; + return ERR_PTR(status); } - - path->mnt = mnt; - path->dentry = dentry; - err = follow_managed(path, nd); - if (likely(err > 0)) - *inode = d_backing_inode(path->dentry); - return err; + return dentry; } /* Fast lookup failed, do it the slow way */ @@ -1788,81 +1561,107 @@ static inline int may_lookup(struct nameidata *nd) return inode_permission(nd->inode, MAY_EXEC); } -static inline int handle_dots(struct nameidata *nd, int type) +static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) { - if (type == LAST_DOTDOT) { - int error = 0; + if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) + return -ELOOP; - if (!nd->root.mnt) { - error = set_root(nd); - if (error) - return error; - } - if (nd->flags & LOOKUP_RCU) - error = follow_dotdot_rcu(nd); - else - error = follow_dotdot(nd); - if (error) - return error; + if (likely(nd->depth != EMBEDDED_LEVELS)) + return 0; + if (likely(nd->stack != nd->internal)) + return 0; + if (likely(nd_alloc_stack(nd))) + return 0; - if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { - /* - * If there was a racing rename or mount along our - * path, then we can't be sure that ".." hasn't jumped - * above nd->root (and so userspace should retry or use - * some fallback). - */ - smp_rmb(); - if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) - return -EAGAIN; - if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) - return -EAGAIN; - } + if (nd->flags & LOOKUP_RCU) { + // we need to grab link before we do unlazy. And we can't skip + // unlazy even if we fail to grab the link - cleanup needs it + bool grabbed_link = legitimize_path(nd, link, seq); + + if (unlazy_walk(nd) != 0 || !grabbed_link) + return -ECHILD; + + if (nd_alloc_stack(nd)) + return 0; } - return 0; + return -ENOMEM; } -static int pick_link(struct nameidata *nd, struct path *link, - struct inode *inode, unsigned seq) +enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; + +static const char *pick_link(struct nameidata *nd, struct path *link, + struct inode *inode, unsigned seq, int flags) { - int error; struct saved *last; - if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { - path_to_nameidata(link, nd); - return -ELOOP; - } - if (!(nd->flags & LOOKUP_RCU)) { - if (link->mnt == nd->path.mnt) - mntget(link->mnt); - } - error = nd_alloc_stack(nd); + const char *res; + int error = reserve_stack(nd, link, seq); + if (unlikely(error)) { - if (error == -ECHILD) { - if (unlikely(!legitimize_path(nd, link, seq))) { - drop_links(nd); - nd->depth = 0; - nd->flags &= ~LOOKUP_RCU; - nd->path.mnt = NULL; - nd->path.dentry = NULL; - rcu_read_unlock(); - } else if (likely(unlazy_walk(nd)) == 0) - error = nd_alloc_stack(nd); - } - if (error) { + if (!(nd->flags & LOOKUP_RCU)) path_put(link); - return error; - } + return ERR_PTR(error); } - last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); - nd->link_inode = inode; last->seq = seq; - return 1; -} -enum {WALK_FOLLOW = 1, WALK_MORE = 2}; + if (flags & WALK_TRAILING) { + error = may_follow_link(nd, inode); + if (unlikely(error)) + return ERR_PTR(error); + } + + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) + return ERR_PTR(-ELOOP); + + if (!(nd->flags & LOOKUP_RCU)) { + touch_atime(&last->link); + cond_resched(); + } else if (atime_needs_update(&last->link, inode)) { + if (unlikely(unlazy_walk(nd))) + return ERR_PTR(-ECHILD); + touch_atime(&last->link); + } + + error = security_inode_follow_link(link->dentry, inode, + nd->flags & LOOKUP_RCU); + if (unlikely(error)) + return ERR_PTR(error); + + res = READ_ONCE(inode->i_link); + if (!res) { + const char * (*get)(struct dentry *, struct inode *, + struct delayed_call *); + get = inode->i_op->get_link; + if (nd->flags & LOOKUP_RCU) { + res = get(NULL, inode, &last->done); + if (res == ERR_PTR(-ECHILD)) { + if (unlikely(unlazy_walk(nd))) + return ERR_PTR(-ECHILD); + res = get(link->dentry, inode, &last->done); + } + } else { + res = get(link->dentry, inode, &last->done); + } + if (!res) + goto all_done; + if (IS_ERR(res)) + return res; + } + if (*res == '/') { + error = nd_jump_root(nd); + if (unlikely(error)) + return ERR_PTR(error); + while (unlikely(*++res == '/')) + ; + } + if (*res) + return res; +all_done: // pure jump + put_link(nd); + return NULL; +} /* * Do we need to follow links? We _really_ want to be able @@ -1870,63 +1669,187 @@ enum {WALK_FOLLOW = 1, WALK_MORE = 2}; * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int step_into(struct nameidata *nd, struct path *path, - int flags, struct inode *inode, unsigned seq) +static const char *step_into(struct nameidata *nd, int flags, + struct dentry *dentry, struct inode *inode, unsigned seq) { - if (!(flags & WALK_MORE) && nd->depth) - put_link(nd); - if (likely(!d_is_symlink(path->dentry)) || - !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) { + struct path path; + int err = handle_mounts(nd, dentry, &path, &inode, &seq); + + if (err < 0) + return ERR_PTR(err); + if (likely(!d_is_symlink(path.dentry)) || + ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || + (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ - path_to_nameidata(path, nd); + if (!(nd->flags & LOOKUP_RCU)) { + dput(nd->path.dentry); + if (nd->path.mnt != path.mnt) + mntput(nd->path.mnt); + } + nd->path = path; nd->inode = inode; nd->seq = seq; - return 0; + return NULL; } - /* make sure that d_is_symlink above matches inode */ if (nd->flags & LOOKUP_RCU) { - if (read_seqcount_retry(&path->dentry->d_seq, seq)) - return -ECHILD; + /* make sure that d_is_symlink above matches inode */ + if (read_seqcount_retry(&path.dentry->d_seq, seq)) + return ERR_PTR(-ECHILD); + } else { + if (path.mnt == nd->path.mnt) + mntget(path.mnt); } - return pick_link(nd, path, inode, seq); + return pick_link(nd, &path, inode, seq, flags); } -static int walk_component(struct nameidata *nd, int flags) +static struct dentry *follow_dotdot_rcu(struct nameidata *nd, + struct inode **inodep, + unsigned *seqp) { - struct path path; + struct dentry *parent, *old; + + if (path_equal(&nd->path, &nd->root)) + goto in_root; + if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { + struct path path; + unsigned seq; + if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), + &nd->root, &path, &seq)) + goto in_root; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return ERR_PTR(-ECHILD); + nd->path = path; + nd->inode = path.dentry->d_inode; + nd->seq = seq; + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return ERR_PTR(-ECHILD); + /* we know that mountpoint was pinned */ + } + old = nd->path.dentry; + parent = old->d_parent; + *inodep = parent->d_inode; + *seqp = read_seqcount_begin(&parent->d_seq); + if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + return ERR_PTR(-ECHILD); + if (unlikely(!path_connected(nd->path.mnt, parent))) + return ERR_PTR(-ECHILD); + return parent; +in_root: + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return ERR_PTR(-ECHILD); + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return ERR_PTR(-ECHILD); + return NULL; +} + +static struct dentry *follow_dotdot(struct nameidata *nd, + struct inode **inodep, + unsigned *seqp) +{ + struct dentry *parent; + + if (path_equal(&nd->path, &nd->root)) + goto in_root; + if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { + struct path path; + + if (!choose_mountpoint(real_mount(nd->path.mnt), + &nd->root, &path)) + goto in_root; + path_put(&nd->path); + nd->path = path; + nd->inode = path.dentry->d_inode; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return ERR_PTR(-EXDEV); + } + /* rare case of legitimate dget_parent()... */ + parent = dget_parent(nd->path.dentry); + if (unlikely(!path_connected(nd->path.mnt, parent))) { + dput(parent); + return ERR_PTR(-ENOENT); + } + *seqp = 0; + *inodep = parent->d_inode; + return parent; + +in_root: + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return ERR_PTR(-EXDEV); + dget(nd->path.dentry); + return NULL; +} + +static const char *handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + const char *error = NULL; + struct dentry *parent; + struct inode *inode; + unsigned seq; + + if (!nd->root.mnt) { + error = ERR_PTR(set_root(nd)); + if (error) + return error; + } + if (nd->flags & LOOKUP_RCU) + parent = follow_dotdot_rcu(nd, &inode, &seq); + else + parent = follow_dotdot(nd, &inode, &seq); + if (IS_ERR(parent)) + return ERR_CAST(parent); + if (unlikely(!parent)) + error = step_into(nd, WALK_NOFOLLOW, + nd->path.dentry, nd->inode, nd->seq); + else + error = step_into(nd, WALK_NOFOLLOW, + parent, inode, seq); + if (unlikely(error)) + return error; + + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { + /* + * If there was a racing rename or mount along our + * path, then we can't be sure that ".." hasn't jumped + * above nd->root (and so userspace should retry or use + * some fallback). + */ + smp_rmb(); + if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) + return ERR_PTR(-EAGAIN); + if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) + return ERR_PTR(-EAGAIN); + } + } + return NULL; +} + +static const char *walk_component(struct nameidata *nd, int flags) +{ + struct dentry *dentry; struct inode *inode; unsigned seq; - int err; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { - err = handle_dots(nd, nd->last_type); if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - return err; + return handle_dots(nd, nd->last_type); } - err = lookup_fast(nd, &path, &inode, &seq); - if (unlikely(err <= 0)) { - if (err < 0) - return err; - path.dentry = lookup_slow(&nd->last, nd->path.dentry, - nd->flags); - if (IS_ERR(path.dentry)) - return PTR_ERR(path.dentry); - - path.mnt = nd->path.mnt; - err = follow_managed(&path, nd); - if (unlikely(err < 0)) - return err; - - seq = 0; /* we are already out of RCU mode */ - inode = d_backing_inode(path.dentry); + dentry = lookup_fast(nd, &inode, &seq); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + if (unlikely(!dentry)) { + dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); } - - return step_into(nd, &path, flags, inode, seq); + if (!(flags & WALK_MORE) && nd->depth) + put_link(nd); + return step_into(nd, flags, dentry, inode, seq); } /* @@ -2167,8 +2090,11 @@ static inline u64 hash_name(const void *salt, const char *name) */ static int link_path_walk(const char *name, struct nameidata *nd) { + int depth = 0; // depth <= nd->depth int err; + nd->last_type = LAST_ROOT; + nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') @@ -2178,6 +2104,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { + const char *link; u64 hash_len; int type; @@ -2227,36 +2154,27 @@ static int link_path_walk(const char *name, struct nameidata *nd) } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: - /* pathname body, done */ - if (!nd->depth) - return 0; - name = nd->stack[nd->depth - 1].name; - /* trailing symlink, done */ - if (!name) + /* pathname or trailing symlink, done */ + if (!depth) { + nd->dir_uid = nd->inode->i_uid; + nd->dir_mode = nd->inode->i_mode; + nd->flags &= ~LOOKUP_PARENT; return 0; + } /* last component of nested symlink */ - err = walk_component(nd, WALK_FOLLOW); + name = nd->stack[--depth].name; + link = walk_component(nd, 0); } else { /* not the last component */ - err = walk_component(nd, WALK_FOLLOW | WALK_MORE); + link = walk_component(nd, WALK_MORE); } - if (err < 0) - return err; - - if (err) { - const char *s = get_link(nd); - - if (IS_ERR(s)) - return PTR_ERR(s); - err = 0; - if (unlikely(!s)) { - /* jumped */ - put_link(nd); - } else { - nd->stack[nd->depth - 1].name = name; - name = s; - continue; - } + if (unlikely(link)) { + if (IS_ERR(link)) + return PTR_ERR(link); + /* a symlink to follow */ + nd->stack[depth++].name = name; + name = link; + continue; } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { @@ -2279,8 +2197,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_RCU) rcu_read_lock(); - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; + nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); @@ -2370,54 +2287,20 @@ static const char *path_init(struct nameidata *nd, unsigned flags) return s; } -static const char *trailing_symlink(struct nameidata *nd) -{ - const char *s; - int error = may_follow_link(nd); - if (unlikely(error)) - return ERR_PTR(error); - nd->flags |= LOOKUP_PARENT; - nd->stack[0].name = NULL; - s = get_link(nd); - return s ? s : ""; -} - -static inline int lookup_last(struct nameidata *nd) +static inline const char *lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - nd->flags &= ~LOOKUP_PARENT; - return walk_component(nd, 0); + return walk_component(nd, WALK_TRAILING); } static int handle_lookup_down(struct nameidata *nd) { - struct path path = nd->path; - struct inode *inode = nd->inode; - unsigned seq = nd->seq; - int err; - - if (nd->flags & LOOKUP_RCU) { - /* - * don't bother with unlazy_walk on failure - we are - * at the very beginning of walk, so we lose nothing - * if we simply redo everything in non-RCU mode - */ - if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq))) - return -ECHILD; - } else { - dget(path.dentry); - err = follow_managed(&path, nd); - if (unlikely(err < 0)) - return err; - inode = d_backing_inode(path.dentry); - seq = 0; - } - path_to_nameidata(&path, nd); - nd->inode = inode; - nd->seq = seq; - return 0; + if (!(nd->flags & LOOKUP_RCU)) + dget(nd->path.dentry); + return PTR_ERR(step_into(nd, WALK_NOFOLLOW, + nd->path.dentry, nd->inode, nd->seq)); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -2432,16 +2315,19 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path s = ERR_PTR(err); } - while (!(err = link_path_walk(s, nd)) - && ((err = lookup_last(nd)) > 0)) { - s = trailing_symlink(nd); - } + while (!(err = link_path_walk(s, nd)) && + (s = lookup_last(nd)) != NULL) + ; if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; + if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { + err = handle_lookup_down(nd); + nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + } if (!err) { *path = nd->path; nd->path.mnt = NULL; @@ -2470,7 +2356,8 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) - audit_inode(name, path->dentry, 0); + audit_inode(name, path->dentry, + flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); putname(name); return retval; @@ -2718,24 +2605,23 @@ int path_pts(struct path *path) /* Find something mounted on "pts" in the same directory as * the input path. */ - struct dentry *child, *parent; - struct qstr this; - int ret; - - ret = path_parent_directory(path); - if (ret) - return ret; + struct dentry *parent = dget_parent(path->dentry); + struct dentry *child; + struct qstr this = QSTR_INIT("pts", 3); - parent = path->dentry; - this.name = "pts"; - this.len = 3; + if (unlikely(!path_connected(path->mnt, parent))) { + dput(parent); + return -ENOENT; + } + dput(path->dentry); + path->dentry = parent; child = d_hash_and_lookup(parent, &this); if (!child) return -ENOENT; path->dentry = child; dput(parent); - follow_mount(path); + follow_down(path); return 0; } #endif @@ -2748,88 +2634,6 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, } EXPORT_SYMBOL(user_path_at_empty); -/** - * path_mountpoint - look up a path to be umounted - * @nd: lookup context - * @flags: lookup flags - * @path: pointer to container for result - * - * Look up the given name, but don't attempt to revalidate the last component. - * Returns 0 and "path" will be valid on success; Returns error otherwise. - */ -static int -path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) -{ - const char *s = path_init(nd, flags); - int err; - - while (!(err = link_path_walk(s, nd)) && - (err = lookup_last(nd)) > 0) { - s = trailing_symlink(nd); - } - if (!err && (nd->flags & LOOKUP_RCU)) - err = unlazy_walk(nd); - if (!err) - err = handle_lookup_down(nd); - if (!err) { - *path = nd->path; - nd->path.mnt = NULL; - nd->path.dentry = NULL; - } - terminate_walk(nd); - return err; -} - -static int -filename_mountpoint(int dfd, struct filename *name, struct path *path, - unsigned int flags) -{ - struct nameidata nd; - int error; - if (IS_ERR(name)) - return PTR_ERR(name); - set_nameidata(&nd, dfd, name); - error = path_mountpoint(&nd, flags | LOOKUP_RCU, path); - if (unlikely(error == -ECHILD)) - error = path_mountpoint(&nd, flags, path); - if (unlikely(error == -ESTALE)) - error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); - if (likely(!error)) - audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL); - restore_nameidata(); - putname(name); - return error; -} - -/** - * user_path_mountpoint_at - lookup a path from userland in order to umount it - * @dfd: directory file descriptor - * @name: pathname from userland - * @flags: lookup flags - * @path: pointer to container to hold result - * - * A umount is a special case for path walking. We're not actually interested - * in the inode in this situation, and ESTALE errors can be a problem. We - * simply want track down the dentry and vfsmount attached at the mountpoint - * and avoid revalidating the last component. - * - * Returns 0 and populates "path" on success. - */ -int -user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, - struct path *path) -{ - return filename_mountpoint(dfd, getname(name), path, flags); -} - -int -kern_path_mountpoint(int dfd, const char *name, struct path *path, - unsigned int flags) -{ - return filename_mountpoint(dfd, getname_kernel(name), path, flags); -} -EXPORT_SYMBOL(kern_path_mountpoint); - int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); @@ -3127,18 +2931,14 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m * * Returns an error code otherwise. */ -static int atomic_open(struct nameidata *nd, struct dentry *dentry, - struct path *path, struct file *file, - const struct open_flags *op, - int open_flag, umode_t mode) +static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, + struct file *file, + int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; int error; - if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */ - open_flag &= ~O_TRUNC; - if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; @@ -3149,19 +2949,10 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { - /* - * We didn't have the inode before the open, so check open - * permission here. - */ - int acc_mode = op->acc_mode; - if (file->f_mode & FMODE_CREATED) { - WARN_ON(!(open_flag & O_CREAT)); - fsnotify_create(dir, dentry); - acc_mode = 0; + if (unlikely(dentry != file->f_path.dentry)) { + dput(dentry); + dentry = dget(file->f_path.dentry); } - error = may_open(&file->f_path, acc_mode, open_flag); - if (WARN_ON(error > 0)) - error = -EINVAL; } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { @@ -3169,19 +2960,15 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, dput(dentry); dentry = file->f_path.dentry; } - if (file->f_mode & FMODE_CREATED) - fsnotify_create(dir, dentry); - if (unlikely(d_is_negative(dentry))) { + if (unlikely(d_is_negative(dentry))) error = -ENOENT; - } else { - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 0; - } } } - dput(dentry); - return error; + if (error) { + dput(dentry); + dentry = ERR_PTR(error); + } + return dentry; } /* @@ -3199,10 +2986,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, * * An error code is returned on failure. */ -static int lookup_open(struct nameidata *nd, struct path *path, - struct file *file, - const struct open_flags *op, - bool got_write) +static struct dentry *lookup_open(struct nameidata *nd, struct file *file, + const struct open_flags *op, + bool got_write) { struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; @@ -3213,7 +2999,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) - return -ENOENT; + return ERR_PTR(-ENOENT); file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); @@ -3221,7 +3007,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) - return PTR_ERR(dentry); + return dentry; } if (d_in_lookup(dentry)) break; @@ -3237,7 +3023,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ - goto out_no_open; + return dentry; } /* @@ -3249,41 +3035,27 @@ static int lookup_open(struct nameidata *nd, struct path *path, * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ + if (unlikely(!got_write)) + open_flag &= ~O_TRUNC; if (open_flag & O_CREAT) { + if (open_flag & O_EXCL) + open_flag &= ~O_TRUNC; if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); - if (unlikely(!got_write)) { - create_error = -EROFS; - open_flag &= ~O_CREAT; - if (open_flag & (O_EXCL | O_TRUNC)) - goto no_open; - /* No side effects, safe to clear O_CREAT */ - } else { + if (likely(got_write)) create_error = may_o_create(&nd->path, dentry, mode); - if (create_error) { - open_flag &= ~O_CREAT; - if (open_flag & O_EXCL) - goto no_open; - } - } - } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) && - unlikely(!got_write)) { - /* - * No O_CREATE -> atomicity not a requirement -> fall - * back to lookup + open - */ - goto no_open; + else + create_error = -EROFS; } - + if (create_error) + open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { - error = atomic_open(nd, dentry, path, file, op, open_flag, - mode); - if (unlikely(error == -ENOENT) && create_error) - error = create_error; - return error; + dentry = atomic_open(nd, dentry, file, open_flag, mode); + if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) + dentry = ERR_PTR(create_error); + return dentry; } -no_open: if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); @@ -3310,78 +3082,60 @@ no_open: open_flag & O_EXCL); if (error) goto out_dput; - fsnotify_create(dir_inode, dentry); } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } -out_no_open: - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 0; + return dentry; out_dput: dput(dentry); - return error; + return ERR_PTR(error); } -/* - * Handle the last step of open() - */ -static int do_last(struct nameidata *nd, +static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; - kuid_t dir_uid = nd->inode->i_uid; - umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; - bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; - int acc_mode = op->acc_mode; unsigned seq; struct inode *inode; - struct path path; + struct dentry *dentry; + const char *res; int error; - nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { - error = handle_dots(nd, nd->last_type); - if (unlikely(error)) - return error; - goto finish_open; + if (nd->depth) + put_link(nd); + return handle_dots(nd, nd->last_type); } if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ - error = lookup_fast(nd, &path, &inode, &seq); - if (likely(error > 0)) + dentry = lookup_fast(nd, &inode, &seq); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + if (likely(dentry)) goto finish_lookup; - if (error < 0) - return error; - - BUG_ON(nd->inode != dir->d_inode); BUG_ON(nd->flags & LOOKUP_RCU); } else { /* create side of things */ - /* - * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED - * has been cleared when we got to the last component we are - * about to look up - */ - error = complete_walk(nd); - if (error) - return error; - + if (nd->flags & LOOKUP_RCU) { + error = unlazy_walk(nd); + if (unlikely(error)) + return ERR_PTR(error); + } audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) - return -EISDIR; + return ERR_PTR(-EISDIR); } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { @@ -3398,108 +3152,90 @@ static int do_last(struct nameidata *nd, inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); - error = lookup_open(nd, &path, file, op, got_write); + dentry = lookup_open(nd, file, op, got_write); + if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED)) + fsnotify_create(dir->d_inode, dentry); if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); - if (error) - goto out; - - if (file->f_mode & FMODE_OPENED) { - if ((file->f_mode & FMODE_CREATED) || - !S_ISREG(file_inode(file)->i_mode)) - will_truncate = false; - - audit_inode(nd->name, file->f_path.dentry, 0); - goto opened; - } + if (got_write) + mnt_drop_write(nd->path.mnt); - if (file->f_mode & FMODE_CREATED) { - /* Don't check for write permission, don't truncate */ - open_flag &= ~O_TRUNC; - will_truncate = false; - acc_mode = 0; - path_to_nameidata(&path, nd); - goto finish_open_created; - } + if (IS_ERR(dentry)) + return ERR_CAST(dentry); - /* - * If atomic_open() acquired write access it is dropped now due to - * possible mount and symlink following (this might be optimized away if - * necessary...) - */ - if (got_write) { - mnt_drop_write(nd->path.mnt); - got_write = false; + if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { + dput(nd->path.dentry); + nd->path.dentry = dentry; + return NULL; } - error = follow_managed(&path, nd); - if (unlikely(error < 0)) - return error; +finish_lookup: + if (nd->depth) + put_link(nd); + res = step_into(nd, WALK_TRAILING, dentry, inode, seq); + if (unlikely(res)) + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + return res; +} - /* - * create/update audit record if it already exists. - */ - audit_inode(nd->name, path.dentry, 0); +/* + * Handle the last step of open() + */ +static int do_open(struct nameidata *nd, + struct file *file, const struct open_flags *op) +{ + int open_flag = op->open_flag; + bool do_truncate; + int acc_mode; + int error; - if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { - path_to_nameidata(&path, nd); - return -EEXIST; + if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { + error = complete_walk(nd); + if (error) + return error; } - - seq = 0; /* out of RCU mode, so the value doesn't matter */ - inode = d_backing_inode(path.dentry); -finish_lookup: - error = step_into(nd, &path, 0, inode, seq); - if (unlikely(error)) - return error; -finish_open: - /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ - error = complete_walk(nd); - if (error) - return error; - audit_inode(nd->name, nd->path.dentry, 0); + if (!(file->f_mode & FMODE_CREATED)) + audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { - error = -EISDIR; + if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) + return -EEXIST; if (d_is_dir(nd->path.dentry)) - goto out; - error = may_create_in_sticky(dir_mode, dir_uid, + return -EISDIR; + error = may_create_in_sticky(nd->dir_mode, nd->dir_uid, d_backing_inode(nd->path.dentry)); if (unlikely(error)) - goto out; + return error; } - error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) - goto out; - if (!d_is_reg(nd->path.dentry)) - will_truncate = false; + return -ENOTDIR; - if (will_truncate) { + do_truncate = false; + acc_mode = op->acc_mode; + if (file->f_mode & FMODE_CREATED) { + /* Don't check for write permission, don't truncate */ + open_flag &= ~O_TRUNC; + acc_mode = 0; + } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) - goto out; - got_write = true; + return error; + do_truncate = true; } -finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); - if (error) - goto out; - BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ - error = vfs_open(&nd->path, file); - if (error) - goto out; -opened: - error = ima_file_check(file, op->acc_mode); - if (!error && will_truncate) + if (!error && !(file->f_mode & FMODE_OPENED)) + error = vfs_open(&nd->path, file); + if (!error) + error = ima_file_check(file, op->acc_mode); + if (!error && do_truncate) error = handle_truncate(file); -out: if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } - if (got_write) + if (do_truncate) mnt_drop_write(nd->path.mnt); return error; } @@ -3604,10 +3340,10 @@ static struct file *path_openat(struct nameidata *nd, } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && - (error = do_last(nd, file, op)) > 0) { - nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - s = trailing_symlink(nd); - } + (s = open_last_lookups(nd, file, op)) != NULL) + ; + if (!error) + error = do_open(nd, file, op); terminate_walk(nd); } if (likely(!error)) { diff --git a/fs/namespace.c b/fs/namespace.c index 85b5f7bea82e..a28e4db075ed 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1669,7 +1669,7 @@ int ksys_umount(char __user *name, int flags) struct path path; struct mount *mnt; int retval; - int lookup_flags = 0; + int lookup_flags = LOOKUP_MOUNTPOINT; if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) return -EINVAL; @@ -1680,7 +1680,7 @@ int ksys_umount(char __user *name, int flags) if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); + retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; mnt = real_mount(path.mnt); @@ -2697,45 +2697,32 @@ static int do_move_mount_old(struct path *path, const char *old_name) /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, + struct path *path, int mnt_flags) { - struct mountpoint *mp; - struct mount *parent; - int err; + struct mount *parent = real_mount(path->mnt); mnt_flags &= ~MNT_INTERNAL_FLAGS; - mp = lock_mount(path); - if (IS_ERR(mp)) - return PTR_ERR(mp); - - parent = real_mount(path->mnt); - err = -EINVAL; if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) - goto unlock; + return -EINVAL; /* ... and for those we'd better have mountpoint still alive */ if (!parent->mnt_ns) - goto unlock; + return -EINVAL; } /* Refuse the same filesystem on the same mount point */ - err = -EBUSY; if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path->mnt->mnt_root == path->dentry) - goto unlock; + return -EBUSY; - err = -EINVAL; if (d_is_symlink(newmnt->mnt.mnt_root)) - goto unlock; + return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; - err = graft_tree(newmnt, parent, mp); - -unlock: - unlock_mount(mp); - return err; + return graft_tree(newmnt, parent, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); @@ -2748,6 +2735,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; + struct mountpoint *mp; struct super_block *sb = fc->root->d_sb; int error; @@ -2768,7 +2756,13 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, mnt_warn_timestamp_expiry(mountpoint, mnt); - error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + mp = lock_mount(mountpoint); + if (IS_ERR(mp)) { + mntput(mnt); + return PTR_ERR(mp); + } + error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); + unlock_mount(mp); if (error < 0) mntput(mnt); return error; @@ -2829,23 +2823,63 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int finish_automount(struct vfsmount *m, struct path *path) { - struct mount *mnt = real_mount(m); + struct dentry *dentry = path->dentry; + struct mountpoint *mp; + struct mount *mnt; int err; + + if (!m) + return 0; + if (IS_ERR(m)) + return PTR_ERR(m); + + mnt = real_mount(m); /* The new mount record should have at least 2 refs to prevent it being * expired before we get a chance to add it */ BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && - m->mnt_root == path->dentry) { + m->mnt_root == dentry) { err = -ELOOP; - goto fail; + goto discard; } - err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); - if (!err) - return 0; -fail: + /* + * we don't want to use lock_mount() - in this case finding something + * that overmounts our mountpoint to be means "quitely drop what we've + * got", not "try to mount it on top". + */ + inode_lock(dentry->d_inode); + namespace_lock(); + if (unlikely(cant_mount(dentry))) { + err = -ENOENT; + goto discard_locked; + } + rcu_read_lock(); + if (unlikely(__lookup_mnt(path->mnt, dentry))) { + rcu_read_unlock(); + err = 0; + goto discard_locked; + } + rcu_read_unlock(); + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + goto discard_locked; + } + + err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + unlock_mount(mp); + if (unlikely(err)) + goto discard; + mntput(m); + return 0; + +discard_locked: + namespace_unlock(); + inode_unlock(dentry->d_inode); +discard: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { namespace_lock(); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 40b6c5ac46c0..88e1763e02f3 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -164,7 +164,7 @@ config ROOT_NFS If you want your system to mount its root file system via NFS, choose Y here. This is common practice for managing systems without local permanent storage. For details, read - <file:Documentation/filesystems/nfs/nfsroot.txt>. + <file:Documentation/admin-guide/nfs/nfsroot.rst>. Most people say N here. diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 690221747b47..d1a0e2c8b1b4 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -476,7 +476,7 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) err = ext_tree_remove(bl, true, 0, LLONG_MAX); WARN_ON(err); - kfree(bl); + kfree_rcu(bl, bl_layout.plh_rcu); } static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode, diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 549350259840..6a2033131c06 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -127,7 +127,9 @@ extern __be32 nfs4_callback_sequence(void *argp, void *resp, #define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 #define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 #define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 -#define RCA4_TYPE_MASK_ALL 0xf31f +#define PNFS_FF_RCA4_TYPE_MASK_READ 16 +#define PNFS_FF_RCA4_TYPE_MASK_RW 17 +#define RCA4_TYPE_MASK_ALL 0x3f31f struct cb_recallanyargs { uint32_t craa_objs_to_keep; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index cd4c6bc81cae..e61dbc9b86ae 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -121,31 +121,31 @@ out: */ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp, const nfs4_stateid *stateid) + __must_hold(RCU) { struct nfs_server *server; struct inode *inode; struct pnfs_layout_hdr *lo; + rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry(lo, &server->layouts, plh_layouts) { + list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { + if (!pnfs_layout_is_valid(lo)) + continue; if (stateid != NULL && !nfs4_stateid_match_other(stateid, &lo->plh_stateid)) continue; + if (!nfs_sb_active(server->super)) + continue; inode = igrab(lo->plh_inode); - if (!inode) - return ERR_PTR(-EAGAIN); - if (!nfs_sb_active(inode->i_sb)) { - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); - spin_lock(&clp->cl_lock); - rcu_read_lock(); - return ERR_PTR(-EAGAIN); - } - return inode; + rcu_read_unlock(); + if (inode) + return inode; + nfs_sb_deactive(server->super); + return ERR_PTR(-EAGAIN); } } - + rcu_read_unlock(); return ERR_PTR(-ENOENT); } @@ -163,28 +163,25 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp, struct inode *inode; struct pnfs_layout_hdr *lo; + rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry(lo, &server->layouts, plh_layouts) { + list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { nfsi = NFS_I(lo->plh_inode); if (nfs_compare_fh(fh, &nfsi->fh)) continue; if (nfsi->layout != lo) continue; + if (!nfs_sb_active(server->super)) + continue; inode = igrab(lo->plh_inode); - if (!inode) - return ERR_PTR(-EAGAIN); - if (!nfs_sb_active(inode->i_sb)) { - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); - spin_lock(&clp->cl_lock); - rcu_read_lock(); - return ERR_PTR(-EAGAIN); - } - return inode; + rcu_read_unlock(); + if (inode) + return inode; + nfs_sb_deactive(server->super); + return ERR_PTR(-EAGAIN); } } - + rcu_read_unlock(); return ERR_PTR(-ENOENT); } @@ -194,14 +191,9 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp, { struct inode *inode; - spin_lock(&clp->cl_lock); - rcu_read_lock(); inode = nfs_layout_find_inode_by_stateid(clp, stateid); if (inode == ERR_PTR(-ENOENT)) inode = nfs_layout_find_inode_by_fh(clp, fh); - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - return inode; } @@ -280,7 +272,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto unlock; } - pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + pnfs_set_layout_stateid(lo, &args->cbl_stateid, NULL, true); switch (pnfs_mark_matching_lsegs_return(lo, &free_me_list, &args->cbl_range, be32_to_cpu(args->cbl_stateid.seqid))) { @@ -605,6 +597,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, struct cb_recallanyargs *args = argp; __be32 status; fmode_t flags = 0; + bool schedule_manager = false; status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); if (!cps->clp) /* set in cb_sequence */ @@ -627,6 +620,18 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT)) pnfs_recall_all_layouts(cps->clp); + + if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) { + set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state); + schedule_manager = true; + } + if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_RW)) { + set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &cps->clp->cl_state); + schedule_manager = true; + } + if (schedule_manager) + nfs4_schedule_state_manager(cps->clp); + out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 1865322de142..816e1427f17e 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -378,6 +378,18 @@ nfs_inode_detach_delegation(struct inode *inode) } static void +nfs_update_delegation_cred(struct nfs_delegation *delegation, + const struct cred *cred) +{ + const struct cred *old; + + if (cred_fscmp(delegation->cred, cred) != 0) { + old = xchg(&delegation->cred, get_cred(cred)); + put_cred(old); + } +} + +static void nfs_update_inplace_delegation(struct nfs_delegation *delegation, const struct nfs_delegation *update) { @@ -385,8 +397,14 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, delegation->stateid.seqid = update->stateid.seqid; smp_wmb(); delegation->type = update->type; - if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + delegation->pagemod_limit = update->pagemod_limit; + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + delegation->change_attr = update->change_attr; + nfs_update_delegation_cred(delegation, update->cred); + /* smp_mb__before_atomic() is implicit due to xchg() */ + clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags); atomic_long_inc(&nfs_active_delegations); + } } } @@ -545,21 +563,11 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) return ret; } -/** - * nfs_client_return_marked_delegations - return previously marked delegations - * @clp: nfs_client to process - * - * Note that this function is designed to be called by the state - * manager thread. For this reason, it cannot flush the dirty data, - * since that could deadlock in case of a state recovery error. - * - * Returns zero on success, or a negative errno value. - */ -int nfs_client_return_marked_delegations(struct nfs_client *clp) +static int nfs_server_return_marked_delegations(struct nfs_server *server, + void __always_unused *data) { struct nfs_delegation *delegation; struct nfs_delegation *prev; - struct nfs_server *server; struct inode *inode; struct inode *place_holder = NULL; struct nfs_delegation *place_holder_deleg = NULL; @@ -569,78 +577,79 @@ restart: /* * To avoid quadratic looping we hold a reference * to an inode place_holder. Each time we restart, we - * list nfs_servers from the server of that inode, and - * delegation in the server from the delegations of that - * inode. + * list delegation in the server from the delegations + * of that inode. * prev is an RCU-protected pointer to a delegation which * wasn't marked for return and might be a good choice for * the next place_holder. */ - rcu_read_lock(); prev = NULL; + delegation = NULL; + rcu_read_lock(); if (place_holder) - server = NFS_SERVER(place_holder); - else - server = list_entry_rcu(clp->cl_superblocks.next, - struct nfs_server, client_link); - list_for_each_entry_from_rcu(server, &clp->cl_superblocks, client_link) { - delegation = NULL; - if (place_holder && server == NFS_SERVER(place_holder)) - delegation = rcu_dereference(NFS_I(place_holder)->delegation); - if (!delegation || delegation != place_holder_deleg) - delegation = list_entry_rcu(server->delegations.next, - struct nfs_delegation, super_list); - list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) { - struct inode *to_put = NULL; - - if (!nfs_delegation_need_return(delegation)) { + delegation = rcu_dereference(NFS_I(place_holder)->delegation); + if (!delegation || delegation != place_holder_deleg) + delegation = list_entry_rcu(server->delegations.next, + struct nfs_delegation, super_list); + list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) { + struct inode *to_put = NULL; + + if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags)) + continue; + if (!nfs_delegation_need_return(delegation)) { + if (nfs4_is_valid_delegation(delegation, 0)) prev = delegation; - continue; - } - if (!nfs_sb_active(server->super)) - break; /* continue in outer loop */ - - if (prev) { - struct inode *tmp; - - tmp = nfs_delegation_grab_inode(prev); - if (tmp) { - to_put = place_holder; - place_holder = tmp; - place_holder_deleg = prev; - } - } + continue; + } - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - if (to_put) - iput(to_put); - nfs_sb_deactive(server->super); - goto restart; + if (prev) { + struct inode *tmp = nfs_delegation_grab_inode(prev); + if (tmp) { + to_put = place_holder; + place_holder = tmp; + place_holder_deleg = prev; } - delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + } + + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { rcu_read_unlock(); + iput(to_put); + goto restart; + } + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + rcu_read_unlock(); - if (to_put) - iput(to_put); + iput(to_put); - err = nfs_end_delegation_return(inode, delegation, 0); - iput(inode); - nfs_sb_deactive(server->super); - cond_resched(); - if (!err) - goto restart; - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); - if (place_holder) - iput(place_holder); - return err; - } + err = nfs_end_delegation_return(inode, delegation, 0); + iput(inode); + cond_resched(); + if (!err) + goto restart; + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); + goto out; } rcu_read_unlock(); - if (place_holder) - iput(place_holder); - return 0; +out: + iput(place_holder); + return err; +} + +/** + * nfs_client_return_marked_delegations - return previously marked delegations + * @clp: nfs_client to process + * + * Note that this function is designed to be called by the state + * manager thread. For this reason, it cannot flush the dirty data, + * since that could deadlock in case of a state recovery error. + * + * Returns zero on success, or a negative errno value. + */ +int nfs_client_return_marked_delegations(struct nfs_client *clp) +{ + return nfs_client_for_each_server(clp, + nfs_server_return_marked_delegations, NULL); } /** @@ -1083,53 +1092,51 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp) rcu_read_unlock(); } -/** - * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done - * @clp: nfs_client to process - * - */ -void nfs_delegation_reap_unclaimed(struct nfs_client *clp) +static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server, + void __always_unused *data) { struct nfs_delegation *delegation; - struct nfs_server *server; struct inode *inode; - restart: rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry_rcu(delegation, &server->delegations, - super_list) { - if (test_bit(NFS_DELEGATION_INODE_FREEING, - &delegation->flags) || - test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags) || - test_bit(NFS_DELEGATION_NEED_RECLAIM, - &delegation->flags) == 0) - continue; - if (!nfs_sb_active(server->super)) - break; /* continue in outer loop */ - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - nfs_sb_deactive(server->super); - goto restart; - } - delegation = nfs_start_delegation_return_locked(NFS_I(inode)); - rcu_read_unlock(); - if (delegation != NULL) { - if (nfs_detach_delegation(NFS_I(inode), delegation, - server) != NULL) - nfs_free_delegation(delegation); - /* Match nfs_start_delegation_return_locked */ - nfs_put_delegation(delegation); - } - iput(inode); - nfs_sb_deactive(server->super); - cond_resched(); - goto restart; +restart_locked: + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags) == 0) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + goto restart_locked; + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + rcu_read_unlock(); + if (delegation != NULL) { + if (nfs_detach_delegation(NFS_I(inode), delegation, + server) != NULL) + nfs_free_delegation(delegation); + /* Match nfs_start_delegation_return_locked */ + nfs_put_delegation(delegation); } + iput(inode); + cond_resched(); + goto restart; } rcu_read_unlock(); + return 0; +} + +/** + * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done + * @clp: nfs_client to process + * + */ +void nfs_delegation_reap_unclaimed(struct nfs_client *clp) +{ + nfs_client_for_each_server(clp, nfs_server_reap_unclaimed_delegations, + NULL); } static inline bool nfs4_server_rebooted(const struct nfs_client *clp) @@ -1215,62 +1222,61 @@ nfs_delegation_test_free_expired(struct inode *inode, nfs_remove_bad_delegation(inode, stateid); } -/** - * nfs_reap_expired_delegations - reap expired delegations - * @clp: nfs_client to process - * - * Iterates through all the delegations associated with this server and - * checks if they have may have been revoked. This function is usually - * expected to be called in cases where the server may have lost its - * lease. - */ -void nfs_reap_expired_delegations(struct nfs_client *clp) +static int nfs_server_reap_expired_delegations(struct nfs_server *server, + void __always_unused *data) { struct nfs_delegation *delegation; - struct nfs_server *server; struct inode *inode; const struct cred *cred; nfs4_stateid stateid; - restart: rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry_rcu(delegation, &server->delegations, - super_list) { - if (test_bit(NFS_DELEGATION_INODE_FREEING, - &delegation->flags) || - test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags) || - test_bit(NFS_DELEGATION_TEST_EXPIRED, - &delegation->flags) == 0) - continue; - if (!nfs_sb_active(server->super)) - break; /* continue in outer loop */ - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - nfs_sb_deactive(server->super); - goto restart; - } - cred = get_cred_rcu(delegation->cred); - nfs4_stateid_copy(&stateid, &delegation->stateid); - clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); - rcu_read_unlock(); - nfs_delegation_test_free_expired(inode, &stateid, cred); - put_cred(cred); - if (nfs4_server_rebooted(clp)) { - nfs_inode_mark_test_expired_delegation(server,inode); - iput(inode); - nfs_sb_deactive(server->super); - return; - } +restart_locked: + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags) == 0) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + goto restart_locked; + spin_lock(&delegation->lock); + cred = get_cred_rcu(delegation->cred); + nfs4_stateid_copy(&stateid, &delegation->stateid); + spin_unlock(&delegation->lock); + clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + rcu_read_unlock(); + nfs_delegation_test_free_expired(inode, &stateid, cred); + put_cred(cred); + if (!nfs4_server_rebooted(server->nfs_client)) { iput(inode); - nfs_sb_deactive(server->super); cond_resched(); goto restart; } + nfs_inode_mark_test_expired_delegation(server,inode); + iput(inode); + return -EAGAIN; } rcu_read_unlock(); + return 0; +} + +/** + * nfs_reap_expired_delegations - reap expired delegations + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * checks if they have may have been revoked. This function is usually + * expected to be called in cases where the server may have lost its + * lease. + */ +void nfs_reap_expired_delegations(struct nfs_client *clp) +{ + nfs_client_for_each_server(clp, nfs_server_reap_expired_delegations, + NULL); } void nfs_inode_find_delegation_state_and_recover(struct inode *inode, @@ -1359,11 +1365,14 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - bool ret; + bool ret = false; flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); + if (!delegation) + goto out; + spin_lock(&delegation->lock); ret = nfs4_is_valid_delegation(delegation, flags); if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); @@ -1371,6 +1380,8 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, if (cred) *cred = get_cred(delegation->cred); } + spin_unlock(&delegation->lock); +out: rcu_read_unlock(); return ret; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 193d6fb363b7..5a331da5f55a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -141,10 +141,9 @@ struct nfs_cache_array { int size; int eof_index; u64 last_cookie; - struct nfs_cache_array_entry array[0]; + struct nfs_cache_array_entry array[]; }; -typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool); typedef struct { struct file *file; struct page *page; @@ -153,7 +152,7 @@ typedef struct { u64 *dir_cookie; u64 last_cookie; loff_t current_index; - decode_dirent_t decode; + loff_t prev_index; unsigned long dir_verifier; unsigned long timestamp; @@ -240,6 +239,25 @@ out: return ret; } +static inline +int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return in_compat_syscall(); +#else + return (BITS_PER_LONG == 32); +#endif +} + +static +bool nfs_readdir_use_cookie(const struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return false; + return true; +} + static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) { @@ -289,7 +307,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des !nfs_readdir_inode_mapping_valid(nfsi)) { ctx->duped = 0; ctx->attr_gencount = nfsi->attr_gencount; - } else if (new_pos < desc->ctx->pos) { + } else if (new_pos < desc->prev_index) { if (ctx->duped > 0 && ctx->dup_cookie == *desc->dir_cookie) { if (printk_ratelimit()) { @@ -305,7 +323,11 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des ctx->dup_cookie = *desc->dir_cookie; ctx->duped = -1; } - desc->ctx->pos = new_pos; + if (nfs_readdir_use_cookie(desc->file)) + desc->ctx->pos = *desc->dir_cookie; + else + desc->ctx->pos = new_pos; + desc->prev_index = new_pos; desc->cache_entry_index = i; return 0; } @@ -376,9 +398,10 @@ error: static int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *xdr) { + struct inode *inode = file_inode(desc->file); int error; - error = desc->decode(xdr, entry, desc->plus); + error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus); if (error) return error; entry->fattr->time_start = desc->timestamp; @@ -756,6 +779,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) if (desc->page_index == 0) { desc->current_index = 0; + desc->prev_index = 0; desc->last_cookie = 0; } do { @@ -786,11 +810,14 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) desc->eof = true; break; } - desc->ctx->pos++; if (i < (array->size-1)) *desc->dir_cookie = array->array[i+1].cookie; else *desc->dir_cookie = array->last_cookie; + if (nfs_readdir_use_cookie(file)) + desc->ctx->pos = *desc->dir_cookie; + else + desc->ctx->pos++; if (ctx->duped != 0) ctx->duped = 1; } @@ -860,9 +887,14 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); - nfs_readdir_descriptor_t my_desc, - *desc = &my_desc; struct nfs_open_dir_context *dir_ctx = file->private_data; + nfs_readdir_descriptor_t my_desc = { + .file = file, + .ctx = ctx, + .dir_cookie = &dir_ctx->dir_cookie, + .plus = nfs_use_readdirplus(inode, ctx), + }, + *desc = &my_desc; int res = 0; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", @@ -875,14 +907,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) * to either find the entry with the appropriate number or * revalidate the cookie. */ - memset(desc, 0, sizeof(*desc)); - - desc->file = file; - desc->ctx = ctx; - desc->dir_cookie = &dir_ctx->dir_cookie; - desc->decode = NFS_PROTO(inode)->decode_dirent; - desc->plus = nfs_use_readdirplus(inode, ctx); - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) @@ -954,7 +978,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) } if (offset != filp->f_pos) { filp->f_pos = offset; - dir_ctx->dir_cookie = 0; + if (nfs_readdir_use_cookie(filp)) + dir_ctx->dir_cookie = offset; + else + dir_ctx->dir_cookie = 0; dir_ctx->duped = 0; } inode_unlock(inode); @@ -2282,7 +2309,7 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock); static LIST_HEAD(nfs_access_lru_list); static atomic_long_t nfs_access_nr_entries; -static unsigned long nfs_access_max_cachesize = ULONG_MAX; +static unsigned long nfs_access_max_cachesize = 4*1024*1024; module_param(nfs_access_max_cachesize, ulong, 0644); MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length"); @@ -2489,7 +2516,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre rcu_read_lock(); if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) goto out; - lh = rcu_dereference(nfsi->access_cache_entry_lru.prev); + lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru)); cache = list_entry(lh, struct nfs_access_entry, lru); if (lh == &nfsi->access_cache_entry_lru || cred_fscmp(cred, cache->cred) != 0) @@ -2642,9 +2669,10 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) status = NFS_PROTO(inode)->access(inode, &cache); if (status != 0) { if (status == -ESTALE) { - nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); + else + nfs_zap_caches(inode); } goto out; } @@ -2732,14 +2760,7 @@ force_lookup: if (!NFS_PROTO(inode)->access) goto out_notsup; - /* Always try fast lookups first */ - rcu_read_lock(); - res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK); - rcu_read_unlock(); - if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) { - /* Fast lookup failed, try the slow way */ - res = nfs_do_access(inode, cred, mask); - } + res = nfs_do_access(inode, cred, mask); out: if (!res && (mask & MAY_EXEC)) res = nfs_execute_ok(inode, mask); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b768a0b42e82..a57e7c72c7f4 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -94,7 +94,7 @@ struct nfs_direct_req { #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ /* for read */ #define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ - struct nfs_writeverf verf; /* unstable write verifier */ +#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */ }; static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; @@ -151,106 +151,6 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, dreq->count = dreq_len; } -/* - * nfs_direct_select_verf - select the right verifier - * @dreq - direct request possibly spanning multiple servers - * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs - * @commit_idx - commit bucket index for the DS - * - * returns the correct verifier to use given the role of the server - */ -static struct nfs_writeverf * -nfs_direct_select_verf(struct nfs_direct_req *dreq, - struct nfs_client *ds_clp, - int commit_idx) -{ - struct nfs_writeverf *verfp = &dreq->verf; - -#ifdef CONFIG_NFS_V4_1 - /* - * pNFS is in use, use the DS verf except commit_through_mds is set - * for layout segment where nbuckets is zero. - */ - if (ds_clp && dreq->ds_cinfo.nbuckets > 0) { - if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets) - verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf; - else - WARN_ON_ONCE(1); - } -#endif - return verfp; -} - - -/* - * nfs_direct_set_hdr_verf - set the write/commit verifier - * @dreq - direct request possibly spanning multiple servers - * @hdr - pageio header to validate against previously seen verfs - * - * Set the server's (MDS or DS) "seen" verifier - */ -static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, - struct nfs_pgio_header *hdr) -{ - struct nfs_writeverf *verfp; - - verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx); - WARN_ON_ONCE(verfp->committed >= 0); - memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); - WARN_ON_ONCE(verfp->committed < 0); -} - -static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1, - const struct nfs_writeverf *v2) -{ - return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier); -} - -/* - * nfs_direct_cmp_hdr_verf - compare verifier for pgio header - * @dreq - direct request possibly spanning multiple servers - * @hdr - pageio header to validate against previously seen verf - * - * set the server's "seen" verf if not initialized. - * returns result of comparison between @hdr->verf and the "seen" - * verf of the server used by @hdr (DS or MDS) - */ -static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, - struct nfs_pgio_header *hdr) -{ - struct nfs_writeverf *verfp; - - verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx); - if (verfp->committed < 0) { - nfs_direct_set_hdr_verf(dreq, hdr); - return 0; - } - return nfs_direct_cmp_verf(verfp, &hdr->verf); -} - -/* - * nfs_direct_cmp_commit_data_verf - compare verifier for commit data - * @dreq - direct request possibly spanning multiple servers - * @data - commit data to validate against previously seen verf - * - * returns result of comparison between @data->verf and the verf of - * the server used by @data (DS or MDS) - */ -static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, - struct nfs_commit_data *data) -{ - struct nfs_writeverf *verfp; - - verfp = nfs_direct_select_verf(dreq, data->ds_clp, - data->ds_commit_index); - - /* verifier not set so always fail */ - if (verfp->committed < 0 || data->res.verf->committed <= NFS_UNSTABLE) - return 1; - - return nfs_direct_cmp_verf(verfp, data->res.verf); -} - /** * nfs_direct_IO - NFS address space operation for direct I/O * @iocb: target I/O control block @@ -305,7 +205,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) kref_get(&dreq->kref); init_completion(&dreq->completion); INIT_LIST_HEAD(&dreq->mds_cinfo.list); - dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ + pnfs_init_ds_commit_info(&dreq->ds_cinfo); INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); spin_lock_init(&dreq->lock); @@ -316,7 +216,7 @@ static void nfs_direct_req_free(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); - nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo); + pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode); if (dreq->l_ctx != NULL) nfs_put_lock_context(dreq->l_ctx); if (dreq->ctx != NULL) @@ -571,6 +471,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { result = PTR_ERR(l_ctx); + nfs_direct_req_release(dreq); goto out_release; } dreq->l_ctx = l_ctx; @@ -605,15 +506,30 @@ out: } static void +nfs_direct_join_group(struct list_head *list, struct inode *inode) +{ + struct nfs_page *req, *next; + + list_for_each_entry(req, list, wb_list) { + if (req->wb_head != req || req->wb_this_page == req) + continue; + for (next = req->wb_this_page; + next != req->wb_head; + next = next->wb_this_page) { + nfs_list_remove_request(next); + nfs_release_request(next); + } + nfs_join_page_group(req, inode); + } +} + +static void nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, struct nfs_commit_info *cinfo) { mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); -#ifdef CONFIG_NFS_V4_1 - if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) - NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); -#endif + pnfs_recover_commit_reqs(list, cinfo); nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } @@ -629,11 +545,12 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); + nfs_direct_join_group(&reqs, dreq->inode); + dreq->count = 0; dreq->max_count = 0; list_for_each_entry(req, &reqs, wb_list) dreq->max_count += req->wb_bytes; - dreq->verf.committed = NFS_INVALID_STABLE_HOW; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); get_dreq(dreq); @@ -670,27 +587,35 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) static void nfs_direct_commit_complete(struct nfs_commit_data *data) { + const struct nfs_writeverf *verf = data->res.verf; struct nfs_direct_req *dreq = data->dreq; struct nfs_commit_info cinfo; struct nfs_page *req; int status = data->task.tk_status; + if (status < 0) { + /* Errors in commit are fatal */ + dreq->error = status; + dreq->max_count = 0; + dreq->count = 0; + dreq->flags = NFS_ODIRECT_DONE; + } else if (dreq->flags == NFS_ODIRECT_DONE) + status = dreq->error; + nfs_init_cinfo_from_dreq(&cinfo, dreq); - if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data)) - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { + if (status >= 0 && !nfs_write_match_verf(verf, req)) { + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; /* * Despite the reboot, the write was successful, * so reset wb_nio. */ req->wb_nio = 0; - /* Note the rewrite will go through mds */ nfs_mark_request_commit(req, NULL, &cinfo, 0); - } else + } else /* Error or match */ nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -705,7 +630,8 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq = cinfo->dreq; spin_lock(&dreq->lock); - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + if (dreq->flags != NFS_ODIRECT_DONE) + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; spin_unlock(&dreq->lock); nfs_mark_request_commit(req, NULL, cinfo, 0); } @@ -728,6 +654,23 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) nfs_direct_write_reschedule(dreq); } +static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) +{ + struct nfs_commit_info cinfo; + struct nfs_page *req; + LIST_HEAD(reqs); + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); + + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); + nfs_list_remove_request(req); + nfs_release_request(req); + nfs_unlock_and_release_request(req); + } +} + static void nfs_direct_write_schedule_work(struct work_struct *work) { struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work); @@ -742,6 +685,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: + nfs_direct_write_clear_reqs(dreq); nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); nfs_direct_complete(dreq); } @@ -768,20 +712,15 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } nfs_direct_count_bytes(dreq, hdr); - if (hdr->good_bytes != 0) { - if (nfs_write_need_commit(hdr)) { - if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) - request_commit = true; - else if (dreq->flags == 0) { - nfs_direct_set_hdr_verf(dreq, hdr); - request_commit = true; - dreq->flags = NFS_ODIRECT_DO_COMMIT; - } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { - request_commit = true; - if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) - dreq->flags = - NFS_ODIRECT_RESCHED_WRITES; - } + if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) { + switch (dreq->flags) { + case 0: + dreq->flags = NFS_ODIRECT_DO_COMMIT; + request_commit = true; + break; + case NFS_ODIRECT_RESCHED_WRITES: + case NFS_ODIRECT_DO_COMMIT: + request_commit = true; } } spin_unlock(&dreq->lock); @@ -990,11 +929,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { result = PTR_ERR(l_ctx); + nfs_direct_req_release(dreq); goto out_release; } dreq->l_ctx = l_ctx; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); nfs_start_io_direct(inode); diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 89bd5581f317..963800037609 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -152,12 +152,13 @@ static int nfs_dns_upcall(struct cache_detail *cd, struct cache_head *ch) { struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); - int ret; - ret = nfs_cache_upcall(cd, key->hostname); - if (ret) - ret = sunrpc_cache_pipe_upcall(cd, ch); - return ret; + if (test_and_set_bit(CACHE_PENDING, &ch->flags)) + return 0; + if (!nfs_cache_upcall(cd, key->hostname)) + return 0; + clear_bit(CACHE_PENDING, &ch->flags); + return sunrpc_cache_pipe_upcall_timeout(cd, ch); } static int nfs_dns_match(struct cache_head *ca, diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index c9b605f6c9cb..a13e69009f19 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -49,6 +49,7 @@ MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); MODULE_DESCRIPTION("The NFSv4 file layout driver"); #define FILELAYOUT_POLL_RETRY_MAX (15*HZ) +static const struct pnfs_commit_ops filelayout_commit_ops; static loff_t filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, @@ -750,72 +751,17 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) /* This assumes a single RW lseg */ if (lseg->pls_range.iomode == IOMODE_RW) { struct nfs4_filelayout *flo; + struct inode *inode; flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); - flo->commit_info.nbuckets = 0; - kfree(flo->commit_info.buckets); - flo->commit_info.buckets = NULL; + inode = flo->generic_hdr.plh_inode; + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg); + spin_unlock(&inode->i_lock); } _filelayout_free_lseg(fl); } -static int -filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - gfp_t gfp_flags) -{ - struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); - struct pnfs_commit_bucket *buckets; - int size, i; - - if (fl->commit_through_mds) - return 0; - - size = (fl->stripe_type == STRIPE_SPARSE) ? - fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - - if (cinfo->ds->nbuckets >= size) { - /* This assumes there is only one IOMODE_RW lseg. What - * we really want to do is have a layout_hdr level - * dictionary of <multipath_list4, fh> keys, each - * associated with a struct list_head, populated by calls - * to filelayout_write_pagelist(). - * */ - return 0; - } - - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), - gfp_flags); - if (!buckets) - return -ENOMEM; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - /* mark direct verifier as unset */ - buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; - } - - spin_lock(&cinfo->inode->i_lock); - if (cinfo->ds->nbuckets >= size) - goto out; - for (i = 0; i < cinfo->ds->nbuckets; i++) { - list_splice(&cinfo->ds->buckets[i].written, - &buckets[i].written); - list_splice(&cinfo->ds->buckets[i].committing, - &buckets[i].committing); - buckets[i].direct_verf.committed = - cinfo->ds->buckets[i].direct_verf.committed; - buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; - buckets[i].clseg = cinfo->ds->buckets[i].clseg; - } - swap(cinfo->ds->buckets, buckets); - cinfo->ds->nbuckets = size; -out: - spin_unlock(&cinfo->inode->i_lock); - kfree(buckets); - return 0; -} - static struct pnfs_layout_segment * filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, @@ -938,9 +884,6 @@ static void filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - struct nfs_commit_info cinfo; - int status; - pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, @@ -959,17 +902,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) - goto out_mds; - nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); - status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); - if (status < 0) { - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; - goto out_mds; - } - return; -out_mds: - nfs_pageio_reset_write_mds(pgio); + nfs_pageio_reset_write_mds(pgio); } static const struct nfs_pageio_ops filelayout_pg_read_ops = { @@ -1078,36 +1011,6 @@ out_err: return -EAGAIN; } -/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest - * for @page - * @cinfo - commit info for current inode - * @page - page to search for matching head request - * - * Returns a the head request if one is found, otherwise returns NULL. - */ -static struct nfs_page * -filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) -{ - struct nfs_page *freq, *t; - struct pnfs_commit_bucket *b; - int i; - - /* Linearly search the commit lists for each bucket until a matching - * request is found */ - for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { - list_for_each_entry_safe(freq, t, &b->written, wb_list) { - if (freq->wb_page == page) - return freq->wb_head; - } - list_for_each_entry_safe(freq, t, &b->committing, wb_list) { - if (freq->wb_page == page) - return freq->wb_head; - } - } - - return NULL; -} - static int filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) @@ -1140,13 +1043,17 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct nfs4_filelayout *flo; flo = kzalloc(sizeof(*flo), gfp_flags); - return flo != NULL ? &flo->generic_hdr : NULL; + if (flo == NULL) + return NULL; + pnfs_init_ds_commit_info(&flo->commit_info); + flo->commit_info.ops = &filelayout_commit_ops; + return &flo->generic_hdr; } static void filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo) { - kfree(FILELAYOUT_FROM_HDR(lo)); + kfree_rcu(FILELAYOUT_FROM_HDR(lo), generic_hdr.plh_rcu); } static struct pnfs_ds_commit_info * @@ -1160,6 +1067,46 @@ filelayout_get_ds_info(struct inode *inode) return &FILELAYOUT_FROM_HDR(layout)->commit_info; } +static void +filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + struct inode *inode = lseg->pls_layout->plh_inode; + struct pnfs_commit_array *array, *new; + unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + new = pnfs_alloc_commit_array(size, GFP_NOIO); + if (new) { + spin_lock(&inode->i_lock); + array = pnfs_add_commit_array(fl_cinfo, new, lseg); + spin_unlock(&inode->i_lock); + if (array != new) + pnfs_free_commit_array(new); + } +} + +static void +filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_destroy(fl_cinfo); + spin_unlock(&inode->i_lock); +} + +static const struct pnfs_commit_ops filelayout_commit_ops = { + .setup_ds_info = filelayout_setup_ds_info, + .release_ds_info = filelayout_release_ds_info, + .mark_request_commit = filelayout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .search_commit_reqs = pnfs_generic_search_commit_reqs, + .commit_pagelist = filelayout_commit_pagelist, +}; + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", @@ -1173,12 +1120,6 @@ static struct pnfs_layoutdriver_type filelayout_type = { .pg_read_ops = &filelayout_pg_read_ops, .pg_write_ops = &filelayout_pg_write_ops, .get_ds_info = &filelayout_get_ds_info, - .mark_request_commit = filelayout_mark_request_commit, - .clear_request_commit = pnfs_generic_clear_request_commit, - .scan_commit_lists = pnfs_generic_scan_commit_lists, - .recover_commit_reqs = pnfs_generic_recover_commit_reqs, - .search_commit_reqs = filelayout_search_commit_reqs, - .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, .alloc_deviceid_node = filelayout_alloc_deviceid_node, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index bb9148b83166..7d399f72ebbb 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -32,6 +32,7 @@ static unsigned short io_maxretrans; +static const struct pnfs_commit_ops ff_layout_commit_ops; static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr); static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, @@ -48,9 +49,11 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) ffl = kzalloc(sizeof(*ffl), gfp_flags); if (ffl) { + pnfs_init_ds_commit_info(&ffl->commit_info); INIT_LIST_HEAD(&ffl->error_list); INIT_LIST_HEAD(&ffl->mirrors); ffl->last_report_time = ktime_get(); + ffl->commit_info.ops = &ff_layout_commit_ops; return &ffl->generic_hdr; } else return NULL; @@ -59,14 +62,14 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) static void ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo) { + struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_ds_err *err, *n; - list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list, - list) { + list_for_each_entry_safe(err, n, &ffl->error_list, list) { list_del(&err->list); kfree(err); } - kfree(FF_LAYOUT_FROM_HDR(lo)); + kfree_rcu(ffl, generic_hdr.plh_rcu); } static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) @@ -248,36 +251,10 @@ static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror) static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) { - int i; - - if (fls->mirror_array) { - for (i = 0; i < fls->mirror_array_cnt; i++) { - /* normally mirror_ds is freed in - * .free_deviceid_node but we still do it here - * for .alloc_lseg error path */ - ff_layout_put_mirror(fls->mirror_array[i]); - } - kfree(fls->mirror_array); - fls->mirror_array = NULL; - } -} - -static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr) -{ - int ret = 0; + u32 i; - dprintk("--> %s\n", __func__); - - /* FIXME: remove this check when layout segment support is added */ - if (lgr->range.offset != 0 || - lgr->range.length != NFS4_MAX_UINT64) { - dprintk("%s Only whole file layouts supported. Use MDS i/o\n", - __func__); - ret = -EINVAL; - } - - dprintk("--> %s returns %d\n", __func__, ret); - return ret; + for (i = 0; i < fls->mirror_array_cnt; i++) + ff_layout_put_mirror(fls->mirror_array[i]); } static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) @@ -289,6 +266,23 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) } static bool +ff_lseg_match_mirrors(struct pnfs_layout_segment *l1, + struct pnfs_layout_segment *l2) +{ + const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1); + const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1); + u32 i; + + if (fl1->mirror_array_cnt != fl2->mirror_array_cnt) + return false; + for (i = 0; i < fl1->mirror_array_cnt; i++) { + if (fl1->mirror_array[i] != fl2->mirror_array[i]) + return false; + } + return true; +} + +static bool ff_lseg_range_is_after(const struct pnfs_layout_range *l1, const struct pnfs_layout_range *l2) { @@ -323,6 +317,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new, new->pls_range.length); if (new_end < old->pls_range.offset) return false; + if (!ff_lseg_match_mirrors(new, old)) + return false; /* Mergeable: copy info from 'old' to 'new' */ if (new_end < old_end) @@ -400,16 +396,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; rc = -ENOMEM; - fls = kzalloc(sizeof(*fls), gfp_flags); + fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt), + gfp_flags); if (!fls) goto out_err_free; fls->mirror_array_cnt = mirror_array_cnt; fls->stripe_unit = stripe_unit; - fls->mirror_array = kcalloc(fls->mirror_array_cnt, - sizeof(fls->mirror_array[0]), gfp_flags); - if (fls->mirror_array == NULL) - goto out_err_free; for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; @@ -545,9 +538,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, out_sort_mirrors: ff_layout_sort_mirrors(fls); - rc = ff_layout_check_layout(lgr); - if (rc) - goto out_err_free; ret = &fls->generic_hdr; dprintk("<-- %s (success)\n", __func__); out_free_page: @@ -560,17 +550,6 @@ out_err_free: goto out_free_page; } -static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout) -{ - struct pnfs_layout_segment *lseg; - - list_for_each_entry(lseg, &layout->plh_segs, pls_list) - if (lseg->pls_range.iomode == IOMODE_RW) - return true; - - return false; -} - static void ff_layout_free_lseg(struct pnfs_layout_segment *lseg) { @@ -585,23 +564,12 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg) ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout); inode = ffl->generic_hdr.plh_inode; spin_lock(&inode->i_lock); - if (!ff_layout_has_rw_segments(lseg->pls_layout)) { - ffl->commit_info.nbuckets = 0; - kfree(ffl->commit_info.buckets); - ffl->commit_info.buckets = NULL; - } + pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg); spin_unlock(&inode->i_lock); } _ff_layout_free_lseg(fls); } -/* Return 1 until we have multiple lsegs support */ -static int -ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) -{ - return 1; -} - static void nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { @@ -746,52 +714,6 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, spin_unlock(&mirror->lock); } -static int -ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - gfp_t gfp_flags) -{ - struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); - struct pnfs_commit_bucket *buckets; - int size; - - if (cinfo->ds->nbuckets != 0) { - /* This assumes there is only one RW lseg per file. - * To support multiple lseg per file, we need to - * change struct pnfs_commit_bucket to allow dynamic - * increasing nbuckets. - */ - return 0; - } - - size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg); - - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), - gfp_flags); - if (!buckets) - return -ENOMEM; - else { - int i; - - spin_lock(&cinfo->inode->i_lock); - if (cinfo->ds->nbuckets != 0) - kfree(buckets); - else { - cinfo->ds->buckets = buckets; - cinfo->ds->nbuckets = size; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - /* mark direct verifier as unset */ - buckets[i].direct_verf.committed = - NFS_INVALID_STABLE_HOW; - } - } - spin_unlock(&cinfo->inode->i_lock); - return 0; - } -} - static void ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) { @@ -876,8 +798,8 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), - 0, - NFS4_MAX_UINT64, + req_offset(req), + req->wb_bytes, IOMODE_READ, strict_iomode, GFP_KERNEL); @@ -888,6 +810,14 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, } static void +ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_range(pgio, req); +} + +static void ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { @@ -897,7 +827,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; retry: - pnfs_generic_pg_check_layout(pgio); + ff_layout_pg_check_layout(pgio, req); /* Use full layout for now */ if (!pgio->pg_lseg) { ff_layout_pg_get_read(pgio, req, false); @@ -953,18 +883,16 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, { struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; - struct nfs_commit_info cinfo; struct nfs4_pnfs_ds *ds; int i; - int status; retry: - pnfs_generic_pg_check_layout(pgio); + ff_layout_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), - 0, - NFS4_MAX_UINT64, + req_offset(req), + req->wb_bytes, IOMODE_RW, false, GFP_NOFS); @@ -978,11 +906,6 @@ retry: if (pgio->pg_lseg == NULL) goto out_mds; - nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); - status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); - if (status < 0) - goto out_mds; - /* Use a direct mapping of ds_idx to pgio mirror_idx */ if (WARN_ON_ONCE(pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))) @@ -1297,21 +1220,23 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, } } + mirror = FF_LAYOUT_COMP(lseg, idx); + err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, offset, length, status, opnum, + GFP_NOIO); + switch (status) { case NFS4ERR_DELAY: case NFS4ERR_GRACE: - return; - default: break; + case NFS4ERR_NXIO: + ff_layout_mark_ds_unreachable(lseg, idx); + /* Fallthrough */ + default: + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, + lseg); } - mirror = FF_LAYOUT_COMP(lseg, idx); - err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, offset, length, status, opnum, - GFP_NOIO); - if (status == NFS4ERR_NXIO) - ff_layout_mark_ds_unreachable(lseg, idx); - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); } @@ -2012,6 +1937,33 @@ ff_layout_get_ds_info(struct inode *inode) } static void +ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + struct inode *inode = lseg->pls_layout->plh_inode; + struct pnfs_commit_array *array, *new; + + new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO); + if (new) { + spin_lock(&inode->i_lock); + array = pnfs_add_commit_array(fl_cinfo, new, lseg); + spin_unlock(&inode->i_lock); + if (array != new) + pnfs_free_commit_array(new); + } +} + +static void +ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_destroy(fl_cinfo); + spin_unlock(&inode->i_lock); +} + +static void ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, @@ -2496,6 +2448,16 @@ ff_layout_set_layoutdriver(struct nfs_server *server, return 0; } +static const struct pnfs_commit_ops ff_layout_commit_ops = { + .setup_ds_info = ff_layout_setup_ds_info, + .release_ds_info = ff_layout_release_ds_info, + .mark_request_commit = pnfs_layout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .commit_pagelist = ff_layout_commit_pagelist, +}; + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", @@ -2512,11 +2474,6 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, .free_deviceid_node = ff_layout_free_deviceid_node, - .mark_request_commit = pnfs_layout_mark_request_commit, - .clear_request_commit = pnfs_generic_clear_request_commit, - .scan_commit_lists = pnfs_generic_scan_commit_lists, - .recover_commit_reqs = pnfs_generic_recover_commit_reqs, - .commit_pagelist = ff_layout_commit_pagelist, .read_pagelist = ff_layout_read_pagelist, .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 2f369966abf7..354a031c69b1 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -99,7 +99,7 @@ struct nfs4_ff_layout_segment { u64 stripe_unit; u32 flags; u32 mirror_array_cnt; - struct nfs4_ff_layout_mirror **mirror_array; + struct nfs4_ff_layout_mirror *mirror_array[]; }; struct nfs4_flexfile_layout { diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index e113fcb4bb4c..ccc88be88d6a 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -190,6 +190,7 @@ static const struct constant_table nfs_vers_tokens[] = { { "4.0", Opt_vers_4_0 }, { "4.1", Opt_vers_4_1 }, { "4.2", Opt_vers_4_2 }, + {} }; enum { @@ -202,13 +203,14 @@ enum { nr__Opt_xprt }; -static const struct constant_table nfs_xprt_protocol_tokens[nr__Opt_xprt] = { +static const struct constant_table nfs_xprt_protocol_tokens[] = { { "rdma", Opt_xprt_rdma }, { "rdma6", Opt_xprt_rdma6 }, { "tcp", Opt_xprt_tcp }, { "tcp6", Opt_xprt_tcp6 }, { "udp", Opt_xprt_udp }, { "udp6", Opt_xprt_udp6 }, + {} }; enum { @@ -239,6 +241,7 @@ static const struct constant_table nfs_secflavor_tokens[] = { { "spkm3i", Opt_sec_spkmi }, { "spkm3p", Opt_sec_spkmp }, { "sys", Opt_sec_sys }, + {} }; /* @@ -1135,7 +1138,7 @@ out_no_address: return nfs_invalf(fc, "NFS4: mount program didn't pass remote address"); out_invalid_transport_udp: - return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp"); + return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); } #endif @@ -1257,7 +1260,7 @@ out_v4_not_compiled: nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel"); return -EPROTONOSUPPORT; out_invalid_transport_udp: - return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp"); + return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); out_no_address: return nfs_invalf(fc, "NFS: mount program didn't pass remote address"); out_mountproto_mismatch: diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index b012c2668a1f..aaeeb4659bff 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -73,6 +73,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) struct inode *inode; char *name; int error = -ENOMEM; + unsigned long kflags = 0, kflags_out = 0; name = kstrdup(fc->source, GFP_KERNEL); if (!name) @@ -83,11 +84,14 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) if (fsinfo.fattr == NULL) goto out_name; + fsinfo.fattr->label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(fsinfo.fattr->label)) + goto out_fattr; error = server->nfs_client->rpc_ops->getroot(server, ctx->mntfh, &fsinfo); if (error < 0) { dprintk("nfs_get_root: getattr error = %d\n", -error); nfs_errorf(fc, "NFS: Couldn't getattr on root"); - goto out_fattr; + goto out_label; } inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr, NULL); @@ -95,12 +99,12 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) dprintk("nfs_get_root: get root inode failed\n"); error = PTR_ERR(inode); nfs_errorf(fc, "NFS: Couldn't get root inode"); - goto out_fattr; + goto out_label; } error = nfs_superblock_set_dummy_root(s, inode); if (error != 0) - goto out_fattr; + goto out_label; /* root dentries normally start off anonymous and get spliced in later * if the dentry tree reaches them; however if the dentry already @@ -111,7 +115,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) dprintk("nfs_get_root: get root dentry failed\n"); error = PTR_ERR(root); nfs_errorf(fc, "NFS: Couldn't get root dentry"); - goto out_fattr; + goto out_label; } security_d_instantiate(root, inode); @@ -123,12 +127,39 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) } spin_unlock(&root->d_lock); fc->root = root; + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + kflags |= SECURITY_LSM_NATIVE_LABELS; + if (ctx->clone_data.sb) { + if (d_inode(fc->root)->i_fop != &nfs_dir_operations) { + error = -ESTALE; + goto error_splat_root; + } + /* clone lsm security options from the parent to the new sb */ + error = security_sb_clone_mnt_opts(ctx->clone_data.sb, + s, kflags, &kflags_out); + } else { + error = security_sb_set_mnt_opts(s, fc->security, + kflags, &kflags_out); + } + if (error) + goto error_splat_root; + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) + NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; + + nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label); error = 0; +out_label: + nfs4_label_free(fsinfo.fattr->label); out_fattr: nfs_free_fattr(fsinfo.fattr); out_name: kfree(name); out: return error; +error_splat_root: + dput(fc->root); + fc->root = NULL; + goto out_label; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 11bf15800ac9..b9d0921cb4fe 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -62,7 +62,6 @@ /* Default is to see 64-bit inode numbers */ static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; -static void nfs_invalidate_inode(struct inode *); static int nfs_update_inode(struct inode *, struct nfs_fattr *); static struct kmem_cache * nfs_inode_cachep; @@ -284,10 +283,18 @@ EXPORT_SYMBOL_GPL(nfs_invalidate_atime); * Invalidate, but do not unhash, the inode. * NB: must be called with inode->i_lock held! */ -static void nfs_invalidate_inode(struct inode *inode) +static void nfs_set_inode_stale_locked(struct inode *inode) { set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); nfs_zap_caches_locked(inode); + trace_nfs_set_inode_stale(inode); +} + +void nfs_set_inode_stale(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_set_inode_stale_locked(inode); + spin_unlock(&inode->i_lock); } struct nfs_find_desc { @@ -959,16 +966,16 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct file *filp) { struct nfs_open_context *ctx; - const struct cred *cred = get_current_cred(); ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { - put_cred(cred); + if (!ctx) return ERR_PTR(-ENOMEM); - } nfs_sb_active(dentry->d_sb); ctx->dentry = dget(dentry); - ctx->cred = cred; + if (filp) + ctx->cred = get_cred(filp->f_cred); + else + ctx->cred = get_current_cred(); ctx->ll_cred = NULL; ctx->state = NULL; ctx->mode = f_mode; @@ -1163,9 +1170,10 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = 0; break; case -ESTALE: - nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); + else + nfs_zap_caches(inode); } goto err_out; } @@ -2064,7 +2072,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) * lookup validation will know that the inode is bad. * (But we fall through to invalidate the caches.) */ - nfs_invalidate_inode(inode); + nfs_set_inode_stale_locked(inode); return -ESTALE; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f80c47d5ff27..1f32a9fbfdaf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -274,12 +274,6 @@ void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); -static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc) -{ - WARN_ON_ONCE(desc->pg_mirror_count < 1); - return desc->pg_mirror_count > 1; -} - static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { @@ -417,7 +411,9 @@ extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); - +extern int nfs_client_for_each_server(struct nfs_client *clp, + int (*fn)(struct nfs_server *, void *), + void *data); /* io.c */ extern void nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); @@ -515,13 +511,25 @@ int nfs_filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); #ifdef CONFIG_NFS_V4_1 +static inline void +pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets, + unsigned int nbuckets) +{ + unsigned int i; + + for (i = 0; i < nbuckets; i++) + buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; +} static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { - int i; + struct pnfs_commit_array *array; - for (i = 0; i < cinfo->nbuckets; i++) - cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; + rcu_read_lock(); + list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list) + pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets, + array->nbuckets); + rcu_read_unlock(); } #else static inline @@ -542,6 +550,14 @@ nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, return memcmp(v1->data, v2->data, sizeof(v1->data)); } +static inline bool +nfs_write_match_verf(const struct nfs_writeverf *verf, + struct nfs_page *req) +{ + return verf->committed > NFS_UNSTABLE && + !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index f3ece8ed3203..6b063227e34e 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -145,6 +145,7 @@ struct vfsmount *nfs_d_automount(struct path *path) struct vfsmount *mnt = ERR_PTR(-ENOMEM); struct nfs_server *server = NFS_SERVER(d_inode(path->dentry)); struct nfs_client *client = server->nfs_client; + int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout); int ret; if (IS_ROOT(path->dentry)) @@ -190,12 +191,12 @@ struct vfsmount *nfs_d_automount(struct path *path) if (IS_ERR(mnt)) goto out_fc; - if (nfs_mountpoint_expiry_timeout < 0) + mntget(mnt); /* prevent immediate expiration */ + if (timeout <= 0) goto out_fc; - mntget(mnt); /* prevent immediate expiration */ mnt_set_expiry(mnt, &nfs_automount_list); - schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + schedule_delayed_work(&nfs_automount_task, timeout); out_fc: put_fs_context(fc); @@ -233,10 +234,11 @@ const struct inode_operations nfs_referral_inode_operations = { static void nfs_expire_automounts(struct work_struct *work) { struct list_head *list = &nfs_automount_list; + int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout); mark_mounts_for_expiry(list); - if (!list_empty(list)) - schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + if (!list_empty(list) && timeout > 0) + schedule_delayed_work(&nfs_automount_task, timeout); } void nfs_release_automount_timer(void) @@ -247,10 +249,7 @@ void nfs_release_automount_timer(void) /** * nfs_do_submount - set up mountpoint when crossing a filesystem boundary - * @dentry: parent directory - * @fh: filehandle for new root dentry - * @fattr: attributes for new root inode - * @authflavor: security flavor to use when performing the mount + * @fc: pointer to struct nfs_fs_context * */ int nfs_do_submount(struct fs_context *fc) @@ -312,3 +311,53 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server) return nfs_do_submount(fc); } EXPORT_SYMBOL_GPL(nfs_submount); + +static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp) +{ + long num; + int ret; + + if (!val) + return -EINVAL; + ret = kstrtol(val, 0, &num); + if (ret) + return -EINVAL; + if (num > 0) { + if (num >= INT_MAX / HZ) + num = INT_MAX; + else + num *= HZ; + *((int *)kp->arg) = num; + if (!list_empty(&nfs_automount_list)) + mod_delayed_work(system_wq, &nfs_automount_task, num); + } else { + *((int *)kp->arg) = -1*HZ; + cancel_delayed_work(&nfs_automount_task); + } + return 0; +} + +static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp) +{ + long num = *((int *)kp->arg); + + if (num > 0) { + if (num >= INT_MAX - (HZ - 1)) + num = INT_MAX / HZ; + else + num = (num + (HZ - 1)) / HZ; + } else + num = -1; + return scnprintf(buffer, PAGE_SIZE, "%li\n", num); +} + +static const struct kernel_param_ops param_ops_nfs_timeout = { + .set = param_set_nfs_timeout, + .get = param_get_nfs_timeout, +}; +#define param_check_nfs_timeout(name, p) __param_check(name, p, int); + +module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644); +MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout, + "Set the NFS automounted mountpoint timeout value (seconds)." + "Values <= 0 turn expiration off."); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8be1ba7c62bb..2b7f6dcd2eb8 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -42,7 +42,9 @@ enum nfs4_client_state { NFS4CLNT_LEASE_MOVED, NFS4CLNT_DELEGATION_EXPIRED, NFS4CLNT_RUN_MANAGER, - NFS4CLNT_DELEGRETURN_RUNNING, + NFS4CLNT_RECALL_RUNNING, + NFS4CLNT_RECALL_ANY_LAYOUT_READ, + NFS4CLNT_RECALL_ANY_LAYOUT_RW, }; #define NFS4_RENEW_TIMEOUT 0x01 diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1297919e0fce..8e5d6223ddd3 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -252,6 +252,9 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, if (remap_flags & ~REMAP_FILE_ADVISORY) return -EINVAL; + if (IS_SWAPFILE(dst_inode) || IS_SWAPFILE(src_inode)) + return -ETXTBSY; + /* check alignment w.r.t. clone_blksize */ ret = -EINVAL; if (bs) { diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 84026e7b8a5f..a3ab6e219061 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -354,7 +354,7 @@ static int try_location(struct fs_context *fc, /** * nfs_follow_referral - set up mountpoint when hitting a referral on moved error - * @dentry: parent directory + * @fc: pointer to struct nfs_fs_context * @locations: array of NFSv4 server location information * */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 69b7ab7a5815..512afb1c7867 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2346,7 +2346,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) .callback_ops = &nfs4_open_confirm_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int status; @@ -2511,7 +2511,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, .callback_ops = &nfs4_open_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int status; @@ -2790,16 +2790,19 @@ static int nfs41_check_delegation_stateid(struct nfs4_state *state) return NFS_OK; } + spin_lock(&delegation->lock); nfs4_stateid_copy(&stateid, &delegation->stateid); if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) { + spin_unlock(&delegation->lock); rcu_read_unlock(); return NFS_OK; } if (delegation->cred) cred = get_cred(delegation->cred); + spin_unlock(&delegation->lock); rcu_read_unlock(); status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); trace_nfs4_test_delegation_stateid(state, NULL, status); @@ -3651,7 +3654,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) .rpc_message = &msg, .callback_ops = &nfs4_close_ops, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int status = -ENOMEM; @@ -4002,7 +4005,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, { int error; struct nfs_fattr *fattr = info->fattr; - struct nfs4_label *label = NULL; + struct nfs4_label *label = fattr->label; error = nfs4_server_capabilities(server, mntfh); if (error < 0) { @@ -4010,23 +4013,17 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, return error; } - label = nfs4_label_alloc(server, GFP_KERNEL); - if (IS_ERR(label)) - return PTR_ERR(label); - error = nfs4_proc_getattr(server, mntfh, fattr, label, NULL); if (error < 0) { dprintk("nfs4_get_root: getattr error = %d\n", -error); - goto err_free_label; + goto out; } if (fattr->valid & NFS_ATTR_FATTR_FSID && !nfs_fsid_equal(&server->fsid, &fattr->fsid)) memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); -err_free_label: - nfs4_label_free(label); - +out: return error; } @@ -5550,7 +5547,7 @@ unwind: struct nfs4_cached_acl { int cached; size_t len; - char data[0]; + char data[]; }; static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl) @@ -6259,6 +6256,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) /* Fallthrough */ case -NFS4ERR_BAD_STATEID: case -NFS4ERR_STALE_STATEID: + case -ETIMEDOUT: task->tk_status = 0; break; case -NFS4ERR_OLD_STATEID: @@ -6349,7 +6347,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | RPC_TASK_TIMEOUT, }; int status = 0; @@ -6932,7 +6930,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .rpc_message = &msg, .callback_ops = &nfs4_lock_ops, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int ret; @@ -9176,7 +9174,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) .rpc_message = &msg, .callback_ops = &nfs4_layoutget_call_ops, .callback_data = lgp, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; struct pnfs_layout_segment *lseg = NULL; struct nfs4_exception exception = { @@ -9293,6 +9291,7 @@ static void nfs4_layoutreturn_release(void *calldata) lrp->ld_private.ops->free(&lrp->ld_private); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); + put_cred(lrp->cred); kfree(calldata); dprintk("<-- %s\n", __func__); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f7723d221945..ac93715c05a4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2524,6 +2524,21 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) } return 0; } + +static void nfs4_layoutreturn_any_run(struct nfs_client *clp) +{ + int iomode = 0; + + if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &clp->cl_state)) + iomode += IOMODE_READ; + if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &clp->cl_state)) + iomode += IOMODE_RW; + /* Note: IOMODE_READ + IOMODE_RW == IOMODE_ANY */ + if (iomode) { + pnfs_layout_return_unused_byclid(clp, iomode); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + } +} #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } @@ -2531,6 +2546,10 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) { return 0; } + +static void nfs4_layoutreturn_any_run(struct nfs_client *clp) +{ +} #endif /* CONFIG_NFS_V4_1 */ static void nfs4_state_manager(struct nfs_client *clp) @@ -2635,12 +2654,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); - if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) { + if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) { if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { nfs_client_return_marked_delegations(clp); set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); } - clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state); + nfs4_layoutreturn_any_run(clp); + clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state); } /* Did we race with an attempt to give us more work? */ diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 1e97e5e04cb4..543541173a3d 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -584,7 +584,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_MOVED); TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED); TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED); TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER); -TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING); +TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING); +TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ); +TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW); #define show_nfs4_clp_state(state) \ __print_flags(state, "|", \ @@ -605,7 +607,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING); { NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \ { NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \ { NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \ - { NFS4CLNT_DELEGRETURN_RUNNING, "DELEGRETURN_RUNNING" }) + { NFS4CLNT_RECALL_RUNNING, "RECALL_RUNNING" }, \ + { NFS4CLNT_RECALL_ANY_LAYOUT_READ, "RECALL_ANY_LAYOUT_READ" }, \ + { NFS4CLNT_RECALL_ANY_LAYOUT_RW, "RECALL_ANY_LAYOUT_RW" }) TRACE_EVENT(nfs4_state_mgr, TP_PROTO( diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index effaa4247b91..8d3278805602 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -88,7 +88,7 @@ #define NFS_ROOT "/tftpboot/%s" /* Default NFSROOT mount options. */ -#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096" +#define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096" /* Parameters passed from the kernel command line */ static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = ""; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index a9588d19a5ae..7e7a97ae21ed 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -181,6 +181,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, int error \ ), \ TP_ARGS(inode, error)) +DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale); DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 20b3717cd7ca..f61f96603df7 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -33,9 +33,7 @@ static const struct rpc_call_ops nfs_pgio_common_ops; struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc) { - return nfs_pgio_has_mirroring(desc) ? - &desc->pg_mirrors[desc->pg_mirror_idx] : - &desc->pg_mirrors[0]; + return &desc->pg_mirrors[desc->pg_mirror_idx]; } EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror); @@ -133,47 +131,166 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); /* - * nfs_page_group_lock - lock the head of the page group - * @req - request in group that is to be locked + * nfs_page_lock_head_request - page lock the head of the page group + * @req: any member of the page group + */ +struct nfs_page * +nfs_page_group_lock_head(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + while (!nfs_lock_request(head)) { + int ret = nfs_wait_on_request(head); + if (ret < 0) + return ERR_PTR(ret); + } + if (head != req) + kref_get(&head->wb_kref); + return head; +} + +/* + * nfs_unroll_locks - unlock all newly locked reqs and wait on @req + * @head: head request of page group, must be holding head lock + * @req: request that couldn't lock and needs to wait on the req bit lock * - * this lock must be held when traversing or modifying the page - * group list + * This is a helper function for nfs_lock_and_join_requests + * returns 0 on success, < 0 on error. + */ +static void +nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req) +{ + struct nfs_page *tmp; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { + if (!kref_read(&tmp->wb_kref)) + continue; + nfs_unlock_and_release_request(tmp); + } +} + +/* + * nfs_page_group_lock_subreq - try to lock a subrequest + * @head: head request of page group + * @subreq: request to lock * - * return 0 on success, < 0 on error + * This is a helper function for nfs_lock_and_join_requests which + * must be called with the head request and page group both locked. + * On error, it returns with the page group unlocked. */ -int -nfs_page_group_lock(struct nfs_page *req) +static int +nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq) { - struct nfs_page *head = req->wb_head; + int ret; + + if (!kref_get_unless_zero(&subreq->wb_kref)) + return 0; + while (!nfs_lock_request(subreq)) { + nfs_page_group_unlock(head); + ret = nfs_wait_on_request(subreq); + if (!ret) + ret = nfs_page_group_lock(head); + if (ret < 0) { + nfs_unroll_locks(head, subreq); + nfs_release_request(subreq); + return ret; + } + } + return 0; +} + +/* + * nfs_page_group_lock_subrequests - try to lock the subrequests + * @head: head request of page group + * + * This is a helper function for nfs_lock_and_join_requests which + * must be called with the head request locked. + */ +int nfs_page_group_lock_subrequests(struct nfs_page *head) +{ + struct nfs_page *subreq; + int ret; - WARN_ON_ONCE(head != head->wb_head); + ret = nfs_page_group_lock(head); + if (ret < 0) + return ret; + /* lock each request in the page group */ + for (subreq = head->wb_this_page; subreq != head; + subreq = subreq->wb_this_page) { + ret = nfs_page_group_lock_subreq(head, subreq); + if (ret < 0) + return ret; + } + nfs_page_group_unlock(head); + return 0; +} - if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) +/* + * nfs_page_set_headlock - set the request PG_HEADLOCK + * @req: request that is to be locked + * + * this lock must be held when modifying req->wb_head + * + * return 0 on success, < 0 on error + */ +int +nfs_page_set_headlock(struct nfs_page *req) +{ + if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags)) return 0; - set_bit(PG_CONTENDED1, &head->wb_flags); + set_bit(PG_CONTENDED1, &req->wb_flags); smp_mb__after_atomic(); - return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); } /* - * nfs_page_group_unlock - unlock the head of the page group - * @req - request in group that is to be unlocked + * nfs_page_clear_headlock - clear the request PG_HEADLOCK + * @req: request that is to be locked */ void -nfs_page_group_unlock(struct nfs_page *req) +nfs_page_clear_headlock(struct nfs_page *req) { - struct nfs_page *head = req->wb_head; - - WARN_ON_ONCE(head != head->wb_head); - smp_mb__before_atomic(); - clear_bit(PG_HEADLOCK, &head->wb_flags); + clear_bit(PG_HEADLOCK, &req->wb_flags); smp_mb__after_atomic(); - if (!test_bit(PG_CONTENDED1, &head->wb_flags)) + if (!test_bit(PG_CONTENDED1, &req->wb_flags)) return; - wake_up_bit(&head->wb_flags, PG_HEADLOCK); + wake_up_bit(&req->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req: request in group that is to be locked + * + * this lock must be held when traversing or modifying the page + * group list + * + * return 0 on success, < 0 on error + */ +int +nfs_page_group_lock(struct nfs_page *req) +{ + int ret; + + ret = nfs_page_set_headlock(req); + if (ret || req->wb_head == req) + return ret; + return nfs_page_set_headlock(req->wb_head); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req: request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ + if (req != req->wb_head) + nfs_page_clear_headlock(req->wb_head); + nfs_page_clear_headlock(req); } /* @@ -359,15 +476,23 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, } static struct nfs_page * -nfs_create_subreq(struct nfs_page *req, struct nfs_page *last, - unsigned int pgbase, unsigned int offset, +nfs_create_subreq(struct nfs_page *req, + unsigned int pgbase, + unsigned int offset, unsigned int count) { + struct nfs_page *last; struct nfs_page *ret; ret = __nfs_create_request(req->wb_lock_context, req->wb_page, pgbase, offset, count); if (!IS_ERR(ret)) { + /* find the last request */ + for (last = req->wb_head; + last->wb_this_page != req->wb_head; + last = last->wb_this_page) + ; + nfs_lock_request(ret); ret->wb_index = req->wb_index; nfs_page_group_init(ret, last); @@ -627,9 +752,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, .callback_ops = call_ops, .callback_data = hdr, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | flags, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags, }; - int ret = 0; hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); @@ -641,18 +765,10 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, (unsigned long long)hdr->args.offset); task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } + if (IS_ERR(task)) + return PTR_ERR(task); rpc_put_task(task); -out: - return ret; + return 0; } EXPORT_SYMBOL_GPL(nfs_initiate_pgio); @@ -886,15 +1002,6 @@ static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, pgio->pg_mirror_count = mirror_count; } -/* - * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1) - */ -void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio) -{ - pgio->pg_mirror_count = 1; - pgio->pg_mirror_idx = 0; -} - static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) { pgio->pg_mirror_count = 1; @@ -911,7 +1018,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1, } /** - * nfs_can_coalesce_requests - test two requests for compatibility + * nfs_coalesce_size - test two requests for compatibility * @prev: pointer to nfs_page * @req: pointer to nfs_page * @pgio: pointer to nfs_pagio_descriptor @@ -920,41 +1027,36 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1, * page data area they describe is contiguous, and that their RPC * credentials, NFSv4 open state, and lockowners are the same. * - * Return 'true' if this is the case, else return 'false'. + * Returns size of the request that can be coalesced */ -static bool nfs_can_coalesce_requests(struct nfs_page *prev, +static unsigned int nfs_coalesce_size(struct nfs_page *prev, struct nfs_page *req, struct nfs_pageio_descriptor *pgio) { - size_t size; struct file_lock_context *flctx; if (prev) { if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev))) - return false; + return 0; flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx; if (flctx != NULL && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock)) && !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) - return false; + return 0; if (req_offset(req) != req_offset(prev) + prev->wb_bytes) - return false; + return 0; if (req->wb_page == prev->wb_page) { if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes) - return false; + return 0; } else { if (req->wb_pgbase != 0 || prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE) - return false; + return 0; } } - size = pgio->pg_ops->pg_test(pgio, prev, req); - WARN_ON_ONCE(size > req->wb_bytes); - if (size && size < req->wb_bytes) - req->wb_bytes = size; - return size > 0; + return pgio->pg_ops->pg_test(pgio, prev, req); } /** @@ -962,15 +1064,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, * @desc: destination io descriptor * @req: request * - * Returns true if the request 'req' was successfully coalesced into the - * existing list of pages 'desc'. + * If the request 'req' was successfully coalesced into the existing list + * of pages 'desc', it returns the size of req. */ -static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, - struct nfs_page *req) +static unsigned int +nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) { struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_page *prev = NULL; + unsigned int size; if (mirror->pg_count != 0) { prev = nfs_list_entry(mirror->pg_list.prev); @@ -990,11 +1093,12 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, return 0; } - if (!nfs_can_coalesce_requests(prev, req, desc)) - return 0; + size = nfs_coalesce_size(prev, req, desc); + if (size < req->wb_bytes) + return size; nfs_list_move_request(req, &mirror->pg_list); mirror->pg_count += req->wb_bytes; - return 1; + return req->wb_bytes; } /* @@ -1034,7 +1138,8 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc, * @req: request * * This may split a request into subrequests which are all part of the - * same page group. + * same page group. If so, it will submit @req as the last one, to ensure + * the pointer to @req is still valid in case of failure. * * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. @@ -1043,51 +1148,50 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_page *subreq; - unsigned int bytes_left = 0; - unsigned int offset, pgbase; + unsigned int size, subreq_size; nfs_page_group_lock(req); subreq = req; - bytes_left = subreq->wb_bytes; - offset = subreq->wb_offset; - pgbase = subreq->wb_pgbase; - - do { - if (!nfs_pageio_do_add_request(desc, subreq)) { - /* make sure pg_test call(s) did nothing */ - WARN_ON_ONCE(subreq->wb_bytes != bytes_left); - WARN_ON_ONCE(subreq->wb_offset != offset); - WARN_ON_ONCE(subreq->wb_pgbase != pgbase); - + subreq_size = subreq->wb_bytes; + for(;;) { + size = nfs_pageio_do_add_request(desc, subreq); + if (size == subreq_size) { + /* We successfully submitted a request */ + if (subreq == req) + break; + req->wb_pgbase += size; + req->wb_bytes -= size; + req->wb_offset += size; + subreq_size = req->wb_bytes; + subreq = req; + continue; + } + if (WARN_ON_ONCE(subreq != req)) { + nfs_page_group_unlock(req); + nfs_pageio_cleanup_request(desc, subreq); + subreq = req; + subreq_size = req->wb_bytes; + nfs_page_group_lock(req); + } + if (!size) { + /* Can't coalesce any more, so do I/O */ nfs_page_group_unlock(req); desc->pg_moreio = 1; nfs_pageio_doio(desc); if (desc->pg_error < 0 || mirror->pg_recoalesce) - goto out_cleanup_subreq; + return 0; /* retry add_request for this subreq */ nfs_page_group_lock(req); continue; } - - /* check for buggy pg_test call(s) */ - WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); - WARN_ON_ONCE(subreq->wb_bytes > bytes_left); - WARN_ON_ONCE(subreq->wb_bytes == 0); - - bytes_left -= subreq->wb_bytes; - offset += subreq->wb_bytes; - pgbase += subreq->wb_bytes; - - if (bytes_left) { - subreq = nfs_create_subreq(req, subreq, pgbase, - offset, bytes_left); - if (IS_ERR(subreq)) - goto err_ptr; - } - } while (bytes_left > 0); + subreq = nfs_create_subreq(req, req->wb_pgbase, + req->wb_offset, size); + if (IS_ERR(subreq)) + goto err_ptr; + subreq_size = size; + } nfs_page_group_unlock(req); return 1; @@ -1095,10 +1199,6 @@ err_ptr: desc->pg_error = PTR_ERR(subreq); nfs_page_group_unlock(req); return 0; -out_cleanup_subreq: - if (req != subreq) - nfs_pageio_cleanup_request(desc, subreq); - return 0; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -1167,7 +1267,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, { u32 midx; unsigned int pgbase, offset, bytes; - struct nfs_page *dupreq, *lastreq; + struct nfs_page *dupreq; pgbase = req->wb_pgbase; offset = req->wb_offset; @@ -1177,38 +1277,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (desc->pg_error < 0) goto out_failed; - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - if (midx) { - nfs_page_group_lock(req); + /* Create the mirror instances first, and fire them off */ + for (midx = 1; midx < desc->pg_mirror_count; midx++) { + nfs_page_group_lock(req); - /* find the last request */ - for (lastreq = req->wb_head; - lastreq->wb_this_page != req->wb_head; - lastreq = lastreq->wb_this_page) - ; + dupreq = nfs_create_subreq(req, + pgbase, offset, bytes); - dupreq = nfs_create_subreq(req, lastreq, - pgbase, offset, bytes); - - nfs_page_group_unlock(req); - if (IS_ERR(dupreq)) { - desc->pg_error = PTR_ERR(dupreq); - goto out_failed; - } - } else - dupreq = req; + nfs_page_group_unlock(req); + if (IS_ERR(dupreq)) { + desc->pg_error = PTR_ERR(dupreq); + goto out_failed; + } - if (nfs_pgio_has_mirroring(desc)) - desc->pg_mirror_idx = midx; + desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) goto out_cleanup_subreq; } + desc->pg_mirror_idx = 0; + if (!nfs_pageio_add_request_mirror(desc, req)) + goto out_failed; + return 1; out_cleanup_subreq: - if (req != dupreq) - nfs_pageio_cleanup_request(desc, dupreq); + nfs_pageio_cleanup_request(desc, dupreq); out_failed: nfs_pageio_error_cleanup(desc); return 0; @@ -1226,8 +1320,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx]; u32 restore_idx = desc->pg_mirror_idx; - if (nfs_pgio_has_mirroring(desc)) - desc->pg_mirror_idx = mirror_idx; + desc->pg_mirror_idx = mirror_idx; for (;;) { nfs_pageio_doio(desc); if (desc->pg_error < 0 || !mirror->pg_recoalesce) @@ -1320,6 +1413,14 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) } } +/* + * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1) + */ +void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio) +{ + nfs_pageio_complete(pgio); +} + int __init nfs_init_nfspagecache(void) { nfs_page_cachep = kmem_cache_create("nfs_page", diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 542ea8dfd1bc..f2dc35c22964 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -268,11 +268,11 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) struct nfs_server *server = NFS_SERVER(lo->plh_inode); struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - if (!list_empty(&lo->plh_layouts)) { + if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) { struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); + list_del_rcu(&lo->plh_layouts); spin_unlock(&clp->cl_lock); } put_cred(lo->plh_lc_cred); @@ -309,6 +309,16 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) } } +static struct inode * +pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = igrab(lo->plh_inode); + if (inode) + return inode; + set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags); + return NULL; +} + static void pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, u32 seq) @@ -496,6 +506,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); + INIT_LIST_HEAD(&lseg->pls_commits); refcount_set(&lseg->pls_refcount, 1); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; @@ -782,9 +793,10 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, /* If the sb is being destroyed, just bail */ if (!nfs_sb_active(server->super)) break; - inode = igrab(lo->plh_inode); + inode = pnfs_grab_inode_layout_hdr(lo); if (inode != NULL) { - list_del_init(&lo->plh_layouts); + if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) + list_del_rcu(&lo->plh_layouts); if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) continue; @@ -794,7 +806,6 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, } else { rcu_read_unlock(); spin_unlock(&clp->cl_lock); - set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags); } nfs_sb_deactive(server->super); spin_lock(&clp->cl_lock); @@ -903,10 +914,21 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } +static void +pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred) +{ + const struct cred *old; + + if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) { + old = xchg(&lo->plh_lc_cred, get_cred(cred)); + put_cred(old); + } +} + /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, - bool update_barrier) + const struct cred *cred, bool update_barrier) { u32 oldseq, newseq, new_barrier = 0; @@ -914,6 +936,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, newseq = be32_to_cpu(new->seqid); if (!pnfs_layout_is_valid(lo)) { + pnfs_set_layout_cred(lo, cred); nfs4_stateid_copy(&lo->plh_stateid, new); lo->plh_barrier = newseq; pnfs_clear_layoutreturn_info(lo); @@ -1061,7 +1084,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino, lgp->args.ctx = get_nfs_open_context(ctx); nfs4_stateid_copy(&lgp->args.stateid, stateid); lgp->gfp_flags = gfp_flags; - lgp->cred = get_cred(ctx->cred); + lgp->cred = ctx->cred; return lgp; } @@ -1072,7 +1095,6 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp) nfs4_free_pages(lgp->args.layout.pages, max_pages); if (lgp->args.inode) pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout); - put_cred(lgp->cred); put_nfs_open_context(lgp->args.ctx); kfree(lgp); } @@ -1109,7 +1131,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); - pnfs_set_layout_stateid(lo, stateid, true); + pnfs_set_layout_stateid(lo, stateid, NULL, true); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: @@ -1122,6 +1144,7 @@ out_unlock: static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, + const struct cred **cred, enum pnfs_iomode *iomode) { /* Serialise LAYOUTGET/LAYOUTRETURN */ @@ -1132,18 +1155,17 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { - if (stateid != NULL) { - nfs4_stateid_copy(stateid, &lo->plh_stateid); - if (lo->plh_return_seq != 0) - stateid->seqid = cpu_to_be32(lo->plh_return_seq); - } + nfs4_stateid_copy(stateid, &lo->plh_stateid); + *cred = get_cred(lo->plh_lc_cred); + if (lo->plh_return_seq != 0) + stateid->seqid = cpu_to_be32(lo->plh_return_seq); if (iomode != NULL) *iomode = lo->plh_return_iomode; pnfs_clear_layoutreturn_info(lo); return true; } - if (stateid != NULL) - nfs4_stateid_copy(stateid, &lo->plh_stateid); + nfs4_stateid_copy(stateid, &lo->plh_stateid); + *cred = get_cred(lo->plh_lc_cred); if (iomode != NULL) *iomode = IOMODE_ANY; return true; @@ -1167,20 +1189,26 @@ pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args, } static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, - enum pnfs_iomode iomode, bool sync) +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid, + const struct cred **pcred, + enum pnfs_iomode iomode, + bool sync) { struct inode *ino = lo->plh_inode; struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; struct nfs4_layoutreturn *lrp; + const struct cred *cred = *pcred; int status = 0; + *pcred = NULL; lrp = kzalloc(sizeof(*lrp), GFP_NOFS); if (unlikely(lrp == NULL)) { status = -ENOMEM; spin_lock(&ino->i_lock); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&ino->i_lock); + put_cred(cred); pnfs_put_layout_hdr(lo); goto out; } @@ -1188,7 +1216,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode); lrp->args.ld_private = &lrp->ld_private; lrp->clp = NFS_SERVER(ino)->nfs_client; - lrp->cred = lo->plh_lc_cred; + lrp->cred = cred; if (ld->prepare_layoutreturn) ld->prepare_layoutreturn(&lrp->args); @@ -1233,15 +1261,16 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) return; spin_lock(&inode->i_lock); if (pnfs_layout_need_return(lo)) { + const struct cred *cred; nfs4_stateid stateid; enum pnfs_iomode iomode; bool send; - send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, &stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); } } else spin_unlock(&inode->i_lock); @@ -1261,6 +1290,7 @@ _pnfs_return_layout(struct inode *ino) struct pnfs_layout_hdr *lo = NULL; struct nfs_inode *nfsi = NFS_I(ino); LIST_HEAD(tmp_list); + const struct cred *cred; nfs4_stateid stateid; int status = 0; bool send, valid_layout; @@ -1305,10 +1335,10 @@ _pnfs_return_layout(struct inode *ino) goto out_put_layout_hdr; } - send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); + send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL); spin_unlock(&ino->i_lock); if (send) - status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); + status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true); out_put_layout_hdr: pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -1354,6 +1384,7 @@ bool pnfs_roc(struct inode *ino, struct nfs4_state *state; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg, *next; + const struct cred *lc_cred; nfs4_stateid stateid; enum pnfs_iomode iomode = 0; bool layoutreturn = false, roc = false; @@ -1423,16 +1454,20 @@ retry: * 2. we don't send layoutreturn */ /* lo ref dropped in pnfs_roc_release() */ - layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode); /* If the creds don't match, we can't compound the layoutreturn */ - if (!layoutreturn || cred_fscmp(cred, lo->plh_lc_cred) != 0) + if (!layoutreturn) goto out_noroc; + if (cred_fscmp(cred, lc_cred) != 0) + goto out_noroc_put_cred; roc = layoutreturn; pnfs_init_layoutreturn_args(args, lo, &stateid, iomode); res->lrs_present = 0; layoutreturn = false; +out_noroc_put_cred: + put_cred(lc_cred); out_noroc: spin_unlock(&ino->i_lock); rcu_read_unlock(); @@ -1445,7 +1480,7 @@ out_noroc: return true; } if (layoutreturn) - pnfs_send_layoutreturn(lo, &stateid, iomode, true); + pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true); pnfs_put_layout_hdr(lo); return false; } @@ -1859,15 +1894,14 @@ static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) static void _add_to_server_list(struct pnfs_layout_hdr *lo, struct nfs_server *server) { - if (list_empty(&lo->plh_layouts)) { + if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) { struct nfs_client *clp = server->nfs_client; /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. */ spin_lock(&clp->cl_lock); - if (list_empty(&lo->plh_layouts)) - list_add_tail(&lo->plh_layouts, &server->layouts); + list_add_tail_rcu(&lo->plh_layouts, &server->layouts); spin_unlock(&clp->cl_lock); } } @@ -2323,14 +2357,14 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) if (!pnfs_layout_is_valid(lo)) { /* We have a completely new layout */ - pnfs_set_layout_stateid(lo, &res->stateid, true); + pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true); } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to sequence\n", __func__); goto out_forget; } - pnfs_set_layout_stateid(lo, &res->stateid, false); + pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false); } else { /* * We got an entirely new state ID. Mark all segments for the @@ -2423,43 +2457,159 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, return -ENOENT; } -void pnfs_error_mark_layout_for_return(struct inode *inode, - struct pnfs_layout_segment *lseg) +static void +pnfs_mark_layout_for_return(struct inode *inode, + const struct pnfs_layout_range *range) { - struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - struct pnfs_layout_range range = { - .iomode = lseg->pls_range.iomode, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; + struct pnfs_layout_hdr *lo; bool return_now = false; spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; if (!pnfs_layout_is_valid(lo)) { spin_unlock(&inode->i_lock); return; } - pnfs_set_plh_return_info(lo, range.iomode, 0); + pnfs_set_plh_return_info(lo, range->iomode, 0); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0) != -EBUSY) { + if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) { + const struct cred *cred; nfs4_stateid stateid; enum pnfs_iomode iomode; - return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); spin_unlock(&inode->i_lock); if (return_now) - pnfs_send_layoutreturn(lo, &stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); } else { spin_unlock(&inode->i_lock); nfs_commit_inode(inode, 0); } } + +void pnfs_error_mark_layout_for_return(struct inode *inode, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_range range = { + .iomode = lseg->pls_range.iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + pnfs_mark_layout_for_return(inode, &range); +} EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); +static bool +pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo) +{ + return pnfs_layout_is_valid(lo) && + !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) && + !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); +} + +static struct pnfs_layout_segment * +pnfs_find_first_lseg(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range, + enum pnfs_iomode iomode) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + continue; + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + continue; + if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY) + continue; + if (pnfs_lseg_range_intersecting(&lseg->pls_range, range)) + return lseg; + } + return NULL; +} + +/* Find open file states whose mode matches that of the range */ +static bool +pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range) +{ + struct list_head *head; + struct nfs_open_context *ctx; + fmode_t mode = 0; + + if (!pnfs_layout_can_be_returned(lo) || + !pnfs_find_first_lseg(lo, range, range->iomode)) + return false; + + head = &NFS_I(lo->plh_inode)->open_files; + list_for_each_entry_rcu(ctx, head, list) { + if (ctx->state) + mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE); + } + + switch (range->iomode) { + default: + break; + case IOMODE_READ: + mode &= ~FMODE_WRITE; + break; + case IOMODE_RW: + if (pnfs_find_first_lseg(lo, range, IOMODE_READ)) + mode &= ~FMODE_READ; + } + return mode == 0; +} + +static int +pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data) +{ + const struct pnfs_layout_range *range = data; + struct pnfs_layout_hdr *lo; + struct inode *inode; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { + if (!pnfs_layout_can_be_returned(lo) || + test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + continue; + inode = lo->plh_inode; + spin_lock(&inode->i_lock); + if (!pnfs_should_return_unused_layout(lo, range)) { + spin_unlock(&inode->i_lock); + continue; + } + spin_unlock(&inode->i_lock); + inode = pnfs_grab_inode_layout_hdr(lo); + if (!inode) + continue; + rcu_read_unlock(); + pnfs_mark_layout_for_return(inode, range); + iput(inode); + cond_resched(); + goto restart; + } + rcu_read_unlock(); + return 0; +} + +void +pnfs_layout_return_unused_byclid(struct nfs_client *clp, + enum pnfs_iomode iomode) +{ + struct pnfs_layout_range range = { + .iomode = iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver, + &range); +} + void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) { @@ -2475,7 +2625,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); * Check for any intersection between the request and the pgio->pg_lseg, * and if none, put this pgio->pg_lseg away. */ -static void +void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) { @@ -2483,6 +2633,7 @@ pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page pgio->pg_lseg = NULL; } } +EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) @@ -3000,10 +3151,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) end_pos = nfsi->layout->plh_lwb; nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); + data->cred = get_cred(nfsi->layout->plh_lc_cred); spin_unlock(&inode->i_lock); data->args.inode = inode; - data->cred = get_cred(nfsi->layout->plh_lc_cred); nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 0fafdadc9c8d..8e0ada581b92 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -66,6 +66,7 @@ struct nfs4_pnfs_ds { struct pnfs_layout_segment { struct list_head pls_list; struct list_head pls_lc_list; + struct list_head pls_commits; struct pnfs_layout_range pls_range; refcount_t pls_refcount; u32 pls_seq; @@ -105,6 +106,7 @@ enum { NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */ + NFS_LAYOUT_HASHED, /* The layout visible */ }; enum layoutdriver_policy_flags { @@ -148,22 +150,6 @@ struct pnfs_layoutdriver_type { const struct nfs_pageio_ops *pg_write_ops; struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); - void (*mark_request_commit) (struct nfs_page *req, - struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - u32 ds_commit_idx); - void (*clear_request_commit) (struct nfs_page *req, - struct nfs_commit_info *cinfo); - int (*scan_commit_lists) (struct nfs_commit_info *cinfo, - int max); - void (*recover_commit_reqs) (struct list_head *list, - struct nfs_commit_info *cinfo); - struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, - struct page *page); - int (*commit_pagelist)(struct inode *inode, - struct list_head *mds_pages, - int how, - struct nfs_commit_info *cinfo); int (*sync)(struct inode *inode, bool datasync); @@ -186,6 +172,29 @@ struct pnfs_layoutdriver_type { int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); }; +struct pnfs_commit_ops { + void (*setup_ds_info)(struct pnfs_ds_commit_info *, + struct pnfs_layout_segment *); + void (*release_ds_info)(struct pnfs_ds_commit_info *, + struct inode *inode); + int (*commit_pagelist)(struct inode *inode, + struct list_head *mds_pages, + int how, + struct nfs_commit_info *cinfo); + void (*mark_request_commit) (struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); + void (*clear_request_commit) (struct nfs_page *req, + struct nfs_commit_info *cinfo); + int (*scan_commit_lists) (struct nfs_commit_info *cinfo, + int max); + void (*recover_commit_reqs) (struct list_head *list, + struct nfs_commit_info *cinfo); + struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, + struct page *page); +}; + struct pnfs_layout_hdr { refcount_t plh_refcount; atomic_t plh_outstanding; /* number of RPCs out */ @@ -203,6 +212,7 @@ struct pnfs_layout_hdr { loff_t plh_lwb; /* last write byte for layoutcommit */ const struct cred *plh_lc_cred; /* layoutcommit cred */ struct inode *plh_inode; + struct rcu_head plh_rcu; }; struct pnfs_device { @@ -242,6 +252,7 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio); +void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, @@ -267,6 +278,7 @@ bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, + const struct cred *cred, bool update_barrier); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, @@ -326,6 +338,9 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg); +void pnfs_layout_return_unused_byclid(struct nfs_client *clp, + enum pnfs_iomode iomode); + /* nfs4_deviceid_flags */ enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ @@ -360,6 +375,16 @@ bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); /* pnfs_nfs.c */ +struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags); +void pnfs_free_commit_array(struct pnfs_commit_array *p); +struct pnfs_commit_array *pnfs_add_commit_array(struct pnfs_ds_commit_info *, + struct pnfs_commit_array *, + struct pnfs_layout_segment *); + +void pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg); +void pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo); + void pnfs_generic_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo); void pnfs_generic_commit_release(void *calldata); @@ -367,6 +392,8 @@ void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data); void pnfs_generic_rw_release(void *data); void pnfs_generic_recover_commit_reqs(struct list_head *dst, struct nfs_commit_info *cinfo); +struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, + struct page *page); int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int how, @@ -438,9 +465,11 @@ static inline int pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) { - if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0) + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (fl_cinfo == NULL || fl_cinfo->ncommitting == 0) return PNFS_NOT_ATTEMPTED; - return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo); + return fl_cinfo->ops->commit_pagelist(inode, mds_pages, how, cinfo); } static inline struct pnfs_ds_commit_info * @@ -454,6 +483,28 @@ pnfs_get_ds_info(struct inode *inode) } static inline void +pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ + struct pnfs_ds_commit_info *inode_cinfo = pnfs_get_ds_info(inode); + if (inode_cinfo != NULL) + fl_cinfo->ops = inode_cinfo->ops; +} + +static inline void +pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo) +{ + INIT_LIST_HEAD(&fl_cinfo->commits); + fl_cinfo->ops = NULL; +} + +static inline void +pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ + if (fl_cinfo->ops != NULL && fl_cinfo->ops->release_ds_info != NULL) + fl_cinfo->ops->release_ds_info(fl_cinfo, inode); +} + +static inline void pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node) { set_bit(NFS_DEVICEID_INVALID, &node->flags); @@ -463,24 +514,22 @@ static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) { - struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (lseg == NULL || ld->mark_request_commit == NULL) + if (!lseg || !fl_cinfo->ops->mark_request_commit) return false; - ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx); + fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx); return true; } static inline bool pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { - struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (ld == NULL || ld->clear_request_commit == NULL) + if (!fl_cinfo || !fl_cinfo->ops || !fl_cinfo->ops->clear_request_commit) return false; - ld->clear_request_commit(req, cinfo); + fl_cinfo->ops->clear_request_commit(req, cinfo); return true; } @@ -488,21 +537,31 @@ static inline int pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, int max) { - if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (!fl_cinfo || fl_cinfo->nwritten == 0) return 0; - else - return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); + return fl_cinfo->ops->scan_commit_lists(cinfo, max); +} + +static inline void +pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) +{ + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (fl_cinfo && fl_cinfo->nwritten != 0) + fl_cinfo->ops->recover_commit_reqs(head, cinfo); } static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, struct page *page) { - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (ld == NULL || ld->search_commit_reqs == NULL) + if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs) return NULL; - return ld->search_commit_reqs(cinfo, page); + return fl_cinfo->ops->search_commit_reqs(cinfo, page); } /* Should the pNFS client commit and return the layout upon a setattr */ @@ -750,6 +809,21 @@ pnfs_get_ds_info(struct inode *inode) return NULL; } +static inline void +pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ +} + +static inline void +pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo) +{ +} + +static inline void +pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ +} + static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) @@ -770,6 +844,11 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, return 0; } +static inline void +pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) +{ +} + static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, struct page *page) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 8b37e7f8e789..25f135572fc8 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -59,6 +59,17 @@ void pnfs_generic_commit_release(void *calldata) } EXPORT_SYMBOL_GPL(pnfs_generic_commit_release); +static struct pnfs_layout_segment * +pnfs_free_bucket_lseg(struct pnfs_commit_bucket *bucket) +{ + if (list_empty(&bucket->committing) && list_empty(&bucket->written)) { + struct pnfs_layout_segment *freeme = bucket->lseg; + bucket->lseg = NULL; + return freeme; + } + return NULL; +} + /* The generic layer is about to remove the req from the commit list. * If this will make the bucket empty, it will need to put the lseg reference. * Note this must be called holding nfsi->commit_mutex @@ -78,8 +89,7 @@ pnfs_generic_clear_request_commit(struct nfs_page *req, bucket = list_first_entry(&req->wb_list, struct pnfs_commit_bucket, written); - freeme = bucket->wlseg; - bucket->wlseg = NULL; + freeme = pnfs_free_bucket_lseg(bucket); } out: nfs_request_remove_commit_list(req, cinfo); @@ -87,10 +97,154 @@ out: } EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit); +struct pnfs_commit_array * +pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags) +{ + struct pnfs_commit_array *p; + struct pnfs_commit_bucket *b; + + p = kmalloc(struct_size(p, buckets, n), gfp_flags); + if (!p) + return NULL; + p->nbuckets = n; + INIT_LIST_HEAD(&p->cinfo_list); + INIT_LIST_HEAD(&p->lseg_list); + p->lseg = NULL; + for (b = &p->buckets[0]; n != 0; b++, n--) { + INIT_LIST_HEAD(&b->written); + INIT_LIST_HEAD(&b->committing); + b->lseg = NULL; + b->direct_verf.committed = NFS_INVALID_STABLE_HOW; + } + return p; +} +EXPORT_SYMBOL_GPL(pnfs_alloc_commit_array); + +void +pnfs_free_commit_array(struct pnfs_commit_array *p) +{ + kfree_rcu(p, rcu); +} +EXPORT_SYMBOL_GPL(pnfs_free_commit_array); + +static struct pnfs_commit_array * +pnfs_find_commit_array_by_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array; + + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (array->lseg == lseg) + return array; + } + return NULL; +} + +struct pnfs_commit_array * +pnfs_add_commit_array(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_commit_array *new, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array; + + array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); + if (array) + return array; + new->lseg = lseg; + refcount_set(&new->refcount, 1); + list_add_rcu(&new->cinfo_list, &fl_cinfo->commits); + list_add(&new->lseg_list, &lseg->pls_commits); + return new; +} +EXPORT_SYMBOL_GPL(pnfs_add_commit_array); + +static struct pnfs_commit_array * +pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array; + + rcu_read_lock(); + array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); + if (!array) { + rcu_read_unlock(); + fl_cinfo->ops->setup_ds_info(fl_cinfo, lseg); + rcu_read_lock(); + array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); + } + rcu_read_unlock(); + return array; +} + +static void +pnfs_release_commit_array_locked(struct pnfs_commit_array *array) +{ + list_del_rcu(&array->cinfo_list); + list_del(&array->lseg_list); + pnfs_free_commit_array(array); +} + +static void +pnfs_put_commit_array_locked(struct pnfs_commit_array *array) +{ + if (refcount_dec_and_test(&array->refcount)) + pnfs_release_commit_array_locked(array); +} + +static void +pnfs_put_commit_array(struct pnfs_commit_array *array, struct inode *inode) +{ + if (refcount_dec_and_lock(&array->refcount, &inode->i_lock)) { + pnfs_release_commit_array_locked(array); + spin_unlock(&inode->i_lock); + } +} + +static struct pnfs_commit_array * +pnfs_get_commit_array(struct pnfs_commit_array *array) +{ + if (refcount_inc_not_zero(&array->refcount)) + return array; + return NULL; +} + +static void +pnfs_remove_and_free_commit_array(struct pnfs_commit_array *array) +{ + array->lseg = NULL; + list_del_init(&array->lseg_list); + pnfs_put_commit_array_locked(array); +} + +void +pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array, *tmp; + + list_for_each_entry_safe(array, tmp, &lseg->pls_commits, lseg_list) + pnfs_remove_and_free_commit_array(array); +} +EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_release_lseg); + +void +pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo) +{ + struct pnfs_commit_array *array, *tmp; + + list_for_each_entry_safe(array, tmp, &fl_cinfo->commits, cinfo_list) + pnfs_remove_and_free_commit_array(array); +} +EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_destroy); + +/* + * Locks the nfs_page requests for commit and moves them to + * @bucket->committing. + */ static int -pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, - struct nfs_commit_info *cinfo, - int max) +pnfs_bucket_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo, + int max) { struct list_head *src = &bucket->written; struct list_head *dst = &bucket->committing; @@ -101,158 +255,253 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, if (ret) { cinfo->ds->nwritten -= ret; cinfo->ds->ncommitting += ret; - if (bucket->clseg == NULL) - bucket->clseg = pnfs_get_lseg(bucket->wlseg); - if (list_empty(src)) { - pnfs_put_lseg(bucket->wlseg); - bucket->wlseg = NULL; - } } return ret; } +static int pnfs_bucket_scan_array(struct nfs_commit_info *cinfo, + struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + int max) +{ + unsigned int i; + int rv = 0, cnt; + + for (i = 0; i < nbuckets && max != 0; i++) { + cnt = pnfs_bucket_scan_ds_commit_list(&buckets[i], cinfo, max); + rv += cnt; + max -= cnt; + } + return rv; +} + /* Move reqs from written to committing lists, returning count * of number moved. */ -int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, - int max) +int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max) { - int i, rv = 0, cnt; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_array *array; + int rv = 0, cnt; - lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); - for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { - cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], - cinfo, max); - max -= cnt; + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); + cnt = pnfs_bucket_scan_array(cinfo, array->buckets, + array->nbuckets, max); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); rv += cnt; + max -= cnt; + if (!max) + break; } + rcu_read_unlock(); return rv; } EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists); -/* Pull everything off the committing lists and dump into @dst. */ -void pnfs_generic_recover_commit_reqs(struct list_head *dst, - struct nfs_commit_info *cinfo) +static unsigned int +pnfs_bucket_recover_commit_reqs(struct list_head *dst, + struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + struct nfs_commit_info *cinfo) { struct pnfs_commit_bucket *b; struct pnfs_layout_segment *freeme; - int nwritten; - int i; + unsigned int nwritten, ret = 0; + unsigned int i; - lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); restart: - for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { + for (i = 0, b = buckets; i < nbuckets; i++, b++) { nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0); if (!nwritten) continue; - cinfo->ds->nwritten -= nwritten; - if (list_empty(&b->written)) { - freeme = b->wlseg; - b->wlseg = NULL; + ret += nwritten; + freeme = pnfs_free_bucket_lseg(b); + if (freeme) { pnfs_put_lseg(freeme); goto restart; } } + return ret; +} + +/* Pull everything off the committing lists and dump into @dst. */ +void pnfs_generic_recover_commit_reqs(struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_array *array; + unsigned int nwritten; + + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); + nwritten = pnfs_bucket_recover_commit_reqs(dst, + array->buckets, + array->nbuckets, + cinfo); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); + fl_cinfo->nwritten -= nwritten; + } + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); -static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) +static struct nfs_page * +pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, struct page *page) +{ + struct nfs_page *req; + struct pnfs_commit_bucket *b; + unsigned int i; + + /* Linearly search the commit lists for each bucket until a matching + * request is found */ + for (i = 0, b = buckets; i < nbuckets; i++, b++) { + list_for_each_entry(req, &b->written, wb_list) { + if (req->wb_page == page) + return req->wb_head; + } + list_for_each_entry(req, &b->committing, wb_list) { + if (req->wb_page == page) + return req->wb_head; + } + } + return NULL; +} + +/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest + * for @page + * @cinfo - commit info for current inode + * @page - page to search for matching head request + * + * Returns a the head request if one is found, otherwise returns NULL. + */ +struct nfs_page * +pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) { struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_array *array; + struct nfs_page *req; + + list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) { + req = pnfs_bucket_search_commit_reqs(array->buckets, + array->nbuckets, page); + if (req) + return req; + } + return NULL; +} +EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs); + +static struct pnfs_layout_segment * +pnfs_bucket_get_committing(struct list_head *head, + struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo) +{ + struct list_head *pos; + + list_for_each(pos, &bucket->committing) + cinfo->ds->ncommitting--; + list_splice_init(&bucket->committing, head); + return pnfs_free_bucket_lseg(bucket); +} + +static struct nfs_commit_data * +pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo) +{ + struct nfs_commit_data *data = nfs_commitdata_alloc(false); + + if (!data) + return NULL; + data->lseg = pnfs_bucket_get_committing(&data->pages, bucket, cinfo); + if (!data->lseg) + data->lseg = pnfs_get_lseg(bucket->lseg); + return data; +} + +static void pnfs_generic_retry_commit(struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + struct nfs_commit_info *cinfo, + unsigned int idx) +{ struct pnfs_commit_bucket *bucket; struct pnfs_layout_segment *freeme; - struct list_head *pos; LIST_HEAD(pages); - int i; - mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); - for (i = idx; i < fl_cinfo->nbuckets; i++) { - bucket = &fl_cinfo->buckets[i]; + for (bucket = buckets; idx < nbuckets; bucket++, idx++) { if (list_empty(&bucket->committing)) continue; - freeme = bucket->clseg; - bucket->clseg = NULL; - list_for_each(pos, &bucket->committing) - cinfo->ds->ncommitting--; - list_splice_init(&bucket->committing, &pages); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + freeme = pnfs_bucket_get_committing(&pages, bucket, cinfo); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - nfs_retry_commit(&pages, freeme, cinfo, i); + nfs_retry_commit(&pages, freeme, cinfo, idx); pnfs_put_lseg(freeme); - mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); } - mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } static unsigned int -pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, - struct list_head *list) +pnfs_bucket_alloc_ds_commits(struct list_head *list, + struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + struct nfs_commit_info *cinfo) { - struct pnfs_ds_commit_info *fl_cinfo; struct pnfs_commit_bucket *bucket; struct nfs_commit_data *data; - int i; + unsigned int i; unsigned int nreq = 0; - fl_cinfo = cinfo->ds; - bucket = fl_cinfo->buckets; - for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { + for (i = 0, bucket = buckets; i < nbuckets; i++, bucket++) { if (list_empty(&bucket->committing)) continue; - data = nfs_commitdata_alloc(false); - if (!data) - break; - data->ds_commit_index = i; - list_add(&data->pages, list); - nreq++; + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + if (!list_empty(&bucket->committing)) { + data = pnfs_bucket_fetch_commitdata(bucket, cinfo); + if (!data) + goto out_error; + data->ds_commit_index = i; + list_add_tail(&data->list, list); + atomic_inc(&cinfo->mds->rpcs_out); + nreq++; + } + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } - + return nreq; +out_error: + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); /* Clean up on error */ - pnfs_generic_retry_commit(cinfo, i); + pnfs_generic_retry_commit(buckets, nbuckets, cinfo, i); return nreq; } -static inline -void pnfs_fetch_commit_bucket_list(struct list_head *pages, - struct nfs_commit_data *data, - struct nfs_commit_info *cinfo) +static unsigned int +pnfs_alloc_ds_commits_list(struct list_head *list, + struct pnfs_ds_commit_info *fl_cinfo, + struct nfs_commit_info *cinfo) { - struct pnfs_commit_bucket *bucket; - struct list_head *pos; - - bucket = &cinfo->ds->buckets[data->ds_commit_index]; - mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); - list_for_each(pos, &bucket->committing) - cinfo->ds->ncommitting--; - list_splice_init(&bucket->committing, pages); - data->lseg = bucket->clseg; - bucket->clseg = NULL; - mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - -} + struct pnfs_commit_array *array; + unsigned int ret = 0; -/* Helper function for pnfs_generic_commit_pagelist to catch an empty - * page list. This can happen when two commits race. - * - * This must be called instead of nfs_init_commit - call one or the other, but - * not both! - */ -static bool -pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, - struct nfs_commit_data *data, - struct nfs_commit_info *cinfo) -{ - if (list_empty(pages)) { - if (atomic_dec_and_test(&cinfo->mds->rpcs_out)) - wake_up_var(&cinfo->mds->rpcs_out); - /* don't call nfs_commitdata_release - it tries to put - * the open_context which is not acquired until nfs_init_commit - * which has not been called on @data */ - WARN_ON_ONCE(data->context); - nfs_commit_free(data); - return true; + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); + ret += pnfs_bucket_alloc_ds_commits(list, array->buckets, + array->nbuckets, cinfo); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); } - - return false; + return ret; } /* This follows nfs_commit_list pretty closely */ @@ -262,6 +511,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int (*initiate_commit)(struct nfs_commit_data *data, int how)) { + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; struct nfs_commit_data *data, *tmp; LIST_HEAD(list); unsigned int nreq = 0; @@ -269,40 +519,25 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, if (!list_empty(mds_pages)) { data = nfs_commitdata_alloc(true); data->ds_commit_index = -1; - list_add(&data->pages, &list); + list_splice_init(mds_pages, &data->pages); + list_add_tail(&data->list, &list); + atomic_inc(&cinfo->mds->rpcs_out); nreq++; } - nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); - + nreq += pnfs_alloc_ds_commits_list(&list, fl_cinfo, cinfo); if (nreq == 0) goto out; - atomic_add(nreq, &cinfo->mds->rpcs_out); - - list_for_each_entry_safe(data, tmp, &list, pages) { - list_del_init(&data->pages); + list_for_each_entry_safe(data, tmp, &list, list) { + list_del(&data->list); if (data->ds_commit_index < 0) { - /* another commit raced with us */ - if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages, - data, cinfo)) - continue; - - nfs_init_commit(data, mds_pages, NULL, cinfo); + nfs_init_commit(data, NULL, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), data->mds_ops, how, 0); } else { - LIST_HEAD(pages); - - pnfs_fetch_commit_bucket_list(&pages, data, cinfo); - - /* another commit raced with us */ - if (pnfs_generic_commit_cancel_empty_pagelist(&pages, - data, cinfo)) - continue; - - nfs_init_commit(data, &pages, data->lseg, cinfo); + nfs_init_commit(data, NULL, data->lseg, cinfo); initiate_commit(data, how); } } @@ -930,32 +1165,33 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx) { struct list_head *list; - struct pnfs_commit_bucket *buckets; + struct pnfs_commit_array *array; + struct pnfs_commit_bucket *bucket; mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); - buckets = cinfo->ds->buckets; - list = &buckets[ds_commit_idx].written; - if (list_empty(list)) { - if (!pnfs_is_valid_lseg(lseg)) { - mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - cinfo->completion_ops->resched_write(cinfo, req); - return; - } - /* Non-empty buckets hold a reference on the lseg. That ref - * is normally transferred to the COMMIT call and released - * there. It could also be released if the last req is pulled - * off due to a rewrite, in which case it will be done in - * pnfs_common_clear_request_commit - */ - WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL); - buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg); - } + array = pnfs_lookup_commit_array(cinfo->ds, lseg); + if (!array || !pnfs_is_valid_lseg(lseg)) + goto out_resched; + bucket = &array->buckets[ds_commit_idx]; + list = &bucket->written; + /* Non-empty buckets hold a reference on the lseg. That ref + * is normally transferred to the COMMIT call and released + * there. It could also be released if the last req is pulled + * off due to a rewrite, in which case it will be done in + * pnfs_common_clear_request_commit + */ + if (!bucket->lseg) + bucket->lseg = pnfs_get_lseg(lseg); set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; nfs_request_add_commit_list_locked(req, list, cinfo); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); nfs_mark_page_unstable(req->wb_page, cinfo); + return; +out_resched: + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); + cinfo->completion_ops->resched_write(cinfo, req); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 34bb9add2302..13b22e898116 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -250,7 +250,7 @@ static int nfs_readpage_done(struct rpc_task *task, trace_nfs_readpage_done(task, hdr); if (task->tk_status == -ESTALE) { - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); nfs_mark_for_revalidate(inode); } return 0; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index dada09b391c6..59ef3b13ccca 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -176,6 +176,41 @@ void nfs_sb_deactive(struct super_block *sb) } EXPORT_SYMBOL_GPL(nfs_sb_deactive); +static int __nfs_list_for_each_server(struct list_head *head, + int (*fn)(struct nfs_server *, void *), + void *data) +{ + struct nfs_server *server, *last = NULL; + int ret = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, head, client_link) { + if (!nfs_sb_active(server->super)) + continue; + rcu_read_unlock(); + if (last) + nfs_sb_deactive(last->super); + last = server; + ret = fn(server, data); + if (ret) + goto out; + rcu_read_lock(); + } + rcu_read_unlock(); +out: + if (last) + nfs_sb_deactive(last->super); + return ret; +} + +int nfs_client_for_each_server(struct nfs_client *clp, + int (*fn)(struct nfs_server *, void *), + void *data) +{ + return __nfs_list_for_each_server(&clp->cl_superblocks, fn, data); +} +EXPORT_SYMBOL_GPL(nfs_client_for_each_server); + /* * Deliver file system statistics to userspace */ @@ -1179,7 +1214,6 @@ int nfs_get_tree_common(struct fs_context *fc) struct super_block *s; int (*compare_super)(struct super_block *, struct fs_context *) = nfs_compare_super; struct nfs_server *server = ctx->server; - unsigned long kflags = 0, kflags_out = 0; int error; ctx->server = NULL; @@ -1239,26 +1273,6 @@ int nfs_get_tree_common(struct fs_context *fc) goto error_splat_super; } - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) - kflags |= SECURITY_LSM_NATIVE_LABELS; - if (ctx->clone_data.sb) { - if (d_inode(fc->root)->i_fop != &nfs_dir_operations) { - error = -ESTALE; - goto error_splat_root; - } - /* clone any lsm security options from the parent to the new sb */ - error = security_sb_clone_mnt_opts(ctx->clone_data.sb, s, kflags, - &kflags_out); - } else { - error = security_sb_set_mnt_opts(s, fc->security, - kflags, &kflags_out); - } - if (error) - goto error_splat_root; - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && - !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) - NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; - s->s_flags |= SB_ACTIVE; error = 0; @@ -1268,10 +1282,6 @@ out: out_err_nosb: nfs_free_server(server); goto out; - -error_splat_root: - dput(fc->root); - fc->root = NULL; error_splat_super: deactivate_locked_super(s); goto out; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 0effeee28352..b27ebdccef70 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -98,7 +98,7 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data) .callback_ops = &nfs_unlink_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; struct rpc_task *task; struct inode *dir = d_inode(data->dentry->d_parent); @@ -341,7 +341,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, .callback_ops = &nfs_rename_ops, .workqueue = nfsiod_workqueue, .rpc_client = NFS_CLIENT(old_dir), - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; data = kzalloc(sizeof(*data), GFP_KERNEL); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c478b772cc49..df4b87c30ac9 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -149,6 +149,31 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc) kref_put(&ioc->refcount, nfs_io_completion_release); } +static void +nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) +{ + if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) { + kref_get(&req->wb_kref); + atomic_long_inc(&NFS_I(inode)->nrequests); + } +} + +static int +nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +{ + int ret; + + if (!test_bit(PG_REMOVE, &req->wb_flags)) + return 0; + ret = nfs_page_group_lock(req); + if (ret) + return ret; + if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) + nfs_page_set_inode_ref(req, inode); + nfs_page_group_unlock(req); + return 0; +} + static struct nfs_page * nfs_page_private_request(struct page *page) { @@ -218,6 +243,36 @@ static struct nfs_page *nfs_page_find_head_request(struct page *page) return req; } +static struct nfs_page *nfs_find_and_lock_page_request(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *req, *head; + int ret; + + for (;;) { + req = nfs_page_find_head_request(page); + if (!req) + return req; + head = nfs_page_group_lock_head(req); + if (head != req) + nfs_release_request(req); + if (IS_ERR(head)) + return head; + ret = nfs_cancel_remove_inode(head, inode); + if (ret < 0) { + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); + } + /* Ensure that nobody removed the request before we locked it */ + if (head == nfs_page_private_request(page)) + break; + if (PageSwapCache(page)) + break; + nfs_unlock_and_release_request(head); + } + return head; +} + /* Adjust the file length if we're writing beyond the end */ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { @@ -380,34 +435,6 @@ static void nfs_end_page_writeback(struct nfs_page *req) } /* - * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req - * - * this is a helper function for nfs_lock_and_join_requests - * - * @inode - inode associated with request page group, must be holding inode lock - * @head - head request of page group, must be holding head lock - * @req - request that couldn't lock and needs to wait on the req bit lock - * - * NOTE: this must be called holding page_group bit lock - * which will be released before returning. - * - * returns 0 on success, < 0 on error. - */ -static void -nfs_unroll_locks(struct inode *inode, struct nfs_page *head, - struct nfs_page *req) -{ - struct nfs_page *tmp; - - /* relinquish all the locks successfully grabbed this run */ - for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { - if (!kref_read(&tmp->wb_kref)) - continue; - nfs_unlock_and_release_request(tmp); - } -} - -/* * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests * * @destroy_list - request list (using wb_this_page) terminated by @old_head @@ -428,22 +455,29 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, destroy_list = (subreq->wb_this_page == old_head) ? NULL : subreq->wb_this_page; + /* Note: lock subreq in order to change subreq->wb_head */ + nfs_page_set_headlock(subreq); WARN_ON_ONCE(old_head != subreq->wb_head); /* make sure old group is not used */ subreq->wb_this_page = subreq; + subreq->wb_head = subreq; clear_bit(PG_REMOVE, &subreq->wb_flags); /* Note: races with nfs_page_group_destroy() */ if (!kref_read(&subreq->wb_kref)) { /* Check if we raced with nfs_page_group_destroy() */ - if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) + if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) { + nfs_page_clear_headlock(subreq); nfs_free_request(subreq); + } else + nfs_page_clear_headlock(subreq); continue; } + nfs_page_clear_headlock(subreq); - subreq->wb_head = subreq; + nfs_release_request(old_head); if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { nfs_release_request(subreq); @@ -457,105 +491,43 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, } /* - * nfs_lock_and_join_requests - join all subreqs to the head req and return - * a locked reference, cancelling any pending - * operations for this page. - * - * @page - the page used to lookup the "page group" of nfs_page structures + * nfs_join_page_group - destroy subrequests of the head req + * @head: the page used to lookup the "page group" of nfs_page structures + * @inode: Inode to which the request belongs. * * This function joins all sub requests to the head request by first * locking all requests in the group, cancelling any pending operations * and finally updating the head request to cover the whole range covered by * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. - * - * Returns a locked, referenced pointer to the head request - which after - * this call is guaranteed to be the only request associated with the page. - * Returns NULL if no requests are found for @page, or a ERR_PTR if an - * error was encountered. */ -static struct nfs_page * -nfs_lock_and_join_requests(struct page *page) +void +nfs_join_page_group(struct nfs_page *head, struct inode *inode) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *head, *subreq; + struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; - unsigned int total_bytes; - int ret; + unsigned int pgbase, off, bytes; -try_again: - /* - * A reference is taken only on the head request which acts as a - * reference to the whole page group - the group will not be destroyed - * until the head reference is released. - */ - head = nfs_page_find_head_request(page); - if (!head) - return NULL; - - /* lock the page head first in order to avoid an ABBA inefficiency */ - if (!nfs_lock_request(head)) { - ret = nfs_wait_on_request(head); - nfs_release_request(head); - if (ret < 0) - return ERR_PTR(ret); - goto try_again; - } - - /* Ensure that nobody removed the request before we locked it */ - if (head != nfs_page_private_request(page) && !PageSwapCache(page)) { - nfs_unlock_and_release_request(head); - goto try_again; - } - - ret = nfs_page_group_lock(head); - if (ret < 0) - goto release_request; - - /* lock each request in the page group */ - total_bytes = head->wb_bytes; + pgbase = head->wb_pgbase; + bytes = head->wb_bytes; + off = head->wb_offset; for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { - - if (!kref_get_unless_zero(&subreq->wb_kref)) { - if (subreq->wb_offset == head->wb_offset + total_bytes) - total_bytes += subreq->wb_bytes; - continue; - } - - while (!nfs_lock_request(subreq)) { - /* - * Unlock page to allow nfs_page_group_sync_on_bit() - * to succeed - */ - nfs_page_group_unlock(head); - ret = nfs_wait_on_request(subreq); - if (!ret) - ret = nfs_page_group_lock(head); - if (ret < 0) { - nfs_unroll_locks(inode, head, subreq); - nfs_release_request(subreq); - goto release_request; - } - } - /* - * Subrequests are always contiguous, non overlapping - * and in order - but may be repeated (mirrored writes). - */ - if (subreq->wb_offset == (head->wb_offset + total_bytes)) { - /* keep track of how many bytes this group covers */ - total_bytes += subreq->wb_bytes; - } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset || - ((subreq->wb_offset + subreq->wb_bytes) > - (head->wb_offset + total_bytes)))) { - nfs_page_group_unlock(head); - nfs_unroll_locks(inode, head, subreq); - nfs_unlock_and_release_request(subreq); - ret = -EIO; - goto release_request; + /* Subrequests should always form a contiguous range */ + if (pgbase > subreq->wb_pgbase) { + off -= pgbase - subreq->wb_pgbase; + bytes += pgbase - subreq->wb_pgbase; + pgbase = subreq->wb_pgbase; } + bytes = max(subreq->wb_pgbase + subreq->wb_bytes + - pgbase, bytes); } + /* Set the head request's range to cover the former page group */ + head->wb_pgbase = pgbase; + head->wb_bytes = bytes; + head->wb_offset = off; + /* Now that all requests are locked, make sure they aren't on any list. * Commit list removal accounting is done after locks are dropped */ subreq = head; @@ -569,36 +541,52 @@ try_again: /* destroy list will be terminated by head */ destroy_list = head->wb_this_page; head->wb_this_page = head; - - /* change head request to cover whole range that - * the former page group covered */ - head->wb_bytes = total_bytes; } - /* Postpone destruction of this request */ - if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) { - set_bit(PG_INODE_REF, &head->wb_flags); - kref_get(&head->wb_kref); - atomic_long_inc(&NFS_I(inode)->nrequests); - } + nfs_destroy_unlinked_subrequests(destroy_list, head, inode); +} - nfs_page_group_unlock(head); +/* + * nfs_lock_and_join_requests - join all subreqs to the head req + * @page: the page used to lookup the "page group" of nfs_page structures + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *head; + int ret; - nfs_destroy_unlinked_subrequests(destroy_list, head, inode); + /* + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. + */ + head = nfs_find_and_lock_page_request(page); + if (IS_ERR_OR_NULL(head)) + return head; - /* Did we lose a race with nfs_inode_remove_request()? */ - if (!(PagePrivate(page) || PageSwapCache(page))) { + /* lock each request in the page group */ + ret = nfs_page_group_lock_subrequests(head); + if (ret < 0) { nfs_unlock_and_release_request(head); - return NULL; + return ERR_PTR(ret); } - /* still holds ref on head from nfs_page_find_head_request - * and still has lock on head from lock loop */ - return head; + nfs_join_page_group(head, inode); -release_request: - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); + return head; } static void nfs_write_error(struct nfs_page *req, int error) @@ -1707,7 +1695,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | flags, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags, .priority = priority, }; /* Set up the initial task struct. */ @@ -1746,14 +1734,19 @@ void nfs_init_commit(struct nfs_commit_data *data, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo) { - struct nfs_page *first = nfs_list_entry(head->next); - struct nfs_open_context *ctx = nfs_req_openctx(first); - struct inode *inode = d_inode(ctx->dentry); + struct nfs_page *first; + struct nfs_open_context *ctx; + struct inode *inode; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ - list_splice_init(head, &data->pages); + if (head) + list_splice_init(head, &data->pages); + + first = nfs_list_entry(data->pages.next); + ctx = nfs_req_openctx(first); + inode = d_inode(ctx->dentry); data->inode = inode; data->cred = ctx->cred; @@ -1869,8 +1862,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Okay, COMMIT succeeded, apparently. Check the verifier * returned by the server against all stored verfs. */ - if (verf->committed > NFS_UNSTABLE && - !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier)) { + if (nfs_write_match_verf(verf, req)) { /* We have a match */ if (req->wb_page) nfs_inode_remove_request(req); diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index f368f3215f88..99d2cae91bd6 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -136,7 +136,7 @@ config NFSD_FLEXFILELAYOUT config NFSD_V4_2_INTER_SSC bool "NFSv4.2 inter server to server COPY" - depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2 + depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2 && NFS_FS=y help This option enables support for NFSv4.2 inter server to server copy where the destination server calls the NFSv4.2 diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 15422c951fd1..cb777fe82988 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -23,6 +23,7 @@ #include "netns.h" #include "pnfs.h" #include "filecache.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -50,6 +51,11 @@ static void expkey_put(struct kref *ref) kfree_rcu(key, ek_rcu); } +static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); +} + static void expkey_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -140,7 +146,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) if (len == 0) { set_bit(CACHE_NEGATIVE, &key.h.flags); ek = svc_expkey_update(cd, &key, ek); - if (!ek) + if (ek) + trace_nfsd_expkey_update(ek, NULL); + else err = -ENOMEM; } else { err = kern_path(buf, 0, &key.ek_path); @@ -150,7 +158,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) dprintk("Found the path %s\n", buf); ek = svc_expkey_update(cd, &key, ek); - if (!ek) + if (ek) + trace_nfsd_expkey_update(ek, buf); + else err = -ENOMEM; path_put(&key.ek_path); } @@ -249,6 +259,7 @@ static const struct cache_detail svc_expkey_cache_template = { .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", .cache_put = expkey_put, + .cache_upcall = expkey_upcall, .cache_request = expkey_request, .cache_parse = expkey_parse, .cache_show = expkey_show, @@ -330,6 +341,11 @@ static void svc_export_put(struct kref *ref) kfree_rcu(exp, ex_rcu); } +static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); +} + static void svc_export_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -643,15 +659,17 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) } expp = svc_export_lookup(&exp); - if (expp) - expp = svc_export_update(&exp, expp); - else - err = -ENOMEM; - cache_flush(); - if (expp == NULL) + if (!expp) { err = -ENOMEM; - else + goto out4; + } + expp = svc_export_update(&exp, expp); + if (expp) { + trace_nfsd_export_update(expp); + cache_flush(); exp_put(expp); + } else + err = -ENOMEM; out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); @@ -767,6 +785,7 @@ static const struct cache_detail svc_export_cache_template = { .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", .cache_put = svc_export_put, + .cache_upcall = svc_export_upcall, .cache_request = svc_export_request, .cache_parse = svc_export_parse, .cache_show = svc_export_show, @@ -832,8 +851,10 @@ exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, if (ek == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &ek->h, reqp); - if (err) + if (err) { + trace_nfsd_exp_find_key(&key, err); return ERR_PTR(err); + } return ek; } @@ -855,8 +876,10 @@ exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, if (exp == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &exp->h, reqp); - if (err) + if (err) { + trace_nfsd_exp_get_by_name(&key, err); return ERR_PTR(err); + } return exp; } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 22e77ede9f14..82198d747c4c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -890,7 +890,7 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, unsigned char need = may_flags & NFSD_FILE_MAY_MASK; hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, - nf_node) { + nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) { if ((need & nf->nf_may) != need) continue; if (nf->nf_inode != inode) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 2baf32311e00..09aa545825bd 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -172,6 +172,8 @@ struct nfsd_net { unsigned int longest_chain_cachesize; struct shrinker nfsd_reply_cache_shrinker; + /* utsname taken from the the process that starts the server */ + char nfsd_name[UNX_MAXNODENAME+1]; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index d1f285245af8..9460be8a8321 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -122,6 +122,12 @@ idtoname_hash(struct ent *ent) return hash; } +static int +idtoname_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + static void idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) @@ -184,6 +190,7 @@ static const struct cache_detail idtoname_cache_template = { .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", .cache_put = ent_put, + .cache_upcall = idtoname_upcall, .cache_request = idtoname_request, .cache_parse = idtoname_parse, .cache_show = idtoname_show, @@ -295,6 +302,12 @@ nametoid_hash(struct ent *ent) return hash_str(ent->name, ENT_HASHBITS); } +static int +nametoid_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + static void nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) @@ -347,6 +360,7 @@ static const struct cache_detail nametoid_cache_template = { .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", .cache_put = ent_put, + .cache_upcall = nametoid_upcall, .cache_request = nametoid_request, .cache_parse = nametoid_parse, .cache_show = nametoid_show, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 65cfe9ab47be..e32ecedece0f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -494,6 +494,8 @@ find_any_file(struct nfs4_file *f) { struct nfsd_file *ret; + if (!f) + return NULL; spin_lock(&f->fi_lock); ret = __nfs4_get_fd(f, O_RDWR); if (!ret) { @@ -1309,6 +1311,12 @@ static void nfs4_put_stateowner(struct nfs4_stateowner *sop) nfs4_free_stateowner(sop); } +static bool +nfs4_ol_stateid_unhashed(const struct nfs4_ol_stateid *stp) +{ + return list_empty(&stp->st_perfile); +} + static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_file *fp = stp->st_stid.sc_file; @@ -1379,9 +1387,11 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) { lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); + if (!unhash_ol_stateid(stp)) + return false; list_del_init(&stp->st_locks); nfs4_unhash_stid(&stp->st_stid); - return unhash_ol_stateid(stp); + return true; } static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -1446,13 +1456,12 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, static bool unhash_open_stateid(struct nfs4_ol_stateid *stp, struct list_head *reaplist) { - bool unhashed; - lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); - unhashed = unhash_ol_stateid(stp); + if (!unhash_ol_stateid(stp)) + return false; release_open_stateid_locks(stp, reaplist); - return unhashed; + return true; } static void release_open_stateid(struct nfs4_ol_stateid *stp) @@ -2636,7 +2645,7 @@ static const struct file_operations client_ctl_fops = { static const struct tree_descr client_files[] = { [0] = {"info", &client_info_fops, S_IRUSR}, [1] = {"states", &client_states_fops, S_IRUSR}, - [2] = {"ctl", &client_ctl_fops, S_IRUSR|S_IWUSR}, + [2] = {"ctl", &client_ctl_fops, S_IWUSR}, [3] = {""}, }; @@ -4343,7 +4352,8 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval) { struct nfs4_file *fp; - hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { + hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, + lockdep_is_held(&state_lock)) { if (fh_match(&fp->fi_fhandle, fh)) { if (refcount_inc_not_zero(&fp->fi_ref)) return fp; @@ -5521,15 +5531,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return status; - /* Client debugging aid. */ - if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { - char addr_str[INET6_ADDRSTRLEN]; - rpc_ntop((struct sockaddr *)&cl->cl_addr, addr_str, - sizeof(addr_str)); - pr_warn_ratelimited("NFSD: client %s testing state ID " - "with incorrect client ID\n", addr_str); + if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) return status; - } spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); if (!s) @@ -6393,21 +6396,21 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, } static struct nfs4_ol_stateid * -find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) +find_lock_stateid(const struct nfs4_lockowner *lo, + const struct nfs4_ol_stateid *ost) { struct nfs4_ol_stateid *lst; - struct nfs4_client *clp = lo->lo_owner.so_client; - lockdep_assert_held(&clp->cl_lock); + lockdep_assert_held(&ost->st_stid.sc_client->cl_lock); - list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { - if (lst->st_stid.sc_type != NFS4_LOCK_STID) - continue; - if (lst->st_stid.sc_file == fp) { - refcount_inc(&lst->st_stid.sc_count); - return lst; + /* If ost is not hashed, ost->st_locks will not be valid */ + if (!nfs4_ol_stateid_unhashed(ost)) + list_for_each_entry(lst, &ost->st_locks, st_locks) { + if (lst->st_stateowner == &lo->lo_owner) { + refcount_inc(&lst->st_stid.sc_count); + return lst; + } } - } return NULL; } @@ -6423,11 +6426,11 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); retry: spin_lock(&clp->cl_lock); - spin_lock(&fp->fi_lock); - retstp = find_lock_stateid(lo, fp); + if (nfs4_ol_stateid_unhashed(open_stp)) + goto out_close; + retstp = find_lock_stateid(lo, open_stp); if (retstp) - goto out_unlock; - + goto out_found; refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_LOCK_STID; stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); @@ -6436,22 +6439,26 @@ retry: stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; + spin_lock(&fp->fi_lock); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); -out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); - if (retstp) { - if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { - nfs4_put_stid(&retstp->st_stid); - goto retry; - } - /* To keep mutex tracking happy */ - mutex_unlock(&stp->st_mutex); - stp = retstp; - } return stp; +out_found: + spin_unlock(&clp->cl_lock); + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; + } + /* To keep mutex tracking happy */ + mutex_unlock(&stp->st_mutex); + return retstp; +out_close: + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + return NULL; } static struct nfs4_ol_stateid * @@ -6466,7 +6473,7 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, *new = false; spin_lock(&clp->cl_lock); - lst = find_lock_stateid(lo, fi); + lst = find_lock_stateid(lo, ost); spin_unlock(&clp->cl_lock); if (lst != NULL) { if (nfsd4_lock_ol_stateid(lst) == nfs_ok) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9761512674a0..996ac01ee977 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3591,23 +3591,22 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, __be32 nfserr; __be32 tmp; __be32 *p; - u32 zzz = 0; int pad; + /* + * svcrdma requires every READ payload to start somewhere + * in xdr->pages. + */ + if (xdr->iov == xdr->buf->head) { + xdr->iov = NULL; + xdr->end = xdr->p; + } + len = maxcount; v = 0; - - thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p)); - p = xdr_reserve_space(xdr, (thislen+3)&~3); - WARN_ON_ONCE(!p); - resp->rqstp->rq_vec[v].iov_base = p; - resp->rqstp->rq_vec[v].iov_len = thislen; - v++; - len -= thislen; - while (len) { thislen = min_t(long, len, PAGE_SIZE); - p = xdr_reserve_space(xdr, (thislen+3)&~3); + p = xdr_reserve_space(xdr, thislen); WARN_ON_ONCE(!p); resp->rqstp->rq_vec[v].iov_base = p; resp->rqstp->rq_vec[v].iov_len = thislen; @@ -3616,23 +3615,25 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, } read->rd_vlen = v; - len = maxcount; nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount, &eof); read->rd_length = maxcount; if (nfserr) return nfserr; - xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); + if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount)) + return nfserr_io; + xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount)); tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); tmp = htonl(maxcount); write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); + tmp = xdr_zero; pad = (maxcount&3) ? 4 - (maxcount&3) : 0; write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount, - &zzz, pad); + &tmp, pad); return 0; } @@ -4005,11 +4006,12 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, int major_id_sz; int server_scope_sz; uint64_t minor_id = 0; + struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id); - major_id = utsname()->nodename; - major_id_sz = strlen(major_id); - server_scope = utsname()->nodename; - server_scope_sz = strlen(server_scope); + major_id = nn->nfsd_name; + major_id_sz = strlen(nn->nfsd_name); + server_scope = nn->nfsd_name; + server_scope_sz = strlen(nn->nfsd_name); p = xdr_reserve_space(xdr, 8 /* eir_clientid */ + diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index e109a1007704..3bb2db947d29 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1333,6 +1333,7 @@ void nfsd_client_rmdir(struct dentry *dentry) dget(dentry); ret = simple_rmdir(dir, dentry); WARN_ON_ONCE(ret); + fsnotify_rmdir(dir, dentry); d_delete(dentry); inode_unlock(dir); } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index b319080288c3..37bc8f5f4514 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -14,6 +14,7 @@ #include "nfsd.h" #include "vfs.h" #include "auth.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FH @@ -209,11 +210,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) } error = nfserr_stale; - if (PTR_ERR(exp) == -ENOENT) - return error; + if (IS_ERR(exp)) { + trace_nfsd_set_fh_dentry_badexport(rqstp, fhp, PTR_ERR(exp)); + + if (PTR_ERR(exp) == -ENOENT) + return error; - if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); + } if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { /* Elevate privileges so that the lack of 'r' or 'x' @@ -267,6 +271,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) dentry = exportfs_decode_fh(exp->ex_path.mnt, fid, data_left, fileid_type, nfsd_acceptable, exp); + if (IS_ERR_OR_NULL(dentry)) + trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp, + dentry ? PTR_ERR(dentry) : -ESTALE); } if (dentry == NULL) goto out; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 3b77b904212d..ca9fd348548b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -749,6 +749,9 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) if (nrservs == 0 && nn->nfsd_serv == NULL) goto out; + strlcpy(nn->nfsd_name, utsname()->nodename, + sizeof(nn->nfsd_name)); + error = nfsd_create_serv(net); if (error) goto out; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 06dd0d337049..78c574251c60 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -9,6 +9,7 @@ #define _NFSD_TRACE_H #include <linux/tracepoint.h> +#include "export.h" #include "nfsfh.h" TRACE_EVENT(nfsd_compound, @@ -50,6 +51,127 @@ TRACE_EVENT(nfsd_compound_status, __get_str(name), __entry->status) ) +DECLARE_EVENT_CLASS(nfsd_fh_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + int status), + TP_ARGS(rqstp, fhp, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, fh_hash) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->status = status; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x status=%d", + __entry->xid, __entry->fh_hash, + __entry->status) +) + +#define DEFINE_NFSD_FH_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_fh_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + int status), \ + TP_ARGS(rqstp, fhp, status)) + +DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badexport); +DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badhandle); + +TRACE_EVENT(nfsd_exp_find_key, + TP_PROTO(const struct svc_expkey *key, + int status), + TP_ARGS(key, status), + TP_STRUCT__entry( + __field(int, fsidtype) + __array(u32, fsid, 6) + __string(auth_domain, key->ek_client->name) + __field(int, status) + ), + TP_fast_assign( + __entry->fsidtype = key->ek_fsidtype; + memcpy(__entry->fsid, key->ek_fsid, 4*6); + __assign_str(auth_domain, key->ek_client->name); + __entry->status = status; + ), + TP_printk("fsid=%x::%s domain=%s status=%d", + __entry->fsidtype, + __print_array(__entry->fsid, 6, 4), + __get_str(auth_domain), + __entry->status + ) +); + +TRACE_EVENT(nfsd_expkey_update, + TP_PROTO(const struct svc_expkey *key, const char *exp_path), + TP_ARGS(key, exp_path), + TP_STRUCT__entry( + __field(int, fsidtype) + __array(u32, fsid, 6) + __string(auth_domain, key->ek_client->name) + __string(path, exp_path) + __field(bool, cache) + ), + TP_fast_assign( + __entry->fsidtype = key->ek_fsidtype; + memcpy(__entry->fsid, key->ek_fsid, 4*6); + __assign_str(auth_domain, key->ek_client->name); + __assign_str(path, exp_path); + __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); + ), + TP_printk("fsid=%x::%s domain=%s path=%s cache=%s", + __entry->fsidtype, + __print_array(__entry->fsid, 6, 4), + __get_str(auth_domain), + __get_str(path), + __entry->cache ? "pos" : "neg" + ) +); + +TRACE_EVENT(nfsd_exp_get_by_name, + TP_PROTO(const struct svc_export *key, + int status), + TP_ARGS(key, status), + TP_STRUCT__entry( + __string(path, key->ex_path.dentry->d_name.name) + __string(auth_domain, key->ex_client->name) + __field(int, status) + ), + TP_fast_assign( + __assign_str(path, key->ex_path.dentry->d_name.name); + __assign_str(auth_domain, key->ex_client->name); + __entry->status = status; + ), + TP_printk("path=%s domain=%s status=%d", + __get_str(path), + __get_str(auth_domain), + __entry->status + ) +); + +TRACE_EVENT(nfsd_export_update, + TP_PROTO(const struct svc_export *key), + TP_ARGS(key), + TP_STRUCT__entry( + __string(path, key->ex_path.dentry->d_name.name) + __string(auth_domain, key->ex_client->name) + __field(bool, cache) + ), + TP_fast_assign( + __assign_str(path, key->ex_path.dentry->d_name.name); + __assign_str(auth_domain, key->ex_client->name); + __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); + ), + TP_printk("path=%s domain=%s cache=%s", + __get_str(path), + __get_str(auth_domain), + __entry->cache ? "pos" : "neg" + ) +); + DECLARE_EVENT_CLASS(nfsd_io_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 5778d1347b35..5435a40f82be 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -17,6 +17,59 @@ #include "fanotify.h" +static bool fanotify_path_equal(struct path *p1, struct path *p2) +{ + return p1->mnt == p2->mnt && p1->dentry == p2->dentry; +} + +static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, + __kernel_fsid_t *fsid2) +{ + return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; +} + +static bool fanotify_fh_equal(struct fanotify_fh *fh1, + struct fanotify_fh *fh2) +{ + if (fh1->type != fh2->type || fh1->len != fh2->len) + return false; + + /* Do not merge events if we failed to encode fh */ + if (fh1->type == FILEID_INVALID) + return false; + + return !fh1->len || + !memcmp(fanotify_fh_buf(fh1), fanotify_fh_buf(fh2), fh1->len); +} + +static bool fanotify_fid_event_equal(struct fanotify_fid_event *ffe1, + struct fanotify_fid_event *ffe2) +{ + /* Do not merge fid events without object fh */ + if (!ffe1->object_fh.len) + return false; + + return fanotify_fsid_equal(&ffe1->fsid, &ffe2->fsid) && + fanotify_fh_equal(&ffe1->object_fh, &ffe2->object_fh); +} + +static bool fanotify_name_event_equal(struct fanotify_name_event *fne1, + struct fanotify_name_event *fne2) +{ + /* + * Do not merge name events without dir fh. + * FAN_DIR_MODIFY does not encode object fh, so it may be empty. + */ + if (!fne1->dir_fh.len) + return false; + + if (fne1->name_len != fne2->name_len || + !fanotify_fh_equal(&fne1->dir_fh, &fne2->dir_fh)) + return false; + + return !memcmp(fne1->name, fne2->name, fne1->name_len); +} + static bool should_merge(struct fsnotify_event *old_fsn, struct fsnotify_event *new_fsn) { @@ -26,14 +79,15 @@ static bool should_merge(struct fsnotify_event *old_fsn, old = FANOTIFY_E(old_fsn); new = FANOTIFY_E(new_fsn); - if (old_fsn->inode != new_fsn->inode || old->pid != new->pid || - old->fh_type != new->fh_type || old->fh_len != new->fh_len) + if (old_fsn->objectid != new_fsn->objectid || + old->type != new->type || old->pid != new->pid) return false; - if (fanotify_event_has_path(old)) { - return old->path.mnt == new->path.mnt && - old->path.dentry == new->path.dentry; - } else if (fanotify_event_has_fid(old)) { + switch (old->type) { + case FANOTIFY_EVENT_TYPE_PATH: + return fanotify_path_equal(fanotify_event_path(old), + fanotify_event_path(new)); + case FANOTIFY_EVENT_TYPE_FID: /* * We want to merge many dirent events in the same dir (i.e. * creates/unlinks/renames), but we do not want to merge dirent @@ -42,11 +96,18 @@ static bool should_merge(struct fsnotify_event *old_fsn, * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+ * unlink pair or rmdir+create pair of events. */ - return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) && - fanotify_fid_equal(&old->fid, &new->fid, old->fh_len); + if ((old->mask & FS_ISDIR) != (new->mask & FS_ISDIR)) + return false; + + return fanotify_fid_event_equal(FANOTIFY_FE(old), + FANOTIFY_FE(new)); + case FANOTIFY_EVENT_TYPE_FID_NAME: + return fanotify_name_event_equal(FANOTIFY_NE(old), + FANOTIFY_NE(new)); + default: + WARN_ON_ONCE(1); } - /* Do not merge events if we failed to encode fid */ return false; } @@ -151,7 +212,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, { __u32 marks_mask = 0, marks_ignored_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS; - const struct path *path = data; + const struct path *path = fsnotify_data_path(data, data_type); struct fsnotify_mark *mark; int type; @@ -160,7 +221,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { /* Do we have path to open a file descriptor? */ - if (data_type != FSNOTIFY_EVENT_PATH) + if (!path) return 0; /* Path type events are only relevant for files and dirs */ if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry)) @@ -172,6 +233,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, continue; mark = iter_info->marks[type]; /* + * If the event is on dir and this mark doesn't care about + * events on dir, don't send it! + */ + if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR)) + continue; + + /* * If the event is for a child and this mark doesn't care about * events on a child, don't send it! */ @@ -187,9 +255,9 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, test_mask = event_mask & marks_mask & ~marks_ignored_mask; /* - * dirent modification events (create/delete/move) do not carry the - * child entry name/inode information. Instead, we report FAN_ONDIR - * for mkdir/rmdir so user can differentiate them from creat/unlink. + * For dirent modification events (create/delete/move) that do not carry + * the child entry name information, we report FAN_ONDIR for mkdir/rmdir + * so user can differentiate them from creat/unlink. * * For backward compatibility and consistency, do not report FAN_ONDIR * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR @@ -203,22 +271,20 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, user_mask &= ~FAN_ONDIR; } - if (event_mask & FS_ISDIR && - !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) - return 0; - return test_mask & user_mask; } -static int fanotify_encode_fid(struct fanotify_event *event, - struct inode *inode, gfp_t gfp, - __kernel_fsid_t *fsid) +static void fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, + gfp_t gfp) { - struct fanotify_fid *fid = &event->fid; - int dwords, bytes = 0; - int err, type; + int dwords, type, bytes = 0; + char *ext_buf = NULL; + void *buf = fh->buf; + int err; + + if (!inode) + goto out; - fid->ext_fh = NULL; dwords = 0; err = -ENOENT; type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL); @@ -229,31 +295,33 @@ static int fanotify_encode_fid(struct fanotify_event *event, if (bytes > FANOTIFY_INLINE_FH_LEN) { /* Treat failure to allocate fh as failure to allocate event */ err = -ENOMEM; - fid->ext_fh = kmalloc(bytes, gfp); - if (!fid->ext_fh) + ext_buf = kmalloc(bytes, gfp); + if (!ext_buf) goto out_err; + + *fanotify_fh_ext_buf_ptr(fh) = ext_buf; + buf = ext_buf; } - type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes), - &dwords, NULL); + type = exportfs_encode_inode_fh(inode, buf, &dwords, NULL); err = -EINVAL; if (!type || type == FILEID_INVALID || bytes != dwords << 2) goto out_err; - fid->fsid = *fsid; - event->fh_len = bytes; + fh->type = type; + fh->len = bytes; - return type; + return; out_err: - pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, " - "type=%d, bytes=%d, err=%i)\n", - fsid->val[0], fsid->val[1], type, bytes, err); - kfree(fid->ext_fh); - fid->ext_fh = NULL; - event->fh_len = 0; - - return FILEID_INVALID; + pr_warn_ratelimited("fanotify: failed to encode fid (type=%d, len=%d, err=%i)\n", + type, bytes, err); + kfree(ext_buf); + *fanotify_fh_ext_buf_ptr(fh) = NULL; +out: + /* Report the event without a file identifier on encode error */ + fh->type = FILEID_INVALID; + fh->len = 0; } /* @@ -269,21 +337,22 @@ static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask, { if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) return to_tell; - else if (data_type == FSNOTIFY_EVENT_INODE) - return (struct inode *)data; - else if (data_type == FSNOTIFY_EVENT_PATH) - return d_inode(((struct path *)data)->dentry); - return NULL; + + return (struct inode *)fsnotify_data_inode(data, data_type); } struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const void *data, int data_type, + const struct qstr *file_name, __kernel_fsid_t *fsid) { struct fanotify_event *event = NULL; + struct fanotify_fid_event *ffe = NULL; + struct fanotify_name_event *fne = NULL; gfp_t gfp = GFP_KERNEL_ACCOUNT; struct inode *id = fanotify_fid_inode(inode, mask, data, data_type); + const struct path *path = fsnotify_data_path(data, data_type); /* * For queues with unlimited length lost events are not expected and @@ -305,33 +374,81 @@ struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); if (!pevent) goto out; + event = &pevent->fae; + event->type = FANOTIFY_EVENT_TYPE_PATH_PERM; pevent->response = 0; pevent->state = FAN_EVENT_INIT; goto init; } - event = kmem_cache_alloc(fanotify_event_cachep, gfp); - if (!event) - goto out; -init: __maybe_unused - fsnotify_init_event(&event->fse, inode); + + /* + * For FAN_DIR_MODIFY event, we report the fid of the directory and + * the name of the modified entry. + * Allocate an fanotify_name_event struct and copy the name. + */ + if (mask & FAN_DIR_MODIFY && !(WARN_ON_ONCE(!file_name))) { + fne = kmalloc(sizeof(*fne) + file_name->len + 1, gfp); + if (!fne) + goto out; + + event = &fne->fae; + event->type = FANOTIFY_EVENT_TYPE_FID_NAME; + fne->name_len = file_name->len; + strcpy(fne->name, file_name->name); + goto init; + } + + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + ffe = kmem_cache_alloc(fanotify_fid_event_cachep, gfp); + if (!ffe) + goto out; + + event = &ffe->fae; + event->type = FANOTIFY_EVENT_TYPE_FID; + } else { + struct fanotify_path_event *pevent; + + pevent = kmem_cache_alloc(fanotify_path_event_cachep, gfp); + if (!pevent) + goto out; + + event = &pevent->fae; + event->type = FANOTIFY_EVENT_TYPE_PATH; + } + +init: + /* + * Use the victim inode instead of the watching inode as the id for + * event queue, so event reported on parent is merged with event + * reported on child when both directory and child watches exist. + */ + fsnotify_init_event(&event->fse, (unsigned long)id); event->mask = mask; if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) event->pid = get_pid(task_pid(current)); else event->pid = get_pid(task_tgid(current)); - event->fh_len = 0; - if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { - /* Report the event without a file identifier on encode error */ - event->fh_type = fanotify_encode_fid(event, id, gfp, fsid); - } else if (data_type == FSNOTIFY_EVENT_PATH) { - event->fh_type = FILEID_ROOT; - event->path = *((struct path *)data); - path_get(&event->path); - } else { - event->fh_type = FILEID_INVALID; - event->path.mnt = NULL; - event->path.dentry = NULL; + + if (fsid && fanotify_event_fsid(event)) + *fanotify_event_fsid(event) = *fsid; + + if (fanotify_event_object_fh(event)) + fanotify_encode_fh(fanotify_event_object_fh(event), id, gfp); + + if (fanotify_event_dir_fh(event)) + fanotify_encode_fh(fanotify_event_dir_fh(event), id, gfp); + + if (fanotify_event_has_path(event)) { + struct path *p = fanotify_event_path(event); + + if (path) { + *p = *path; + path_get(path); + } else { + p->mnt = NULL; + p->dentry = NULL; + } } out: memalloc_unuse_memcg(); @@ -392,6 +509,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM); BUILD_BUG_ON(FAN_CREATE != FS_CREATE); BUILD_BUG_ON(FAN_DELETE != FS_DELETE); + BUILD_BUG_ON(FAN_DIR_MODIFY != FS_DIR_MODIFY); BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF); BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); @@ -402,7 +520,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); mask = fanotify_group_event_mask(group, iter_info, mask, data, data_type); @@ -429,7 +547,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, } event = fanotify_alloc_event(group, inode, mask, data, data_type, - &fsid); + file_name, &fsid); ret = -ENOMEM; if (unlikely(!event)) { /* @@ -451,7 +569,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, ret = 0; } else if (fanotify_is_perm_event(mask)) { - ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event), + ret = fanotify_get_response(group, FANOTIFY_PERM(event), iter_info); } finish: @@ -470,22 +588,58 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) free_uid(user); } +static void fanotify_free_path_event(struct fanotify_event *event) +{ + path_put(fanotify_event_path(event)); + kmem_cache_free(fanotify_path_event_cachep, FANOTIFY_PE(event)); +} + +static void fanotify_free_perm_event(struct fanotify_event *event) +{ + path_put(fanotify_event_path(event)); + kmem_cache_free(fanotify_perm_event_cachep, FANOTIFY_PERM(event)); +} + +static void fanotify_free_fid_event(struct fanotify_event *event) +{ + struct fanotify_fid_event *ffe = FANOTIFY_FE(event); + + if (fanotify_fh_has_ext_buf(&ffe->object_fh)) + kfree(fanotify_fh_ext_buf(&ffe->object_fh)); + kmem_cache_free(fanotify_fid_event_cachep, ffe); +} + +static void fanotify_free_name_event(struct fanotify_event *event) +{ + struct fanotify_name_event *fne = FANOTIFY_NE(event); + + if (fanotify_fh_has_ext_buf(&fne->dir_fh)) + kfree(fanotify_fh_ext_buf(&fne->dir_fh)); + kfree(fne); +} + static void fanotify_free_event(struct fsnotify_event *fsn_event) { struct fanotify_event *event; event = FANOTIFY_E(fsn_event); - if (fanotify_event_has_path(event)) - path_put(&event->path); - else if (fanotify_event_has_ext_fh(event)) - kfree(event->fid.ext_fh); put_pid(event->pid); - if (fanotify_is_perm_event(event->mask)) { - kmem_cache_free(fanotify_perm_event_cachep, - FANOTIFY_PE(fsn_event)); - return; + switch (event->type) { + case FANOTIFY_EVENT_TYPE_PATH: + fanotify_free_path_event(event); + break; + case FANOTIFY_EVENT_TYPE_PATH_PERM: + fanotify_free_perm_event(event); + break; + case FANOTIFY_EVENT_TYPE_FID: + fanotify_free_fid_event(event); + break; + case FANOTIFY_EVENT_TYPE_FID_NAME: + fanotify_free_name_event(event); + break; + default: + WARN_ON_ONCE(1); } - kmem_cache_free(fanotify_event_cachep, event); } static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 68b30504284c..35bfbf4a7aac 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -5,7 +5,8 @@ #include <linux/exportfs.h> extern struct kmem_cache *fanotify_mark_cache; -extern struct kmem_cache *fanotify_event_cachep; +extern struct kmem_cache *fanotify_fid_event_cachep; +extern struct kmem_cache *fanotify_path_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; /* Possible states of the permission event */ @@ -18,94 +19,140 @@ enum { /* * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation). - * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and - * fh_* fields increase the size of fanotify_event by another 4 bytes. - * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and - * fh_* fields are packed in a hole after mask. + * fh buf should be dword aligned. On 64bit arch, the ext_buf pointer is + * stored in either the first or last 2 dwords. */ -#if BITS_PER_LONG == 32 #define FANOTIFY_INLINE_FH_LEN (3 << 2) -#else -#define FANOTIFY_INLINE_FH_LEN (4 << 2) -#endif -struct fanotify_fid { - __kernel_fsid_t fsid; - union { - unsigned char fh[FANOTIFY_INLINE_FH_LEN]; - unsigned char *ext_fh; - }; -}; +struct fanotify_fh { + unsigned char buf[FANOTIFY_INLINE_FH_LEN]; + u8 type; + u8 len; +} __aligned(4); + +static inline bool fanotify_fh_has_ext_buf(struct fanotify_fh *fh) +{ + return fh->len > FANOTIFY_INLINE_FH_LEN; +} + +static inline char **fanotify_fh_ext_buf_ptr(struct fanotify_fh *fh) +{ + BUILD_BUG_ON(__alignof__(char *) - 4 + sizeof(char *) > + FANOTIFY_INLINE_FH_LEN); + return (char **)ALIGN((unsigned long)(fh->buf), __alignof__(char *)); +} -static inline void *fanotify_fid_fh(struct fanotify_fid *fid, - unsigned int fh_len) +static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh) { - return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh; + return *fanotify_fh_ext_buf_ptr(fh); } -static inline bool fanotify_fid_equal(struct fanotify_fid *fid1, - struct fanotify_fid *fid2, - unsigned int fh_len) +static inline void *fanotify_fh_buf(struct fanotify_fh *fh) { - return fid1->fsid.val[0] == fid2->fsid.val[0] && - fid1->fsid.val[1] == fid2->fsid.val[1] && - !memcmp(fanotify_fid_fh(fid1, fh_len), - fanotify_fid_fh(fid2, fh_len), fh_len); + return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh->buf; } /* - * Structure for normal fanotify events. It gets allocated in + * Common structure for fanotify events. Concrete structs are allocated in * fanotify_handle_event() and freed when the information is retrieved by - * userspace + * userspace. The type of event determines how it was allocated, how it will + * be freed and which concrete struct it may be cast to. */ +enum fanotify_event_type { + FANOTIFY_EVENT_TYPE_FID, /* fixed length */ + FANOTIFY_EVENT_TYPE_FID_NAME, /* variable length */ + FANOTIFY_EVENT_TYPE_PATH, + FANOTIFY_EVENT_TYPE_PATH_PERM, +}; + struct fanotify_event { struct fsnotify_event fse; u32 mask; - /* - * Those fields are outside fanotify_fid to pack fanotify_event nicely - * on 64bit arch and to use fh_type as an indication of whether path - * or fid are used in the union: - * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither. - */ - u8 fh_type; - u8 fh_len; - u16 pad; - union { - /* - * We hold ref to this path so it may be dereferenced at any - * point during this object's lifetime - */ - struct path path; - /* - * With FAN_REPORT_FID, we do not hold any reference on the - * victim object. Instead we store its NFS file handle and its - * filesystem's fsid as a unique identifier. - */ - struct fanotify_fid fid; - }; + enum fanotify_event_type type; struct pid *pid; }; -static inline bool fanotify_event_has_path(struct fanotify_event *event) +struct fanotify_fid_event { + struct fanotify_event fae; + __kernel_fsid_t fsid; + struct fanotify_fh object_fh; +}; + +static inline struct fanotify_fid_event * +FANOTIFY_FE(struct fanotify_event *event) { - return event->fh_type == FILEID_ROOT; + return container_of(event, struct fanotify_fid_event, fae); } -static inline bool fanotify_event_has_fid(struct fanotify_event *event) +struct fanotify_name_event { + struct fanotify_event fae; + __kernel_fsid_t fsid; + struct fanotify_fh dir_fh; + u8 name_len; + char name[0]; +}; + +static inline struct fanotify_name_event * +FANOTIFY_NE(struct fanotify_event *event) { - return event->fh_type != FILEID_ROOT && - event->fh_type != FILEID_INVALID; + return container_of(event, struct fanotify_name_event, fae); } -static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event) +static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event) { - return fanotify_event_has_fid(event) && - event->fh_len > FANOTIFY_INLINE_FH_LEN; + if (event->type == FANOTIFY_EVENT_TYPE_FID) + return &FANOTIFY_FE(event)->fsid; + else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) + return &FANOTIFY_NE(event)->fsid; + else + return NULL; } -static inline void *fanotify_event_fh(struct fanotify_event *event) +static inline struct fanotify_fh *fanotify_event_object_fh( + struct fanotify_event *event) { - return fanotify_fid_fh(&event->fid, event->fh_len); + if (event->type == FANOTIFY_EVENT_TYPE_FID) + return &FANOTIFY_FE(event)->object_fh; + else + return NULL; +} + +static inline struct fanotify_fh *fanotify_event_dir_fh( + struct fanotify_event *event) +{ + if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) + return &FANOTIFY_NE(event)->dir_fh; + else + return NULL; +} + +static inline int fanotify_event_object_fh_len(struct fanotify_event *event) +{ + struct fanotify_fh *fh = fanotify_event_object_fh(event); + + return fh ? fh->len : 0; +} + +static inline bool fanotify_event_has_name(struct fanotify_event *event) +{ + return event->type == FANOTIFY_EVENT_TYPE_FID_NAME; +} + +static inline int fanotify_event_name_len(struct fanotify_event *event) +{ + return fanotify_event_has_name(event) ? + FANOTIFY_NE(event)->name_len : 0; +} + +struct fanotify_path_event { + struct fanotify_event fae; + struct path path; +}; + +static inline struct fanotify_path_event * +FANOTIFY_PE(struct fanotify_event *event) +{ + return container_of(event, struct fanotify_path_event, fae); } /* @@ -117,15 +164,16 @@ static inline void *fanotify_event_fh(struct fanotify_event *event) */ struct fanotify_perm_event { struct fanotify_event fae; + struct path path; unsigned short response; /* userspace answer to the event */ unsigned short state; /* state of the event */ int fd; /* fd we passed to userspace for this event */ }; static inline struct fanotify_perm_event * -FANOTIFY_PE(struct fsnotify_event *fse) +FANOTIFY_PERM(struct fanotify_event *event) { - return container_of(fse, struct fanotify_perm_event, fae.fse); + return container_of(event, struct fanotify_perm_event, fae); } static inline bool fanotify_is_perm_event(u32 mask) @@ -139,7 +187,24 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct fanotify_event, fse); } +static inline bool fanotify_event_has_path(struct fanotify_event *event) +{ + return event->type == FANOTIFY_EVENT_TYPE_PATH || + event->type == FANOTIFY_EVENT_TYPE_PATH_PERM; +} + +static inline struct path *fanotify_event_path(struct fanotify_event *event) +{ + if (event->type == FANOTIFY_EVENT_TYPE_PATH) + return &FANOTIFY_PE(event)->path; + else if (event->type == FANOTIFY_EVENT_TYPE_PATH_PERM) + return &FANOTIFY_PERM(event)->path; + else + return NULL; +} + struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const void *data, int data_type, + const struct qstr *file_name, __kernel_fsid_t *fsid); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 0aa362b88550..42cb794c62ac 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -46,32 +46,53 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; struct kmem_cache *fanotify_mark_cache __read_mostly; -struct kmem_cache *fanotify_event_cachep __read_mostly; +struct kmem_cache *fanotify_fid_event_cachep __read_mostly; +struct kmem_cache *fanotify_path_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; #define FANOTIFY_EVENT_ALIGN 4 +#define FANOTIFY_INFO_HDR_LEN \ + (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) + +static int fanotify_fid_info_len(int fh_len, int name_len) +{ + int info_len = fh_len; + + if (name_len) + info_len += name_len + 1; + + return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); +} static int fanotify_event_info_len(struct fanotify_event *event) { - if (!fanotify_event_has_fid(event)) - return 0; + int info_len = 0; + int fh_len = fanotify_event_object_fh_len(event); + + if (fh_len) + info_len += fanotify_fid_info_len(fh_len, 0); - return roundup(sizeof(struct fanotify_event_info_fid) + - sizeof(struct file_handle) + event->fh_len, - FANOTIFY_EVENT_ALIGN); + if (fanotify_event_name_len(event)) { + struct fanotify_name_event *fne = FANOTIFY_NE(event); + + info_len += fanotify_fid_info_len(fne->dir_fh.len, + fne->name_len); + } + + return info_len; } /* - * Get an fsnotify notification event if one exists and is small + * Get an fanotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count * is not large enough. When permission event is dequeued, its state is * updated accordingly. */ -static struct fsnotify_event *get_one_event(struct fsnotify_group *group, +static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t count) { size_t event_size = FAN_EVENT_METADATA_LEN; - struct fsnotify_event *fsn_event = NULL; + struct fanotify_event *event = NULL; pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -85,26 +106,23 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, } if (event_size > count) { - fsn_event = ERR_PTR(-EINVAL); + event = ERR_PTR(-EINVAL); goto out; } - fsn_event = fsnotify_remove_first_event(group); - if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask)) - FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED; + event = FANOTIFY_E(fsnotify_remove_first_event(group)); + if (fanotify_is_perm_event(event->mask)) + FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; out: spin_unlock(&group->notification_lock); - return fsn_event; + return event; } -static int create_fd(struct fsnotify_group *group, - struct fanotify_event *event, +static int create_fd(struct fsnotify_group *group, struct path *path, struct file **file) { int client_fd; struct file *new_file; - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); if (client_fd < 0) return client_fd; @@ -113,14 +131,9 @@ static int create_fd(struct fsnotify_group *group, * we need a new file handle for the userspace program so it can read even if it was * originally opened O_WRONLY. */ - /* it's possible this event was an overflow event. in that case dentry and mnt - * are NULL; That's fine, just don't call dentry open */ - if (event->path.dentry && event->path.mnt) - new_file = dentry_open(&event->path, - group->fanotify_data.f_flags | FMODE_NONOTIFY, - current_cred()); - else - new_file = ERR_PTR(-EOVERFLOW); + new_file = dentry_open(path, + group->fanotify_data.f_flags | FMODE_NONOTIFY, + current_cred()); if (IS_ERR(new_file)) { /* * we still send an event even if we can't open the file. this @@ -204,83 +217,111 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } -static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) +static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, + const char *name, size_t name_len, + char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; - unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh; - size_t fh_len = event->fh_len; - size_t len = fanotify_event_info_len(event); + unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf; + size_t fh_len = fh ? fh->len : 0; + size_t info_len = fanotify_fid_info_len(fh_len, name_len); + size_t len = info_len; + + pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", + __func__, fh_len, name_len, info_len, count); - if (!len) + if (!fh_len || (name && !name_len)) return 0; - if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len)) + if (WARN_ON_ONCE(len < sizeof(info) || len > count)) return -EFAULT; - /* Copy event info fid header followed by vaiable sized file handle */ - info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID; + /* + * Copy event info fid header followed by variable sized file handle + * and optionally followed by variable sized filename. + */ + info.hdr.info_type = name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : + FAN_EVENT_INFO_TYPE_FID; info.hdr.len = len; - info.fsid = event->fid.fsid; + info.fsid = *fsid; if (copy_to_user(buf, &info, sizeof(info))) return -EFAULT; buf += sizeof(info); len -= sizeof(info); - handle.handle_type = event->fh_type; + if (WARN_ON_ONCE(len < sizeof(handle))) + return -EFAULT; + + handle.handle_type = fh->type; handle.handle_bytes = fh_len; if (copy_to_user(buf, &handle, sizeof(handle))) return -EFAULT; buf += sizeof(handle); len -= sizeof(handle); + if (WARN_ON_ONCE(len < fh_len)) + return -EFAULT; + /* - * For an inline fh, copy through stack to exclude the copy from - * usercopy hardening protections. + * For an inline fh and inline file name, copy through stack to exclude + * the copy from usercopy hardening protections. */ - fh = fanotify_event_fh(event); + fh_buf = fanotify_fh_buf(fh); if (fh_len <= FANOTIFY_INLINE_FH_LEN) { - memcpy(bounce, fh, fh_len); - fh = bounce; + memcpy(bounce, fh_buf, fh_len); + fh_buf = bounce; } - if (copy_to_user(buf, fh, fh_len)) + if (copy_to_user(buf, fh_buf, fh_len)) return -EFAULT; - /* Pad with 0's */ buf += fh_len; len -= fh_len; + + if (name_len) { + /* Copy the filename with terminating null */ + name_len++; + if (WARN_ON_ONCE(len < name_len)) + return -EFAULT; + + if (copy_to_user(buf, name, name_len)) + return -EFAULT; + + buf += name_len; + len -= name_len; + } + + /* Pad with 0's */ WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); if (len > 0 && clear_user(buf, len)) return -EFAULT; - return 0; + return info_len; } static ssize_t copy_event_to_user(struct fsnotify_group *group, - struct fsnotify_event *fsn_event, + struct fanotify_event *event, char __user *buf, size_t count) { struct fanotify_event_metadata metadata; - struct fanotify_event *event; + struct path *path = fanotify_event_path(event); struct file *f = NULL; int ret, fd = FAN_NOFD; - pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); + pr_debug("%s: group=%p event=%p\n", __func__, group, event); - event = container_of(fsn_event, struct fanotify_event, fse); - metadata.event_len = FAN_EVENT_METADATA_LEN; + metadata.event_len = FAN_EVENT_METADATA_LEN + + fanotify_event_info_len(event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; metadata.pid = pid_vnr(event->pid); - if (fanotify_event_has_path(event)) { - fd = create_fd(group, event, &f); + if (path && path->mnt && path->dentry) { + fd = create_fd(group, path, &f); if (fd < 0) return fd; - } else if (fanotify_event_has_fid(event)) { - metadata.event_len += fanotify_event_info_len(event); } metadata.fd = fd; @@ -295,15 +336,39 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) goto out_close_fd; + buf += FAN_EVENT_METADATA_LEN; + count -= FAN_EVENT_METADATA_LEN; + if (fanotify_is_perm_event(event->mask)) - FANOTIFY_PE(fsn_event)->fd = fd; + FANOTIFY_PERM(event)->fd = fd; - if (fanotify_event_has_path(event)) { + if (f) fd_install(fd, f); - } else if (fanotify_event_has_fid(event)) { - ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN); + + /* Event info records order is: dir fid + name, child fid */ + if (fanotify_event_name_len(event)) { + struct fanotify_name_event *fne = FANOTIFY_NE(event); + + ret = copy_info_to_user(fanotify_event_fsid(event), + fanotify_event_dir_fh(event), + fne->name, fne->name_len, + buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + } + + if (fanotify_event_object_fh_len(event)) { + ret = copy_info_to_user(fanotify_event_fsid(event), + fanotify_event_object_fh(event), + NULL, 0, buf, count); if (ret < 0) return ret; + + buf += ret; + count -= ret; } return metadata.event_len; @@ -335,7 +400,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct fsnotify_group *group; - struct fsnotify_event *kevent; + struct fanotify_event *event; char __user *start; int ret; DEFINE_WAIT_FUNC(wait, woken_wake_function); @@ -347,13 +412,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { - kevent = get_one_event(group, count); - if (IS_ERR(kevent)) { - ret = PTR_ERR(kevent); + event = get_one_event(group, count); + if (IS_ERR(event)) { + ret = PTR_ERR(event); break; } - if (!kevent) { + if (!event) { ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; @@ -369,7 +434,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, continue; } - ret = copy_event_to_user(group, kevent, buf, count); + ret = copy_event_to_user(group, event, buf, count); if (unlikely(ret == -EOPENSTALE)) { /* * We cannot report events with stale fd so drop it. @@ -384,17 +449,17 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, * Permission events get queued to wait for response. Other * events can be destroyed now. */ - if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) { - fsnotify_destroy_event(group, kevent); + if (!fanotify_is_perm_event(event->mask)) { + fsnotify_destroy_event(group, &event->fse); } else { if (ret <= 0) { spin_lock(&group->notification_lock); finish_permission_event(group, - FANOTIFY_PE(kevent), FAN_DENY); + FANOTIFY_PERM(event), FAN_DENY); wake_up(&group->fanotify_data.access_waitq); } else { spin_lock(&group->notification_lock); - list_add_tail(&kevent->list, + list_add_tail(&event->fse.list, &group->fanotify_data.access_list); spin_unlock(&group->notification_lock); } @@ -440,8 +505,6 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; - struct fanotify_perm_event *event; - struct fsnotify_event *fsn_event; /* * Stop new events from arriving in the notification queue. since @@ -456,6 +519,8 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ spin_lock(&group->notification_lock); while (!list_empty(&group->fanotify_data.access_list)) { + struct fanotify_perm_event *event; + event = list_first_entry(&group->fanotify_data.access_list, struct fanotify_perm_event, fae.fse.list); list_del_init(&event->fae.fse.list); @@ -469,12 +534,14 @@ static int fanotify_release(struct inode *ignored, struct file *file) * response is consumed and fanotify_get_response() returns. */ while (!fsnotify_notify_queue_is_empty(group)) { - fsn_event = fsnotify_remove_first_event(group); - if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) { + struct fanotify_event *event; + + event = FANOTIFY_E(fsnotify_remove_first_event(group)); + if (!(event->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); - fsnotify_destroy_event(group, fsn_event); + fsnotify_destroy_event(group, &event->fse); } else { - finish_permission_event(group, FANOTIFY_PE(fsn_event), + finish_permission_event(group, FANOTIFY_PERM(event), FAN_ALLOW); } spin_lock(&group->notification_lock); @@ -824,7 +891,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->memcg = get_mem_cgroup_from_mm(current->mm); oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL, - FSNOTIFY_EVENT_NONE, NULL); + FSNOTIFY_EVENT_NONE, NULL, NULL); if (unlikely(!oevent)) { fd = -ENOMEM; goto out_destroy_group; @@ -1139,7 +1206,10 @@ static int __init fanotify_user_setup(void) fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); - fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC); + fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, + SLAB_PANIC); + fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event, + SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 46f225580009..72d332ce8e12 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -143,15 +143,13 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) } /* Notify this dentry's parent about a child's events. */ -int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask) +int fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, + int data_type) { struct dentry *parent; struct inode *p_inode; int ret = 0; - if (!dentry) - dentry = path->dentry; - if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) return 0; @@ -168,12 +166,7 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask mask |= FS_EVENT_ON_CHILD; take_dentry_name_snapshot(&name, dentry); - if (path) - ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, - &name.name, 0); - else - ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, - &name.name, 0); + ret = fsnotify(p_inode, mask, data, data_type, &name.name, 0); release_dentry_name_snapshot(&name); } @@ -181,7 +174,7 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask return ret; } -EXPORT_SYMBOL_GPL(__fsnotify_parent); +EXPORT_SYMBOL_GPL(fsnotify_parent); static int send_to_group(struct inode *to_tell, __u32 mask, const void *data, @@ -318,6 +311,7 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const struct qstr *file_name, u32 cookie) { + const struct path *path = fsnotify_data_path(data, data_is); struct fsnotify_iter_info iter_info = {}; struct super_block *sb = to_tell->i_sb; struct mount *mnt = NULL; @@ -325,8 +319,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, int ret = 0; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); - if (data_is == FSNOTIFY_EVENT_PATH) { - mnt = real_mount(((const struct path *)data)->mnt); + if (path) { + mnt = real_mount(path->mnt); mnt_or_sb_mask |= mnt->mnt_fsnotify_mask; } /* An event "on child" is not intended for a mount/sb mark */ @@ -389,7 +383,7 @@ static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index d510223d302c..2ebc89047153 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -39,7 +39,7 @@ static bool event_compare(struct fsnotify_event *old_fsn, if (old->mask & FS_IN_IGNORED) return false; if ((old->mask == new->mask) && - (old_fsn->inode == new_fsn->inode) && + (old_fsn->objectid == new_fsn->objectid) && (old->name_len == new->name_len) && (!old->name_len || !strcmp(old->name, new->name))) return true; @@ -61,6 +61,7 @@ int inotify_handle_event(struct fsnotify_group *group, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { + const struct path *path = fsnotify_data_path(data, data_type); struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct inotify_inode_mark *i_mark; struct inotify_event_info *event; @@ -73,12 +74,9 @@ int inotify_handle_event(struct fsnotify_group *group, return 0; if ((inode_mark->mask & FS_EXCL_UNLINK) && - (data_type == FSNOTIFY_EVENT_PATH)) { - const struct path *path = data; + path && d_unlinked(path->dentry)) + return 0; - if (d_unlinked(path->dentry)) - return 0; - } if (file_name) { len = file_name->len; alloc_len += len + 1; @@ -118,7 +116,7 @@ int inotify_handle_event(struct fsnotify_group *group, mask &= ~IN_ISDIR; fsn_event = &event->fse; - fsnotify_init_event(fsn_event, inode); + fsnotify_init_event(fsn_event, (unsigned long)inode); event->mask = mask; event->wd = i_mark->wd; event->sync_cookie = cookie; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 107537a543fd..81ffc8629fc4 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -635,7 +635,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) return ERR_PTR(-ENOMEM); } group->overflow_event = &oevent->fse; - fsnotify_init_event(group->overflow_event, NULL); + fsnotify_init_event(group->overflow_event, 0); oevent->mask = FS_Q_OVERFLOW; oevent->wd = -1; oevent->sync_cookie = 0; diff --git a/fs/nsfs.c b/fs/nsfs.c index b13bfd406820..4f1205725cfe 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -247,6 +247,20 @@ out_invalid: return ERR_PTR(-EINVAL); } +/** + * ns_match() - Returns true if current namespace matches dev/ino provided. + * @ns_common: current ns + * @dev: dev_t from nsfs that will be matched against current nsfs + * @ino: ino_t from nsfs that will be matched against current nsfs + * + * Return: true if dev and ino matches the current nsfs. + */ +bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino) +{ + return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev); +} + + static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) { struct inode *inode = d_inode(dentry); diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 7202a1e39d70..554b744f41bf 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -92,8 +92,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) "0x%llx.", (unsigned long long)bh->b_blocknr); } first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -108,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } while (tmp != bh); - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* * If none of the buffers had errors then we can set the page uptodate, * but we first have to perform the post read mst fixups, if the @@ -142,8 +140,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) unlock_page(page); return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 88534eb0e7c2..65b3abbcce4e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1060,7 +1060,6 @@ bail: brelse(bhs[i]); bhs[i] = NULL; } - mlog_errno(status); } return status; } @@ -3942,7 +3941,7 @@ rotate: * above. * * This leaf needs to have space, either by the empty 1st - * extent record, or by virtue of an l_next_rec < l_count. + * extent record, or by virtue of an l_next_free_rec < l_count. */ ocfs2_rotate_leaf(el, insert_rec); } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a368350d4c27..89d13e0705fe 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -101,8 +101,6 @@ static struct o2hb_callback { static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); -#define O2HB_DEFAULT_BLOCK_BITS 9 - enum o2hb_heartbeat_modes { O2HB_HEARTBEAT_LOCAL = 0, O2HB_HEARTBEAT_GLOBAL, @@ -1309,7 +1307,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) case O2HB_DB_TYPE_REGION_NUMBER: reg = (struct o2hb_region *)db->db_data; - out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", + out += scnprintf(buf + out, PAGE_SIZE - out, "%d\n", reg->hr_region_num); goto done; @@ -1319,12 +1317,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) /* If 0, it has never been set before */ if (lts) lts = jiffies_to_msecs(jiffies - lts); - out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); + out += scnprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); goto done; case O2HB_DB_TYPE_REGION_PINNED: reg = (struct o2hb_region *)db->db_data; - out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + out += scnprintf(buf + out, PAGE_SIZE - out, "%u\n", !!reg->hr_item_pinned); goto done; @@ -1333,8 +1331,8 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) } while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) - out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); - out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += scnprintf(buf + out, PAGE_SIZE - out, "\n"); done: i_size_write(inode, out); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 02bf4a1774cc..667a5c5e1f66 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -443,8 +443,8 @@ static int o2net_fill_bitmap(char *buf, int len) o2net_fill_node_map(map, sizeof(map)); while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) - out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); - out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += scnprintf(buf + out, PAGE_SIZE - out, "\n"); return out; } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 48a3398f0bf5..2c512b40a940 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1570,15 +1570,13 @@ static void o2net_start_connect(struct work_struct *work) struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; int ret = 0, stop; unsigned int timeout; - unsigned int noio_flag; + unsigned int nofs_flag; /* - * sock_create allocates the sock with GFP_KERNEL. We must set - * per-process flag PF_MEMALLOC_NOIO so that all allocations done - * by this process are done as if GFP_NOIO was specified. So we - * are not reentering filesystem while doing memory reclaim. + * sock_create allocates the sock with GFP_KERNEL. We must + * prevent the filesystem from being reentered by memory reclaim. */ - noio_flag = memalloc_noio_save(); + nofs_flag = memalloc_nofs_save(); /* if we're greater we initiate tx, otherwise we accept */ if (o2nm_this_node() <= o2net_num_from_nn(nn)) goto out; @@ -1683,7 +1681,7 @@ out: if (mynode) o2nm_node_put(mynode); - memalloc_noio_restore(noio_flag); + memalloc_nofs_restore(nofs_flag); return; } @@ -1810,15 +1808,13 @@ static int o2net_accept_one(struct socket *sock, int *more) struct o2nm_node *local_node = NULL; struct o2net_sock_container *sc = NULL; struct o2net_node *nn; - unsigned int noio_flag; + unsigned int nofs_flag; /* - * sock_create_lite allocates the sock with GFP_KERNEL. We must set - * per-process flag PF_MEMALLOC_NOIO so that all allocations done - * by this process are done as if GFP_NOIO was specified. So we - * are not reentering filesystem while doing memory reclaim. + * sock_create_lite allocates the sock with GFP_KERNEL. We must + * prevent the filesystem from being reentered by memory reclaim. */ - noio_flag = memalloc_noio_save(); + nofs_flag = memalloc_nofs_save(); BUG_ON(sock == NULL); *more = 0; @@ -1934,7 +1930,7 @@ out: if (sc) sc_put(sc); - memalloc_noio_restore(noio_flag); + memalloc_nofs_restore(nofs_flag); return ret; } @@ -1948,7 +1944,6 @@ static void o2net_accept_many(struct work_struct *work) { struct socket *sock = o2net_listen_sock; int more; - int err; /* * It is critical to note that due to interrupt moderation @@ -1963,7 +1958,7 @@ static void o2net_accept_many(struct work_struct *work) */ for (;;) { - err = o2net_accept_one(sock, &more); + o2net_accept_one(sock, &more); if (!more) break; cond_resched(); diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index de87cbffd175..736338f45c59 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -32,7 +32,7 @@ struct o2net_msg __be32 status; __be32 key; __be32 msg_num; - __u8 buf[0]; + __u8 buf[]; }; typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data, diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index bdef72c0f099..5761060d2ba8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -676,7 +676,7 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, int ra_ptr = 0; /* Current index into readahead buffer */ int num = 0; - int nblocks, i, err; + int nblocks, i; sb = dir->i_sb; @@ -708,7 +708,7 @@ restart: num++; bh = NULL; - err = ocfs2_read_dir_block(dir, b++, &bh, + ocfs2_read_dir_block(dir, b++, &bh, OCFS2_BH_READAHEAD); bh_use[ra_max] = bh; } diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 0463dce65bb2..c8a444622faa 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -564,7 +564,7 @@ struct dlm_migratable_lockres // 48 bytes u8 lvb[DLM_LVB_LEN]; // 112 bytes - struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112 + struct dlm_migratable_lock ml[]; // 16 bytes each, begins at byte 112 }; #define DLM_MIG_LOCKRES_MAX_LEN \ (sizeof(struct dlm_migratable_lockres) + \ @@ -601,7 +601,7 @@ struct dlm_convert_lock u8 name[O2NM_MAX_NAME_LEN]; - s8 lvb[0]; + s8 lvb[]; }; #define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN) @@ -616,7 +616,7 @@ struct dlm_unlock_lock u8 name[O2NM_MAX_NAME_LEN]; - s8 lvb[0]; + s8 lvb[]; }; #define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN) @@ -632,7 +632,7 @@ struct dlm_proxy_ast u8 name[O2NM_MAX_NAME_LEN]; - s8 lvb[0]; + s8 lvb[]; }; #define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c5c6efba7b5e..4b8b41d23e91 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -244,11 +244,11 @@ static int stringify_lockname(const char *lockname, int locklen, char *buf, memcpy((__be64 *)&inode_blkno_be, (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START], sizeof(__be64)); - out += snprintf(buf + out, len - out, "%.*s%08x", + out += scnprintf(buf + out, len - out, "%.*s%08x", OCFS2_DENTRY_LOCK_INO_START - 1, lockname, (unsigned int)be64_to_cpu(inode_blkno_be)); } else - out += snprintf(buf + out, len - out, "%.*s", + out += scnprintf(buf + out, len - out, "%.*s", locklen, lockname); return out; } @@ -260,7 +260,7 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes, int i = -1; while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes) - out += snprintf(buf + out, len - out, "%d ", i); + out += scnprintf(buf + out, len - out, "%d ", i); return out; } @@ -278,34 +278,34 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) mle_type = "MIG"; out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", mle_type, mle->master, mle->new_master, !list_empty(&mle->hb_events), !!mle->inuse, kref_read(&mle->mle_refs)); - out += snprintf(buf + out, len - out, "Maybe="); + out += scnprintf(buf + out, len - out, "Maybe="); out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "Vote="); + out += scnprintf(buf + out, len - out, "Vote="); out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "Response="); + out += scnprintf(buf + out, len - out, "Response="); out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "Node="); + out += scnprintf(buf + out, len - out, "Node="); out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); return out; } @@ -353,7 +353,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) int out = 0; unsigned long total = 0; - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Dumping Purgelist for Domain: %s\n", dlm->name); spin_lock(&dlm->spinlock); @@ -365,13 +365,13 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) out += stringify_lockname(res->lockname.name, res->lockname.len, buf + out, len - out); - out += snprintf(buf + out, len - out, "\t%ld\n", + out += scnprintf(buf + out, len - out, "\t%ld\n", (jiffies - res->last_used)/HZ); spin_unlock(&res->spinlock); } spin_unlock(&dlm->spinlock); - out += snprintf(buf + out, len - out, "Total on list: %lu\n", total); + out += scnprintf(buf + out, len - out, "Total on list: %lu\n", total); return out; } @@ -410,7 +410,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) int i, out = 0; unsigned long total = 0, longest = 0, bucket_count = 0; - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Dumping MLEs for Domain: %s\n", dlm->name); spin_lock(&dlm->master_lock); @@ -428,7 +428,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) } spin_unlock(&dlm->master_lock); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Total: %lu, Longest: %lu\n", total, longest); return out; } @@ -467,7 +467,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len) #define DEBUG_LOCK_VERSION 1 spin_lock(&lock->spinlock); - out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d," + out = scnprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d," "%d,%d,%d,%d\n", DEBUG_LOCK_VERSION, list_type, lock->ml.type, lock->ml.convert_type, @@ -491,13 +491,13 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) int i; int out = 0; - out += snprintf(buf + out, len - out, "NAME:"); + out += scnprintf(buf + out, len - out, "NAME:"); out += stringify_lockname(res->lockname.name, res->lockname.len, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); #define DEBUG_LRES_VERSION 1 - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n", DEBUG_LRES_VERSION, res->owner, res->state, res->last_used, @@ -509,17 +509,17 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) kref_read(&res->refs)); /* refmap */ - out += snprintf(buf + out, len - out, "RMAP:"); + out += scnprintf(buf + out, len - out, "RMAP:"); out += stringify_nodemap(res->refmap, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* lvb */ - out += snprintf(buf + out, len - out, "LVBX:"); + out += scnprintf(buf + out, len - out, "LVBX:"); for (i = 0; i < DLM_LVB_LEN; i++) - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%02x", (unsigned char)res->lvb[i]); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* granted */ list_for_each_entry(lock, &res->granted, list) @@ -533,7 +533,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) list_for_each_entry(lock, &res->blocked, list) out += dump_lock(lock, 2, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); return out; } @@ -683,41 +683,41 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) } /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Domain: %s Key: 0x%08x Protocol: %d.%d\n", dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); /* Thread Pid: xxx Node: xxx State: xxxxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Thread Pid: %d Node: %d State: %s\n", task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state); /* Number of Joins: xxx Joining Node: xxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Number of Joins: %d Joining Node: %d\n", dlm->num_joins, dlm->joining_node); /* Domain Map: xx xx xx */ - out += snprintf(buf + out, len - out, "Domain Map: "); + out += scnprintf(buf + out, len - out, "Domain Map: "); out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Exit Domain Map: xx xx xx */ - out += snprintf(buf + out, len - out, "Exit Domain Map: "); + out += scnprintf(buf + out, len - out, "Exit Domain Map: "); out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Live Map: xx xx xx */ - out += snprintf(buf + out, len - out, "Live Map: "); + out += scnprintf(buf + out, len - out, "Live Map: "); out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Lock Resources: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Lock Resources: %d (%d)\n", atomic_read(&dlm->res_cur_count), atomic_read(&dlm->res_tot_count)); @@ -729,29 +729,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) cur_mles += atomic_read(&dlm->mle_cur_count[i]); /* MLEs: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "MLEs: %d (%d)\n", cur_mles, tot_mles); /* Blocking: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, " Blocking: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); /* Mastery: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, " Mastery: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); /* Migration: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, " Migration: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Lists: Dirty=%s Purge=%s PendingASTs=%s " "PendingBASTs=%s\n", (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), @@ -760,12 +760,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); /* Purge Count: xxx Refs: xxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Purge Count: %d Refs: %d\n", dlm->purge_count, kref_read(&dlm->dlm_refs)); /* Dead Node: xxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Dead Node: %d\n", dlm->reco.dead_node); /* What about DLM_RECO_STATE_FINALIZE? */ @@ -775,19 +775,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) state = "INACTIVE"; /* Recovery Pid: xxxx Master: xxx State: xxxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Recovery Pid: %d Master: %d State: %s\n", task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, state); /* Recovery Map: xx xx */ - out += snprintf(buf + out, len - out, "Recovery Map: "); + out += scnprintf(buf + out, len - out, "Recovery Map: "); out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Recovery Node State: */ - out += snprintf(buf + out, len - out, "Recovery Node State:\n"); + out += scnprintf(buf + out, len - out, "Recovery Node State:\n"); list_for_each_entry(node, &dlm->reco.node_data, list) { switch (node->state) { case DLM_RECO_NODE_DATA_INIT: @@ -815,7 +815,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) state = "BAD"; break; } - out += snprintf(buf + out, len - out, "\t%u - %s\n", + out += scnprintf(buf + out, len - out, "\t%u - %s\n", node->node_num, state); } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 900f7e466d11..55a6512e9fde 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2749,8 +2749,6 @@ leave: return ret; } -#define DLM_MIGRATION_RETRY_MS 100 - /* * Should be called only after beginning the domain leave process. * There should not be any remaining locks on nonlocal lock resources, diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index fd40c17cd022..5ccc4ff0b82a 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -39,8 +39,6 @@ static int dlm_thread(void *data); static void dlm_flush_asts(struct dlm_ctxt *dlm); -#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) - /* will exit holding res->spinlock, but may drop in function */ /* waits until flags are cleared on res->state */ void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags) @@ -680,7 +678,6 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm) #define DLM_THREAD_TIMEOUT_MS (4 * 1000) #define DLM_THREAD_MAX_DIRTY 100 -#define DLM_THREAD_MAX_ASTS 10 static int dlm_thread(void *data) { diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index cb9e6a73bea9..152a0fc4e905 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2133,7 +2133,7 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, } #define OCFS2_SEC_BITS 34 -#define OCFS2_SEC_SHIFT (64 - 34) +#define OCFS2_SEC_SHIFT (64 - OCFS2_SEC_BITS) #define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) /* LVB only has room for 64 bits of time here so we pack it for diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 68ba354cf361..b425f0b01dce 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -91,7 +91,7 @@ enum ocfs2_replay_state { struct ocfs2_replay_map { unsigned int rm_slots; enum ocfs2_replay_state rm_state; - unsigned char rm_replay_slots[0]; + unsigned char rm_replay_slots[]; }; static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index da65251ef815..5381020aaa9a 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -406,7 +406,7 @@ static int ocfs2_mknod(struct inode *dir, if (status < 0) { mlog_errno(status); - goto leave; + goto roll_back; } if (si.enable) { @@ -414,7 +414,7 @@ static int ocfs2_mknod(struct inode *dir, meta_ac, data_ac); if (status < 0) { mlog_errno(status); - goto leave; + goto roll_back; } } @@ -427,7 +427,7 @@ static int ocfs2_mknod(struct inode *dir, OCFS2_I(dir)->ip_blkno); if (status) { mlog_errno(status); - goto leave; + goto roll_back; } dl = dentry->d_fsdata; @@ -437,12 +437,19 @@ static int ocfs2_mknod(struct inode *dir, &lookup); if (status < 0) { mlog_errno(status); - goto leave; + goto roll_back; } insert_inode_hash(inode); d_instantiate(dentry, inode); status = 0; + +roll_back: + if (status < 0 && S_ISDIR(mode)) { + ocfs2_add_links_count(dirfe, -1); + drop_nlink(dir); + } + leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 0db4a7ec58a2..0dd8c41bafd4 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -470,7 +470,7 @@ struct ocfs2_extent_list { __le16 l_reserved1; __le64 l_reserved2; /* Pad to sizeof(ocfs2_extent_rec) */ -/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */ +/*10*/ struct ocfs2_extent_rec l_recs[]; /* Extent records */ }; /* @@ -484,7 +484,7 @@ struct ocfs2_chain_list { __le16 cl_count; /* Total chains in this list */ __le16 cl_next_free_rec; /* Next unused chain slot */ __le64 cl_reserved1; -/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */ +/*10*/ struct ocfs2_chain_rec cl_recs[]; /* Chain records */ }; /* @@ -496,7 +496,7 @@ struct ocfs2_truncate_log { /*00*/ __le16 tl_count; /* Total records in this log */ __le16 tl_used; /* Number of records in use */ __le32 tl_reserved1; -/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */ +/*08*/ struct ocfs2_truncate_rec tl_recs[]; /* Truncate records */ }; /* @@ -640,7 +640,7 @@ struct ocfs2_local_alloc __le16 la_size; /* Size of included bitmap, in bytes */ __le16 la_reserved1; __le64 la_reserved2; -/*10*/ __u8 la_bitmap[0]; +/*10*/ __u8 la_bitmap[]; }; /* @@ -653,7 +653,7 @@ struct ocfs2_inline_data * for data, starting at id_data */ __le16 id_reserved0; __le32 id_reserved1; - __u8 id_data[0]; /* Start of user data */ + __u8 id_data[]; /* Start of user data */ }; /* @@ -798,7 +798,7 @@ struct ocfs2_dx_entry_list { * possible in de_entries */ __le16 de_num_used; /* Current number of * de_entries entries */ - struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries + struct ocfs2_dx_entry de_entries[]; /* Indexed dir entries * in a packed array of * length de_num_used */ }; @@ -935,7 +935,7 @@ struct ocfs2_refcount_list { __le16 rl_used; /* Current number of used records */ __le32 rl_reserved2; __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ -/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */ +/*10*/ struct ocfs2_refcount_rec rl_recs[]; /* Refcount records */ }; @@ -1021,7 +1021,7 @@ struct ocfs2_xattr_header { buckets. A block uses xb_check and sets this field to zero.) */ - struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ + struct ocfs2_xattr_entry xh_entries[]; /* xattr entry list. */ }; /* @@ -1207,7 +1207,7 @@ struct ocfs2_local_disk_dqinfo { /* Header of one chunk of a quota file */ struct ocfs2_local_disk_chunk { __le32 dqc_free; /* Number of free entries in the bitmap */ - __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + __u8 dqc_bitmap[]; /* Bitmap of entries in the corresponding * chunk of quota file */ }; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ee43e51188be..cfb77f70c888 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -154,6 +154,7 @@ ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) } static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) +__acquires(&rf->rf_lock) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); @@ -161,6 +162,7 @@ static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) } static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) +__releases(&rf->rf_lock) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 0249e8ca1028..bf3842e34fb9 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -33,9 +33,6 @@ static DEFINE_SPINLOCK(resv_lock); -#define OCFS2_MIN_RESV_WINDOW_BITS 8 -#define OCFS2_MAX_RESV_WINDOW_BITS 1024 - int ocfs2_dir_resv_allowed(struct ocfs2_super *osb) { return (osb->osb_resv_level && osb->osb_dir_resv_level); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 8aa6a667860c..a191094694c6 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -656,8 +656,6 @@ error: * and easier to preserve the name. */ -#define FS_OCFS2_NM 1 - static struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 939df99d2dec..4836becb7578 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2509,9 +2509,6 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, bail: brelse(group_bh); - - if (status) - mlog_errno(status); return status; } @@ -2582,8 +2579,6 @@ static int _ocfs2_free_clusters(handle_t *handle, num_clusters); out: - if (status) - mlog_errno(status); return status; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 05dd68ade293..ac61eeaf3837 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -220,31 +220,31 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) int i, out = 0; unsigned long flags; - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", "Device", osb->dev_str, osb->uuid_str, osb->fs_generation, osb->vol_label); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => State: %d Flags: 0x%lX\n", "Volume", atomic_read(&osb->vol_state), osb->osb_flags); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Block: %lu Cluster: %d\n", "Sizes", osb->sb->s_blocksize, osb->s_clustersize); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Compat: 0x%X Incompat: 0x%X " "ROcompat: 0x%X\n", "Features", osb->s_feature_compat, osb->s_feature_incompat, osb->s_feature_ro_compat); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", osb->s_mount_opt, osb->s_atime_quantum); if (cconn) { - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Stack: %s Name: %*s " "Version: %d.%d\n", "Cluster", (*osb->osb_cluster_stack == '\0' ? @@ -255,7 +255,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) } spin_lock_irqsave(&osb->dc_task_lock, flags); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Pid: %d Count: %lu WakeSeq: %lu " "WorkSeq: %lu\n", "DownCnvt", (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), @@ -264,32 +264,32 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) spin_unlock_irqrestore(&osb->dc_task_lock, flags); spin_lock(&osb->osb_lock); - out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", + out += scnprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", "Recovery", (osb->recovery_thread_task ? task_pid_nr(osb->recovery_thread_task) : -1)); if (rm->rm_used == 0) - out += snprintf(buf + out, len - out, " None\n"); + out += scnprintf(buf + out, len - out, " None\n"); else { for (i = 0; i < rm->rm_used; i++) - out += snprintf(buf + out, len - out, " %d", + out += scnprintf(buf + out, len - out, " %d", rm->rm_entries[i]); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); } spin_unlock(&osb->osb_lock); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Pid: %d Interval: %lu\n", "Commit", (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), osb->osb_commit_interval); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => State: %d TxnId: %lu NumTxns: %d\n", "Journal", osb->journal->j_state, osb->journal->j_trans_id, atomic_read(&osb->journal->j_num_trans)); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => GlobalAllocs: %d LocalAllocs: %d " "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", "Stats", @@ -299,7 +299,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) atomic_read(&osb->alloc_stats.moves), atomic_read(&osb->alloc_stats.bg_extends)); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => State: %u Descriptor: %llu Size: %u bits " "Default: %u bits\n", "LocalAlloc", osb->local_alloc_state, @@ -307,7 +307,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) osb->local_alloc_bits, osb->local_alloc_default_bits); spin_lock(&osb->osb_lock); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => InodeSlot: %d StolenInodes: %d, " "MetaSlot: %d StolenMeta: %d\n", "Steal", osb->s_inode_steal_slot, @@ -316,20 +316,20 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) atomic_read(&osb->s_num_meta_stolen)); spin_unlock(&osb->osb_lock); - out += snprintf(buf + out, len - out, "OrphanScan => "); - out += snprintf(buf + out, len - out, "Local: %u Global: %u ", + out += scnprintf(buf + out, len - out, "OrphanScan => "); + out += scnprintf(buf + out, len - out, "Local: %u Global: %u ", os->os_count, os->os_seqno); - out += snprintf(buf + out, len - out, " Last Scan: "); + out += scnprintf(buf + out, len - out, " Last Scan: "); if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) - out += snprintf(buf + out, len - out, "Disabled\n"); + out += scnprintf(buf + out, len - out, "Disabled\n"); else - out += snprintf(buf + out, len - out, "%lu seconds ago\n", + out += scnprintf(buf + out, len - out, "%lu seconds ago\n", (unsigned long)(ktime_get_seconds() - os->os_scantime)); - out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", + out += scnprintf(buf + out, len - out, "%10s => %3s %10s\n", "Slots", "Num", "RecoGen"); for (i = 0; i < osb->max_slots; ++i) { - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s %c %3d %10d\n", " ", (i == osb->slot_num ? '*' : ' '), diff --git a/fs/open.c b/fs/open.c index b69d6eed67e6..719b320ede52 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1046,8 +1046,10 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; - if (flags & O_EXCL) + if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; + flags |= O_NOFOLLOW; + } } if (flags & O_DIRECTORY) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 9fc47c2e078d..9709cf22cab3 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -36,6 +36,13 @@ static int ovl_ccup_get(char *buf, const struct kernel_param *param) module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644); MODULE_PARM_DESC(check_copy_up, "Obsolete; does nothing"); +static bool ovl_must_copy_xattr(const char *name) +{ + return !strcmp(name, XATTR_POSIX_ACL_ACCESS) || + !strcmp(name, XATTR_POSIX_ACL_DEFAULT) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); +} + int ovl_copy_xattr(struct dentry *old, struct dentry *new) { ssize_t list_size, size, value_size = 0; @@ -107,8 +114,13 @@ retry: continue; /* Discard */ } error = vfs_setxattr(new, name, value, size, 0); - if (error) - break; + if (error) { + if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name)) + break; + + /* Ignore failure to copy unknown xattrs */ + error = 0; + } } kfree(value); out: diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 8e57d5372b8f..279009dee366 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -42,7 +42,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) return err; } -static struct dentry *ovl_lookup_temp(struct dentry *workdir) +struct dentry *ovl_lookup_temp(struct dentry *workdir) { struct dentry *temp; char name[20]; @@ -243,6 +243,9 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, ovl_dir_modified(dentry->d_parent, false); ovl_dentry_set_upper_alias(dentry); + ovl_dentry_update_reval(dentry, newdentry, + DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + if (!hardlink) { /* * ovl_obtain_alias() can be called after ovl_create_real() @@ -819,6 +822,28 @@ static bool ovl_pure_upper(struct dentry *dentry) !ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry)); } +static void ovl_drop_nlink(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct dentry *alias; + + /* Try to find another, hashed alias */ + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + if (alias != dentry && !d_unhashed(alias)) + break; + } + spin_unlock(&inode->i_lock); + + /* + * Changes to underlying layers may cause i_nlink to lose sync with + * reality. In this case prevent the link count from going to zero + * prematurely. + */ + if (inode->i_nlink > !!alias) + drop_nlink(inode); +} + static int ovl_do_remove(struct dentry *dentry, bool is_dir) { int err; @@ -856,7 +881,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (is_dir) clear_nlink(dentry->d_inode); else - drop_nlink(dentry->d_inode); + ovl_drop_nlink(dentry); } ovl_nlink_end(dentry); @@ -1201,7 +1226,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (new_is_dir) clear_nlink(d_inode(new)); else - drop_nlink(d_inode(new)); + ovl_drop_nlink(new); } ovl_dir_modified(old->d_parent, ovl_type_origin(old) || diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 6f54d70cef27..475c61f53f0f 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -308,29 +308,35 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, ovl_set_flag(OVL_UPPERDATA, inode); dentry = d_find_any_alias(inode); - if (!dentry) { - dentry = d_alloc_anon(inode->i_sb); - if (!dentry) - goto nomem; - oe = ovl_alloc_entry(lower ? 1 : 0); - if (!oe) - goto nomem; - - if (lower) { - oe->lowerstack->dentry = dget(lower); - oe->lowerstack->layer = lowerpath->layer; - } - dentry->d_fsdata = oe; - if (upper_alias) - ovl_dentry_set_upper_alias(dentry); + if (dentry) + goto out_iput; + + dentry = d_alloc_anon(inode->i_sb); + if (unlikely(!dentry)) + goto nomem; + oe = ovl_alloc_entry(lower ? 1 : 0); + if (!oe) + goto nomem; + + if (lower) { + oe->lowerstack->dentry = dget(lower); + oe->lowerstack->layer = lowerpath->layer; } + dentry->d_fsdata = oe; + if (upper_alias) + ovl_dentry_set_upper_alias(dentry); + + ovl_dentry_update_reval(dentry, upper, + DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); return d_instantiate_anon(dentry, inode); nomem: - iput(inode); dput(dentry); - return ERR_PTR(-ENOMEM); + dentry = ERR_PTR(-ENOMEM); +out_iput: + iput(inode); + return dentry; } /* Get the upper or lower dentry in stach whose on layer @idx */ diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 79e8994e3bc1..b0d42ece4d7c 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -79,6 +79,7 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) { bool samefs = ovl_same_fs(dentry->d_sb); unsigned int xinobits = ovl_xino_bits(dentry->d_sb); + unsigned int xinoshift = 64 - xinobits; if (samefs) { /* @@ -89,22 +90,22 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) stat->dev = dentry->d_sb->s_dev; return 0; } else if (xinobits) { - unsigned int shift = 64 - xinobits; /* * All inode numbers of underlying fs should not be using the * high xinobits, so we use high xinobits to partition the * overlay st_ino address space. The high bits holds the fsid - * (upper fsid is 0). This way overlay inode numbers are unique - * and all inodes use overlay st_dev. Inode numbers are also - * persistent for a given layer configuration. + * (upper fsid is 0). The lowest xinobit is reserved for mapping + * the non-peresistent inode numbers range in case of overflow. + * This way all overlay inode numbers are unique and use the + * overlay st_dev. */ - if (stat->ino >> shift) { - pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", - dentry, stat->ino, xinobits); - } else { - stat->ino |= ((u64)fsid) << shift; + if (likely(!(stat->ino >> xinoshift))) { + stat->ino |= ((u64)fsid) << (xinoshift + 1); stat->dev = dentry->d_sb->s_dev; return 0; + } else if (ovl_xino_warn(dentry->d_sb)) { + pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", + dentry, stat->ino, xinobits); } } @@ -504,7 +505,7 @@ static const struct address_space_operations ovl_aops = { /* * It is possible to stack overlayfs instance on top of another - * overlayfs instance as lower layer. We need to annonate the + * overlayfs instance as lower layer. We need to annotate the * stackable i_mutex locks according to stack level of the super * block instance. An overlayfs instance can never be in stack * depth 0 (there is always a real fs below it). An overlayfs @@ -561,27 +562,73 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) #endif } -static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, - unsigned long ino, int fsid) +static void ovl_next_ino(struct inode *inode) +{ + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + + inode->i_ino = atomic_long_inc_return(&ofs->last_ino); + if (unlikely(!inode->i_ino)) + inode->i_ino = atomic_long_inc_return(&ofs->last_ino); +} + +static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) { int xinobits = ovl_xino_bits(inode->i_sb); + unsigned int xinoshift = 64 - xinobits; /* * When d_ino is consistent with st_ino (samefs or i_ino has enough * bits to encode layer), set the same value used for st_ino to i_ino, * so inode number exposed via /proc/locks and a like will be * consistent with d_ino and st_ino values. An i_ino value inconsistent - * with d_ino also causes nfsd readdirplus to fail. When called from - * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real - * upper inode i_ino on ovl_inode_init() or ovl_inode_update(). + * with d_ino also causes nfsd readdirplus to fail. */ - if (ovl_same_dev(inode->i_sb)) { - inode->i_ino = ino; - if (xinobits && fsid && !(ino >> (64 - xinobits))) - inode->i_ino |= (unsigned long)fsid << (64 - xinobits); - } else { - inode->i_ino = get_next_ino(); + inode->i_ino = ino; + if (ovl_same_fs(inode->i_sb)) { + return; + } else if (xinobits && likely(!(ino >> xinoshift))) { + inode->i_ino |= (unsigned long)fsid << (xinoshift + 1); + return; + } + + /* + * For directory inodes on non-samefs with xino disabled or xino + * overflow, we allocate a non-persistent inode number, to be used for + * resolving st_ino collisions in ovl_map_dev_ino(). + * + * To avoid ino collision with legitimate xino values from upper + * layer (fsid 0), use the lowest xinobit to map the non + * persistent inode numbers to the unified st_ino address space. + */ + if (S_ISDIR(inode->i_mode)) { + ovl_next_ino(inode); + if (xinobits) { + inode->i_ino &= ~0UL >> xinobits; + inode->i_ino |= 1UL << xinoshift; + } } +} + +void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, + unsigned long ino, int fsid) +{ + struct inode *realinode; + + if (oip->upperdentry) + OVL_I(inode)->__upperdentry = oip->upperdentry; + if (oip->lowerpath && oip->lowerpath->dentry) + OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry)); + if (oip->lowerdata) + OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata)); + + realinode = ovl_inode_real(inode); + ovl_copyattr(realinode, inode); + ovl_copyflags(realinode, inode); + ovl_map_ino(inode, ino, fsid); +} + +static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; #ifdef CONFIG_FS_POSIX_ACL @@ -719,7 +766,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) inode = new_inode(sb); if (inode) - ovl_fill_inode(inode, mode, rdev, 0, 0); + ovl_fill_inode(inode, mode, rdev); return inode; } @@ -891,7 +938,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, oip->index); - int fsid = bylower ? oip->lowerpath->layer->fsid : 0; + int fsid = bylower ? lowerpath->layer->fsid : 0; bool is_dir, metacopy = false; unsigned long ino = 0; int err = oip->newinode ? -EEXIST : -ENOMEM; @@ -941,9 +988,11 @@ struct inode *ovl_get_inode(struct super_block *sb, err = -ENOMEM; goto out_err; } + ino = realinode->i_ino; + fsid = lowerpath->layer->fsid; } - ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid); - ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata); + ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); + ovl_inode_init(inode, oip, ino, fsid); if (upperdentry && ovl_is_impuredir(upperdentry)) ovl_set_flag(OVL_IMPURE, inode); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index ed9e129fae04..0db23baf98e7 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -845,7 +845,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (err) goto out; - if (upperdentry && unlikely(ovl_dentry_remote(upperdentry))) { + if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) { dput(upperdentry); err = -EREMOTE; goto out; @@ -1076,6 +1076,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_free_oe; } + ovl_dentry_update_reval(dentry, upperdentry, + DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + revert_creds(old_cred); if (origin_path) { dput(origin_path->dentry); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 3d3f2b8bdae5..e6f3670146ed 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -48,6 +48,12 @@ enum ovl_entry_flag { OVL_E_CONNECTED, }; +enum { + OVL_XINO_OFF, + OVL_XINO_AUTO, + OVL_XINO_ON, +}; + /* * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, * where: @@ -87,7 +93,7 @@ struct ovl_fb { u8 flags; /* OVL_FH_FLAG_* */ u8 type; /* fid_type of fid */ uuid_t uuid; /* uuid of filesystem */ - u32 fid[0]; /* file identifier should be 32bit aligned in-memory */ + u32 fid[]; /* file identifier should be 32bit aligned in-memory */ } __packed; /* In-memory and on-wire format for overlay file handle */ @@ -230,6 +236,8 @@ bool ovl_index_all(struct super_block *sb); bool ovl_verify_lower(struct super_block *sb); struct ovl_entry *ovl_alloc_entry(unsigned int numlower); bool ovl_dentry_remote(struct dentry *dentry); +void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry, + unsigned int mask); bool ovl_dentry_weird(struct dentry *dentry); enum ovl_path_type ovl_path_type(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); @@ -264,8 +272,6 @@ void ovl_set_upperdata(struct inode *inode); bool ovl_redirect_dir(struct super_block *sb); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); -void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, - struct dentry *lowerdentry, struct dentry *lowerdata); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); u64 ovl_dentry_version_get(struct dentry *dentry); @@ -301,6 +307,16 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); } +/* + * With xino=auto, we do best effort to keep all inodes on same st_dev and + * d_ino consistent with st_ino. + * With xino=on, we do the same effort but we warn if we failed. + */ +static inline bool ovl_xino_warn(struct super_block *sb) +{ + return OVL_FS(sb)->config.xino == OVL_XINO_ON; +} + /* All layers on same fs? */ static inline bool ovl_same_fs(struct super_block *sb) { @@ -410,6 +426,8 @@ struct ovl_inode_params { char *redirect; struct dentry *lowerdata; }; +void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, + unsigned long ino, int fsid); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, bool is_upper); @@ -451,6 +469,7 @@ struct ovl_cattr { struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr); int ovl_cleanup(struct inode *dir, struct dentry *dentry); +struct dentry *ovl_lookup_temp(struct dentry *workdir); struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); /* file.c */ diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 89015ea822e7..5762d802fe01 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -75,6 +75,8 @@ struct ovl_fs { struct inode *indexdir_trap; /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */ int xino_mode; + /* For allocation of non-persistent inode numbers */ + atomic_long_t last_ino; }; static inline struct ovl_fs *OVL_FS(struct super_block *sb) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 40ac9ce2465a..e452ff7d583d 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -438,15 +438,23 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) /* Map inode number to lower fs unique range */ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, - const char *name, int namelen) + const char *name, int namelen, bool warn) { - if (ino >> (64 - xinobits)) { - pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", - namelen, name, ino, xinobits); + unsigned int xinoshift = 64 - xinobits; + + if (unlikely(ino >> xinoshift)) { + if (warn) { + pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", + namelen, name, ino, xinobits); + } return ino; } - return ino | ((u64)fsid) << (64 - xinobits); + /* + * The lowest xinobit is reserved for mapping the non-peresistent inode + * numbers range, but this range is only exposed via st_ino, not here. + */ + return ino | ((u64)fsid) << (xinoshift + 1); } /* @@ -515,7 +523,8 @@ get: } else if (xinobits && !OVL_TYPE_UPPER(type)) { ino = ovl_remap_lower_ino(ino, xinobits, ovl_layer_lower(this)->fsid, - p->name, p->len); + p->name, p->len, + ovl_xino_warn(dir->d_sb)); } out: @@ -645,6 +654,7 @@ struct ovl_readdir_translate { u64 parent_ino; int fsid; int xinobits; + bool xinowarn; }; static int ovl_fill_real(struct dir_context *ctx, const char *name, @@ -665,7 +675,7 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name, ino = p->ino; } else if (rdt->xinobits) { ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, - name, namelen); + name, namelen, rdt->xinowarn); } return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); @@ -696,6 +706,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) .ctx.actor = ovl_fill_real, .orig_ctx = ctx, .xinobits = ovl_xino_bits(dir->d_sb), + .xinowarn = ovl_xino_warn(dir->d_sb), }; if (rdt.xinobits && lower_layer) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ac967f1cb6e5..732ad5495c92 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -113,53 +113,54 @@ bug: return dentry; } -static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) { - struct ovl_entry *oe = dentry->d_fsdata; - unsigned int i; int ret = 1; - for (i = 0; i < oe->numlower; i++) { - struct dentry *d = oe->lowerstack[i].dentry; - - if (d->d_flags & DCACHE_OP_REVALIDATE) { - ret = d->d_op->d_revalidate(d, flags); - if (ret < 0) - return ret; - if (!ret) { - if (!(flags & LOOKUP_RCU)) - d_invalidate(d); - return -ESTALE; - } + if (weak) { + if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) + ret = d->d_op->d_weak_revalidate(d, flags); + } else if (d->d_flags & DCACHE_OP_REVALIDATE) { + ret = d->d_op->d_revalidate(d, flags); + if (!ret) { + if (!(flags & LOOKUP_RCU)) + d_invalidate(d); + ret = -ESTALE; } } - return 1; + return ret; } -static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) +static int ovl_dentry_revalidate_common(struct dentry *dentry, + unsigned int flags, bool weak) { struct ovl_entry *oe = dentry->d_fsdata; + struct dentry *upper; unsigned int i; int ret = 1; - for (i = 0; i < oe->numlower; i++) { - struct dentry *d = oe->lowerstack[i].dentry; + upper = ovl_dentry_upper(dentry); + if (upper) + ret = ovl_revalidate_real(upper, flags, weak); - if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) { - ret = d->d_op->d_weak_revalidate(d, flags); - if (ret <= 0) - break; - } + for (i = 0; ret > 0 && i < oe->numlower; i++) { + ret = ovl_revalidate_real(oe->lowerstack[i].dentry, flags, + weak); } return ret; } -static const struct dentry_operations ovl_dentry_operations = { - .d_release = ovl_dentry_release, - .d_real = ovl_d_real, -}; +static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +{ + return ovl_dentry_revalidate_common(dentry, flags, false); +} + +static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + return ovl_dentry_revalidate_common(dentry, flags, true); +} -static const struct dentry_operations ovl_reval_dentry_operations = { +static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, .d_real = ovl_d_real, .d_revalidate = ovl_dentry_revalidate, @@ -316,12 +317,6 @@ static const char *ovl_redirect_mode_def(void) return ovl_redirect_dir_def ? "on" : "off"; } -enum { - OVL_XINO_OFF, - OVL_XINO_AUTO, - OVL_XINO_ON, -}; - static const char * const ovl_xino_str[] = { "off", "auto", @@ -751,13 +746,12 @@ static int ovl_mount_dir(const char *name, struct path *path) ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); - if (!err) - if (ovl_dentry_remote(path->dentry)) { - pr_err("filesystem on '%s' not supported as upperdir\n", - tmp); - path_put_init(path); - err = -EINVAL; - } + if (!err && path->dentry->d_flags & DCACHE_OP_REAL) { + pr_err("filesystem on '%s' not supported as upperdir\n", + tmp); + path_put_init(path); + err = -EINVAL; + } kfree(tmp); } return err; @@ -778,7 +772,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, } static int ovl_lower_dir(const char *name, struct path *path, - struct ovl_fs *ofs, int *stack_depth, bool *remote) + struct ovl_fs *ofs, int *stack_depth) { int fh_type; int err; @@ -793,9 +787,6 @@ static int ovl_lower_dir(const char *name, struct path *path, *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); - if (ovl_dentry_remote(path->dentry)) - *remote = true; - /* * The inodes index feature and NFS export need to encode and decode * file handles, so they require that all layers support them. @@ -1074,11 +1065,73 @@ out: return err; } +/* + * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and + * negative values if error is encountered. + */ +static int ovl_check_rename_whiteout(struct dentry *workdir) +{ + struct inode *dir = d_inode(workdir); + struct dentry *temp; + struct dentry *dest; + struct dentry *whiteout; + struct name_snapshot name; + int err; + + inode_lock_nested(dir, I_MUTEX_PARENT); + + temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0)); + err = PTR_ERR(temp); + if (IS_ERR(temp)) + goto out_unlock; + + dest = ovl_lookup_temp(workdir); + err = PTR_ERR(dest); + if (IS_ERR(dest)) { + dput(temp); + goto out_unlock; + } + + /* Name is inline and stable - using snapshot as a copy helper */ + take_dentry_name_snapshot(&name, temp); + err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT); + if (err) { + if (err == -EINVAL) + err = 0; + goto cleanup_temp; + } + + whiteout = lookup_one_len(name.name.name, workdir, name.name.len); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto cleanup_temp; + + err = ovl_is_whiteout(whiteout); + + /* Best effort cleanup of whiteout and temp file */ + if (err) + ovl_cleanup(dir, whiteout); + dput(whiteout); + +cleanup_temp: + ovl_cleanup(dir, temp); + release_dentry_name_snapshot(&name); + dput(temp); + dput(dest); + +out_unlock: + inode_unlock(dir); + + return err; +} + static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, struct path *workpath) { struct vfsmount *mnt = ofs->upper_mnt; struct dentry *temp; + bool rename_whiteout; + bool d_type; int fh_type; int err; @@ -1104,11 +1157,8 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, if (err < 0) goto out; - /* - * We allowed this configuration and don't want to break users over - * kernel upgrade. So warn instead of erroring out. - */ - if (!err) + d_type = err; + if (!d_type) pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ @@ -1119,6 +1169,16 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, else pr_warn("upper fs does not support tmpfile.\n"); + + /* Check if upper/work fs supports RENAME_WHITEOUT */ + err = ovl_check_rename_whiteout(ofs->workdir); + if (err < 0) + goto out; + + rename_whiteout = err; + if (!rename_whiteout) + pr_warn("upper fs does not support RENAME_WHITEOUT.\n"); + /* * Check if upper/work fs supports trusted.overlay.* xattr */ @@ -1133,6 +1193,18 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); } + /* + * We allowed sub-optimal upper fs configuration and don't want to break + * users over kernel upgrade, but we never allowed remote upper fs, so + * we can enforce strict requirements for remote upper fs. + */ + if (ovl_dentry_remote(ofs->workdir) && + (!d_type || !rename_whiteout || ofs->noxattr)) { + pr_err("upper fs missing required features.\n"); + err = -EINVAL; + goto out; + } + /* Check if upper/work fs supports file handles */ fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); if (ofs->config.index && !fh_type) { @@ -1401,11 +1473,12 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, /* * When all layers on same fs, overlay can use real inode numbers. - * With mount option "xino=on", mounter declares that there are enough - * free high bits in underlying fs to hold the unique fsid. + * With mount option "xino=<on|auto>", mounter declares that there are + * enough free high bits in underlying fs to hold the unique fsid. * If overlayfs does encounter underlying inodes using the high xino * bits reserved for fsid, it emits a warning and uses the original - * inode number. + * inode number or a non persistent inode number allocated from a + * dedicated range. */ if (ofs->numfs - !ofs->upper_mnt == 1) { if (ofs->config.xino == OVL_XINO_ON) @@ -1413,14 +1486,16 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, ofs->xino_mode = 0; } else if (ofs->config.xino == OVL_XINO_OFF) { ofs->xino_mode = -1; - } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) { + } else if (ofs->xino_mode < 0) { /* * This is a roundup of number of bits needed for encoding - * fsid, where fsid 0 is reserved for upper fs even with - * lower only overlay. + * fsid, where fsid 0 is reserved for upper fs (even with + * lower only overlay) +1 extra bit is reserved for the non + * persistent inode number range that is used for resolving + * xino lower bits overflow. */ - BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31); - ofs->xino_mode = ilog2(ofs->numfs - 1) + 1; + BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30); + ofs->xino_mode = ilog2(ofs->numfs - 1) + 2; } if (ofs->xino_mode > 0) { @@ -1440,7 +1515,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, char *lowertmp, *lower; struct path *stack = NULL; unsigned int stacklen, numlower = 0, i; - bool remote = false; struct ovl_entry *oe; err = -ENOMEM; @@ -1472,7 +1546,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, lower = lowertmp; for (numlower = 0; numlower < stacklen; numlower++) { err = ovl_lower_dir(lower, &stack[numlower], ofs, - &sb->s_stack_depth, &remote); + &sb->s_stack_depth); if (err) goto out_err; @@ -1500,11 +1574,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, oe->lowerstack[i].layer = &ofs->layers[i+1]; } - if (remote) - sb->s_d_op = &ovl_reval_dentry_operations; - else - sb->s_d_op = &ovl_dentry_operations; - out: for (i = 0; i < numlower; i++) path_put(&stack[i]); @@ -1589,6 +1658,44 @@ static int ovl_check_overlapping_layers(struct super_block *sb, return 0; } +static struct dentry *ovl_get_root(struct super_block *sb, + struct dentry *upperdentry, + struct ovl_entry *oe) +{ + struct dentry *root; + struct ovl_path *lowerpath = &oe->lowerstack[0]; + unsigned long ino = d_inode(lowerpath->dentry)->i_ino; + int fsid = lowerpath->layer->fsid; + struct ovl_inode_params oip = { + .upperdentry = upperdentry, + .lowerpath = lowerpath, + }; + + root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); + if (!root) + return NULL; + + root->d_fsdata = oe; + + if (upperdentry) { + /* Root inode uses upper st_ino/i_ino */ + ino = d_inode(upperdentry)->i_ino; + fsid = 0; + ovl_dentry_set_upper_alias(root); + if (ovl_is_impuredir(upperdentry)) + ovl_set_flag(OVL_IMPURE, d_inode(root)); + } + + /* Root is always merge -> can have whiteouts */ + ovl_set_flag(OVL_WHITEOUTS, d_inode(root)); + ovl_dentry_set_flag(OVL_E_CONNECTED, root); + ovl_set_upperdata(d_inode(root)); + ovl_inode_init(d_inode(root), &oip, ino, fsid); + ovl_dentry_update_reval(root, upperdentry, DCACHE_OP_WEAK_REVALIDATE); + + return root; +} + static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path upperpath = { }; @@ -1598,6 +1705,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct cred *cred; int err; + sb->s_d_op = &ovl_dentry_operations; + err = -ENOMEM; ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); if (!ofs) @@ -1624,6 +1733,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; + atomic_long_set(&ofs->last_ino, 1); /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ if (ofs->config.xino != OVL_XINO_OFF) { ofs->xino_mode = BITS_PER_LONG - 32; @@ -1710,25 +1820,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= SB_POSIXACL; err = -ENOMEM; - root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); + root_dentry = ovl_get_root(sb, upperpath.dentry, oe); if (!root_dentry) goto out_free_oe; - root_dentry->d_fsdata = oe; - mntput(upperpath.mnt); - if (upperpath.dentry) { - ovl_dentry_set_upper_alias(root_dentry); - if (ovl_is_impuredir(upperpath.dentry)) - ovl_set_flag(OVL_IMPURE, d_inode(root_dentry)); - } - - /* Root is always merge -> can have whiteouts */ - ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); - ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry); - ovl_set_upperdata(d_inode(root_dentry)); - ovl_inode_init(d_inode(root_dentry), upperpath.dentry, - ovl_dentry_lower(root_dentry), NULL); sb->s_root = root_dentry; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 042f7eb4f7f4..36b60788ee47 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -93,8 +93,24 @@ struct ovl_entry *ovl_alloc_entry(unsigned int numlower) bool ovl_dentry_remote(struct dentry *dentry) { return dentry->d_flags & - (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | - DCACHE_OP_REAL); + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); +} + +void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry, + unsigned int mask) +{ + struct ovl_entry *oe = OVL_E(dentry); + unsigned int i, flags = 0; + + if (upperdentry) + flags |= upperdentry->d_flags; + for (i = 0; i < oe->numlower; i++) + flags |= oe->lowerstack[i].dentry->d_flags; + + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~mask; + dentry->d_flags |= flags & mask; + spin_unlock(&dentry->d_lock); } bool ovl_dentry_weird(struct dentry *dentry) @@ -386,24 +402,6 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect) oi->redirect = redirect; } -void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, - struct dentry *lowerdentry, struct dentry *lowerdata) -{ - struct inode *realinode = d_inode(upperdentry ?: lowerdentry); - - if (upperdentry) - OVL_I(inode)->__upperdentry = upperdentry; - if (lowerdentry) - OVL_I(inode)->lower = igrab(d_inode(lowerdentry)); - if (lowerdata) - OVL_I(inode)->lowerdata = igrab(d_inode(lowerdata)); - - ovl_copyattr(realinode, inode); - ovl_copyflags(realinode, inode); - if (!inode->i_ino) - inode->i_ino = realinode->i_ino; -} - void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) { struct inode *upperinode = d_inode(upperdentry); @@ -416,8 +414,6 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) smp_wmb(); OVL_I(inode)->__upperdentry = upperdentry; if (inode_unhashed(inode)) { - if (!inode->i_ino) - inode->i_ino = upperinode->i_ino; inode->i_private = upperinode; __insert_inode_hash(inode, (unsigned long) upperinode); } diff --git a/fs/pipe.c b/fs/pipe.c index 2144507447c5..16fb72e9abf7 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -146,7 +146,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct page *page = buf->page; if (page_count(page) == 1) { - memcg_kmem_uncharge(page, 0); + memcg_kmem_uncharge_page(page, 0); __SetPageLocked(page); return 0; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 5efaf3708ec6..8e16f14bb05a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -635,28 +635,35 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0; struct mm_struct *mm = get_task_mm(task); if (mm) { + unsigned long size; + unsigned long resident = 0; + unsigned long shared = 0; + unsigned long text = 0; + unsigned long data = 0; + size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); - } - /* - * For quick read, open code by putting numbers directly - * expected format is - * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", - * size, resident, shared, text, data); - */ - seq_put_decimal_ull(m, "", size); - seq_put_decimal_ull(m, " ", resident); - seq_put_decimal_ull(m, " ", shared); - seq_put_decimal_ull(m, " ", text); - seq_put_decimal_ull(m, " ", 0); - seq_put_decimal_ull(m, " ", data); - seq_put_decimal_ull(m, " ", 0); - seq_putc(m, '\n'); + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, "", size); + seq_put_decimal_ull(m, " ", resident); + seq_put_decimal_ull(m, " ", shared); + seq_put_decimal_ull(m, " ", text); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", data); + seq_put_decimal_ull(m, " ", 0); + seq_putc(m, '\n'); + } else { + seq_write(m, "0 0 0 0 0 0 0\n", 14); + } return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index c7c64272b0fa..6042b646ab27 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -405,11 +405,11 @@ print0: static int lock_trace(struct task_struct *task) { - int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + int err = mutex_lock_killable(&task->signal->exec_update_mutex); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return -EPERM; } return 0; @@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task) static void unlock_trace(struct task_struct *task) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); } #ifdef CONFIG_STACKTRACE @@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode, *rgid = gid; } +void proc_pid_evict_inode(struct proc_inode *ei) +{ + struct pid *pid = ei->pid; + + if (S_ISDIR(ei->vfs_inode.i_mode)) { + spin_lock(&pid->lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(&pid->lock); + } + + put_pid(pid); +} + struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; + struct pid *pid; /* We need a new inode */ @@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb, /* * grab the reference to task. */ - ei->pid = get_task_pid(task, PIDTYPE_PID); - if (!ei->pid) + pid = get_task_pid(task, PIDTYPE_PID); + if (!pid) goto out_unlock; + /* Let the pid remember us for quick removal */ + ei->pid = pid; + if (S_ISDIR(mode)) { + spin_lock(&pid->lock); + hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); + spin_unlock(&pid->lock); + } + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); @@ -2861,7 +2883,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unsigned long flags; int result; - result = mutex_lock_killable(&task->signal->cred_guard_mutex); + result = mutex_lock_killable(&task->signal->exec_update_mutex); if (result) return result; @@ -2897,7 +2919,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh result = 0; out_unlock: - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return result; } @@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .permission = proc_pid_permission, }; -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) -{ - struct dentry *dentry, *leader, *dir; - char buf[10 + 1]; - struct qstr name; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", pid); - /* no ->d_hash() rejects on procfs */ - dentry = d_hash_and_lookup(mnt->mnt_root, &name); - if (dentry) { - d_invalidate(dentry); - dput(dentry); - } - - if (pid == tgid) - return; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", tgid); - leader = d_hash_and_lookup(mnt->mnt_root, &name); - if (!leader) - goto out; - - name.name = "task"; - name.len = strlen(name.name); - dir = d_hash_and_lookup(leader, &name); - if (!dir) - goto out_put_leader; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", pid); - dentry = d_hash_and_lookup(dir, &name); - if (dentry) { - d_invalidate(dentry); - dput(dentry); - } - - dput(dir); -out_put_leader: - dput(leader); -out: - return; -} - /** - * proc_flush_task - Remove dcache entries for @task from the /proc dcache. - * @task: task that should be flushed. + * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache. + * @pid: pid that should be flushed. * - * When flushing dentries from proc, one needs to flush them from global - * proc (proc_mnt) and from all the namespaces' procs this task was seen - * in. This call is supposed to do all of this job. - * - * Looks in the dcache for - * /proc/@pid - * /proc/@tgid/task/@pid - * if either directory is present flushes it and all of it'ts children - * from the dcache. + * This function walks a list of inodes (that belong to any proc + * filesystem) that are attached to the pid and flushes them from + * the dentry cache. * * It is safe and reasonable to cache /proc entries for a task until * that task exits. After that they just clog up the dcache with * useless entries, possibly causing useful dcache entries to be - * flushed instead. This routine is proved to flush those useless - * dcache entries at process exit time. + * flushed instead. This routine is provided to flush those useless + * dcache entries when a process is reaped. * * NOTE: This routine is just an optimization so it does not guarantee - * that no dcache entries will exist at process exit time it - * just makes it very unlikely that any will persist. + * that no dcache entries will exist after a process is reaped + * it just makes it very unlikely that any will persist. */ -void proc_flush_task(struct task_struct *task) +void proc_flush_pid(struct pid *pid) { - int i; - struct pid *pid, *tgid; - struct upid *upid; - - pid = task_pid(task); - tgid = task_tgid(task); - - for (i = 0; i <= pid->level; i++) { - upid = &pid->numbers[i]; - proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, - tgid->numbers[i].nr); - } + proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); + put_pid(pid); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index c1dea9b8222e..d0989a443c77 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -17,6 +17,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file) } static const struct proc_ops cpuinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = cpuinfo_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 3faed94e4b65..4ed6dabdf6ff 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -531,6 +531,12 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } +static inline void pde_set_flags(struct proc_dir_entry *pde) +{ + if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; +} + struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops, void *data) @@ -541,6 +547,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, if (!p) return NULL; p->proc_ops = proc_ops; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -572,6 +579,7 @@ static int proc_seq_release(struct inode *inode, struct file *file) } static const struct proc_ops proc_seq_ops = { + /* not permanent -- can call into arbitrary seq_operations */ .proc_open = proc_seq_open, .proc_read = seq_read, .proc_lseek = seq_lseek, @@ -602,6 +610,7 @@ static int proc_single_open(struct inode *inode, struct file *file) } static const struct proc_ops proc_single_ops = { + /* not permanent -- can call into arbitrary ->single_show */ .proc_open = proc_single_open, .proc_read = seq_read, .proc_lseek = seq_lseek, @@ -662,9 +671,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) { - rb_erase(&de->subdir_node, &parent->subdir); - if (S_ISDIR(de->mode)) { - parent->nlink--; + if (unlikely(pde_is_permanent(de))) { + WARN(1, "removing permanent /proc entry '%s'", de->name); + de = NULL; + } else { + rb_erase(&de->subdir_node, &parent->subdir); + if (S_ISDIR(de->mode)) + parent->nlink--; } } write_unlock(&proc_subdir_lock); @@ -700,12 +713,24 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + root->parent->name, root->name); + return -EINVAL; + } rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + next->parent->name, next->name); + return -EINVAL; + } rb_erase(&next->subdir_node, &de->subdir); de = next; continue; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6da18316d209..fb4cace9ea41 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -33,21 +33,27 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; + struct proc_inode *ei = PROC_I(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ - put_pid(PROC_I(inode)->pid); + if (ei->pid) { + proc_pid_evict_inode(ei); + ei->pid = NULL; + } /* Let go of any associated proc directory entry */ - de = PDE(inode); - if (de) + de = ei->pde; + if (de) { pde_put(de); + ei->pde = NULL; + } - head = PROC_I(inode)->sysctl; + head = ei->sysctl; if (head) { - RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); + RCU_INIT_POINTER(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } @@ -68,6 +74,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; + INIT_HLIST_NODE(&ei->sibling_inodes); ei->ns_ops = NULL; return &ei->vfs_inode; } @@ -102,6 +109,62 @@ void __init proc_init_kmemcache(void) BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) +{ + struct inode *inode; + struct proc_inode *ei; + struct hlist_node *node; + struct super_block *old_sb = NULL; + + rcu_read_lock(); + for (;;) { + struct super_block *sb; + node = hlist_first_rcu(inodes); + if (!node) + break; + ei = hlist_entry(node, struct proc_inode, sibling_inodes); + spin_lock(lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(lock); + + inode = &ei->vfs_inode; + sb = inode->i_sb; + if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active)) + continue; + inode = igrab(inode); + rcu_read_unlock(); + if (sb != old_sb) { + if (old_sb) + deactivate_super(old_sb); + old_sb = sb; + } + if (unlikely(!inode)) { + rcu_read_lock(); + continue; + } + + if (S_ISDIR(inode->i_mode)) { + struct dentry *dir = d_find_any_alias(inode); + if (dir) { + d_invalidate(dir); + dput(dir); + } + } else { + struct dentry *dentry; + while ((dentry = d_find_alias(inode))) { + d_invalidate(dentry); + dput(dentry); + } + } + iput(inode); + + rcu_read_lock(); + } + rcu_read_unlock(); + if (old_sb) + deactivate_super(old_sb); +} + static int proc_show_options(struct seq_file *seq, struct dentry *root) { struct super_block *sb = root->d_sb; @@ -139,6 +202,7 @@ static void unuse_pde(struct proc_dir_entry *pde) /* pde is locked on entry, unlocked on exit */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) + __releases(&pde->pde_unload_lock) { /* * close() (proc_reg_release()) can't delete an entry and proceed: @@ -195,135 +259,204 @@ void proc_entry_rundown(struct proc_dir_entry *de) spin_unlock(&de->pde_unload_lock); } +static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence) +{ + typeof_member(struct proc_ops, proc_lseek) lseek; + + lseek = pde->proc_ops->proc_lseek; + if (!lseek) + lseek = default_llseek; + return lseek(file, offset, whence); +} + static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_lseek) lseek; - lseek = pde->proc_ops->proc_lseek; - if (!lseek) - lseek = default_llseek; - rv = lseek(file, offset, whence); + if (pde_is_permanent(pde)) { + return pde_lseek(pde, file, offset, whence); + } else if (use_pde(pde)) { + rv = pde_lseek(pde, file, offset, whence); unuse_pde(pde); } return rv; } +static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_read) read; + + read = pde->proc_ops->proc_read; + if (read) + return read(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_read) read; - read = pde->proc_ops->proc_read; - if (read) - rv = read(file, buf, count, ppos); + if (pde_is_permanent(pde)) { + return pde_read(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_read(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_write) write; + + write = pde->proc_ops->proc_write; + if (write) + return write(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_write) write; - write = pde->proc_ops->proc_write; - if (write) - rv = write(file, buf, count, ppos); + if (pde_is_permanent(pde)) { + return pde_write(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_write(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) +{ + typeof_member(struct proc_ops, proc_poll) poll; + + poll = pde->proc_ops->proc_poll; + if (poll) + return poll(file, pts); + return DEFAULT_POLLMASK; +} + static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_poll) poll; - poll = pde->proc_ops->proc_poll; - if (poll) - rv = poll(file, pts); + if (pde_is_permanent(pde)) { + return pde_poll(pde, file, pts); + } else if (use_pde(pde)) { + rv = pde_poll(pde, file, pts); unuse_pde(pde); } return rv; } +static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_ioctl) ioctl; + + ioctl = pde->proc_ops->proc_ioctl; + if (ioctl) + return ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_ioctl) ioctl; - ioctl = pde->proc_ops->proc_ioctl; - if (ioctl) - rv = ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #ifdef CONFIG_COMPAT +static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; + + compat_ioctl = pde->proc_ops->proc_compat_ioctl; + if (compat_ioctl) + return compat_ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; - - compat_ioctl = pde->proc_ops->proc_compat_ioctl; - if (compat_ioctl) - rv = compat_ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_compat_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_compat_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #endif +static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) +{ + typeof_member(struct proc_ops, proc_mmap) mmap; + + mmap = pde->proc_ops->proc_mmap; + if (mmap) + return mmap(file, vma); + return -EIO; +} + static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_mmap) mmap; - mmap = pde->proc_ops->proc_mmap; - if (mmap) - rv = mmap(file, vma); + if (pde_is_permanent(pde)) { + return pde_mmap(pde, file, vma); + } else if (use_pde(pde)) { + rv = pde_mmap(pde, file, vma); unuse_pde(pde); } return rv; } static unsigned long -proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, +pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - struct proc_dir_entry *pde = PDE(file_inode(file)); - unsigned long rv = -EIO; + typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - - get_area = pde->proc_ops->proc_get_unmapped_area; + get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU - if (!get_area) - get_area = current->mm->get_unmapped_area; + if (!get_area) + get_area = current->mm->get_unmapped_area; #endif + if (get_area) + return get_area(file, orig_addr, len, pgoff, flags); + return orig_addr; +} - if (get_area) - rv = get_area(file, orig_addr, len, pgoff, flags); - else - rv = orig_addr; +static unsigned long +proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + unsigned long rv = -EIO; + + if (pde_is_permanent(pde)) { + return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); + } else if (use_pde(pde)) { + rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); unuse_pde(pde); } return rv; @@ -337,6 +470,13 @@ static int proc_reg_open(struct inode *inode, struct file *file) typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; + if (pde_is_permanent(pde)) { + open = pde->proc_ops->proc_open; + if (open) + rv = open(inode, file); + return rv; + } + /* * Ensure that * 1) PDE's ->release hook will be called no matter what @@ -386,6 +526,17 @@ static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); struct pde_opener *pdeo; + + if (pde_is_permanent(pde)) { + typeof_member(struct proc_ops, proc_release) release; + + release = pde->proc_ops->proc_release; + if (release) { + return release(inode, file); + } + return 0; + } + spin_lock(&pde->pde_unload_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 41587276798e..917cc85e3466 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -61,6 +61,7 @@ struct proc_dir_entry { struct rb_node subdir_node; char *name; umode_t mode; + u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; @@ -73,6 +74,11 @@ struct proc_dir_entry { 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) +static inline bool pde_is_permanent(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -91,7 +97,7 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; - struct hlist_node sysctl_inodes; + struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; @@ -158,6 +164,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct dentry *, struct iattr *); +extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); @@ -210,6 +217,7 @@ extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index ec1b7d2fb773..b38ad552887f 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -50,6 +50,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait) static const struct proc_ops kmsg_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_read = kmsg_read, .proc_poll = kmsg_poll, .proc_open = kmsg_open, diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c75bb4632ed1..b6f5d459b087 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -267,42 +267,9 @@ static void unuse_table(struct ctl_table_header *p) complete(p->unregistering); } -static void proc_sys_prune_dcache(struct ctl_table_header *head) +static void proc_sys_invalidate_dcache(struct ctl_table_header *head) { - struct inode *inode; - struct proc_inode *ei; - struct hlist_node *node; - struct super_block *sb; - - rcu_read_lock(); - for (;;) { - node = hlist_first_rcu(&head->inodes); - if (!node) - break; - ei = hlist_entry(node, struct proc_inode, sysctl_inodes); - spin_lock(&sysctl_lock); - hlist_del_init_rcu(&ei->sysctl_inodes); - spin_unlock(&sysctl_lock); - - inode = &ei->vfs_inode; - sb = inode->i_sb; - if (!atomic_inc_not_zero(&sb->s_active)) - continue; - inode = igrab(inode); - rcu_read_unlock(); - if (unlikely(!inode)) { - deactivate_super(sb); - rcu_read_lock(); - continue; - } - - d_prune_aliases(inode); - iput(inode); - deactivate_super(sb); - - rcu_read_lock(); - } - rcu_read_unlock(); + proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } /* called under sysctl_lock, will reacquire if has to wait */ @@ -324,10 +291,10 @@ static void start_unregistering(struct ctl_table_header *p) spin_unlock(&sysctl_lock); } /* - * Prune dentries for unregistered sysctls: namespaced sysctls + * Invalidate dentries for unregistered sysctls: namespaced sysctls * can have duplicate names and contaminate dcache very badly. */ - proc_sys_prune_dcache(p); + proc_sys_invalidate_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. @@ -483,7 +450,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, } ei->sysctl = head; ei->sysctl_entry = table; - hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes); + hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); head->count++; spin_unlock(&sysctl_lock); @@ -514,7 +481,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { spin_lock(&sysctl_lock); - hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes); + hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); if (!--head->count) kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); diff --git a/fs/proc/root.c b/fs/proc/root.c index 608233dfd29c..2633f10446c3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = { .subdir = RB_ROOT, .name = "/proc", }; - -int pid_ns_prepare_proc(struct pid_namespace *ns) -{ - struct proc_fs_context *ctx; - struct fs_context *fc; - struct vfsmount *mnt; - - fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT); - if (IS_ERR(fc)) - return PTR_ERR(fc); - - if (fc->user_ns != ns->user_ns) { - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(ns->user_ns); - } - - ctx = fc->fs_private; - if (ctx->pid_ns != ns) { - put_pid_ns(ctx->pid_ns); - get_pid_ns(ns); - ctx->pid_ns = ns; - } - - mnt = fc_mount(fc); - put_fs_context(fc); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - ns->proc_mnt = mnt; - return 0; -} - -void pid_ns_release_proc(struct pid_namespace *ns) -{ - kern_unmount(ns->proc_mnt); -} diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 0449edf460f5..46b3293015fe 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -224,6 +224,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct proc_ops stat_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3ba9ae83bff5..8d382d4ec067 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -123,38 +123,14 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static void vma_stop(struct proc_maps_private *priv) -{ - struct mm_struct *mm = priv->mm; - - release_task_mempolicy(priv); - up_read(&mm->mmap_sem); - mmput(mm); -} - -static struct vm_area_struct * -m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) -{ - if (vma == priv->tail_vma) - return NULL; - return vma->vm_next ?: priv->tail_vma; -} - -static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) -{ - if (m->count < m->size) /* vma is copied successfully */ - m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL; -} - static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; - unsigned long last_addr = m->version; + unsigned long last_addr = *ppos; struct mm_struct *mm; struct vm_area_struct *vma; - unsigned int pos = *ppos; - /* See m_cache_vma(). Zero at the start or after lseek. */ + /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) return NULL; @@ -163,64 +139,59 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return ERR_PTR(-ESRCH); mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } if (down_read_killable(&mm->mmap_sem)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } hold_task_mempolicy(priv); priv->tail_vma = get_gate_vma(mm); - if (last_addr) { - vma = find_vma(mm, last_addr - 1); - if (vma && vma->vm_start <= last_addr) - vma = m_next_vma(priv, vma); - if (vma) - return vma; - } - - m->version = 0; - if (pos < mm->map_count) { - for (vma = mm->mmap; pos; pos--) { - m->version = vma->vm_start; - vma = vma->vm_next; - } + vma = find_vma(mm, last_addr); + if (vma) return vma; - } - - /* we do not bother to update m->version in this case */ - if (pos == mm->map_count && priv->tail_vma) - return priv->tail_vma; - vma_stop(priv); - return NULL; + return priv->tail_vma; } -static void *m_next(struct seq_file *m, void *v, loff_t *pos) +static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { struct proc_maps_private *priv = m->private; - struct vm_area_struct *next; + struct vm_area_struct *next, *vma = v; + + if (vma == priv->tail_vma) + next = NULL; + else if (vma->vm_next) + next = vma->vm_next; + else + next = priv->tail_vma; + + *ppos = next ? next->vm_start : -1UL; - (*pos)++; - next = m_next_vma(priv, v); - if (!next) - vma_stop(priv); return next; } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; - if (!IS_ERR_OR_NULL(v)) - vma_stop(priv); - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + release_task_mempolicy(priv); + up_read(&mm->mmap_sem); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } static int proc_maps_open(struct inode *inode, struct file *file, @@ -363,7 +334,6 @@ done: static int show_map(struct seq_file *m, void *v) { show_map_vma(m, v); - m_cache_vma(m, v); return 0; } @@ -847,8 +817,6 @@ static int show_smap(struct seq_file *m, void *v) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); - m_cache_vma(m, vma); - return 0; } @@ -1887,7 +1855,6 @@ static int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); - m_cache_vma(m, vma); return 0; } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 7fbe8f058220..d99b5d39aa90 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -87,11 +87,11 @@ static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos) struct pstore_private *ps = s->private; struct pstore_ftrace_seq_data *data = v; + (*pos)++; data->off += REC_SIZE; if (data->off + REC_SIZE > ps->total_size) return NULL; - (*pos)++; return data; } @@ -101,6 +101,9 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v) struct pstore_ftrace_seq_data *data = v; struct pstore_ftrace_record *rec; + if (!data) + return 0; + rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off); seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %ps <- %pS\n", diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index d896457e7c11..408277ee3cdb 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -823,9 +823,9 @@ static int __init pstore_init(void) ret = pstore_init_fs(); if (ret) - return ret; + free_buf_for_compression(); - return 0; + return ret; } late_initcall(pstore_init); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 013486b5125e..795622190c01 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -963,7 +963,6 @@ static void __init ramoops_register_dummy(void) pr_info("could not create platform device: %ld\n", PTR_ERR(dummy)); dummy = NULL; - ramoops_unregister_dummy(); } } diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 1f4d8c06f9be..c917c191e78c 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -34,7 +34,7 @@ struct persistent_ram_buffer { uint32_t sig; atomic_t start; atomic_t size; - uint8_t data[0]; + uint8_t data[]; }; #define PERSISTENT_RAM_SIG (0x43474244) /* DBGC */ diff --git a/fs/read_write.c b/fs/read_write.c index 59d819c5b92e..bbfa9b12b15e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -331,7 +331,8 @@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned i } #endif -#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) +#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ + defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 4075e41408b4..5129efc6f2e6 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -842,7 +842,7 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb, struct item_head *pasted; struct buffer_info bi; - buffer_info_init_right(tb, &bi); + buffer_info_init_right(tb, &bi); leaf_shift_right(tb, tb->rnum[0], tb->rbytes); /* append item in R[0] */ diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 45e1a5d11af3..adb21bea3d60 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -184,11 +184,12 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) } /* we need to make sure nobody is changing the file size beneath us */ -{ - int depth = reiserfs_write_unlock_nested(inode->i_sb); - inode_lock(inode); - reiserfs_write_lock_nested(inode->i_sb, depth); -} + { + int depth = reiserfs_write_unlock_nested(inode->i_sb); + + inode_lock(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + } reiserfs_write_lock(inode->i_sb); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 072156c4f895..5c766330e493 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2599,7 +2599,6 @@ static int journal_init_dev(struct super_block *super, int result; dev_t jdev; fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; - char b[BDEVNAME_SIZE]; result = 0; @@ -2621,8 +2620,8 @@ static int journal_init_dev(struct super_block *super, result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; reiserfs_warning(super, "sh-458", - "cannot init journal device '%s': %i", - __bdevname(jdev, b), result); + "cannot init journal device unknown-block(%u,%u): %i", + MAJOR(jdev), MINOR(jdev), result); return result; } else if (jdev != super->s_dev) set_blocksize(journal->j_dev_bd, super->s_blocksize); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 959a066b7bb0..1594687582f0 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -838,10 +838,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode */ INC_DIR_INODE_NLINK(dir) - retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ , - old_format_only(dir->i_sb) ? - EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, - dentry, inode, &security); + retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */, + old_format_only(dir->i_sb) ? + EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, + dentry, inode, &security); if (retval) { DEC_DIR_INODE_NLINK(dir) goto out_failed; @@ -967,7 +967,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) reiserfs_update_sd(&th, inode); DEC_DIR_INODE_NLINK(dir) - dir->i_size -= (DEH_SIZE + de.de_entrylen); + dir->i_size -= (DEH_SIZE + de.de_entrylen); reiserfs_update_sd(&th, dir); /* prevent empty directory from getting lost */ diff --git a/fs/seq_file.c b/fs/seq_file.c index 1600034a929b..79781ebd2145 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -68,13 +68,6 @@ int seq_open(struct file *file, const struct seq_operations *op) p->file = file; /* - * Wrappers around seq_open(e.g. swaps_open) need to be - * aware of this. If they set f_version themselves, they - * should call seq_open first and then set f_version. - */ - file->f_version = 0; - - /* * seq_files support lseek() and pread(). They do not implement * write() at all, but we clear FMODE_PWRITE here for historical * reasons. @@ -94,7 +87,6 @@ static int traverse(struct seq_file *m, loff_t offset) int error = 0; void *p; - m->version = 0; m->index = 0; m->count = m->from = 0; if (!offset) @@ -161,25 +153,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) mutex_lock(&m->lock); /* - * seq_file->op->..m_start/m_stop/m_next may do special actions - * or optimisations based on the file->f_version, so we want to - * pass the file->f_version to those methods. - * - * seq_file->version is just copy of f_version, and seq_file - * methods can treat it simply as file version. - * It is copied in first and copied out after all operations. - * It is convenient to have it as part of structure to avoid the - * need of passing another argument to all the seq_file methods. - */ - m->version = file->f_version; - - /* * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ if (*ppos == 0) { m->index = 0; - m->version = 0; m->count = 0; } @@ -190,7 +168,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (err) { /* With prejudice... */ m->read_pos = 0; - m->version = 0; m->index = 0; m->count = 0; goto Done; @@ -243,7 +220,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; - m->version = 0; p = m->op->start(m, &m->index); } m->op->stop(m, p); @@ -287,7 +263,6 @@ Done: *ppos += copied; m->read_pos += copied; } - file->f_version = m->version; mutex_unlock(&m->lock); return copied; Enomem: @@ -313,7 +288,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) loff_t retval = -EINVAL; mutex_lock(&m->lock); - m->version = file->f_version; switch (whence) { case SEEK_CUR: offset += file->f_pos; @@ -329,7 +303,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) /* with extreme prejudice... */ file->f_pos = 0; m->read_pos = 0; - m->version = 0; m->index = 0; m->count = 0; } else { @@ -340,7 +313,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) file->f_pos = offset; } } - file->f_version = m->version; mutex_unlock(&m->lock); return retval; } diff --git a/fs/splice.c b/fs/splice.c index d671936d0aad..4735defc46ee 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, /* * Determine where to splice to/from. */ -static long do_splice(struct file *in, loff_t __user *off_in, - struct file *out, loff_t __user *off_out, - size_t len, unsigned int flags) +long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 130fc6fbcc03..26bbf960e2a2 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -558,3 +558,151 @@ void sysfs_remove_bin_file(struct kobject *kobj, kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); + +static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid, + kgid_t kgid) +{ + struct iattr newattrs = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = kuid, + .ia_gid = kgid, + }; + return kernfs_setattr(kn, &newattrs); +} + +/** + * sysfs_link_change_owner - change owner of a sysfs file. + * @kobj: object of the kernfs_node the symlink is located in. + * @targ: object of the kernfs_node the symlink points to. + * @name: name of the link. + * @kuid: new owner's kuid + * @kgid: new owner's kgid + * + * This function looks up the sysfs symlink entry @name under @kobj and changes + * the ownership to @kuid/@kgid. The symlink is looked up in the namespace of + * @targ. + * + * Returns 0 on success or error code on failure. + */ +int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, + const char *name, kuid_t kuid, kgid_t kgid) +{ + struct kernfs_node *kn = NULL; + int error; + + if (!name || !kobj->state_in_sysfs || !targ->state_in_sysfs) + return -EINVAL; + + error = -ENOENT; + kn = kernfs_find_and_get_ns(kobj->sd, name, targ->sd->ns); + if (!kn) + goto out; + + error = -EINVAL; + if (kernfs_type(kn) != KERNFS_LINK) + goto out; + if (kn->symlink.target_kn->priv != targ) + goto out; + + error = internal_change_owner(kn, kuid, kgid); + +out: + kernfs_put(kn); + return error; +} + +/** + * sysfs_file_change_owner - change owner of a sysfs file. + * @kobj: object. + * @name: name of the file to change. + * @kuid: new owner's kuid + * @kgid: new owner's kgid + * + * This function looks up the sysfs entry @name under @kobj and changes the + * ownership to @kuid/@kgid. + * + * Returns 0 on success or error code on failure. + */ +int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, + kgid_t kgid) +{ + struct kernfs_node *kn; + int error; + + if (!name) + return -EINVAL; + + if (!kobj->state_in_sysfs) + return -EINVAL; + + kn = kernfs_find_and_get(kobj->sd, name); + if (!kn) + return -ENOENT; + + error = internal_change_owner(kn, kuid, kgid); + + kernfs_put(kn); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_file_change_owner); + +/** + * sysfs_change_owner - change owner of the given object. + * @kobj: object. + * @kuid: new owner's kuid + * @kgid: new owner's kgid + * + * Change the owner of the default directory, files, groups, and attributes of + * @kobj to @kuid/@kgid. Note that sysfs_change_owner mirrors how the sysfs + * entries for a kobject are added by driver core. In summary, + * sysfs_change_owner() takes care of the default directory entry for @kobj, + * the default attributes associated with the ktype of @kobj and the default + * attributes associated with the ktype of @kobj. + * Additional properties not added by driver core have to be changed by the + * driver or subsystem which created them. This is similar to how + * driver/subsystem specific entries are removed. + * + * Returns 0 on success or error code on failure. + */ +int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) +{ + int error; + const struct kobj_type *ktype; + + if (!kobj->state_in_sysfs) + return -EINVAL; + + /* Change the owner of the kobject itself. */ + error = internal_change_owner(kobj->sd, kuid, kgid); + if (error) + return error; + + ktype = get_ktype(kobj); + if (ktype) { + struct attribute **kattr; + + /* + * Change owner of the default attributes associated with the + * ktype of @kobj. + */ + for (kattr = ktype->default_attrs; kattr && *kattr; kattr++) { + error = sysfs_file_change_owner(kobj, (*kattr)->name, + kuid, kgid); + if (error) + return error; + } + + /* + * Change owner of the default groups associated with the + * ktype of @kobj. + */ + error = sysfs_groups_change_owner(kobj, ktype->default_groups, + kuid, kgid); + if (error) + return error; + } + + return 0; +} +EXPORT_SYMBOL_GPL(sysfs_change_owner); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index c4ab045926b7..64e6a6698935 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -13,6 +13,7 @@ #include <linux/dcache.h> #include <linux/namei.h> #include <linux/err.h> +#include <linux/fs.h> #include "sysfs.h" @@ -415,15 +416,18 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); /** - * __compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing + * compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing * to a group or an attribute * @kobj: The kobject containing the group. * @target_kobj: The target kobject. * @target_name: The name of the target group or attribute. + * @symlink_name: The name of the symlink file (target_name will be + * considered if symlink_name is NULL). */ -int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, - struct kobject *target_kobj, - const char *target_name) +int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, + struct kobject *target_kobj, + const char *target_name, + const char *symlink_name) { struct kernfs_node *target; struct kernfs_node *entry; @@ -448,12 +452,129 @@ int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, return -ENOENT; } - link = kernfs_create_link(kobj->sd, target_name, entry); + if (!symlink_name) + symlink_name = target_name; + + link = kernfs_create_link(kobj->sd, symlink_name, entry); if (PTR_ERR(link) == -EEXIST) - sysfs_warn_dup(kobj->sd, target_name); + sysfs_warn_dup(kobj->sd, symlink_name); kernfs_put(entry); kernfs_put(target); return PTR_ERR_OR_ZERO(link); } -EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj); +EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj); + +static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn, + const struct attribute_group *grp, + struct iattr *newattrs) +{ + struct kernfs_node *kn; + int error; + + if (grp->attrs) { + struct attribute *const *attr; + + for (attr = grp->attrs; *attr; attr++) { + kn = kernfs_find_and_get(grp_kn, (*attr)->name); + if (!kn) + return -ENOENT; + + error = kernfs_setattr(kn, newattrs); + kernfs_put(kn); + if (error) + return error; + } + } + + if (grp->bin_attrs) { + struct bin_attribute *const *bin_attr; + + for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { + kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name); + if (!kn) + return -ENOENT; + + error = kernfs_setattr(kn, newattrs); + kernfs_put(kn); + if (error) + return error; + } + } + + return 0; +} + +/** + * sysfs_group_change_owner - change owner of an attribute group. + * @kobj: The kobject containing the group. + * @grp: The attribute group. + * @kuid: new owner's kuid + * @kgid: new owner's kgid + * + * Returns 0 on success or error code on failure. + */ +int sysfs_group_change_owner(struct kobject *kobj, + const struct attribute_group *grp, kuid_t kuid, + kgid_t kgid) +{ + struct kernfs_node *grp_kn; + int error; + struct iattr newattrs = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = kuid, + .ia_gid = kgid, + }; + + if (!kobj->state_in_sysfs) + return -EINVAL; + + if (grp->name) { + grp_kn = kernfs_find_and_get(kobj->sd, grp->name); + } else { + kernfs_get(kobj->sd); + grp_kn = kobj->sd; + } + if (!grp_kn) + return -ENOENT; + + error = kernfs_setattr(grp_kn, &newattrs); + if (!error) + error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs); + + kernfs_put(grp_kn); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_group_change_owner); + +/** + * sysfs_groups_change_owner - change owner of a set of attribute groups. + * @kobj: The kobject containing the groups. + * @groups: The attribute groups. + * @kuid: new owner's kuid + * @kgid: new owner's kgid + * + * Returns 0 on success or error code on failure. + */ +int sysfs_groups_change_owner(struct kobject *kobj, + const struct attribute_group **groups, + kuid_t kuid, kgid_t kgid) +{ + int error = 0, i; + + if (!kobj->state_in_sysfs) + return -EINVAL; + + if (!groups) + return 0; + + for (i = 0; groups[i]; i++) { + error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid); + if (error) + break; + } + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_groups_change_owner); diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 8ceb51478800..7e4bfaf2871f 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -225,7 +225,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, int offs, int quiet, int must_chk_crc) { - int err = -EINVAL, type, node_len; + int err = -EINVAL, type, node_len, dump_node = 1; uint32_t crc, node_crc, magic; const struct ubifs_ch *ch = buf; @@ -278,10 +278,22 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, out_len: if (!quiet) ubifs_err(c, "bad node length %d", node_len); + if (type == UBIFS_DATA_NODE && node_len > UBIFS_DATA_NODE_SZ) + dump_node = 0; out: if (!quiet) { ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); - ubifs_dump_node(c, buf); + if (dump_node) { + ubifs_dump_node(c, buf); + } else { + int safe_len = min3(node_len, c->leb_size - offs, + (int)UBIFS_MAX_DATA_NODE_SZ); + pr_err("\tprevent out-of-bounds memory access\n"); + pr_err("\ttruncated data node length %d\n", safe_len); + pr_err("\tcorrupted data node:\n"); + print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, + buf, safe_len, 0); + } dump_stack(); } return err; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index d49fc04f2d7d..3df9be2c684c 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -208,6 +208,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GET_ENCRYPTION_KEY_STATUS: return fscrypt_ioctl_get_key_status(file, (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_NONCE: + return fscrypt_ioctl_get_nonce(file, (void __user *)arg); + default: return -ENOTTY; } @@ -230,6 +233,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_REMOVE_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + case FS_IOC_GET_ENCRYPTION_NONCE: break; default: return -ENOIOCTLCMD; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 3bf8b1fda9d7..e5ec1afe1c66 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -905,6 +905,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) ubifs_err(c, "dead directory entry '%s', error %d", xent->name, err); ubifs_ro_mode(c, err); + kfree(xent); goto out_release; } ubifs_assert(c, ubifs_inode(xino)->xattr); diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index edf43ddd7dce..283f9eb48410 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -157,7 +157,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) int err = 0; ino_t xattr_inum; union ubifs_key key; - struct ubifs_dent_node *xent; + struct ubifs_dent_node *xent, *pxent = NULL; struct fscrypt_name nm = {0}; struct ubifs_orphan *xattr_orphan; struct ubifs_orphan *orphan; @@ -181,11 +181,16 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) xattr_inum = le64_to_cpu(xent->inum); xattr_orphan = orphan_add(c, xattr_inum, orphan); - if (IS_ERR(xattr_orphan)) + if (IS_ERR(xattr_orphan)) { + kfree(xent); return PTR_ERR(xattr_orphan); + } + kfree(pxent); + pxent = xent; key_read(c, &xent->key, &key); } + kfree(pxent); return 0; } @@ -688,14 +693,14 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_key_init(c, &key1, inum); err = ubifs_tnc_lookup(c, &key1, ino); - if (err) + if (err && err != -ENOENT) goto out_free; /* * Check whether an inode can really get deleted. * linkat() with O_TMPFILE allows rebirth of an inode. */ - if (ino->nlink == 0) { + if (err == 0 && ino->nlink == 0) { dbg_rcvry("deleting orphaned inode %lu", (unsigned long)inum); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 3d83be54c474..758efe557a19 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -83,7 +83,7 @@ struct udf_virtual_data { struct udf_bitmap { __u32 s_extPosition; int s_nr_groups; - struct buffer_head *s_block_bitmap[0]; + struct buffer_head *s_block_bitmap[]; }; struct udf_part_map { diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore index 0381e2221480..9b2467e77b2d 100644 --- a/fs/unicode/.gitignore +++ b/fs/unicode/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only mkutf8data utf8data.h diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 37df7c9eedb1..e39fdec8a0b0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -314,8 +314,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, if (!pmd_present(_pmd)) goto out; - if (pmd_trans_huge(_pmd)) + if (pmd_trans_huge(_pmd)) { + if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) + ret = true; goto out; + } /* * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it @@ -328,12 +331,38 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, */ if (pte_none(*pte)) ret = true; + if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + ret = true; pte_unmap(pte); out: return ret; } +/* Should pair with userfaultfd_signal_pending() */ +static inline long userfaultfd_get_blocking_state(unsigned int flags) +{ + if (flags & FAULT_FLAG_INTERRUPTIBLE) + return TASK_INTERRUPTIBLE; + + if (flags & FAULT_FLAG_KILLABLE) + return TASK_KILLABLE; + + return TASK_UNINTERRUPTIBLE; +} + +/* Should pair with userfaultfd_get_blocking_state() */ +static inline bool userfaultfd_signal_pending(unsigned int flags) +{ + if (flags & FAULT_FLAG_INTERRUPTIBLE) + return signal_pending(current); + + if (flags & FAULT_FLAG_KILLABLE) + return fatal_signal_pending(current); + + return false; +} + /* * The locking rules involved in returning VM_FAULT_RETRY depending on * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and @@ -355,7 +384,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; vm_fault_t ret = VM_FAULT_SIGBUS; - bool must_wait, return_to_userland; + bool must_wait; long blocking_state; /* @@ -462,11 +491,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) uwq.ctx = ctx; uwq.waken = false; - return_to_userland = - (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == - (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); - blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : - TASK_KILLABLE; + blocking_state = userfaultfd_get_blocking_state(vmf->flags); spin_lock_irq(&ctx->fault_pending_wqh.lock); /* @@ -492,8 +517,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) up_read(&mm->mmap_sem); if (likely(must_wait && !READ_ONCE(ctx->released) && - (return_to_userland ? !signal_pending(current) : - !fatal_signal_pending(current)))) { + !userfaultfd_signal_pending(vmf->flags))) { wake_up_poll(&ctx->fd_wqh, EPOLLIN); schedule(); ret |= VM_FAULT_MAJOR; @@ -515,8 +539,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); if (READ_ONCE(uwq.waken) || READ_ONCE(ctx->released) || - (return_to_userland ? signal_pending(current) : - fatal_signal_pending(current))) + userfaultfd_signal_pending(vmf->flags)) break; schedule(); } @@ -524,30 +547,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) __set_current_state(TASK_RUNNING); - if (return_to_userland) { - if (signal_pending(current) && - !fatal_signal_pending(current)) { - /* - * If we got a SIGSTOP or SIGCONT and this is - * a normal userland page fault, just let - * userland return so the signal will be - * handled and gdb debugging works. The page - * fault code immediately after we return from - * this function is going to release the - * mmap_sem and it's not depending on it - * (unlike gup would if we were not to return - * VM_FAULT_RETRY). - * - * If a fatal signal is pending we still take - * the streamlined VM_FAULT_RETRY failure path - * and there's no need to retake the mmap_sem - * in such case. - */ - down_read(&mm->mmap_sem); - ret = VM_FAULT_NOPAGE; - } - } - /* * Here we race with the list_del; list_add in * userfaultfd_ctx_read(), however because we don't ever run @@ -1293,10 +1292,13 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } -static inline bool vma_can_userfault(struct vm_area_struct *vma) +static inline bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags) { - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); + /* FIXME: add WP support to hugetlbfs and shmem */ + return vma_is_anonymous(vma) || + ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) && + !(vm_flags & VM_UFFD_WP)); } static int userfaultfd_register(struct userfaultfd_ctx *ctx, @@ -1328,15 +1330,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) vm_flags |= VM_UFFD_WP; - /* - * FIXME: remove the below error constraint by - * implementing the wprotect tracking mode. - */ - ret = -EINVAL; - goto out; - } ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); @@ -1386,7 +1381,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (!vma_can_userfault(cur)) + if (!vma_can_userfault(cur, vm_flags)) goto out_unlock; /* @@ -1414,6 +1409,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (end & (vma_hpagesize - 1)) goto out_unlock; } + if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) + goto out_unlock; /* * Check that this vma isn't already owned by a @@ -1443,7 +1440,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_can_userfault(vma)); + BUG_ON(!vma_can_userfault(vma, vm_flags)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); @@ -1498,14 +1495,24 @@ out_unlock: up_write(&mm->mmap_sem); mmput(mm); if (!ret) { + __u64 ioctls_out; + + ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : + UFFD_API_RANGE_IOCTLS; + + /* + * Declare the WP ioctl only if the WP mode is + * specified and all checks passed with the range + */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) + ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); + /* * Now that we scanned all vmas we can already tell * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : - UFFD_API_RANGE_IOCTLS, - &user_uffdio_register->ioctls)) + if (put_user(ioctls_out, &user_uffdio_register->ioctls)) ret = -EFAULT; } out: @@ -1581,7 +1588,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_can_userfault(cur)) + if (!vma_can_userfault(cur, cur->vm_flags)) goto out_unlock; found = true; @@ -1595,7 +1602,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_can_userfault(vma)); + BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); /* * Nothing to do: this vma is already registered into this @@ -1730,11 +1737,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; - if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing); + uffdio_copy.len, &ctx->mmap_changing, + uffdio_copy.mode); mmput(ctx->mm); } else { return -ESRCH; @@ -1807,6 +1815,53 @@ out: return ret; } +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_writeprotect uffdio_wp; + struct uffdio_writeprotect __user *user_uffdio_wp; + struct userfaultfd_wake_range range; + bool mode_wp, mode_dontwake; + + if (READ_ONCE(ctx->mmap_changing)) + return -EAGAIN; + + user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; + + if (copy_from_user(&uffdio_wp, user_uffdio_wp, + sizeof(struct uffdio_writeprotect))) + return -EFAULT; + + ret = validate_range(ctx->mm, &uffdio_wp.range.start, + uffdio_wp.range.len); + if (ret) + return ret; + + if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | + UFFDIO_WRITEPROTECT_MODE_WP)) + return -EINVAL; + + mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; + mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; + + if (mode_wp && mode_dontwake) + return -EINVAL; + + ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp, + &ctx->mmap_changing); + if (ret) + return ret; + + if (!mode_wp && !mode_dontwake) { + range.start = uffdio_wp.range.start; + range.len = uffdio_wp.range.len; + wake_userfault(ctx, &range); + } + return ret; +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -1888,6 +1943,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_ZEROPAGE: ret = userfaultfd_zeropage(ctx, arg); break; + case UFFDIO_WRITEPROTECT: + ret = userfaultfd_writeprotect(ctx, arg); + break; } return ret; } diff --git a/fs/xattr.c b/fs/xattr.c index 90dd78f0eb27..e13265e65871 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -817,7 +817,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) if (len < sizeof(*new_xattr)) return NULL; - new_xattr = kmalloc(len, GFP_KERNEL); + new_xattr = kvmalloc(len, GFP_KERNEL); if (!new_xattr) return NULL; @@ -860,6 +860,7 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * @value: value of the xattr. If %NULL, will remove the attribute. * @size: size of the new xattr * @flags: %XATTR_{CREATE|REPLACE} + * @removed_size: returns size of the removed xattr, -1 if none removed * * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; @@ -868,7 +869,8 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * Returns 0 on success, -errno on failure. */ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags) + const void *value, size_t size, int flags, + ssize_t *removed_size) { struct simple_xattr *xattr; struct simple_xattr *new_xattr = NULL; @@ -882,7 +884,7 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, new_xattr->name = kstrdup(name, GFP_KERNEL); if (!new_xattr->name) { - kfree(new_xattr); + kvfree(new_xattr); return -ENOMEM; } } @@ -895,8 +897,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, err = -EEXIST; } else if (new_xattr) { list_replace(&xattr->list, &new_xattr->list); + if (removed_size) + *removed_size = xattr->size; } else { list_del(&xattr->list); + if (removed_size) + *removed_size = xattr->size; } goto out; } @@ -908,11 +914,14 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, list_add(&new_xattr->list, &xattrs->head); xattr = NULL; } + + if (removed_size) + *removed_size = -1; out: spin_unlock(&xattrs->lock); if (xattr) { kfree(xattr->name); - kfree(xattr); + kvfree(xattr); } return err; diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index aceca2f9a3db..4f95df476181 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -26,6 +26,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ + xfs_btree_staging.o \ xfs_da_btree.o \ xfs_defer.o \ xfs_dir2.o \ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 08d6beb54f8c..9d84007a5c65 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -231,7 +231,7 @@ xfs_sbblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; xfs_sb_to_disk(dsb, &mp->m_sb); dsb->sb_inprogress = 1; @@ -243,7 +243,7 @@ xfs_agfblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_agf *agf = bp->b_addr; xfs_extlen_t tmpsize; agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -301,7 +301,7 @@ xfs_agflblock_init( uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); } - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + agfl_bno = xfs_buf_to_agfl_bno(bp); for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); } @@ -312,7 +312,7 @@ xfs_agiblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + struct xfs_agi *agi = bp->b_addr; int bucket; agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -502,7 +502,7 @@ xfs_ag_extend_space( if (error) return error; - agi = XFS_BUF_TO_AGI(bp); + agi = bp->b_addr; be32_add_cpu(&agi->agi_length, len); ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); @@ -515,7 +515,7 @@ xfs_ag_extend_space( if (error) return error; - agf = XFS_BUF_TO_AGF(bp); + agf = bp->b_addr; be32_add_cpu(&agf->agf_length, len); ASSERT(agf->agf_length == agi->agi_length); xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); @@ -569,11 +569,11 @@ xfs_ag_get_geometry( memset(ageo, 0, sizeof(*ageo)); ageo->ag_number = agno; - agi = XFS_BUF_TO_AGI(agi_bp); + agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); ageo->ag_ifree = be32_to_cpu(agi->agi_freecount); - agf = XFS_BUF_TO_AGF(agf_bp); + agf = agf_bp->b_addr; ageo->ag_length = be32_to_cpu(agf->agf_length); freeblks = pag->pagf_freeblks + pag->pagf_flcount + diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index d8053bc96c4d..203e74fa64aa 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -151,7 +151,7 @@ xfs_alloc_lookup_eq( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -171,7 +171,7 @@ xfs_alloc_lookup_ge( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -190,7 +190,7 @@ xfs_alloc_lookup_le( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -198,7 +198,7 @@ static inline bool xfs_alloc_cur_active( struct xfs_btree_cur *cur) { - return cur && cur->bc_private.a.priv.abt.active; + return cur && cur->bc_ag.abt.active; } /* @@ -230,7 +230,7 @@ xfs_alloc_get_rec( int *stat) /* output: success/failure */ { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; @@ -589,6 +589,7 @@ xfs_agfl_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); int i; /* @@ -614,8 +615,8 @@ xfs_agfl_verify( return __this_address; for (i = 0; i < xfs_agfl_size(mp); i++) { - if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && - be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && + be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) return __this_address; } @@ -713,7 +714,7 @@ xfs_alloc_update_counters( struct xfs_buf *agbp, long len) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len); @@ -721,7 +722,7 @@ xfs_alloc_update_counters( xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) { - xfs_buf_corruption_error(agbp); + xfs_buf_mark_corrupt(agbp); return -EFSCORRUPTED; } @@ -907,7 +908,7 @@ xfs_alloc_cur_check( deactivate = true; out: if (deactivate) - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.abt.active = false; trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, *new); return 0; @@ -922,13 +923,13 @@ xfs_alloc_cur_finish( struct xfs_alloc_arg *args, struct xfs_alloc_cur *acur) { + struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; int error; ASSERT(acur->cnt && acur->bnolt); ASSERT(acur->bno >= acur->rec_bno); ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); - ASSERT(acur->rec_bno + acur->rec_len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, acur->rec_len, acur->bno, acur->len, 0); @@ -1026,6 +1027,7 @@ xfs_alloc_ag_vextent_small( xfs_extlen_t *flenp, /* result length */ int *stat) /* status: 0-freelist, 1-normal/none */ { + struct xfs_agf *agf = args->agbp->b_addr; int error = 0; xfs_agblock_t fbno = NULLAGBLOCK; xfs_extlen_t flen = 0; @@ -1054,8 +1056,7 @@ xfs_alloc_ag_vextent_small( if (args->minlen != 1 || args->alignment != 1 || args->resv == XFS_AG_RESV_AGFL || - (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <= - args->minleft)) + be32_to_cpu(agf->agf_flcount) <= args->minleft) goto out; error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -1079,9 +1080,7 @@ xfs_alloc_ag_vextent_small( } *fbnop = args->agbno = fbno; *flenp = args->len = 1; - if (XFS_IS_CORRUPT(args->mp, - fbno >= be32_to_cpu( - XFS_BUF_TO_AGF(args->agbp)->agf_length))) { + if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto error; } @@ -1203,6 +1202,7 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { + struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ int error; @@ -1281,8 +1281,7 @@ xfs_alloc_ag_vextent_exact( */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); - ASSERT(args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { @@ -1353,7 +1352,7 @@ xfs_alloc_walk_iter( if (error) return error; if (i == 0) - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.abt.active = false; if (count > 0) count--; @@ -1468,7 +1467,7 @@ xfs_alloc_ag_vextent_locality( if (error) return error; if (i) { - acur->cnt->bc_private.a.priv.abt.active = true; + acur->cnt->bc_ag.abt.active = true; fbcur = acur->cnt; fbinc = false; } @@ -1515,7 +1514,7 @@ xfs_alloc_ag_vextent_lastblock( * maxlen, go to the start of this block, and skip all those smaller * than minlen. */ - if (len || args->alignment > 1) { + if (*len || args->alignment > 1) { acur->cnt->bc_ptrs[0] = 1; do { error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); @@ -1661,6 +1660,7 @@ STATIC int /* error */ xfs_alloc_ag_vextent_size( xfs_alloc_arg_t *args) /* allocation argument structure */ { + struct xfs_agf *agf = args->agbp->b_addr; xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ int error; /* error result */ @@ -1851,8 +1851,7 @@ restart: args->agbno = rbno; if (XFS_IS_CORRUPT(args->mp, args->agbno + args->len > - be32_to_cpu( - XFS_BUF_TO_AGF(args->agbp)->agf_length))) { + be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto error0; } @@ -2424,7 +2423,7 @@ xfs_agfl_reset( struct xfs_perag *pag) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; ASSERT(pag->pagf_agflreset); trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); @@ -2655,7 +2654,7 @@ xfs_alloc_get_freelist( xfs_agblock_t *bnop, /* block address retrieved from freelist */ int btreeblk) /* destination is a AGF btree */ { - xfs_agf_t *agf; /* a.g. freespace structure */ + struct xfs_agf *agf = agbp->b_addr; xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ __be32 *agfl_bno; @@ -2667,7 +2666,6 @@ xfs_alloc_get_freelist( /* * Freelist is empty, give up. */ - agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2684,7 +2682,7 @@ xfs_alloc_get_freelist( /* * Get the block number and update the data structures. */ - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); @@ -2745,7 +2743,7 @@ xfs_alloc_log_agf( sizeof(xfs_agf_t) }; - trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); @@ -2783,18 +2781,15 @@ xfs_alloc_put_freelist( xfs_agblock_t bno, /* block being freed */ int btreeblk) /* block came from a AGF btree */ { - xfs_agf_t *agf; /* a.g. freespace structure */ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agf *agf = agbp->b_addr; __be32 *blockp;/* pointer to array entry */ int error; int logflags; - xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ __be32 *agfl_bno; int startoff; - agf = XFS_BUF_TO_AGF(agbp); - mp = tp->t_mountp; - if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), &agflbp))) return error; @@ -2820,7 +2815,7 @@ xfs_alloc_put_freelist( ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); startoff = (char *)blockp - (char *)agflbp->b_addr; @@ -2838,13 +2833,12 @@ xfs_agf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_agf *agf = bp->b_addr; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) return __this_address; } @@ -2858,6 +2852,13 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) return __this_address; + if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) + return __this_address; + + if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || + be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) + return __this_address; + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || @@ -2869,6 +2870,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) return __this_address; + if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) + return __this_address; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -2883,6 +2888,11 @@ xfs_agf_verify( return __this_address; if (xfs_sb_version_hasreflink(&mp->m_sb) && + be32_to_cpu(agf->agf_refcount_blocks) > + be32_to_cpu(agf->agf_length)) + return __this_address; + + if (xfs_sb_version_hasreflink(&mp->m_sb) && (be32_to_cpu(agf->agf_refcount_level) < 1 || be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) return __this_address; @@ -2914,6 +2924,7 @@ xfs_agf_write_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agf *agf = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agf_verify(bp); @@ -2926,7 +2937,7 @@ xfs_agf_write_verify( return; if (bip) - XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); } @@ -2994,7 +3005,7 @@ xfs_alloc_read_agf( return error; ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); + agf = (*bpp)->b_addr; pag = xfs_perag_get(mp, agno); if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); @@ -3275,6 +3286,7 @@ __xfs_free_extent( struct xfs_buf *agbp; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + struct xfs_agf *agf; int error; unsigned int busy_flags = 0; @@ -3288,6 +3300,7 @@ __xfs_free_extent( error = xfs_free_extent_fix_freelist(tp, agno, &agbp); if (error) return error; + agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { error = -EFSCORRUPTED; @@ -3295,9 +3308,7 @@ __xfs_free_extent( } /* validate the extent size is legal now we have the agf locked */ - if (XFS_IS_CORRUPT(mp, - agbno + len > - be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length))) { + if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto err; } @@ -3408,7 +3419,7 @@ xfs_agfl_walk( unsigned int i; int error; - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); i = be32_to_cpu(agf->agf_flfirst); /* Nothing to walk in an empty AGFL. */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7380fbe4a3ff..a851bf77f17b 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -236,4 +236,13 @@ typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf, struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv); +static inline __be32 * +xfs_buf_to_agfl_bno( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + return bp->b_addr + sizeof(struct xfs_agfl); + return bp->b_addr; +} + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 279694d73e4e..60c453cb3ee3 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -12,6 +12,7 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" @@ -25,7 +26,7 @@ xfs_allocbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_btnum); } @@ -35,8 +36,8 @@ xfs_allocbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); @@ -62,7 +63,7 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -72,7 +73,7 @@ xfs_allocbt_alloc_block( return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); @@ -86,8 +87,8 @@ xfs_allocbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agblock_t bno; int error; @@ -113,7 +114,7 @@ xfs_allocbt_update_lastrec( int ptr, int reason) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); struct xfs_perag *pag; __be32 len; @@ -162,7 +163,7 @@ xfs_allocbt_update_lastrec( pag = xfs_perag_get(cur->bc_mp, seqno); pag->pagf_longest = be32_to_cpu(len); xfs_perag_put(pag); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); } STATIC int @@ -226,9 +227,9 @@ xfs_allocbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -471,18 +472,14 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .recs_inorder = xfs_cntbt_recs_inorder, }; -/* - * Allocate a new allocation btree cursor. - */ -struct xfs_btree_cur * /* new alloc btree cursor */ -xfs_allocbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agf structure */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_btnum_t btnum) /* btree identifier */ +/* Allocate most of a new allocation btree cursor. */ +STATIC struct xfs_btree_cur * +xfs_allocbt_init_common( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_btnum_t btnum) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); @@ -495,19 +492,16 @@ xfs_allocbt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; if (btnum == XFS_BTNUM_CNT) { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); cur->bc_ops = &xfs_cntbt_ops; - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; } else { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); cur->bc_ops = &xfs_bnobt_ops; - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.agno = agno; + cur->bc_ag.abt.active = false; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; @@ -516,6 +510,73 @@ xfs_allocbt_init_cursor( } /* + * Allocate a new allocation btree cursor. + */ +struct xfs_btree_cur * /* new alloc btree cursor */ +xfs_allocbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agf structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* btree identifier */ +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_allocbt_init_common(mp, tp, agno, btnum); + if (btnum == XFS_BTNUM_CNT) + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + else + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + + cur->bc_ag.agbp = agbp; + + return cur; +} + +/* Create a free space btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_allocbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + + cur = xfs_allocbt_init_common(mp, NULL, agno, btnum); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Install a new free space btree root. Caller is responsible for invalidating + * and freeing the old btree blocks. + */ +void +xfs_allocbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); + agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); + + if (cur->bc_btnum == XFS_BTNUM_BNO) { + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops); + } else { + cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE; + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops); + } +} + +/* * Calculate number of records in an alloc btree block. */ int diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index c9305ebb69f6..047f09f0be3c 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* * Btree block header size depends on a superblock flag. @@ -48,8 +49,14 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); +struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + xfs_btnum_t btnum); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); +void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e6149720ce02..e4fe3dca9883 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -56,33 +56,6 @@ STATIC int xfs_attr_node_removename(xfs_da_args_t *args); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); - -STATIC int -xfs_attr_args_init( - struct xfs_da_args *args, - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - int flags) -{ - - if (!name) - return -EINVAL; - - memset(args, 0, sizeof(*args)); - args->geo = dp->i_mount->m_attr_geo; - args->whichfork = XFS_ATTR_FORK; - args->dp = dp; - args->flags = flags; - args->name = name; - args->namelen = namelen; - if (args->namelen >= MAXNAMELEN) - return -EFAULT; /* match IRIX behaviour */ - - args->hashval = xfs_da_hashname(args->name, args->namelen); - return 0; -} - int xfs_inode_hasattr( struct xfs_inode *ip) @@ -104,85 +77,60 @@ xfs_inode_hasattr( */ int xfs_attr_get_ilocked( - struct xfs_inode *ip, struct xfs_da_args *args) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - if (!xfs_inode_hasattr(ip)) + if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; - else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + + if (args->dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); - else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) + if (xfs_bmap_one_block(args->dp, XFS_ATTR_FORK)) return xfs_attr_leaf_get(args); - else - return xfs_attr_node_get(args); + return xfs_attr_node_get(args); } /* * Retrieve an extended attribute by name, and its value if requested. * - * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value, - * just an indication whether the attribute exists and the size of the value if - * it exists. The size is returned in @valuelenp, + * If args->valuelen is zero, then the caller does not want the value, just an + * indication whether the attribute exists and the size of the value if it + * exists. The size is returned in args.valuelen. * - * If the attribute is found, but exceeds the size limit set by the caller in - * @valuelenp, return -ERANGE with the size of the attribute that was found in - * @valuelenp. + * If args->value is NULL but args->valuelen is non-zero, allocate the buffer + * for the value after existence of the attribute has been determined. The + * caller always has to free args->value if it is set, no matter if this + * function was successful or not. * - * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after - * existence of the attribute has been determined. On success, return that - * buffer to the caller and leave them to free it. On failure, free any - * allocated buffer and ensure the buffer pointer returned to the caller is - * null. + * If the attribute is found, but exceeds the size limit set by the caller in + * args->valuelen, return -ERANGE with the size of the attribute that was found + * in args->valuelen. */ int xfs_attr_get( - struct xfs_inode *ip, - const unsigned char *name, - size_t namelen, - unsigned char **value, - int *valuelenp, - int flags) + struct xfs_da_args *args) { - struct xfs_da_args args; uint lock_mode; int error; - ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value); - - XFS_STATS_INC(ip->i_mount, xs_attr_get); + XFS_STATS_INC(args->dp->i_mount, xs_attr_get); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(args->dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, ip, name, namelen, flags); - if (error) - return error; + args->geo = args->dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->hashval = xfs_da_hashname(args->name, args->namelen); /* Entirely possible to look up a name which doesn't exist */ - args.op_flags = XFS_DA_OP_OKNOENT; - if (flags & ATTR_ALLOC) - args.op_flags |= XFS_DA_OP_ALLOCVAL; - else - args.value = *value; - args.valuelen = *valuelenp; + args->op_flags = XFS_DA_OP_OKNOENT; - lock_mode = xfs_ilock_attr_map_shared(ip); - error = xfs_attr_get_ilocked(ip, &args); - xfs_iunlock(ip, lock_mode); - *valuelenp = args.valuelen; + lock_mode = xfs_ilock_attr_map_shared(args->dp); + error = xfs_attr_get_ilocked(args); + xfs_iunlock(args->dp, lock_mode); - /* on error, we have to clean up allocated value buffers */ - if (error) { - if (flags & ATTR_ALLOC) { - kmem_free(args.value); - *value = NULL; - } - return error; - } - *value = args.value; - return 0; + return error; } /* @@ -238,7 +186,7 @@ xfs_attr_try_sf_addname( * Commit the shortform mods, and we're done. * NOTE: this is also the error path (EEXIST, etc). */ - if (!error && (args->flags & ATTR_KERNOTIME) == 0) + if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); if (mp->m_flags & XFS_MOUNT_WSYNC) @@ -336,188 +284,127 @@ xfs_attr_remove_args( return error; } +/* + * Note: If args->value is NULL the attribute will be removed, just like the + * Linux ->setattr API. + */ int xfs_attr_set( - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - unsigned char *value, - int valuelen, - int flags) + struct xfs_da_args *args) { + struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_da_args args; struct xfs_trans_res tres; - int rsvd = (flags & ATTR_ROOT) != 0; + bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); int error, local; - - XFS_STATS_INC(mp, xs_attr_set); + unsigned int total; if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, namelen, flags); - if (error) - return error; - - args.value = value; - args.valuelen = valuelen; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - args.total = xfs_attr_calc_size(&args, &local); - error = xfs_qm_dqattach(dp); if (error) return error; - /* - * If the inode doesn't have an attribute fork, add one. - * (inode must not be locked when we call this routine) - */ - if (XFS_IFORK_Q(dp) == 0) { - int sf_size = sizeof(xfs_attr_sf_hdr_t) + - XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); - - error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); - if (error) - return error; - } - - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * args.total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - - /* - * Root fork attributes can use reserved data blocks for this - * operation if necessary - */ - error = xfs_trans_alloc(mp, &tres, args.total, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &args.trans); - if (error) - return error; - - xfs_ilock(dp, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, - rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; - - xfs_trans_ijoin(args.trans, dp, 0); - error = xfs_attr_set_args(&args); - if (error) - goto out_trans_cancel; - if (!args.trans) { - /* shortform attribute has already been committed */ - goto out_unlock; - } - - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); - - if ((flags & ATTR_KERNOTIME) == 0) - xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->hashval = xfs_da_hashname(args->name, args->namelen); /* - * Commit the last in the sequence of transactions. + * We have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail as well. */ - xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans); -out_unlock: - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; + args->op_flags = XFS_DA_OP_OKNOENT; -out_trans_cancel: - if (args.trans) - xfs_trans_cancel(args.trans); - goto out_unlock; -} + if (args->value) { + XFS_STATS_INC(mp, xs_attr_set); -/* - * Generic handler routine to remove a name from an attribute list. - * Transitions attribute list from Btree to shortform as necessary. - */ -int -xfs_attr_remove( - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - int flags) -{ - struct xfs_mount *mp = dp->i_mount; - struct xfs_da_args args; - int error; - - XFS_STATS_INC(mp, xs_attr_remove); + args->op_flags |= XFS_DA_OP_ADDNAME; + args->total = xfs_attr_calc_size(args, &local); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return -EIO; + /* + * If the inode doesn't have an attribute fork, add one. + * (inode must not be locked when we call this routine) + */ + if (XFS_IFORK_Q(dp) == 0) { + int sf_size = sizeof(struct xfs_attr_sf_hdr) + + XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, + args->valuelen); - error = xfs_attr_args_init(&args, dp, name, namelen, flags); - if (error) - return error; + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; + } - /* - * we have no control over the attribute names that userspace passes us - * to remove, so we have to allow the name lookup prior to attribute - * removal to fail. - */ - args.op_flags = XFS_DA_OP_OKNOENT; + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + total = args->total; + } else { + XFS_STATS_INC(mp, xs_attr_remove); - error = xfs_qm_dqattach(dp); - if (error) - return error; + tres = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); + } /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm, - XFS_ATTRRM_SPACE_RES(mp), 0, - (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0, - &args.trans); + error = xfs_trans_alloc(mp, &tres, total, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &args->trans); if (error) return error; xfs_ilock(dp, XFS_ILOCK_EXCL); - /* - * No need to make quota reservations here. We expect to release some - * blocks not allocate in the common case. - */ - xfs_trans_ijoin(args.trans, dp, 0); - - error = xfs_attr_remove_args(&args); - if (error) - goto out; + xfs_trans_ijoin(args->trans, dp, 0); + if (args->value) { + unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS; + + if (rsvd) + quota_flags |= XFS_QMOPT_FORCE_RES; + error = xfs_trans_reserve_quota_nblks(args->trans, dp, + args->total, 0, quota_flags); + if (error) + goto out_trans_cancel; + error = xfs_attr_set_args(args); + if (error) + goto out_trans_cancel; + /* shortform attribute has already been committed */ + if (!args->trans) + goto out_unlock; + } else { + error = xfs_attr_remove_args(args); + if (error) + goto out_trans_cancel; + } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); + xfs_trans_set_sync(args->trans); - if ((flags & ATTR_KERNOTIME) == 0) - xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + if (!(args->op_flags & XFS_DA_OP_NOTIME)) + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. */ - xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args->trans); +out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; -out: - if (args.trans) - xfs_trans_cancel(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; +out_trans_cancel: + if (args->trans) + xfs_trans_cancel(args->trans); + goto out_unlock; } /*======================================================================== @@ -536,10 +423,10 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) trace_xfs_attr_sf_addname(args); retval = xfs_attr_shortform_lookup(args); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) return retval; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) return retval; retval = xfs_attr_shortform_remove(args); if (retval) @@ -549,7 +436,7 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * that the leaf format add routine won't trip over the attr * not being around. */ - args->flags &= ~ATTR_REPLACE; + args->attr_flags &= ~XATTR_REPLACE; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || @@ -602,14 +489,11 @@ xfs_attr_leaf_addname( * the given flags produce an error or call for an atomic rename. */ retval = xfs_attr3_leaf_lookup_int(bp, args); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { - xfs_trans_brelse(args->trans, bp); - return retval; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) { /* pure create op */ - xfs_trans_brelse(args->trans, bp); - return retval; - } + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) + goto out_brelse; + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) + goto out_brelse; trace_xfs_attr_leaf_replace(args); @@ -750,6 +634,9 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_clearflag(args); } return error; +out_brelse: + xfs_trans_brelse(args->trans, bp); + return retval; } /* @@ -876,10 +763,10 @@ restart: goto out; blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) goto out; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) goto out; trace_xfs_attr_node_replace(args); @@ -1011,7 +898,7 @@ restart: * The INCOMPLETE flag means that we will find the "old" * attr, not the "new" one. */ - args->op_flags |= XFS_DA_OP_INCOMPLETE; + args->attr_filter |= XFS_ATTR_INCOMPLETE; state = xfs_da_state_alloc(); state->args = args; state->mp = mp; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 4243b2272642..0d2d05908537 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -21,39 +21,6 @@ struct xfs_attr_list_context; * as possible so as to fit into the literal area of the inode. */ -/*======================================================================== - * External interfaces - *========================================================================*/ - - -#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */ -#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ -#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ -#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ -#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ -#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ - -#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ -#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ - -#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ -#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */ - -#define ATTR_KERNEL_FLAGS \ - (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC) - -#define XFS_ATTR_FLAGS \ - { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ - { ATTR_ROOT, "ROOT" }, \ - { ATTR_TRUST, "TRUST" }, \ - { ATTR_SECURE, "SECURE" }, \ - { ATTR_CREATE, "CREATE" }, \ - { ATTR_REPLACE, "REPLACE" }, \ - { ATTR_KERNOTIME, "KERNOTIME" }, \ - { ATTR_KERNOVAL, "KERNOVAL" }, \ - { ATTR_INCOMPLETE, "INCOMPLETE" }, \ - { ATTR_ALLOC, "ALLOC" } - /* * The maximum size (into the kernel or returned from the kernel) of an * attribute value or the buffer used for an attr_list() call. Larger @@ -62,45 +29,16 @@ struct xfs_attr_list_context; #define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ /* - * Define how lists of attribute names are returned to the user from - * the attr_list() call. A large, 32bit aligned, buffer is passed in - * along with its size. We put an array of offsets at the top that each - * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. - */ -typedef struct attrlist { - __s32 al_count; /* number of entries in attrlist */ - __s32 al_more; /* T/F: more attrs (do call again) */ - __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ -} attrlist_t; - -/* - * Show the interesting info about one attribute. This is what the - * al_offset[i] entry points to. - */ -typedef struct attrlist_ent { /* data from attr_list() */ - __u32 a_valuelen; /* number bytes in value of attr */ - char a_name[1]; /* attr name (NULL terminated) */ -} attrlist_ent_t; - -/* - * Given a pointer to the (char*) buffer containing the attr_list() result, - * and an index, return a pointer to the indicated attribute in the buffer. - */ -#define ATTR_ENTRY(buffer, index) \ - ((attrlist_ent_t *) \ - &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) - -/* * Kernel-internal version of the attrlist cursor. */ -typedef struct attrlist_cursor_kern { +struct xfs_attrlist_cursor_kern { __u32 hashval; /* hash value of next entry to add */ __u32 blkno; /* block containing entry (suggestion) */ __u32 offset; /* offset in list of equal-hashvals */ __u16 pad1; /* padding to match user-level */ __u8 pad2; /* padding to match user-level */ __u8 initted; /* T/F: cursor has been initialized */ -} attrlist_cursor_kern_t; +}; /*======================================================================== @@ -112,27 +50,28 @@ typedef struct attrlist_cursor_kern { typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); -typedef struct xfs_attr_list_context { - struct xfs_trans *tp; - struct xfs_inode *dp; /* inode */ - struct attrlist_cursor_kern *cursor; /* position in list */ - char *alist; /* output buffer */ +struct xfs_attr_list_context { + struct xfs_trans *tp; + struct xfs_inode *dp; /* inode */ + struct xfs_attrlist_cursor_kern cursor; /* position in list */ + void *buffer; /* output buffer */ /* * Abort attribute list iteration if non-zero. Can be used to pass * error values to the xfs_attr_list caller. */ - int seen_enough; + int seen_enough; + bool allow_incomplete; - ssize_t count; /* num used entries */ - int dupcnt; /* count dup hashvals seen */ - int bufsize; /* total buffer size */ - int firstu; /* first used byte in buffer */ - int flags; /* from VOP call */ - int resynch; /* T/F: resynch with cursor */ - put_listent_func_t put_listent; /* list output fmt function */ - int index; /* index into output buffer */ -} xfs_attr_list_context_t; + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE} */ + int resynch; /* T/F: resynch with cursor */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +}; /*======================================================================== @@ -143,21 +82,14 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *); -int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_attr_list_ilocked(struct xfs_attr_list_context *); +int xfs_attr_list(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); -int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); -int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - size_t namelen, unsigned char **value, int *valuelenp, - int flags); -int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - size_t namelen, unsigned char *value, int valuelen, int flags); +int xfs_attr_get_ilocked(struct xfs_da_args *args); +int xfs_attr_get(struct xfs_da_args *args); +int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, - size_t namelen, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); -int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, - int flags, struct attrlist_cursor_kern *cursor); bool xfs_attr_namecheck(const void *name, size_t length); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index fed537a4353d..863444e2dda7 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -445,14 +445,25 @@ xfs_attr3_leaf_read( * Namespace helper routines *========================================================================*/ -/* - * If namespace bits don't match return 0. - * If all match then return 1. - */ -STATIC int -xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +static bool +xfs_attr_match( + struct xfs_da_args *args, + uint8_t namelen, + unsigned char *name, + int flags) { - return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); + if (args->namelen != namelen) + return false; + if (memcmp(args->name, name, namelen) != 0) + return false; + /* + * If we are looking for incomplete entries, show only those, else only + * show complete entries. + */ + if (args->attr_filter != + (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) + return false; + return true; } static int @@ -464,7 +475,7 @@ xfs_attr_copy_value( /* * No copy if all we have to do is get the length */ - if (args->flags & ATTR_KERNOVAL) { + if (!args->valuelen) { args->valuelen = valuelen; return 0; } @@ -477,7 +488,7 @@ xfs_attr_copy_value( return -ERANGE; } - if (args->op_flags & XFS_DA_OP_ALLOCVAL) { + if (!args->value) { args->value = kmem_alloc_large(valuelen, 0); if (!args->value) return -ENOMEM; @@ -526,7 +537,7 @@ xfs_attr_shortform_bytesfit( int offset; /* rounded down */ - offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; + offset = (XFS_LITINO(mp) - bytes) >> 3; if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; @@ -593,8 +604,7 @@ xfs_attr_shortform_bytesfit( minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -678,15 +688,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { -#ifdef DEBUG - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - ASSERT(0); -#endif + ASSERT(!xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)); } offset = (char *)sfe - (char *)sf; @@ -697,7 +700,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; - sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); sf->hdr.count++; @@ -749,13 +752,9 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), base += size, i++) { size = XFS_ATTR_SF_ENTSIZE(sfe); - if (sfe->namelen != args->namelen) - continue; - if (memcmp(sfe->nameval, args->name, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - break; + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + break; } if (i == end) return -ENOATTR; @@ -816,13 +815,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - return -EEXIST; + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return -EEXIST; } return -ENOATTR; } @@ -830,9 +825,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) /* * Retrieve the attribute value and length. * - * If ATTR_KERNOVAL is specified, only the length needs to be returned. - * Unlike a lookup, we only return an error if the attribute does not - * exist or we can't retrieve the value. + * If args->valuelen is zero, only the length needs to be returned. Unlike a + * lookup, we only return an error if the attribute does not exist or we can't + * retrieve the value. */ int xfs_attr_shortform_getvalue( @@ -847,14 +842,10 @@ xfs_attr_shortform_getvalue( sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], - sfe->valuelen); + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return xfs_attr_copy_value(args, + &sfe->nameval[args->namelen], sfe->valuelen); } return -ENOATTR; } @@ -918,7 +909,7 @@ xfs_attr_shortform_to_leaf( nargs.valuelen = sfe->valuelen; nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); - nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); + nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); error = xfs_attr3_leaf_add(bp, &nargs); @@ -1124,7 +1115,7 @@ xfs_attr3_leaf_to_shortform( nargs.value = &name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); - nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); + nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK; xfs_attr_shortform_add(&nargs, forkoff); } error = 0; @@ -1449,8 +1440,9 @@ xfs_attr3_leaf_add_work( entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); - entry->flags = tmp ? XFS_ATTR_LOCAL : 0; - entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + entry->flags = args->attr_filter; + if (tmp) + entry->flags |= XFS_ATTR_LOCAL; if (args->op_flags & XFS_DA_OP_RENAME) { entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && @@ -2346,7 +2338,7 @@ xfs_attr3_leaf_lookup_int( xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); if (ichdr.count >= args->geo->blksize / 8) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -2365,11 +2357,11 @@ xfs_attr3_leaf_lookup_int( break; } if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -2399,33 +2391,17 @@ xfs_attr3_leaf_lookup_int( /* * GROT: Add code to remove incomplete entries. */ - /* - * If we are looking for INCOMPLETE entries, show only those. - * If we are looking for complete entries, show only those. - */ - if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != - !!(entry->flags & XFS_ATTR_INCOMPLETE)) { - continue; - } if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); - if (name_loc->namelen != args->namelen) - continue; - if (memcmp(args->name, name_loc->nameval, - args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, entry->flags)) + if (!xfs_attr_match(args, name_loc->namelen, + name_loc->nameval, entry->flags)) continue; args->index = probe; return -EEXIST; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); - if (name_rmt->namelen != args->namelen) - continue; - if (memcmp(args->name, name_rmt->name, - args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, entry->flags)) + if (!xfs_attr_match(args, name_rmt->namelen, + name_rmt->name, entry->flags)) continue; args->index = probe; args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); @@ -2444,9 +2420,9 @@ xfs_attr3_leaf_lookup_int( * Get the value associated with an attribute name from a leaf attribute * list structure. * - * If ATTR_KERNOVAL is specified, only the length needs to be returned. - * Unlike a lookup, we only return an error if the attribute does not - * exist or we can't retrieve the value. + * If args->valuelen is zero, only the length needs to be returned. Unlike a + * lookup, we only return an error if the attribute does not exist or we can't + * retrieve the value. */ int xfs_attr3_leaf_getvalue( diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 73615b1dd1a8..6dd2d937a42a 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -8,7 +8,6 @@ #define __XFS_ATTR_LEAF_H__ struct attrlist; -struct attrlist_cursor_kern; struct xfs_attr_list_context; struct xfs_da_args; struct xfs_da_state; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 8b7f74b3bea2..01ad7f353e08 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -397,7 +397,7 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); - ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->valuelen != 0); ASSERT(args->rmtvaluelen == args->valuelen); valuelen = args->rmtvaluelen; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9a6d7a84689a..fda13cd7add0 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -193,14 +193,12 @@ xfs_default_attroffset( struct xfs_mount *mp = ip->i_mount; uint offset; - if (mp->m_sb.sb_inodesize == 256) { - offset = XFS_LITINO(mp, ip->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - } else { + if (mp->m_sb.sb_inodesize == 256) + offset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + else offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - } - ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + ASSERT(offset < XFS_LITINO(mp)); return offset; } @@ -690,7 +688,7 @@ xfs_bmap_extents_to_btree( * Need a cursor. Can't allocate until bb_level is filled in. */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; /* * Convert to a btree with two levels, one record in root. */ @@ -727,7 +725,7 @@ xfs_bmap_extents_to_btree( ASSERT(tp->t_firstblock == NULLFSBLOCK || args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock)); tp->t_firstblock = args.fsbno; - cur->bc_private.b.allocated++; + cur->bc_ino.allocated++; ip->i_d.di_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, @@ -953,7 +951,7 @@ xfs_bmap_add_attrfork_btree( xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return -ENOSPC; } - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); } return 0; @@ -980,7 +978,7 @@ xfs_bmap_add_attrfork_extents( error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, XFS_DATA_FORK); if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -1178,13 +1176,13 @@ xfs_iread_bmbt_block( { struct xfs_iread_state *ir = priv; struct xfs_mount *mp = cur->bc_mp; - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_btree_block *block; struct xfs_buf *bp; struct xfs_bmbt_rec *frp; xfs_extnum_t num_recs; xfs_extnum_t j; - int whichfork = cur->bc_private.b.whichfork; + int whichfork = cur->bc_ino.whichfork; block = xfs_btree_get_block(cur, level, &bp); @@ -1528,7 +1526,7 @@ xfs_bmap_add_extent_delay_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || - (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -1818,7 +1816,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_ino.allocated : 0)); PREV.br_startoff = new_endoff; PREV.br_blockcount = temp; @@ -1904,7 +1902,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_ino.allocated : 0)); PREV.br_startblock = nullstartblock(da_new); PREV.br_blockcount = temp; @@ -2025,8 +2023,8 @@ xfs_bmap_add_extent_delay_real( xfs_mod_delalloc(mp, (int64_t)da_new - da_old); if (bma->cur) { - da_new += bma->cur->bc_private.b.allocated; - bma->cur->bc_private.b.allocated = 0; + da_new += bma->cur->bc_ino.allocated; + bma->cur->bc_ino.allocated = 0; } /* adjust for changes in reserved delayed indirect blocks */ @@ -2573,7 +2571,7 @@ xfs_bmap_add_extent_unwritten_real( /* clear out the allocated field, done with it now in any case. */ if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; *curp = cur; } @@ -2752,7 +2750,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2955,7 +2953,7 @@ xfs_bmap_add_extent_hole_real( /* clear out the allocated field, done with it now in any case. */ if (cur) - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: @@ -4187,8 +4185,8 @@ xfs_bmapi_allocate( bma->nallocs++; if (bma->cur) - bma->cur->bc_private.b.flags = - bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + bma->cur->bc_ino.flags = + bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; bma->got.br_startoff = bma->offset; bma->got.br_startblock = bma->blkno; @@ -4709,7 +4707,7 @@ xfs_bmapi_remap( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } got.br_startoff = bno; @@ -5364,7 +5362,7 @@ __xfs_bunmapi( if (ifp->if_flags & XFS_IFBROOT) { ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } else cur = NULL; @@ -5620,7 +5618,7 @@ error0: xfs_trans_log_inode(tp, ip, logflags); if (cur) { if (!error) - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -5839,7 +5837,7 @@ xfs_bmap_collapse_extents( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { @@ -5956,7 +5954,7 @@ xfs_bmap_insert_extents( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } if (*next_fsb == NULLFSBLOCK) { @@ -6025,8 +6023,8 @@ del_cursor: * @split_fsb is a block where the extents is split. If split_fsb lies in a * hole or the first block of extents, just return 0. */ -STATIC int -xfs_bmap_split_extent_at( +int +xfs_bmap_split_extent( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_fsb) @@ -6074,7 +6072,7 @@ xfs_bmap_split_extent_at( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; @@ -6133,7 +6131,7 @@ xfs_bmap_split_extent_at( del_cursor: if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } @@ -6142,34 +6140,6 @@ del_cursor: return error; } -int -xfs_bmap_split_extent( - struct xfs_inode *ip, - xfs_fileoff_t split_fsb) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - error = xfs_bmap_split_extent_at(tp, ip, split_fsb); - if (error) - goto out; - - return xfs_trans_commit(tp); - -out: - xfs_trans_cancel(tp); - return error; -} - /* Deferred mapping is only for real extents in the data fork. */ static bool xfs_bmap_is_update_needed( diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 14d25e0b7d9c..f3259ad5c22c 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -222,7 +222,8 @@ int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb); -int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index ffe608d2a2d9..295a59cf8840 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -166,13 +166,13 @@ xfs_bmbt_dup_cursor( struct xfs_btree_cur *new; new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.b.ip, cur->bc_private.b.whichfork); + cur->bc_ino.ip, cur->bc_ino.whichfork); /* * Copy the firstblock, dfops, and flags values, * since init cursor doesn't get them. */ - new->bc_private.b.flags = cur->bc_private.b.flags; + new->bc_ino.flags = cur->bc_ino.flags; return new; } @@ -183,12 +183,12 @@ xfs_bmbt_update_cursor( struct xfs_btree_cur *dst) { ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) || - (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + (dst->bc_ino.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); - dst->bc_private.b.allocated += src->bc_private.b.allocated; + dst->bc_ino.allocated += src->bc_ino.allocated; dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock; - src->bc_private.b.allocated = 0; + src->bc_ino.allocated = 0; } STATIC int @@ -205,8 +205,8 @@ xfs_bmbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.fsbno = cur->bc_tp->t_firstblock; - xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino, - cur->bc_private.b.whichfork); + xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino, + cur->bc_ino.whichfork); if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); @@ -230,7 +230,7 @@ xfs_bmbt_alloc_block( } args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL; if (!args.wasdel && args.tp->t_blk_res == 0) { error = -ENOSPC; goto error0; @@ -259,10 +259,10 @@ xfs_bmbt_alloc_block( ASSERT(args.len == 1); cur->bc_tp->t_firstblock = args.fsbno; - cur->bc_private.b.allocated++; - cur->bc_private.b.ip->i_d.di_nblocks++; - xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); - xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + cur->bc_ino.allocated++; + cur->bc_ino.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip, XFS_TRANS_DQ_BCOUNT, 1L); new->l = cpu_to_be64(args.fsbno); @@ -280,12 +280,12 @@ xfs_bmbt_free_block( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); struct xfs_owner_info oinfo; - xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo); ip->i_d.di_nblocks--; @@ -302,8 +302,8 @@ xfs_bmbt_get_minrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; @@ -320,8 +320,8 @@ xfs_bmbt_get_maxrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); @@ -347,7 +347,7 @@ xfs_bmbt_get_dmaxrecs( { if (level != cur->bc_nlevels - 1) return cur->bc_mp->m_bmap_dmxr[level != 0]; - return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); + return xfs_bmdr_maxrecs(cur->bc_ino.forksize, level == 0); } STATIC void @@ -566,11 +566,11 @@ xfs_bmbt_init_cursor( if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); - cur->bc_private.b.ip = ip; - cur->bc_private.b.allocated = 0; - cur->bc_private.b.flags = 0; - cur->bc_private.b.whichfork = whichfork; + cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_ino.ip = ip; + cur->bc_ino.allocated = 0; + cur->bc_ino.flags = 0; + cur->bc_ino.whichfork = whichfork; return cur; } @@ -644,7 +644,7 @@ xfs_bmbt_change_owner( cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) return -ENOMEM; - cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; + cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index fd300dc93ca4..2d25bab68764 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -20,6 +20,7 @@ #include "xfs_trace.h" #include "xfs_alloc.h" #include "xfs_log.h" +#include "xfs_btree_staging.h" /* * Cursor allocation zone. @@ -214,7 +215,7 @@ xfs_btree_check_sptr( { if (level <= 0) return false; - return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno); + return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.agno, agbno); } /* @@ -234,8 +235,8 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", - cur->bc_private.b.ip->i_ino, - cur->bc_private.b.whichfork, cur->bc_btnum, + cur->bc_ino.ip->i_ino, + cur->bc_ino.whichfork, cur->bc_btnum, level, index); } else { if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), @@ -243,7 +244,7 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "AG %u: Corrupt btree %d pointer at level %d index %d.", - cur->bc_private.a.agno, cur->bc_btnum, + cur->bc_ag.agno, cur->bc_btnum, level, index); } @@ -378,10 +379,12 @@ xfs_btree_del_cursor( * allocated indirect blocks' accounting. */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || - cur->bc_private.b.allocated == 0); + cur->bc_ino.allocated == 0); /* * Free the cursor. */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) + kmem_free((void *)cur->bc_ops); kmem_cache_free(xfs_btree_cur_zone, cur); } @@ -642,6 +645,17 @@ xfs_btree_ptr_addr( ((char *)block + xfs_btree_ptr_offset(cur, n, level)); } +struct xfs_ifork * +xfs_btree_ifork_ptr( + struct xfs_btree_cur *cur) +{ + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + + if (cur->bc_flags & XFS_BTREE_STAGING) + return cur->bc_ino.ifake->if_fork; + return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork); +} + /* * Get the root block which is stored in the inode. * @@ -652,9 +666,8 @@ STATIC struct xfs_btree_block * xfs_btree_get_iroot( struct xfs_btree_cur *cur) { - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); return (struct xfs_btree_block *)ifp->if_broot; } @@ -881,13 +894,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, right, 1, cur->bc_ops->buf_ops); rval++; } @@ -945,7 +958,7 @@ xfs_btree_ptr_to_daddr( *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); } else { agbno = be32_to_cpu(ptr->s); - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.agno, agbno); } @@ -1014,7 +1027,7 @@ xfs_btree_ptr_is_null( return ptr->s == cpu_to_be32(NULLAGBLOCK); } -STATIC void +void xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) @@ -1050,7 +1063,7 @@ xfs_btree_get_sibling( } } -STATIC void +void xfs_btree_set_sibling( struct xfs_btree_cur *cur, struct xfs_btree_block *block, @@ -1128,7 +1141,7 @@ xfs_btree_init_block( btnum, level, numrecs, owner, 0); } -STATIC void +void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, struct xfs_buf *bp, @@ -1144,9 +1157,9 @@ xfs_btree_init_block_cur( * code. */ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - owner = cur->bc_private.b.ip->i_ino; + owner = cur->bc_ino.ip->i_ino; else - owner = cur->bc_private.a.agno; + owner = cur->bc_ag.agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, cur->bc_btnum, level, numrecs, @@ -1220,7 +1233,7 @@ xfs_btree_set_refs( } } -STATIC int +int xfs_btree_get_buf_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, @@ -1280,7 +1293,7 @@ xfs_btree_read_buf_block( /* * Copy keys from one btree block to another. */ -STATIC void +void xfs_btree_copy_keys( struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, @@ -1308,11 +1321,11 @@ xfs_btree_copy_recs( /* * Copy block pointers from one btree block to another. */ -STATIC void +void xfs_btree_copy_ptrs( struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, - union xfs_btree_ptr *src_ptr, + const union xfs_btree_ptr *src_ptr, int numptrs) { ASSERT(numptrs >= 0); @@ -1393,8 +1406,8 @@ xfs_btree_log_keys( xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1436,8 +1449,8 @@ xfs_btree_log_ptrs( xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1505,8 +1518,8 @@ xfs_btree_log_block( xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1743,10 +1756,10 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && - !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && + !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != - cur->bc_private.b.ip->i_ino) + cur->bc_ino.ip->i_ino) goto out_bad; /* Did we get the level we were looking for? */ @@ -1762,7 +1775,7 @@ xfs_btree_lookup_get_block( out_bad: *blkp = NULL; - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); return -EFSCORRUPTED; } @@ -2938,9 +2951,9 @@ xfs_btree_new_iroot( xfs_btree_copy_ptrs(cur, pp, &nptr, 1); - xfs_iroot_realloc(cur->bc_private.b.ip, + xfs_iroot_realloc(cur->bc_ino.ip, 1 - xfs_btree_get_numrecs(cblock), - cur->bc_private.b.whichfork); + cur->bc_ino.whichfork); xfs_btree_setbuf(cur, level, cbp); @@ -2953,7 +2966,7 @@ xfs_btree_new_iroot( xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); *logflags |= - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); *stat = 1; return 0; error0: @@ -3105,11 +3118,11 @@ xfs_btree_make_block_unfull( if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) { - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ - xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); *stat = 1; } else { /* A root block that needs replacing */ @@ -3455,8 +3468,8 @@ STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { - int whichfork = cur->bc_private.b.whichfork; - struct xfs_inode *ip = cur->bc_private.b.ip; + int whichfork = cur->bc_ino.whichfork; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; @@ -3514,8 +3527,8 @@ xfs_btree_kill_iroot( index = numrecs - cur->bc_ops->get_maxrecs(cur, level); if (index) { - xfs_iroot_realloc(cur->bc_private.b.ip, index, - cur->bc_private.b.whichfork); + xfs_iroot_realloc(cur->bc_ino.ip, index, + cur->bc_ino.whichfork); block = ifp->if_broot; } @@ -3544,7 +3557,7 @@ xfs_btree_kill_iroot( cur->bc_bufs[level - 1] = NULL; be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); cur->bc_nlevels--; out0: return 0; @@ -3712,8 +3725,8 @@ xfs_btree_delrec( */ if (level == cur->bc_nlevels - 1) { if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { - xfs_iroot_realloc(cur->bc_private.b.ip, -1, - cur->bc_private.b.whichfork); + xfs_iroot_realloc(cur->bc_ino.ip, -1, + cur->bc_ino.whichfork); error = xfs_btree_kill_iroot(cur); if (error) diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 3eff7c321d43..8626c5a81aad 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -10,6 +10,7 @@ struct xfs_buf; struct xfs_inode; struct xfs_mount; struct xfs_trans; +struct xfs_ifork; extern kmem_zone_t *xfs_btree_cur_zone; @@ -177,15 +178,37 @@ union xfs_btree_irec { struct xfs_refcount_irec rc; }; -/* Per-AG btree private information. */ -union xfs_btree_cur_private { - struct { - unsigned long nr_ops; /* # record updates */ - int shape_changes; /* # of extent splits */ - } refc; - struct { - bool active; /* allocation cursor state */ - } abt; +/* Per-AG btree information. */ +struct xfs_btree_cur_ag { + union { + struct xfs_buf *agbp; + struct xbtree_afakeroot *afake; /* for staging cursor */ + }; + xfs_agnumber_t agno; + union { + struct { + unsigned long nr_ops; /* # record updates */ + int shape_changes; /* # of extent splits */ + } refc; + struct { + bool active; /* allocation cursor state */ + } abt; + }; +}; + +/* Btree-in-inode cursor information */ +struct xfs_btree_cur_ino { + struct xfs_inode *ip; + struct xbtree_ifakeroot *ifake; /* for staging cursor */ + int allocated; + short forksize; + char whichfork; + char flags; +/* We are converting a delalloc reservation */ +#define XFS_BTCUR_BMBT_WASDEL (1 << 0) + +/* For extent swap, ignore owner check in verifier */ +#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1) }; /* @@ -209,21 +232,9 @@ typedef struct xfs_btree_cur xfs_btnum_t bc_btnum; /* identifies which btree type */ int bc_statoff; /* offset of btre stats array */ union { - struct { /* needed for BNO, CNT, INO */ - struct xfs_buf *agbp; /* agf/agi buffer pointer */ - xfs_agnumber_t agno; /* ag number */ - union xfs_btree_cur_private priv; - } a; - struct { /* needed for BMAP */ - struct xfs_inode *ip; /* pointer to our inode */ - int allocated; /* count of alloced */ - short forksize; /* fork's inode space */ - char whichfork; /* data or attr fork */ - char flags; /* flags */ -#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ -#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ - } b; - } bc_private; /* per-btree type data */ + struct xfs_btree_cur_ag bc_ag; + struct xfs_btree_cur_ino bc_ino; + }; } xfs_btree_cur_t; /* cursor flags */ @@ -232,6 +243,12 @@ typedef struct xfs_btree_cur #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ #define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ +/* + * The root of this btree is a fakeroot structure so that we can stage a btree + * rebuild without leaving it accessible via primary metadata. The ops struct + * is dynamically allocated and must be freed when the cursor is deleted. + */ +#define XFS_BTREE_STAGING (1<<5) #define XFS_BTREE_NOERROR 0 @@ -494,6 +511,7 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, union xfs_btree_irec *high, bool *exists); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); +struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); /* Does this cursor point to the last block in the given level? */ static inline bool @@ -512,4 +530,20 @@ xfs_btree_islastblock( return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } +void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); +int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, + struct xfs_btree_block **block, struct xfs_buf **bpp); +void xfs_btree_set_sibling(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, union xfs_btree_ptr *ptr, + int lr); +void xfs_btree_init_block_cur(struct xfs_btree_cur *cur, + struct xfs_buf *bp, int level, int numrecs); +void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, + union xfs_btree_ptr *dst_ptr, + const union xfs_btree_ptr *src_ptr, int numptrs); +void xfs_btree_copy_keys(struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, union xfs_btree_key *src_key, + int numkeys); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c new file mode 100644 index 000000000000..f464a7c7cf22 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -0,0 +1,879 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_trace.h" +#include "xfs_btree_staging.h" + +/* + * Staging Cursors and Fake Roots for Btrees + * ========================================= + * + * A staging btree cursor is a special type of btree cursor that callers must + * use to construct a new btree index using the btree bulk loader code. The + * bulk loading code uses the staging btree cursor to abstract the details of + * initializing new btree blocks and filling them with records or key/ptr + * pairs. Regular btree operations (e.g. queries and modifications) are not + * supported with staging cursors, and callers must not invoke them. + * + * Fake root structures contain all the information about a btree that is under + * construction by the bulk loading code. Staging btree cursors point to fake + * root structures instead of the usual AG header or inode structure. + * + * Callers are expected to initialize a fake root structure and pass it into + * the _stage_cursor function for a specific btree type. When bulk loading is + * complete, callers should call the _commit_staged_btree function for that + * specific btree type to commit the new btree into the filesystem. + */ + +/* + * Don't allow staging cursors to be duplicated because they're supposed to be + * kept private to a single thread. + */ +STATIC struct xfs_btree_cur * +xfs_btree_fakeroot_dup_cursor( + struct xfs_btree_cur *cur) +{ + ASSERT(0); + return NULL; +} + +/* + * Don't allow block allocation for a staging cursor, because staging cursors + * do not support regular btree modifications. + * + * Bulk loading uses a separate callback to obtain new blocks from a + * preallocated list, which prevents ENOSPC failures during loading. + */ +STATIC int +xfs_btree_fakeroot_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int *stat) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Don't allow block freeing for a staging cursor, because staging cursors + * do not support regular btree modifications. + */ +STATIC int +xfs_btree_fakeroot_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* Initialize a pointer to the root block from the fakeroot. */ +STATIC void +xfs_btree_fakeroot_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xbtree_afakeroot *afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + afake = cur->bc_ag.afake; + ptr->s = cpu_to_be32(afake->af_root); +} + +/* + * Bulk Loading for AG Btrees + * ========================== + * + * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the + * staging cursor. Callers should initialize this to zero. + * + * The _stage_cursor() function for a specific btree type should call + * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging + * cursor. The corresponding _commit_staged_btree() function should log the + * new root and call xfs_btree_commit_afakeroot() to transform the staging + * cursor into a regular btree cursor. + */ + +/* Update the btree root information for a per-AG fake root. */ +STATIC void +xfs_btree_afakeroot_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + afake->af_root = be32_to_cpu(ptr->s); + afake->af_levels += inc; +} + +/* + * Initialize a AG-rooted btree cursor with the given AG btree fake root. + * The btree cursor's bc_ops will be overridden as needed to make the staging + * functionality work. + */ +void +xfs_btree_stage_afakeroot( + struct xfs_btree_cur *cur, + struct xbtree_afakeroot *afake) +{ + struct xfs_btree_ops *nops; + + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); + ASSERT(cur->bc_tp == NULL); + + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); + nops->alloc_block = xfs_btree_fakeroot_alloc_block; + nops->free_block = xfs_btree_fakeroot_free_block; + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; + nops->set_root = xfs_btree_afakeroot_set_root; + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; + + cur->bc_ag.afake = afake; + cur->bc_nlevels = afake->af_levels; + cur->bc_ops = nops; + cur->bc_flags |= XFS_BTREE_STAGING; +} + +/* + * Transform an AG-rooted staging btree cursor back into a regular cursor by + * substituting a real btree root for the fake one and restoring normal btree + * cursor ops. The caller must log the btree root change prior to calling + * this. + */ +void +xfs_btree_commit_afakeroot( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp, + const struct xfs_btree_ops *ops) +{ + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(cur->bc_tp == NULL); + + trace_xfs_btree_commit_afakeroot(cur); + + kmem_free((void *)cur->bc_ops); + cur->bc_ag.agbp = agbp; + cur->bc_ops = ops; + cur->bc_flags &= ~XFS_BTREE_STAGING; + cur->bc_tp = tp; +} + +/* + * Bulk Loading for Inode-Rooted Btrees + * ==================================== + * + * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to + * the staging cursor. This structure should be initialized as follows: + * + * - if_fork_size field should be set to the number of bytes available to the + * fork in the inode. + * + * - if_fork should point to a freshly allocated struct xfs_ifork. + * + * - if_format should be set to the appropriate fork type (e.g. + * XFS_DINODE_FMT_BTREE). + * + * All other fields must be zero. + * + * The _stage_cursor() function for a specific btree type should call + * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging + * cursor. The corresponding _commit_staged_btree() function should log the + * new root and call xfs_btree_commit_ifakeroot() to transform the staging + * cursor into a regular btree cursor. + */ + +/* + * Initialize an inode-rooted btree cursor with the given inode btree fake + * root. The btree cursor's bc_ops will be overridden as needed to make the + * staging functionality work. If new_ops is not NULL, these new ops will be + * passed out to the caller for further overriding. + */ +void +xfs_btree_stage_ifakeroot( + struct xfs_btree_cur *cur, + struct xbtree_ifakeroot *ifake, + struct xfs_btree_ops **new_ops) +{ + struct xfs_btree_ops *nops; + + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_tp == NULL); + + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); + nops->alloc_block = xfs_btree_fakeroot_alloc_block; + nops->free_block = xfs_btree_fakeroot_free_block; + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; + + cur->bc_ino.ifake = ifake; + cur->bc_nlevels = ifake->if_levels; + cur->bc_ops = nops; + cur->bc_flags |= XFS_BTREE_STAGING; + + if (new_ops) + *new_ops = nops; +} + +/* + * Transform an inode-rooted staging btree cursor back into a regular cursor by + * substituting a real btree root for the fake one and restoring normal btree + * cursor ops. The caller must log the btree root change prior to calling + * this. + */ +void +xfs_btree_commit_ifakeroot( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + int whichfork, + const struct xfs_btree_ops *ops) +{ + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(cur->bc_tp == NULL); + + trace_xfs_btree_commit_ifakeroot(cur); + + kmem_free((void *)cur->bc_ops); + cur->bc_ino.ifake = NULL; + cur->bc_ino.whichfork = whichfork; + cur->bc_ops = ops; + cur->bc_flags &= ~XFS_BTREE_STAGING; + cur->bc_tp = tp; +} + +/* + * Bulk Loading of Staged Btrees + * ============================= + * + * This interface is used with a staged btree cursor to create a totally new + * btree with a large number of records (i.e. more than what would fit in a + * single root block). When the creation is complete, the new root can be + * linked atomically into the filesystem by committing the staged cursor. + * + * Creation of a new btree proceeds roughly as follows: + * + * The first step is to initialize an appropriate fake btree root structure and + * then construct a staged btree cursor. Refer to the block comments about + * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for + * more information about how to do this. + * + * The second step is to initialize a struct xfs_btree_bload context as + * documented in the structure definition. + * + * The third step is to call xfs_btree_bload_compute_geometry to compute the + * height of and the number of blocks needed to construct the btree. See the + * section "Computing the Geometry of the New Btree" for details about this + * computation. + * + * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and + * save them for later use by ->claim_block(). Bulk loading requires all + * blocks to be allocated beforehand to avoid ENOSPC failures midway through a + * rebuild, and to minimize seek distances of the new btree. + * + * Step five is to call xfs_btree_bload() to start constructing the btree. + * + * The final step is to commit the staging btree cursor, which logs the new + * btree root and turns the staging cursor into a regular cursor. The caller + * is responsible for cleaning up the previous btree blocks, if any. + * + * Computing the Geometry of the New Btree + * ======================================= + * + * The number of items placed in each btree block is computed via the following + * algorithm: For leaf levels, the number of items for the level is nr_records + * in the bload structure. For node levels, the number of items for the level + * is the number of blocks in the next lower level of the tree. For each + * level, the desired number of items per block is defined as: + * + * desired = max(minrecs, maxrecs - slack factor) + * + * The number of blocks for the level is defined to be: + * + * blocks = floor(nr_items / desired) + * + * Note this is rounded down so that the npb calculation below will never fall + * below minrecs. The number of items that will actually be loaded into each + * btree block is defined as: + * + * npb = nr_items / blocks + * + * Some of the leftmost blocks in the level will contain one extra record as + * needed to handle uneven division. If the number of records in any block + * would exceed maxrecs for that level, blocks is incremented and npb is + * recalculated. + * + * In other words, we compute the number of blocks needed to satisfy a given + * loading level, then spread the items as evenly as possible. + * + * The height and number of fs blocks required to create the btree are computed + * and returned via btree_height and nr_blocks. + */ + +/* + * Put a btree block that we're loading onto the ordered list and release it. + * The btree blocks will be written to disk when bulk loading is finished. + */ +static void +xfs_btree_bload_drop_buf( + struct list_head *buffers_list, + struct xfs_buf **bpp) +{ + if (*bpp == NULL) + return; + + if (!xfs_buf_delwri_queue(*bpp, buffers_list)) + ASSERT(0); + + xfs_buf_relse(*bpp); + *bpp = NULL; +} + +/* + * Allocate and initialize one btree block for bulk loading. + * + * The new btree block will have its level and numrecs fields set to the values + * of the level and nr_this_block parameters, respectively. + * + * The caller should ensure that ptrp, bpp, and blockp refer to the left + * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp + * will all point to the new block. + */ +STATIC int +xfs_btree_bload_prep_block( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + struct list_head *buffers_list, + unsigned int level, + unsigned int nr_this_block, + union xfs_btree_ptr *ptrp, /* in/out */ + struct xfs_buf **bpp, /* in/out */ + struct xfs_btree_block **blockp, /* in/out */ + void *priv) +{ + union xfs_btree_ptr new_ptr; + struct xfs_buf *new_bp; + struct xfs_btree_block *new_block; + int ret; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + size_t new_size; + + ASSERT(*bpp == NULL); + + /* Allocate a new incore btree root block. */ + new_size = bbl->iroot_size(cur, nr_this_block, priv); + ifp->if_broot = kmem_zalloc(new_size, 0); + ifp->if_broot_bytes = (int)new_size; + ifp->if_flags |= XFS_IFBROOT; + + /* Initialize it and send it out. */ + xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, + XFS_BUF_DADDR_NULL, cur->bc_btnum, level, + nr_this_block, cur->bc_ino.ip->i_ino, + cur->bc_flags); + + *bpp = NULL; + *blockp = ifp->if_broot; + xfs_btree_set_ptr_null(cur, ptrp); + return 0; + } + + /* Claim one of the caller's preallocated blocks. */ + xfs_btree_set_ptr_null(cur, &new_ptr); + ret = bbl->claim_block(cur, &new_ptr, priv); + if (ret) + return ret; + + ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr)); + + ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp); + if (ret) + return ret; + + /* + * The previous block (if any) is the left sibling of the new block, + * so set its right sibling pointer to the new block and drop it. + */ + if (*blockp) + xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); + xfs_btree_bload_drop_buf(buffers_list, bpp); + + /* Initialize the new btree block. */ + xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); + xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB); + + /* Set the out parameters. */ + *bpp = new_bp; + *blockp = new_block; + xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1); + return 0; +} + +/* Load one leaf block. */ +STATIC int +xfs_btree_bload_leaf( + struct xfs_btree_cur *cur, + unsigned int recs_this_block, + xfs_btree_bload_get_record_fn get_record, + struct xfs_btree_block *block, + void *priv) +{ + unsigned int j; + int ret; + + /* Fill the leaf block with records. */ + for (j = 1; j <= recs_this_block; j++) { + union xfs_btree_rec *block_rec; + + ret = get_record(cur, priv); + if (ret) + return ret; + block_rec = xfs_btree_rec_addr(cur, j, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return 0; +} + +/* + * Load one node block with key/ptr pairs. + * + * child_ptr must point to a block within the next level down in the tree. A + * key/ptr entry will be created in the new node block to the block pointed to + * by child_ptr. On exit, child_ptr points to the next block on the child + * level that needs processing. + */ +STATIC int +xfs_btree_bload_node( + struct xfs_btree_cur *cur, + unsigned int recs_this_block, + union xfs_btree_ptr *child_ptr, + struct xfs_btree_block *block) +{ + unsigned int j; + int ret; + + /* Fill the node block with keys and pointers. */ + for (j = 1; j <= recs_this_block; j++) { + union xfs_btree_key child_key; + union xfs_btree_ptr *block_ptr; + union xfs_btree_key *block_key; + struct xfs_btree_block *child_block; + struct xfs_buf *child_bp; + + ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); + + ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, + &child_bp); + if (ret) + return ret; + + block_ptr = xfs_btree_ptr_addr(cur, j, block); + xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1); + + block_key = xfs_btree_key_addr(cur, j, block); + xfs_btree_get_keys(cur, child_block, &child_key); + xfs_btree_copy_keys(cur, block_key, &child_key, 1); + + xfs_btree_get_sibling(cur, child_block, child_ptr, + XFS_BB_RIGHTSIB); + xfs_buf_relse(child_bp); + } + + return 0; +} + +/* + * Compute the maximum number of records (or keyptrs) per block that we want to + * install at this level in the btree. Caller is responsible for having set + * @cur->bc_ino.forksize to the desired fork size, if appropriate. + */ +STATIC unsigned int +xfs_btree_bload_max_npb( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level) +{ + unsigned int ret; + + if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs) + return cur->bc_ops->get_dmaxrecs(cur, level); + + ret = cur->bc_ops->get_maxrecs(cur, level); + if (level == 0) + ret -= bbl->leaf_slack; + else + ret -= bbl->node_slack; + return ret; +} + +/* + * Compute the desired number of records (or keyptrs) per block that we want to + * install at this level in the btree, which must be somewhere between minrecs + * and max_npb. The caller is free to install fewer records per block. + */ +STATIC unsigned int +xfs_btree_bload_desired_npb( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level) +{ + unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level); + + /* Root blocks are not subject to minrecs rules. */ + if (level == cur->bc_nlevels - 1) + return max(1U, npb); + + return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb); +} + +/* + * Compute the number of records to be stored in each block at this level and + * the number of blocks for this level. For leaf levels, we must populate an + * empty root block even if there are no records, so we have to have at least + * one block. + */ +STATIC void +xfs_btree_bload_level_geometry( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level, + uint64_t nr_this_level, + unsigned int *avg_per_block, + uint64_t *blocks, + uint64_t *blocks_with_extra) +{ + uint64_t npb; + uint64_t dontcare; + unsigned int desired_npb; + unsigned int maxnr; + + maxnr = cur->bc_ops->get_maxrecs(cur, level); + + /* + * Compute the number of blocks we need to fill each block with the + * desired number of records/keyptrs per block. Because desired_npb + * could be minrecs, we use regular integer division (which rounds + * the block count down) so that in the next step the effective # of + * items per block will never be less than desired_npb. + */ + desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level); + *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare); + *blocks = max(1ULL, *blocks); + + /* + * Compute the number of records that we will actually put in each + * block, assuming that we want to spread the records evenly between + * the blocks. Take care that the effective # of items per block (npb) + * won't exceed maxrecs even for the blocks that get an extra record, + * since desired_npb could be maxrecs, and in the previous step we + * rounded the block count down. + */ + npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); + if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) { + (*blocks)++; + npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); + } + + *avg_per_block = min_t(uint64_t, npb, nr_this_level); + + trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level, + *avg_per_block, desired_npb, *blocks, + *blocks_with_extra); +} + +/* + * Ensure a slack value is appropriate for the btree. + * + * If the slack value is negative, set slack so that we fill the block to + * halfway between minrecs and maxrecs. Make sure the slack is never so large + * that we can underflow minrecs. + */ +static void +xfs_btree_bload_ensure_slack( + struct xfs_btree_cur *cur, + int *slack, + int level) +{ + int maxr; + int minr; + + maxr = cur->bc_ops->get_maxrecs(cur, level); + minr = cur->bc_ops->get_minrecs(cur, level); + + /* + * If slack is negative, automatically set slack so that we load the + * btree block approximately halfway between minrecs and maxrecs. + * Generally, this will net us 75% loading. + */ + if (*slack < 0) + *slack = maxr - ((maxr + minr) >> 1); + + *slack = min(*slack, maxr - minr); +} + +/* + * Prepare a btree cursor for a bulk load operation by computing the geometry + * fields in bbl. Caller must ensure that the btree cursor is a staging + * cursor. This function can be called multiple times. + */ +int +xfs_btree_bload_compute_geometry( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + uint64_t nr_records) +{ + uint64_t nr_blocks = 0; + uint64_t nr_this_level; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + /* + * Make sure that the slack values make sense for traditional leaf and + * node blocks. Inode-rooted btrees will return different minrecs and + * maxrecs values for the root block (bc_nlevels == level - 1). We're + * checking levels 0 and 1 here, so set bc_nlevels such that the btree + * code doesn't interpret either as the root level. + */ + cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1; + xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0); + xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1); + + bbl->nr_records = nr_this_level = nr_records; + for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) { + uint64_t level_blocks; + uint64_t dontcare64; + unsigned int level = cur->bc_nlevels - 1; + unsigned int avg_per_block; + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &level_blocks, &dontcare64); + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + /* + * If all the items we want to store at this level + * would fit in the inode root block, then we have our + * btree root and are done. + * + * Note that bmap btrees forbid records in the root. + */ + if (level != 0 && nr_this_level <= avg_per_block) { + nr_blocks++; + break; + } + + /* + * Otherwise, we have to store all the items for this + * level in traditional btree blocks and therefore need + * another level of btree to point to those blocks. + * + * We have to re-compute the geometry for each level of + * an inode-rooted btree because the geometry differs + * between a btree root in an inode fork and a + * traditional btree block. + * + * This distinction is made in the btree code based on + * whether level == bc_nlevels - 1. Based on the + * previous root block size check against the root + * block geometry, we know that we aren't yet ready to + * populate the root. Increment bc_nevels and + * recalculate the geometry for a traditional + * block-based btree level. + */ + cur->bc_nlevels++; + xfs_btree_bload_level_geometry(cur, bbl, level, + nr_this_level, &avg_per_block, + &level_blocks, &dontcare64); + } else { + /* + * If all the items we want to store at this level + * would fit in a single root block, we're done. + */ + if (nr_this_level <= avg_per_block) { + nr_blocks++; + break; + } + + /* Otherwise, we need another level of btree. */ + cur->bc_nlevels++; + } + + nr_blocks += level_blocks; + nr_this_level = level_blocks; + } + + if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS) + return -EOVERFLOW; + + bbl->btree_height = cur->bc_nlevels; + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + bbl->nr_blocks = nr_blocks - 1; + else + bbl->nr_blocks = nr_blocks; + return 0; +} + +/* Bulk load a btree given the parameters and geometry established in bbl. */ +int +xfs_btree_bload( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + void *priv) +{ + struct list_head buffers_list; + union xfs_btree_ptr child_ptr; + union xfs_btree_ptr ptr; + struct xfs_buf *bp = NULL; + struct xfs_btree_block *block = NULL; + uint64_t nr_this_level = bbl->nr_records; + uint64_t blocks; + uint64_t i; + uint64_t blocks_with_extra; + uint64_t total_blocks = 0; + unsigned int avg_per_block; + unsigned int level = 0; + int ret; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + INIT_LIST_HEAD(&buffers_list); + cur->bc_nlevels = bbl->btree_height; + xfs_btree_set_ptr_null(cur, &child_ptr); + xfs_btree_set_ptr_null(cur, &ptr); + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &blocks, &blocks_with_extra); + + /* Load each leaf block. */ + for (i = 0; i < blocks; i++) { + unsigned int nr_this_block = avg_per_block; + + /* + * Due to rounding, btree blocks will not be evenly populated + * in most cases. blocks_with_extra tells us how many blocks + * will receive an extra record to distribute the excess across + * the current level as evenly as possible. + */ + if (i < blocks_with_extra) + nr_this_block++; + + ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level, + nr_this_block, &ptr, &bp, &block, priv); + if (ret) + goto out; + + trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, + nr_this_block); + + ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, + block, priv); + if (ret) + goto out; + + /* + * Record the leftmost leaf pointer so we know where to start + * with the first node level. + */ + if (i == 0) + xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); + } + total_blocks += blocks; + xfs_btree_bload_drop_buf(&buffers_list, &bp); + + /* Populate the internal btree nodes. */ + for (level = 1; level < cur->bc_nlevels; level++) { + union xfs_btree_ptr first_ptr; + + nr_this_level = blocks; + block = NULL; + xfs_btree_set_ptr_null(cur, &ptr); + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &blocks, &blocks_with_extra); + + /* Load each node block. */ + for (i = 0; i < blocks; i++) { + unsigned int nr_this_block = avg_per_block; + + if (i < blocks_with_extra) + nr_this_block++; + + ret = xfs_btree_bload_prep_block(cur, bbl, + &buffers_list, level, nr_this_block, + &ptr, &bp, &block, priv); + if (ret) + goto out; + + trace_xfs_btree_bload_block(cur, level, i, blocks, + &ptr, nr_this_block); + + ret = xfs_btree_bload_node(cur, nr_this_block, + &child_ptr, block); + if (ret) + goto out; + + /* + * Record the leftmost node pointer so that we know + * where to start the next node level above this one. + */ + if (i == 0) + xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); + } + total_blocks += blocks; + xfs_btree_bload_drop_buf(&buffers_list, &bp); + xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); + } + + /* Initialize the new root. */ + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); + cur->bc_ino.ifake->if_levels = cur->bc_nlevels; + cur->bc_ino.ifake->if_blocks = total_blocks - 1; + } else { + cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s); + cur->bc_ag.afake->af_levels = cur->bc_nlevels; + cur->bc_ag.afake->af_blocks = total_blocks; + } + + /* + * Write the new blocks to disk. If the ordered list isn't empty after + * that, then something went wrong and we have to fail. This should + * never happen, but we'll check anyway. + */ + ret = xfs_buf_delwri_submit(&buffers_list); + if (ret) + goto out; + if (!list_empty(&buffers_list)) { + ASSERT(list_empty(&buffers_list)); + ret = -EIO; + } + +out: + xfs_buf_delwri_cancel(&buffers_list); + if (bp) + xfs_buf_relse(bp); + return ret; +} diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h new file mode 100644 index 000000000000..643f0f9b2994 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_BTREE_STAGING_H__ +#define __XFS_BTREE_STAGING_H__ + +/* Fake root for an AG-rooted btree. */ +struct xbtree_afakeroot { + /* AG block number of the new btree root. */ + xfs_agblock_t af_root; + + /* Height of the new btree. */ + unsigned int af_levels; + + /* Number of blocks used by the btree. */ + unsigned int af_blocks; +}; + +/* Cursor interactions with with fake roots for AG-rooted btrees. */ +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, + struct xbtree_afakeroot *afake); +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, + struct xfs_buf *agbp, const struct xfs_btree_ops *ops); + +/* Fake root for an inode-rooted btree. */ +struct xbtree_ifakeroot { + /* Fake inode fork. */ + struct xfs_ifork *if_fork; + + /* Number of blocks used by the btree. */ + int64_t if_blocks; + + /* Height of the new btree. */ + unsigned int if_levels; + + /* Number of bytes available for this fork in the inode. */ + unsigned int if_fork_size; + + /* Fork format. */ + unsigned int if_format; + + /* Number of records. */ + unsigned int if_extents; +}; + +/* Cursor interactions with with fake roots for inode-rooted btrees. */ +void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, + struct xbtree_ifakeroot *ifake, + struct xfs_btree_ops **new_ops); +void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, + int whichfork, const struct xfs_btree_ops *ops); + +/* Bulk loading of staged btrees. */ +typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv); +typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, void *priv); +typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, + unsigned int nr_this_level, void *priv); + +struct xfs_btree_bload { + /* + * This function will be called nr_records times to load records into + * the btree. The function does this by setting the cursor's bc_rec + * field in in-core format. Records must be returned in sort order. + */ + xfs_btree_bload_get_record_fn get_record; + + /* + * This function will be called nr_blocks times to obtain a pointer + * to a new btree block on disk. Callers must preallocate all space + * for the new btree before calling xfs_btree_bload, and this function + * is what claims that reservation. + */ + xfs_btree_bload_claim_block_fn claim_block; + + /* + * This function should return the size of the in-core btree root + * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree + * types. + */ + xfs_btree_bload_iroot_size_fn iroot_size; + + /* + * The caller should set this to the number of records that will be + * stored in the new btree. + */ + uint64_t nr_records; + + /* + * Number of free records to leave in each leaf block. If the caller + * sets this to -1, the slack value will be calculated to be be halfway + * between maxrecs and minrecs. This typically leaves the block 75% + * full. Note that slack values are not enforced on inode root blocks. + */ + int leaf_slack; + + /* + * Number of free key/ptrs pairs to leave in each node block. This + * field has the same semantics as leaf_slack. + */ + int node_slack; + + /* + * The xfs_btree_bload_compute_geometry function will set this to the + * number of btree blocks needed to store nr_records records. + */ + uint64_t nr_blocks; + + /* + * The xfs_btree_bload_compute_geometry function will set this to the + * height of the new btree. + */ + unsigned int btree_height; +}; + +int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, uint64_t nr_records); +int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl, + void *priv); + +#endif /* __XFS_BTREE_STAGING_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 875e04f82541..897749c41f36 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -590,7 +590,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { - xfs_buf_corruption_error(oldblk->bp); + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -603,7 +603,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { - xfs_buf_corruption_error(oldblk->bp); + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -1624,7 +1624,7 @@ xfs_da3_node_lookup_int( } if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } @@ -1639,7 +1639,7 @@ xfs_da3_node_lookup_int( /* Tree taller than we can handle; bail out! */ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } @@ -1647,7 +1647,7 @@ xfs_da3_node_lookup_int( if (blkno == args->geo->leafblk) expected_level = nodehdr.level - 1; else if (expected_level != nodehdr.level) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } else expected_level--; @@ -1986,7 +1986,8 @@ xfs_da3_path_shift( ASSERT(path != NULL); ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ - for (blk = &path->blk[level]; level >= 0; blk--, level--) { + for (; level >= 0; level--) { + blk = &path->blk[level]; xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, blk->bp->b_addr); @@ -2520,8 +2521,10 @@ xfs_dabuf_map( */ if (nirecs > 1) { map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS); - if (!map) + if (!map) { + error = -ENOMEM; goto out_free_irecs; + } *mapp = map; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 0f4fbb0889ff..53e503b6f186 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -57,9 +57,10 @@ typedef struct xfs_da_args { const uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ uint8_t filetype; /* filetype of inode for directories */ - uint8_t *value; /* set of bytes (maybe contain NULLs) */ + void *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ - int flags; /* argument flags (eg: ATTR_NOCREATE) */ + unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ + unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */ xfs_dahash_t hashval; /* hash value of name */ xfs_ino_t inumber; /* input/output inode number */ struct xfs_inode *dp; /* directory inode to manipulate */ @@ -88,8 +89,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ -#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ -#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ +#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -97,8 +97,7 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ - { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } + { XFS_DA_OP_NOTIME, "NOTIME" } /* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 734837a9b51a..08c0a4d98b89 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -692,19 +692,7 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) #define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) #define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) - -/* - * Conversion macros for converting namespace bits from argument flags - * to ondisk flags. - */ -#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) -#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) -#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) -#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ - ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) -#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ - ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) /* * Alignment for namelist and valuelist entries (since they are mixed diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index d6ced59b9567..1dbf2f980a26 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -114,6 +114,23 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_struct = xfs_dir3_block_verify, }; +static xfs_failaddr_t +xfs_dir3_block_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} + int xfs_dir3_block_read( struct xfs_trans *tp, @@ -121,12 +138,24 @@ xfs_dir3_block_read( struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_block_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index b9eba8213180..375b3edb2ad2 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -394,6 +394,22 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .verify_write = xfs_dir3_data_write_verify, }; +static xfs_failaddr_t +xfs_dir3_data_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} int xfs_dir3_data_read( @@ -403,12 +419,24 @@ xfs_dir3_data_read( unsigned int flags, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_data_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index a131b520aac7..95d2a3f92d75 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -1383,7 +1383,7 @@ xfs_dir2_leaf_removename( ltp = xfs_dir2_leaf_tail_p(geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[db]) != oldbest) { - xfs_buf_corruption_error(lbp); + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; } /* diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index a0cc5e240306..6ac4aad98cd7 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -194,6 +194,8 @@ xfs_dir3_free_header_check( return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) return __this_address; + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; @@ -226,8 +228,9 @@ __xfs_dir3_free_read( /* Check things that we can't do in the verifier. */ fa = xfs_dir3_free_header_check(dp, fbno, *bpp); if (fa) { - xfs_verifier_error(*bpp, -EFSCORRUPTED, fa); + __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); + *bpp = NULL; return -EFSCORRUPTED; } @@ -439,7 +442,7 @@ xfs_dir2_leaf_to_node( ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); if (be32_to_cpu(ltp->bestcount) > (uint)dp->i_d.di_size / args->geo->blksize) { - xfs_buf_corruption_error(lbp); + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; } @@ -513,7 +516,7 @@ xfs_dir2_leafn_add( * into other peoples memory */ if (index < 0) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -800,7 +803,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir3_leaf_check(dp, bp); if (leafhdr.count <= 0) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 77e9fa385980..045556e78ee2 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -497,6 +497,23 @@ static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp) return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } +/* + * v5 file systems support V3 inodes only, earlier file systems support + * v2 and v1 inodes. + */ +static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline bool xfs_dinode_good_version(struct xfs_sb *sbp, + uint8_t version) +{ + if (xfs_sb_version_has_v3inode(sbp)) + return version == 3; + return version == 1 || version == 2; +} + static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; @@ -560,7 +577,6 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ @@ -707,7 +723,6 @@ typedef struct xfs_agf { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) /* * Size of the unlinked inode hash table in the agi. @@ -775,7 +790,6 @@ typedef struct xfs_agi { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) #define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) /* * The third a.g. block contains the a.g. freelist, an array @@ -783,21 +797,15 @@ typedef struct xfs_agi { */ #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) +#define XFS_BUF_TO_AGFL(bp) ((struct xfs_agfl *)((bp)->b_addr)) -#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ - (__be32 *)(bp)->b_addr) - -typedef struct xfs_agfl { +struct xfs_agfl { __be32 agfl_magicnum; __be32 agfl_seqno; uuid_t agfl_uuid; __be64 agfl_lsn; __be32 agfl_crc; - __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */ -} __attribute__((packed)) xfs_agfl_t; +} __attribute__((packed)); #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) @@ -946,8 +954,12 @@ enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp, version) \ - ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) +#define XFS_DINODE_SIZE(sbp) \ + (xfs_sb_version_has_v3inode(sbp) ? \ + sizeof(struct xfs_dinode) : \ + offsetof(struct xfs_dinode, di_crc)) +#define XFS_LITINO(mp) \ + ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb)) /* * Inode data & attribute fork sizes, per inode. @@ -956,13 +968,9 @@ enum xfs_dinode_fmt { #define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp, (dip)->di_version)) + (XFS_DFORK_Q(dip) ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp)) #define XFS_DFORK_ASIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ - 0) + (XFS_DFORK_Q(dip) ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ XFS_DFORK_DSIZE(dip, mp) : \ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index ef95ca07d084..245188e4f6d3 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -568,10 +568,40 @@ typedef struct xfs_fsop_setdm_handlereq { struct fsdmidata __user *data; /* DMAPI data */ } xfs_fsop_setdm_handlereq_t; +/* + * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. + * + * NOTE: Must match the values declared in libattr without the XFS_IOC_ prefix. + */ +#define XFS_IOC_ATTR_ROOT 0x0002 /* use attrs in root namespace */ +#define XFS_IOC_ATTR_SECURE 0x0008 /* use attrs in security namespace */ +#define XFS_IOC_ATTR_CREATE 0x0010 /* fail if attr already exists */ +#define XFS_IOC_ATTR_REPLACE 0x0020 /* fail if attr does not exist */ + typedef struct xfs_attrlist_cursor { __u32 opaque[4]; } xfs_attrlist_cursor_t; +/* + * Define how lists of attribute names are returned to userspace from the + * XFS_IOC_ATTRLIST_BY_HANDLE ioctl. struct xfs_attrlist is the header at the + * beginning of the returned buffer, and a each entry in al_offset contains the + * relative offset of an xfs_attrlist_ent containing the actual entry. + * + * NOTE: struct xfs_attrlist must match struct attrlist defined in libattr, and + * struct xfs_attrlist_ent must match struct attrlist_ent defined in libattr. + */ +struct xfs_attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +}; + +struct xfs_attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +}; + typedef struct xfs_fsop_attrlist_handlereq { struct xfs_fsop_handlereq hreq; /* handle interface structure */ struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ @@ -589,7 +619,7 @@ typedef struct xfs_attr_multiop { void __user *am_attrname; void __user *am_attrvalue; __u32 am_length; - __u32 am_flags; + __u32 am_flags; /* XFS_IOC_ATTR_* */ } xfs_attr_multiop_t; typedef struct xfs_fsop_attrmulti_handlereq { diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index bf161e930f1d..7fcf62b324b0 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -105,7 +105,7 @@ xfs_inobt_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; uint64_t realfree; @@ -177,7 +177,7 @@ xfs_inobt_insert( xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agino_t thisino; int i; @@ -304,7 +304,7 @@ xfs_ialloc_inode_init( * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); @@ -339,7 +339,7 @@ xfs_ialloc_inode_init( xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = xfs_dinode_size(version); + uint isize = XFS_DINODE_SIZE(&mp->m_sb); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); @@ -525,7 +525,7 @@ xfs_inobt_insert_sprec( bool merge) /* merge or replace */ { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); int error; int i; @@ -658,7 +658,7 @@ xfs_ialloc_ag_alloc( * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. */ - agi = XFS_BUF_TO_AGI(agbp); + agi = agbp->b_addr; newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + @@ -1130,7 +1130,7 @@ xfs_dialloc_ag_inobt( xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); @@ -1583,7 +1583,7 @@ xfs_dialloc_ag( xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); @@ -1943,7 +1943,7 @@ xfs_difree_inobt( struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_perag *pag; struct xfs_btree_cur *cur; @@ -2079,7 +2079,7 @@ xfs_difree_finobt( xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -2489,9 +2489,8 @@ xfs_ialloc_log_agi( sizeof(xfs_agi_t) }; #ifdef DEBUG - xfs_agi_t *agi; /* allocation group header */ + struct xfs_agi *agi = bp->b_addr; - agi = XFS_BUF_TO_AGI(bp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif @@ -2523,14 +2522,13 @@ xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; - struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + struct xfs_agi *agi = bp->b_addr; int i; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn))) return __this_address; } @@ -2593,6 +2591,7 @@ xfs_agi_write_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agi_verify(bp); @@ -2605,7 +2604,7 @@ xfs_agi_write_verify( return; if (bip) - XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } @@ -2661,7 +2660,7 @@ xfs_ialloc_read_agi( if (error) return error; - agi = XFS_BUF_TO_AGI(*bpp); + agi = (*bpp)->b_addr; pag = xfs_perag_get(mp, agno); if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); @@ -2873,7 +2872,7 @@ xfs_ialloc_setup_geometry( * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b82992f795aa..b2c122ad8f0e 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -12,6 +12,7 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" @@ -20,7 +21,6 @@ #include "xfs_trans.h" #include "xfs_rmap.h" - STATIC int xfs_inobt_get_minrecs( struct xfs_btree_cur *cur, @@ -34,7 +34,7 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_btnum); } @@ -44,8 +44,8 @@ xfs_inobt_set_root( union xfs_btree_ptr *nptr, int inc) /* level change */ { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; agi->agi_root = nptr->s; be32_add_cpu(&agi->agi_level, inc); @@ -58,8 +58,8 @@ xfs_finobt_set_root( union xfs_btree_ptr *nptr, int inc) /* level change */ { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; agi->agi_free_root = nptr->s; be32_add_cpu(&agi->agi_free_level, inc); @@ -83,7 +83,7 @@ __xfs_inobt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.oinfo = XFS_RMAP_OINFO_INOBT; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.agno, sbno); args.minlen = 1; args.maxlen = 1; args.prod = 1; @@ -212,9 +212,9 @@ xfs_inobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -224,9 +224,9 @@ xfs_finobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_free_root; } @@ -400,32 +400,27 @@ static const struct xfs_btree_ops xfs_finobt_ops = { }; /* - * Allocate a new inode btree cursor. + * Initialize a new inode btree cursor. */ -struct xfs_btree_cur * /* new inode btree cursor */ -xfs_inobt_init_cursor( +static struct xfs_btree_cur * +xfs_inobt_init_common( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agi structure */ xfs_agnumber_t agno, /* allocation group number */ xfs_btnum_t btnum) /* ialloc or free ino btree */ { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); - cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; if (btnum == XFS_BTNUM_INO) { - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - cur->bc_ops = &xfs_inobt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); + cur->bc_ops = &xfs_inobt_ops; } else { - cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); - cur->bc_ops = &xfs_finobt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); + cur->bc_ops = &xfs_finobt_ops; } cur->bc_blocklog = mp->m_sb.sb_blocklog; @@ -433,12 +428,75 @@ xfs_inobt_init_cursor( if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; + cur->bc_ag.agno = agno; + return cur; +} + +/* Create an inode btree cursor. */ +struct xfs_btree_cur * +xfs_inobt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = agbp->b_addr; + cur = xfs_inobt_init_common(mp, tp, agno, btnum); + if (btnum == XFS_BTNUM_INO) + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + else + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ag.agbp = agbp; return cur; } +/* Create an inode btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_inobt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + + cur = xfs_inobt_init_common(mp, NULL, agno, btnum); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Install a new inobt btree root. Caller is responsible for invalidating + * and freeing the old btree blocks. + */ +void +xfs_inobt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agi *agi = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + if (cur->bc_btnum == XFS_BTNUM_INO) { + agi->agi_root = cpu_to_be32(afake->af_root); + agi->agi_level = cpu_to_be32(afake->af_levels); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); + } else { + agi->agi_free_root = cpu_to_be32(afake->af_root); + agi->agi_free_level = cpu_to_be32(afake->af_levels); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT | + XFS_AGI_FREE_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); + } +} + /* * Calculate number of records in an inobt btree block. */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 951305ecaae1..35bbd978c272 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -48,6 +48,9 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); +struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + xfs_btnum_t btnum); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ @@ -68,4 +71,7 @@ int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_btnum_t btnum, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp); +void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 8afacfe4be0a..39c5a6e24915 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -44,17 +44,6 @@ xfs_inobp_check( } #endif -bool -xfs_dinode_good_version( - struct xfs_mount *mp, - __u8 version) -{ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return version == 3; - - return version == 1 || version == 2; -} - /* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence @@ -93,7 +82,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && - xfs_dinode_good_version(mp, dip->di_version) && + xfs_dinode_good_version(&mp->m_sb, dip->di_version) && xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { @@ -205,16 +194,14 @@ xfs_inode_from_disk( struct xfs_icdinode *to = &ip->i_d; struct inode *inode = VFS_I(ip); - /* * Convert v1 inodes immediately to v2 inode format as this is the * minimum inode version format we support in the rest of the code. + * They will also be unconditionally written back to disk as v2 inodes. */ - to->di_version = from->di_version; - if (to->di_version == 1) { + if (unlikely(from->di_version == 1)) { set_nlink(inode, be16_to_cpu(from->di_onlink)); to->di_projid = 0; - to->di_version = 2; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | @@ -222,8 +209,8 @@ xfs_inode_from_disk( } to->di_format = from->di_format; - to->di_uid = be32_to_cpu(from->di_uid); - to->di_gid = be32_to_cpu(from->di_gid); + i_uid_write(inode, be32_to_cpu(from->di_uid)); + i_gid_write(inode, be32_to_cpu(from->di_gid)); to->di_flushiter = be16_to_cpu(from->di_flushiter); /* @@ -252,7 +239,7 @@ xfs_inode_from_disk( to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); - if (to->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec); @@ -274,10 +261,9 @@ xfs_inode_to_disk( to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); to->di_onlink = 0; - to->di_version = from->di_version; to->di_format = from->di_format; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); + to->di_uid = cpu_to_be32(i_uid_read(inode)); + to->di_gid = cpu_to_be32(i_gid_read(inode)); to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff); to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); @@ -303,7 +289,8 @@ xfs_inode_to_disk( to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); - if (from->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec); @@ -315,6 +302,7 @@ xfs_inode_to_disk( uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_flushiter = 0; } else { + to->di_version = 2; to->di_flushiter = cpu_to_be16(from->di_flushiter); } } @@ -428,7 +416,7 @@ xfs_dinode_verify_forkoff( case XFS_DINODE_FMT_LOCAL: /* fall through ... */ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ case XFS_DINODE_FMT_BTREE: - if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3)) return __this_address; break; default: @@ -454,7 +442,7 @@ xfs_dinode_verify( /* Verify v3 integrity information first */ if (dip->di_version >= 3) { - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) return __this_address; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) @@ -629,10 +617,9 @@ xfs_iread( /* shortcut IO on inode allocation if possible */ if ((iget_flags & XFS_IGET_CREATE) && - xfs_sb_version_hascrc(&mp->m_sb) && + xfs_sb_version_has_v3inode(&mp->m_sb) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { VFS_I(ip)->i_generation = prandom_u32(); - ip->i_d.di_version = 3; return 0; } @@ -674,7 +661,6 @@ xfs_iread( * Partial initialisation of the in-core inode. Just the bits * that xfs_ialloc won't overwrite or relies on being correct. */ - ip->i_d.di_version = dip->di_version; VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); @@ -688,7 +674,6 @@ xfs_iread( VFS_I(ip)->i_mode = 0; } - ASSERT(ip->i_d.di_version >= 2); ip->i_delayed_blks = 0; /* diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index fd94b1078722..9b373dcf9e34 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -16,11 +16,8 @@ struct xfs_dinode; * format specific structures at the appropriate time. */ struct xfs_icdinode { - int8_t di_version; /* inode version */ int8_t di_format; /* format of di_c data */ uint16_t di_flushiter; /* incremented on flush */ - uint32_t di_uid; /* owner's user id */ - uint32_t di_gid; /* owner's group id */ uint32_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ @@ -61,8 +58,6 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); -bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version); - #if defined(DEBUG) void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #else diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index ad2b9c313fd2..518c6f0ec3a6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -183,7 +183,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %d).", + "corrupt inode %Lu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 500333d0101e..668ee942be22 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -46,14 +46,9 @@ struct xfs_ifork { (ip)->i_afp : \ (ip)->i_cowfp)) #define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) + (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) #define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ - XFS_IFORK_BOFF(ip) : \ - 0) + (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0) #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ XFS_IFORK_DSIZE(ip) : \ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 9bac0d2e56dc..e3400c9c71cd 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -424,12 +424,10 @@ struct xfs_log_dinode { /* structure must be padded to 64 bit alignment */ }; -static inline uint xfs_log_dinode_size(int version) -{ - if (version == 3) - return sizeof(struct xfs_log_dinode); - return offsetof(struct xfs_log_dinode, di_next_unlinked); -} +#define xfs_log_dinode_size(mp) \ + (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \ + sizeof(struct xfs_log_dinode) : \ + offsetof(struct xfs_log_dinode, di_next_unlinked)) /* * Buffer Log Format definitions diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 6e1665f2cb67..2076627243b0 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -46,7 +46,7 @@ xfs_refcount_lookup_le( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -63,7 +63,7 @@ xfs_refcount_lookup_ge( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -80,7 +80,7 @@ xfs_refcount_lookup_eq( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -108,7 +108,7 @@ xfs_refcount_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; xfs_agblock_t realstart; @@ -119,7 +119,7 @@ xfs_refcount_get_rec( xfs_refcount_btrec_to_irec(rec, irec); - agno = cur->bc_private.a.agno; + agno = cur->bc_ag.agno; if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; @@ -144,7 +144,7 @@ xfs_refcount_get_rec( if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) goto out_bad_rec; - trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.agno, irec); return 0; out_bad_rec: @@ -169,14 +169,14 @@ xfs_refcount_update( union xfs_btree_rec rec; int error; - trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.agno, irec); rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -193,7 +193,7 @@ xfs_refcount_insert( { int error; - trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.agno, irec); cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; @@ -208,7 +208,7 @@ xfs_refcount_insert( out_error: if (error) trace_xfs_refcount_insert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -234,7 +234,7 @@ xfs_refcount_delete( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.agno, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { error = -EFSCORRUPTED; @@ -246,7 +246,7 @@ xfs_refcount_delete( out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -366,7 +366,7 @@ xfs_refcount_split_extent( return 0; *shape_changed = true; - trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.agno, &rcext, agbno); /* Establish the right extent. */ @@ -391,7 +391,7 @@ xfs_refcount_split_extent( out_error: trace_xfs_refcount_split_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -411,7 +411,7 @@ xfs_refcount_merge_center_extents( int found_rec; trace_xfs_refcount_merge_center_extents(cur->bc_mp, - cur->bc_private.a.agno, left, center, right); + cur->bc_ag.agno, left, center, right); /* * Make sure the center and right extents are not in the btree. @@ -468,7 +468,7 @@ xfs_refcount_merge_center_extents( out_error: trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -487,7 +487,7 @@ xfs_refcount_merge_left_extent( int found_rec; trace_xfs_refcount_merge_left_extent(cur->bc_mp, - cur->bc_private.a.agno, left, cleft); + cur->bc_ag.agno, left, cleft); /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { @@ -530,7 +530,7 @@ xfs_refcount_merge_left_extent( out_error: trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -548,7 +548,7 @@ xfs_refcount_merge_right_extent( int found_rec; trace_xfs_refcount_merge_right_extent(cur->bc_mp, - cur->bc_private.a.agno, cright, right); + cur->bc_ag.agno, cright, right); /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, @@ -594,7 +594,7 @@ xfs_refcount_merge_right_extent( out_error: trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -679,13 +679,13 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; } - trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.agno, left, cleft, agbno); return error; out_error: trace_xfs_refcount_find_left_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -768,13 +768,13 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = aglen; cright->rc_refcount = 1; } - trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.agno, cright, right, agbno + aglen); return error; out_error: trace_xfs_refcount_find_right_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -883,7 +883,7 @@ xfs_refcount_still_have_space( { unsigned long overhead; - overhead = cur->bc_private.a.priv.refc.shape_changes * + overhead = cur->bc_ag.refc.shape_changes * xfs_allocfree_log_count(cur->bc_mp, 1); overhead *= cur->bc_mp->m_sb.sb_blocksize; @@ -891,17 +891,17 @@ xfs_refcount_still_have_space( * Only allow 2 refcount extent updates per transaction if the * refcount continue update "error" has been injected. */ - if (cur->bc_private.a.priv.refc.nr_ops > 2 && + if (cur->bc_ag.refc.nr_ops > 2 && XFS_TEST_ERROR(false, cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; - if (cur->bc_private.a.priv.refc.nr_ops == 0) + if (cur->bc_ag.refc.nr_ops == 0) return true; else if (overhead > cur->bc_tp->t_log_res) return false; return cur->bc_tp->t_log_res - overhead > - cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; + cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } /* @@ -952,7 +952,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &tmp); + cur->bc_ag.agno, &tmp); /* * Either cover the hole (increment) or @@ -968,10 +968,10 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_private.a.priv.refc.nr_ops++; + cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_private.a.agno, + cur->bc_ag.agno, tmp.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, tmp.rc_blockcount, oinfo); @@ -998,12 +998,12 @@ xfs_refcount_adjust_extents( goto skip; ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &ext); + cur->bc_ag.agno, &ext); if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) goto out_error; - cur->bc_private.a.priv.refc.nr_ops++; + cur->bc_ag.refc.nr_ops++; } else if (ext.rc_refcount == 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) @@ -1012,11 +1012,11 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_private.a.priv.refc.nr_ops++; + cur->bc_ag.refc.nr_ops++; goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_private.a.agno, + cur->bc_ag.agno, ext.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount, oinfo); @@ -1035,7 +1035,7 @@ advloop: return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1057,10 +1057,10 @@ xfs_refcount_adjust( *new_agbno = agbno; *new_aglen = aglen; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); /* @@ -1088,7 +1088,7 @@ xfs_refcount_adjust( if (shape_changed) shape_changes++; if (shape_changes) - cur->bc_private.a.priv.refc.shape_changes++; + cur->bc_ag.refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, @@ -1099,7 +1099,7 @@ xfs_refcount_adjust( return 0; out_error: - trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1115,7 +1115,7 @@ xfs_refcount_finish_one_cleanup( if (rcur == NULL) return; - agbp = rcur->bc_private.a.agbp; + agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error) xfs_trans_brelse(tp, agbp); @@ -1165,9 +1165,9 @@ xfs_refcount_finish_one( * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_private.a.agno != agno) { - nr_ops = rcur->bc_private.a.priv.refc.nr_ops; - shape_changes = rcur->bc_private.a.priv.refc.shape_changes; + if (rcur != NULL && rcur->bc_ag.agno != agno) { + nr_ops = rcur->bc_ag.refc.nr_ops; + shape_changes = rcur->bc_ag.refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -1183,8 +1183,8 @@ xfs_refcount_finish_one( error = -ENOMEM; goto out_cur; } - rcur->bc_private.a.priv.refc.nr_ops = nr_ops; - rcur->bc_private.a.priv.refc.shape_changes = shape_changes; + rcur->bc_ag.refc.nr_ops = nr_ops; + rcur->bc_ag.refc.shape_changes = shape_changes; } *pcur = rcur; @@ -1303,7 +1303,7 @@ xfs_refcount_find_shared( int have; int error; - trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); /* By default, skip the whole range */ @@ -1383,12 +1383,12 @@ xfs_refcount_find_shared( done: trace_xfs_refcount_find_shared_result(cur->bc_mp, - cur->bc_private.a.agno, *fbno, *flen); + cur->bc_ag.agno, *fbno, *flen); out_error: if (error) trace_xfs_refcount_find_shared_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1485,7 +1485,7 @@ xfs_refcount_adjust_cow_extents( tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &tmp); + cur->bc_ag.agno, &tmp); error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1513,7 +1513,7 @@ xfs_refcount_adjust_cow_extents( ext.rc_refcount = 0; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &ext); + cur->bc_ag.agno, &ext); error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; @@ -1529,7 +1529,7 @@ xfs_refcount_adjust_cow_extents( return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1575,7 +1575,7 @@ xfs_refcount_adjust_cow( return 0; out_error: - trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1589,7 +1589,7 @@ __xfs_refcount_cow_alloc( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.agno, agbno, aglen); /* Add refcount btree reservation */ @@ -1606,7 +1606,7 @@ __xfs_refcount_cow_free( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.agno, agbno, aglen); /* Remove refcount btree reservation */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 38529dbacd55..7fd6044a4f78 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -12,6 +12,7 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_refcount_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" @@ -25,7 +26,7 @@ xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_ag.agbp, cur->bc_ag.agno); } STATIC void @@ -34,8 +35,8 @@ xfs_refcountbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); @@ -57,8 +58,8 @@ xfs_refcountbt_alloc_block( union xfs_btree_ptr *new, int *stat) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; struct xfs_alloc_arg args; /* block allocation args */ int error; /* error return value */ @@ -66,7 +67,7 @@ xfs_refcountbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, xfs_refc_block(args.mp)); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; @@ -75,13 +76,13 @@ xfs_refcountbt_alloc_block( error = xfs_alloc_vextent(&args); if (error) goto out_error; - trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_private.a.agno); + ASSERT(args.agno == cur->bc_ag.agno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -101,12 +102,12 @@ xfs_refcountbt_free_block( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); int error; - trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); @@ -169,9 +170,9 @@ xfs_refcountbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } @@ -311,41 +312,90 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { }; /* - * Allocate a new refcount btree cursor. + * Initialize a new refcount btree cursor. */ -struct xfs_btree_cur * -xfs_refcountbt_init_cursor( +static struct xfs_btree_cur * +xfs_refcountbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_buf *agbp, xfs_agnumber_t agno) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; ASSERT(agno != NULLAGNUMBER); ASSERT(agno < mp->m_sb.sb_agcount); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = XFS_BTNUM_REFC; cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_refcountbt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); - - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; + cur->bc_ag.agno = agno; cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.a.priv.refc.nr_ops = 0; - cur->bc_private.a.priv.refc.shape_changes = 0; + cur->bc_ag.refc.nr_ops = 0; + cur->bc_ag.refc.shape_changes = 0; + cur->bc_ops = &xfs_refcountbt_ops; + return cur; +} +/* Create a btree cursor. */ +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_refcountbt_init_common(mp, tp, agno); + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + cur->bc_ag.agbp = agbp; return cur; } +/* Create a btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_refcountbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno) +{ + struct xfs_btree_cur *cur; + + cur = xfs_refcountbt_init_common(mp, NULL, agno); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Swap in the new btree root. Once we pass this point the newly rebuilt btree + * is in place and we have to kill off all the old btree blocks. + */ +void +xfs_refcountbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_refcount_root = cpu_to_be32(afake->af_root); + agf->agf_refcount_level = cpu_to_be32(afake->af_levels); + agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS | + XFS_AGF_REFCOUNT_ROOT | + XFS_AGF_REFCOUNT_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops); +} + /* * Calculate the number of records in a refcount btree block. */ @@ -420,7 +470,7 @@ xfs_refcountbt_calc_reserves( if (error) return error; - agf = XFS_BUF_TO_AGF(agbp); + agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_refcount_blocks); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index ba416f71c824..69dc515db671 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* * Btree block header size @@ -46,6 +47,8 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno); +struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno); extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); @@ -58,4 +61,7 @@ extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); +void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index ff9412f113c4..27c39268c31f 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -79,7 +79,7 @@ xfs_rmap_update( union xfs_btree_rec rec; int error; - trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); @@ -91,7 +91,7 @@ xfs_rmap_update( error = xfs_btree_update(cur, &rec); if (error) trace_xfs_rmap_update_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -107,7 +107,7 @@ xfs_rmap_insert( int i; int error; - trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -133,7 +133,7 @@ xfs_rmap_insert( done: if (error) trace_xfs_rmap_insert_error(rcur->bc_mp, - rcur->bc_private.a.agno, error, _RET_IP_); + rcur->bc_ag.agno, error, _RET_IP_); return error; } @@ -149,7 +149,7 @@ xfs_rmap_delete( int i; int error; - trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -170,7 +170,7 @@ xfs_rmap_delete( done: if (error) trace_xfs_rmap_delete_error(rcur->bc_mp, - rcur->bc_private.a.agno, error, _RET_IP_); + rcur->bc_ag.agno, error, _RET_IP_); return error; } @@ -197,7 +197,7 @@ xfs_rmap_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; @@ -260,7 +260,7 @@ xfs_rmap_find_left_neighbor_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, - cur->bc_private.a.agno, rec->rm_startblock, + cur->bc_ag.agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -312,7 +312,7 @@ xfs_rmap_find_left_neighbor( info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, - cur->bc_private.a.agno, bno, 0, owner, offset, flags); + cur->bc_ag.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_find_left_neighbor_helper, &info); @@ -320,7 +320,7 @@ xfs_rmap_find_left_neighbor( error = 0; if (*stat) trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, irec->rm_startblock, + cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -336,7 +336,7 @@ xfs_rmap_lookup_le_range_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, - cur->bc_private.a.agno, rec->rm_startblock, + cur->bc_ag.agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -385,14 +385,14 @@ xfs_rmap_lookup_le_range( info.stat = stat; trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_private.a.agno, bno, 0, owner, offset, flags); + cur->bc_ag.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_lookup_le_range_helper, &info); if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, irec->rm_startblock, + cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -498,7 +498,7 @@ xfs_rmap_unmap( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -522,7 +522,7 @@ xfs_rmap_unmap( goto out_error; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, ltrec.rm_startblock, + cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); ltoff = ltrec.rm_offset; @@ -588,7 +588,7 @@ xfs_rmap_unmap( if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -666,7 +666,7 @@ xfs_rmap_unmap( else cur->bc_rec.r.rm_offset = offset + len; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, cur->bc_rec.r.rm_startblock, cur->bc_rec.r.rm_blockcount, cur->bc_rec.r.rm_owner, @@ -678,11 +678,11 @@ xfs_rmap_unmap( } out_done: - trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno, + trace_xfs_rmap_unmap_error(mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -773,7 +773,7 @@ xfs_rmap_map( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); @@ -795,7 +795,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, ltrec.rm_startblock, + cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -831,7 +831,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, gtrec.rm_startblock, + cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) @@ -870,7 +870,7 @@ xfs_rmap_map( * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| */ ltrec.rm_blockcount += gtrec.rm_blockcount; - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, @@ -921,7 +921,7 @@ xfs_rmap_map( cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, flags); error = xfs_btree_insert(cur, &i); if (error) @@ -932,11 +932,11 @@ xfs_rmap_map( } } - trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno, + trace_xfs_rmap_map_error(mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1010,7 +1010,7 @@ xfs_rmap_convert( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -1034,7 +1034,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, PREV.rm_startblock, + cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1076,7 +1076,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, LEFT.rm_startblock, + cur->bc_ag.agno, LEFT.rm_startblock, LEFT.rm_blockcount, LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags); if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && @@ -1114,7 +1114,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, RIGHT.rm_startblock, + cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (bno + len == RIGHT.rm_startblock && @@ -1132,7 +1132,7 @@ xfs_rmap_convert( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, _RET_IP_); /* reset the cursor back to PREV */ @@ -1162,7 +1162,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1180,7 +1180,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1210,7 +1210,7 @@ xfs_rmap_convert( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1247,7 +1247,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1326,7 +1326,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1383,7 +1383,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1414,7 +1414,7 @@ xfs_rmap_convert( NEW = PREV; NEW.rm_blockcount = offset - PREV.rm_offset; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, NEW.rm_startblock, NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, NEW.rm_flags); @@ -1441,7 +1441,7 @@ xfs_rmap_convert( /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1465,12 +1465,12 @@ xfs_rmap_convert( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1506,7 +1506,7 @@ xfs_rmap_convert_shared( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -1573,7 +1573,7 @@ xfs_rmap_convert_shared( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, RIGHT.rm_startblock, + cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) @@ -1589,7 +1589,7 @@ xfs_rmap_convert_shared( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, _RET_IP_); /* * Switch out based on the FILLING and CONTIG state bits. @@ -1880,12 +1880,12 @@ xfs_rmap_convert_shared( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1923,7 +1923,7 @@ xfs_rmap_unmap_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -2072,12 +2072,12 @@ xfs_rmap_unmap_shared( goto out_error; } - trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_unmap_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -2112,7 +2112,7 @@ xfs_rmap_map_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* Is there a left record that abuts our range? */ @@ -2138,7 +2138,7 @@ xfs_rmap_map_shared( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, gtrec.rm_startblock, + cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); @@ -2231,12 +2231,12 @@ xfs_rmap_map_shared( goto out_error; } - trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_map_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -2336,7 +2336,7 @@ xfs_rmap_finish_one_cleanup( if (rcur == NULL) return; - agbp = rcur->bc_private.a.agbp; + agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error) xfs_trans_brelse(tp, agbp); @@ -2386,7 +2386,7 @@ xfs_rmap_finish_one( * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_private.a.agno != agno) { + if (rcur != NULL && rcur->bc_ag.agno != agno) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -2694,7 +2694,6 @@ struct xfs_rmap_key_state { uint64_t owner; uint64_t offset; unsigned int flags; - bool has_rmap; }; /* For each rmap given, figure out if it doesn't match the key we want. */ @@ -2709,7 +2708,6 @@ xfs_rmap_has_other_keys_helper( if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) return 0; - rks->has_rmap = true; return -ECANCELED; } @@ -2731,7 +2729,7 @@ xfs_rmap_has_other_keys( int error; xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); - rks.has_rmap = false; + *has_rmap = false; low.rm_startblock = bno; memset(&high, 0xFF, sizeof(high)); @@ -2739,11 +2737,12 @@ xfs_rmap_has_other_keys( error = xfs_rmap_query_range(cur, &low, &high, xfs_rmap_has_other_keys_helper, &rks); - if (error < 0) - return error; + if (error == -ECANCELED) { + *has_rmap = true; + return 0; + } - *has_rmap = rks.has_rmap; - return 0; + return error; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index fc78efa52c94..b7c05314d07c 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -14,6 +14,7 @@ #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_trace.h" @@ -51,7 +52,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_ag.agbp, cur->bc_ag.agno); } STATIC void @@ -60,8 +61,8 @@ xfs_rmapbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); @@ -83,25 +84,25 @@ xfs_rmapbt_alloc_block( union xfs_btree_ptr *new, int *stat) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; int error; xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; - trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, bno, 1); if (bno == NULLAGBLOCK) { *stat = 0; return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); @@ -109,7 +110,7 @@ xfs_rmapbt_alloc_block( be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno); + xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno); *stat = 1; return 0; @@ -120,13 +121,13 @@ xfs_rmapbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agblock_t bno; int error; bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); - trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno, bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); @@ -138,7 +139,7 @@ xfs_rmapbt_free_block( XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); - xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno); + xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_ag.agno); return 0; } @@ -215,9 +216,9 @@ xfs_rmapbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -448,17 +449,12 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .recs_inorder = xfs_rmapbt_recs_inorder, }; -/* - * Allocate a new allocation btree cursor. - */ -struct xfs_btree_cur * -xfs_rmapbt_init_cursor( +static struct xfs_btree_cur * +xfs_rmapbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_buf *agbp, xfs_agnumber_t agno) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); @@ -468,17 +464,68 @@ xfs_rmapbt_init_cursor( cur->bc_btnum = XFS_BTNUM_RMAP; cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); + cur->bc_ag.agno = agno; cur->bc_ops = &xfs_rmapbt_ops; + + return cur; +} + +/* Create a new reverse mapping btree cursor. */ +struct xfs_btree_cur * +xfs_rmapbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_rmapbt_init_common(mp, tp, agno); cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); + cur->bc_ag.agbp = agbp; + return cur; +} - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; +/* Create a new reverse mapping btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_rmapbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno) +{ + struct xfs_btree_cur *cur; + cur = xfs_rmapbt_init_common(mp, NULL, agno); + xfs_btree_stage_afakeroot(cur, afake); return cur; } /* + * Install a new reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rmapbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); + agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | + XFS_AGF_RMAP_BLOCKS); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); +} + +/* * Calculate number of records in an rmap btree block. */ int @@ -569,7 +616,7 @@ xfs_rmapbt_calc_reserves( if (error) return error; - agf = XFS_BUF_TO_AGF(agbp); + agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_rmap_blocks); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 820d668b063d..115c3455a734 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -9,6 +9,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* rmaps only exist on crc enabled filesystems */ #define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN @@ -43,6 +44,10 @@ struct xfs_mount; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, xfs_agnumber_t agno); +struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno); +void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); int xfs_rmapbt_maxrecs(int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 2f60fc3c99a0..c526c5e5ab76 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -220,7 +220,7 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; uint32_t agcount = 0; uint32_t rem; @@ -328,6 +328,38 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } + /* Validate the realtime geometry; stolen from xfs_repair */ + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { + xfs_notice(mp, + "realtime extent sanity check failed"); + return -EFSCORRUPTED; + } + + if (sbp->sb_rblocks == 0) { + if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || + sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { + xfs_notice(mp, + "realtime zeroed geometry check failed"); + return -EFSCORRUPTED; + } + } else { + uint64_t rexts; + uint64_t rbmblocks; + + rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); + rbmblocks = howmany_64(sbp->sb_rextents, + NBBY * sbp->sb_blocksize); + + if (sbp->sb_rextents != rexts || + sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + sbp->sb_rbmblocks != rbmblocks) { + xfs_notice(mp, + "realtime geometry sanity check failed"); + return -EFSCORRUPTED; + } + } + if (sbp->sb_unit) { if (!xfs_sb_version_hasdalign(sbp) || sbp->sb_unit > sbp->sb_width || @@ -681,7 +713,7 @@ xfs_sb_read_verify( { struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; int error; /* @@ -707,7 +739,7 @@ xfs_sb_read_verify( * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ - __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; @@ -730,7 +762,7 @@ static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ @@ -748,13 +780,14 @@ xfs_sb_write_verify( struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_dsb *dsb = bp->b_addr; int error; /* * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ - __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; @@ -766,7 +799,7 @@ xfs_sb_write_verify( return; if (bip) - XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); return; @@ -927,7 +960,7 @@ xfs_log_sb( mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } @@ -1007,7 +1040,7 @@ xfs_update_secondary_sbs( bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_buf_delwri_queue(bp, &buffer_list); xfs_buf_relse(bp); diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 7a9c04920505..d1a0848cb52e 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -187,7 +187,7 @@ xfs_calc_inode_chunk_res( XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_sb_version_has_v3inode(&mp->m_sb)) return res; size = XFS_FSB_TO_B(mp, 1); } diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index ba0f747c82e8..e9bcf1faa183 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -92,7 +92,7 @@ xchk_superblock( if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) return error; - sb = XFS_BUF_TO_SBP(bp); + sb = bp->b_addr; /* * Verify the geometries match. Fields that are permanently @@ -358,7 +358,7 @@ static inline void xchk_agf_xref_freeblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_extlen_t blocks = 0; int error; @@ -378,7 +378,7 @@ static inline void xchk_agf_xref_cntbt( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_agblock_t agbno; xfs_extlen_t blocks; int have; @@ -410,7 +410,7 @@ STATIC void xchk_agf_xref_btreeblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t blocks; xfs_agblock_t btreeblks; @@ -456,7 +456,7 @@ static inline void xchk_agf_xref_refcblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_agblock_t blocks; int error; @@ -525,7 +525,7 @@ xchk_agf( goto out; xchk_buffer_recheck(sc, sc->sa.agf_bp); - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agf = sc->sa.agf_bp->b_addr; /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); @@ -711,7 +711,7 @@ xchk_agfl( goto out; /* Allocate buffer to ensure uniqueness of AGFL entries. */ - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agf = sc->sa.agf_bp->b_addr; agflcount = be32_to_cpu(agf->agf_flcount); if (agflcount > xfs_agfl_size(sc->mp)) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); @@ -728,7 +728,7 @@ xchk_agfl( } /* Check the blocks in the AGFL. */ - error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), + error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sc->sa.agfl_bp, xchk_agfl_block, &sai); if (error == -ECANCELED) { error = 0; @@ -765,7 +765,7 @@ static inline void xchk_agi_xref_icounts( struct xfs_scrub *sc) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; xfs_agino_t icount; xfs_agino_t freecount; int error; @@ -834,7 +834,7 @@ xchk_agi( goto out; xchk_buffer_recheck(sc, sc->sa.agi_bp); - agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + agi = sc->sa.agi_bp->b_addr; /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index d5e6db9af434..bca2ab1d4be9 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -49,7 +49,7 @@ xrep_superblock( /* Copy AG 0's superblock to this one. */ xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); @@ -140,7 +140,7 @@ xrep_agf_find_btrees( struct xrep_find_ag_btree *fab, struct xfs_buf *agfl_bp) { - struct xfs_agf *old_agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *old_agf = agf_bp->b_addr; int error; /* Go find the root data. */ @@ -181,7 +181,7 @@ xrep_agf_init_header( struct xfs_agf *old_agf) { struct xfs_mount *mp = sc->mp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; memcpy(old_agf, agf, sizeof(*old_agf)); memset(agf, 0, BBTOB(agf_bp->b_length)); @@ -238,7 +238,7 @@ xrep_agf_calc_from_btrees( { struct xrep_agf_allocbt raa = { .sc = sc }; struct xfs_btree_cur *cur = NULL; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t btreeblks; xfs_agblock_t blocks; @@ -302,7 +302,7 @@ xrep_agf_commit_new( struct xfs_buf *agf_bp) { struct xfs_perag *pag; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; /* Trigger fdblocks recalculation */ xfs_force_summary_recalc(sc->mp); @@ -376,7 +376,7 @@ xrep_agf( if (error) return error; agf_bp->b_ops = &xfs_agf_buf_ops; - agf = XFS_BUF_TO_AGF(agf_bp); + agf = agf_bp->b_addr; /* * Load the AGFL so that we can screen out OWN_AG blocks that are on @@ -395,7 +395,7 @@ xrep_agf( * Spot-check the AGFL blocks; if they're obviously corrupt then * there's nothing we can do but bail out. */ - error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp, + error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp, xrep_agf_check_agfl_block, sc); if (error) return error; @@ -429,10 +429,10 @@ out_revert: struct xrep_agfl { /* Bitmap of other OWN_AG metadata blocks. */ - struct xfs_bitmap agmetablocks; + struct xbitmap agmetablocks; /* Bitmap of free space. */ - struct xfs_bitmap *freesp; + struct xbitmap *freesp; struct xfs_scrub *sc; }; @@ -453,14 +453,14 @@ xrep_agfl_walk_rmap( /* Record all the OWN_AG blocks. */ if (rec->rm_owner == XFS_RMAP_OWN_AG) { - fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, rec->rm_startblock); - error = xfs_bitmap_set(ra->freesp, fsb, rec->rm_blockcount); + error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount); if (error) return error; } - return xfs_bitmap_set_btcur_path(&ra->agmetablocks, cur); + return xbitmap_set_btcur_path(&ra->agmetablocks, cur); } /* @@ -476,19 +476,17 @@ STATIC int xrep_agfl_collect_blocks( struct xfs_scrub *sc, struct xfs_buf *agf_bp, - struct xfs_bitmap *agfl_extents, + struct xbitmap *agfl_extents, xfs_agblock_t *flcount) { struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *n; int error; ra.sc = sc; ra.freesp = agfl_extents; - xfs_bitmap_init(&ra.agmetablocks); + xbitmap_init(&ra.agmetablocks); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); @@ -500,7 +498,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, XFS_BTNUM_BNO); - error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur); + error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; xfs_btree_del_cursor(cur, error); @@ -508,7 +506,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, XFS_BTNUM_CNT); - error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur); + error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; @@ -518,8 +516,8 @@ xrep_agfl_collect_blocks( * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ - error = xfs_bitmap_disunion(agfl_extents, &ra.agmetablocks); - xfs_bitmap_destroy(&ra.agmetablocks); + error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); + xbitmap_destroy(&ra.agmetablocks); if (error) return error; @@ -527,18 +525,12 @@ xrep_agfl_collect_blocks( * Calculate the new AGFL size. If we found more blocks than fit in * the AGFL we'll free them later. */ - *flcount = 0; - for_each_xfs_bitmap_extent(br, n, agfl_extents) { - *flcount += br->len; - if (*flcount > xfs_agfl_size(mp)) - break; - } - if (*flcount > xfs_agfl_size(mp)) - *flcount = xfs_agfl_size(mp); + *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), + xfs_agfl_size(mp)); return 0; err: - xfs_bitmap_destroy(&ra.agmetablocks); + xbitmap_destroy(&ra.agmetablocks); xfs_btree_del_cursor(cur, error); return error; } @@ -550,7 +542,7 @@ xrep_agfl_update_agf( struct xfs_buf *agf_bp, xfs_agblock_t flcount) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; ASSERT(flcount <= xfs_agfl_size(sc->mp)); @@ -573,13 +565,13 @@ STATIC void xrep_agfl_init_header( struct xfs_scrub *sc, struct xfs_buf *agfl_bp, - struct xfs_bitmap *agfl_extents, + struct xbitmap *agfl_extents, xfs_agblock_t flcount) { struct xfs_mount *mp = sc->mp; __be32 *agfl_bno; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *n; + struct xbitmap_range *br; + struct xbitmap_range *n; struct xfs_agfl *agfl; xfs_agblock_t agbno; unsigned int fl_off; @@ -602,8 +594,8 @@ xrep_agfl_init_header( * step. */ fl_off = 0; - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp); - for_each_xfs_bitmap_extent(br, n, agfl_extents) { + agfl_bno = xfs_buf_to_agfl_bno(agfl_bp); + for_each_xbitmap_extent(br, n, agfl_extents) { agbno = XFS_FSB_TO_AGBNO(mp, br->start); trace_xrep_agfl_insert(mp, sc->sa.agno, agbno, br->len); @@ -637,7 +629,7 @@ int xrep_agfl( struct xfs_scrub *sc) { - struct xfs_bitmap agfl_extents; + struct xbitmap agfl_extents; struct xfs_mount *mp = sc->mp; struct xfs_buf *agf_bp; struct xfs_buf *agfl_bp; @@ -649,7 +641,7 @@ xrep_agfl( return -EOPNOTSUPP; xchk_perag_get(sc->mp, &sc->sa); - xfs_bitmap_init(&agfl_extents); + xbitmap_init(&agfl_extents); /* * Read the AGF so that we can query the rmapbt. We hope that there's @@ -696,10 +688,10 @@ xrep_agfl( goto err; /* Dump any AGFL overflow. */ - return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, + error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); err: - xfs_bitmap_destroy(&agfl_extents); + xbitmap_destroy(&agfl_extents); return error; } @@ -761,7 +753,7 @@ xrep_agi_init_header( struct xfs_buf *agi_bp, struct xfs_agi *old_agi) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; memcpy(old_agi, agi, sizeof(*old_agi)); @@ -807,7 +799,7 @@ xrep_agi_calc_from_btrees( struct xfs_buf *agi_bp) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agino_t count; xfs_agino_t freecount; @@ -835,7 +827,7 @@ xrep_agi_commit_new( struct xfs_buf *agi_bp) { struct xfs_perag *pag; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; /* Trigger inode count recalculation */ xfs_force_summary_recalc(sc->mp); @@ -892,7 +884,7 @@ xrep_agi( if (error) return error; agi_bp->b_ops = &xfs_agi_buf_ops; - agi = XFS_BUF_TO_AGI(agi_bp); + agi = agi_bp->b_addr; /* Find the AGI btree roots. */ error = xrep_agi_find_btrees(sc, fab); diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 5533e48e605d..73d924e47565 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -94,7 +94,7 @@ xchk_allocbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; xfs_extlen_t len; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index d9f0dd444b80..9faddb334a2c 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -98,7 +98,7 @@ struct xchk_xattr { /* * Check that an extended attribute key can be looked up by hash. * - * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked) + * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked) * to call this function for every attribute key in an inode. Once * we're here, we load the attribute value to see if any errors happen, * or if we get more or less data than we expected. @@ -147,11 +147,8 @@ xchk_xattr_listent( return; } - args.flags = ATTR_KERNOTIME; - if (flags & XFS_ATTR_ROOT) - args.flags |= ATTR_ROOT; - else if (flags & XFS_ATTR_SECURE) - args.flags |= ATTR_SECURE; + args.op_flags = XFS_DA_OP_NOTIME; + args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK; args.geo = context->dp->i_mount->m_attr_geo; args.whichfork = XFS_ATTR_FORK; args.dp = context->dp; @@ -162,7 +159,10 @@ xchk_xattr_listent( args.value = xchk_xattr_valuebuf(sx->sc); args.valuelen = valuelen; - error = xfs_attr_get_ilocked(context->dp, &args); + error = xfs_attr_get_ilocked(&args); + /* ENODATA means the hash lookup failed and the attr is bad */ + if (error == -ENODATA) + error = -EFSCORRUPTED; if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, &error)) goto fail_xref; @@ -474,7 +474,6 @@ xchk_xattr( struct xfs_scrub *sc) { struct xchk_xattr sx; - struct attrlist_cursor_kern cursor = { 0 }; xfs_dablk_t last_checked = -1U; int error = 0; @@ -493,11 +492,10 @@ xchk_xattr( /* Check that every attr key can also be looked up by hash. */ sx.context.dp = sc->ip; - sx.context.cursor = &cursor; sx.context.resynch = 1; sx.context.put_listent = xchk_xattr_listent; sx.context.tp = sc->tp; - sx.context.flags = ATTR_INCOMPLETE; + sx.context.allow_incomplete = true; sx.sc = sc; /* @@ -516,7 +514,7 @@ xchk_xattr( * iteration, which doesn't really follow the usual buffer * locking order. */ - error = xfs_attr_list_int_ilocked(&sx.context); + error = xfs_attr_list_ilocked(&sx.context); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) goto out; diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 18a684e18a69..f88694f22d05 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -18,14 +18,14 @@ * This is the logical equivalent of bitmap |= mask(start, len). */ int -xfs_bitmap_set( - struct xfs_bitmap *bitmap, +xbitmap_set( + struct xbitmap *bitmap, uint64_t start, uint64_t len) { - struct xfs_bitmap_range *bmr; + struct xbitmap_range *bmr; - bmr = kmem_alloc(sizeof(struct xfs_bitmap_range), KM_MAYFAIL); + bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); if (!bmr) return -ENOMEM; @@ -39,13 +39,13 @@ xfs_bitmap_set( /* Free everything related to this bitmap. */ void -xfs_bitmap_destroy( - struct xfs_bitmap *bitmap) +xbitmap_destroy( + struct xbitmap *bitmap) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; - for_each_xfs_bitmap_extent(bmr, n, bitmap) { + for_each_xbitmap_extent(bmr, n, bitmap) { list_del(&bmr->list); kmem_free(bmr); } @@ -53,24 +53,24 @@ xfs_bitmap_destroy( /* Set up a per-AG block bitmap. */ void -xfs_bitmap_init( - struct xfs_bitmap *bitmap) +xbitmap_init( + struct xbitmap *bitmap) { INIT_LIST_HEAD(&bitmap->list); } /* Compare two btree extents. */ static int -xfs_bitmap_range_cmp( +xbitmap_range_cmp( void *priv, struct list_head *a, struct list_head *b) { - struct xfs_bitmap_range *ap; - struct xfs_bitmap_range *bp; + struct xbitmap_range *ap; + struct xbitmap_range *bp; - ap = container_of(a, struct xfs_bitmap_range, list); - bp = container_of(b, struct xfs_bitmap_range, list); + ap = container_of(a, struct xbitmap_range, list); + bp = container_of(b, struct xbitmap_range, list); if (ap->start > bp->start) return 1; @@ -96,14 +96,14 @@ xfs_bitmap_range_cmp( #define LEFT_ALIGNED (1 << 0) #define RIGHT_ALIGNED (1 << 1) int -xfs_bitmap_disunion( - struct xfs_bitmap *bitmap, - struct xfs_bitmap *sub) +xbitmap_disunion( + struct xbitmap *bitmap, + struct xbitmap *sub) { struct list_head *lp; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *new_br; - struct xfs_bitmap_range *sub_br; + struct xbitmap_range *br; + struct xbitmap_range *new_br; + struct xbitmap_range *sub_br; uint64_t sub_start; uint64_t sub_len; int state; @@ -113,8 +113,8 @@ xfs_bitmap_disunion( return 0; ASSERT(!list_empty(&sub->list)); - list_sort(NULL, &bitmap->list, xfs_bitmap_range_cmp); - list_sort(NULL, &sub->list, xfs_bitmap_range_cmp); + list_sort(NULL, &bitmap->list, xbitmap_range_cmp); + list_sort(NULL, &sub->list, xbitmap_range_cmp); /* * Now that we've sorted both lists, we iterate bitmap once, rolling @@ -124,11 +124,11 @@ xfs_bitmap_disunion( * list traversal is similar to merge sort, but we're deleting * instead. In this manner we avoid O(n^2) operations. */ - sub_br = list_first_entry(&sub->list, struct xfs_bitmap_range, + sub_br = list_first_entry(&sub->list, struct xbitmap_range, list); lp = bitmap->list.next; while (lp != &bitmap->list) { - br = list_entry(lp, struct xfs_bitmap_range, list); + br = list_entry(lp, struct xbitmap_range, list); /* * Advance sub_br and/or br until we find a pair that @@ -181,7 +181,7 @@ xfs_bitmap_disunion( * Deleting from the middle: add the new right extent * and then shrink the left extent. */ - new_br = kmem_alloc(sizeof(struct xfs_bitmap_range), + new_br = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); if (!new_br) { error = -ENOMEM; @@ -247,8 +247,8 @@ out: * blocks going from the leaf towards the root. */ int -xfs_bitmap_set_btcur_path( - struct xfs_bitmap *bitmap, +xbitmap_set_btcur_path( + struct xbitmap *bitmap, struct xfs_btree_cur *cur) { struct xfs_buf *bp; @@ -261,7 +261,7 @@ xfs_bitmap_set_btcur_path( if (!bp) continue; fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); - error = xfs_bitmap_set(bitmap, fsb, 1); + error = xbitmap_set(bitmap, fsb, 1); if (error) return error; } @@ -271,12 +271,12 @@ xfs_bitmap_set_btcur_path( /* Collect a btree's block in the bitmap. */ STATIC int -xfs_bitmap_collect_btblock( +xbitmap_collect_btblock( struct xfs_btree_cur *cur, int level, void *priv) { - struct xfs_bitmap *bitmap = priv; + struct xbitmap *bitmap = priv; struct xfs_buf *bp; xfs_fsblock_t fsbno; @@ -285,15 +285,30 @@ xfs_bitmap_collect_btblock( return 0; fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); - return xfs_bitmap_set(bitmap, fsbno, 1); + return xbitmap_set(bitmap, fsbno, 1); } /* Walk the btree and mark the bitmap wherever a btree block is found. */ int -xfs_bitmap_set_btblocks( - struct xfs_bitmap *bitmap, +xbitmap_set_btblocks( + struct xbitmap *bitmap, struct xfs_btree_cur *cur) { - return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock, + return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock, XFS_BTREE_VISIT_ALL, bitmap); } + +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap_hweight( + struct xbitmap *bitmap) +{ + struct xbitmap_range *bmr; + struct xbitmap_range *n; + uint64_t ret = 0; + + for_each_xbitmap_extent(bmr, n, bitmap) + ret += bmr->len; + + return ret; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index ae8ecbce6fa6..900646b72de1 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -6,31 +6,32 @@ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xfs_bitmap_range { +struct xbitmap_range { struct list_head list; uint64_t start; uint64_t len; }; -struct xfs_bitmap { +struct xbitmap { struct list_head list; }; -void xfs_bitmap_init(struct xfs_bitmap *bitmap); -void xfs_bitmap_destroy(struct xfs_bitmap *bitmap); +void xbitmap_init(struct xbitmap *bitmap); +void xbitmap_destroy(struct xbitmap *bitmap); -#define for_each_xfs_bitmap_extent(bex, n, bitmap) \ +#define for_each_xbitmap_extent(bex, n, bitmap) \ list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) -#define for_each_xfs_bitmap_block(b, bex, n, bitmap) \ +#define for_each_xbitmap_block(b, bex, n, bitmap) \ list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \ - for ((b) = bex->start; (b) < bex->start + bex->len; (b)++) + for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++) -int xfs_bitmap_set(struct xfs_bitmap *bitmap, uint64_t start, uint64_t len); -int xfs_bitmap_disunion(struct xfs_bitmap *bitmap, struct xfs_bitmap *sub); -int xfs_bitmap_set_btcur_path(struct xfs_bitmap *bitmap, +int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); +int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); +int xbitmap_set_btcur_path(struct xbitmap *bitmap, struct xfs_btree_cur *cur); -int xfs_bitmap_set_btblocks(struct xfs_bitmap *bitmap, +int xbitmap_set_btblocks(struct xbitmap *bitmap, struct xfs_btree_cur *cur); +uint64_t xbitmap_hweight(struct xbitmap *bitmap); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index fa6ea6407992..add8598eacd5 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -374,7 +374,7 @@ xchk_bmapbt_rec( struct xfs_bmbt_irec iext_irec; struct xfs_iext_cursor icur; struct xchk_bmap_info *info = bs->private; - struct xfs_inode *ip = bs->cur->bc_private.b.ip; + struct xfs_inode *ip = bs->cur->bc_ino.ip; struct xfs_buf *bp = NULL; struct xfs_btree_block *block; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); @@ -501,7 +501,7 @@ xchk_bmap_check_rmap( xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_private.a.agno, rec->rm_startblock)) + cur->bc_ag.agno, rec->rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_blockcount > rec->rm_blockcount) diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 97a15b6f2865..9a2e27ac1300 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -219,19 +219,21 @@ xchk_da_btree_block_check_sibling( int direction, xfs_dablk_t sibling) { + struct xfs_da_state_path *path = &ds->state->path; + struct xfs_da_state_path *altpath = &ds->state->altpath; int retval; + int plevel; int error; - memcpy(&ds->state->altpath, &ds->state->path, - sizeof(ds->state->altpath)); + memcpy(altpath, path, sizeof(ds->state->altpath)); /* * If the pointer is null, we shouldn't be able to move the upper * level pointer anywhere. */ if (sibling == 0) { - error = xfs_da3_path_shift(ds->state, &ds->state->altpath, - direction, false, &retval); + error = xfs_da3_path_shift(ds->state, altpath, direction, + false, &retval); if (error == 0 && retval == 0) xchk_da_set_corrupt(ds, level); error = 0; @@ -239,27 +241,33 @@ xchk_da_btree_block_check_sibling( } /* Move the alternate cursor one block in the direction given. */ - error = xfs_da3_path_shift(ds->state, &ds->state->altpath, - direction, false, &retval); + error = xfs_da3_path_shift(ds->state, altpath, direction, false, + &retval); if (!xchk_da_process_error(ds, level, &error)) - return error; + goto out; if (retval) { xchk_da_set_corrupt(ds, level); - return error; + goto out; } - if (ds->state->altpath.blk[level].bp) - xchk_buffer_recheck(ds->sc, - ds->state->altpath.blk[level].bp); + if (altpath->blk[level].bp) + xchk_buffer_recheck(ds->sc, altpath->blk[level].bp); /* Compare upper level pointer to sibling pointer. */ - if (ds->state->altpath.blk[level].blkno != sibling) + if (altpath->blk[level].blkno != sibling) xchk_da_set_corrupt(ds, level); - if (ds->state->altpath.blk[level].bp) { - xfs_trans_brelse(ds->dargs.trans, - ds->state->altpath.blk[level].bp); - ds->state->altpath.blk[level].bp = NULL; - } + out: + /* Free all buffers in the altpath that aren't referenced from path. */ + for (plevel = 0; plevel < altpath->active; plevel++) { + if (altpath->blk[plevel].bp == NULL || + (plevel < path->active && + altpath->blk[plevel].bp == path->blk[plevel].bp)) + continue; + + xfs_trans_brelse(ds->dargs.trans, altpath->blk[plevel].bp); + altpath->blk[plevel].bp = NULL; + } + return error; } diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 266da4e4bde6..fe2a6e030c8a 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -155,6 +155,9 @@ xchk_dir_actor( xname.type = XFS_DIR3_FT_UNKNOWN; error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); + /* ENOENT means the hash lookup failed and the dir is corrupt */ + if (error == -ENOENT) + error = -EFSCORRUPTED; if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, &error)) goto out; @@ -500,7 +503,7 @@ xchk_directory_leaf1_bestfree( /* Read the free space block. */ error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - goto out; + return error; xchk_buffer_recheck(sc, bp); leaf = bp->b_addr; @@ -565,9 +568,10 @@ xchk_directory_leaf1_bestfree( xchk_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + break; } out: + xfs_trans_brelse(sc->tp, bp); return error; } @@ -589,7 +593,7 @@ xchk_directory_free_bestfree( /* Read the free space block */ error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - goto out; + return error; xchk_buffer_recheck(sc, bp); if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { @@ -612,7 +616,7 @@ xchk_directory_free_bestfree( 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - break; + goto out; xchk_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); } @@ -620,6 +624,7 @@ xchk_directory_free_bestfree( if (freehdr.nused + stale != freehdr.nvalid) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); out: + xfs_trans_brelse(sc->tp, bp); return error; } diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 681758704fda..64c217eb06a7 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -104,7 +104,7 @@ xchk_iallocbt_chunk( xfs_extlen_t len) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); @@ -164,7 +164,7 @@ xchk_iallocbt_check_cluster_ifree( * the record, compute which fs inode we're talking about. */ agino = irec->ir_startino + irec_ino; - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.agno, agino); irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || @@ -215,7 +215,7 @@ xchk_iallocbt_check_cluster( struct xfs_dinode *dip; struct xfs_buf *cluster_bp; unsigned int nr_inodes; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t agbno; unsigned int cluster_index; uint16_t cluster_mask = 0; @@ -426,7 +426,7 @@ xchk_iallocbt_rec( struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agino_t agino; xfs_extlen_t len; int holecount; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 0cab11a5d390..beaeb6fa3119 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -336,7 +336,7 @@ xchk_refcountbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; xfs_agblock_t *cow_blocks = bs->private; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; xfs_extlen_t len; xfs_nlink_t refcount; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index e489d7a8446a..db3cfd12803d 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -208,8 +208,10 @@ xrep_calc_ag_resblks( /* Now grab the block counters from the AGF. */ error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); if (!error) { - aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length); - freelen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_freeblks); + struct xfs_agf *agf = bp->b_addr; + + aglen = be32_to_cpu(agf->agf_length); + freelen = be32_to_cpu(agf->agf_freeblks); usedlen = aglen - freelen; xfs_buf_relse(bp); } @@ -434,10 +436,10 @@ xrep_init_btblock( int xrep_invalidate_blocks( struct xfs_scrub *sc, - struct xfs_bitmap *bitmap) + struct xbitmap *bitmap) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; struct xfs_buf *bp; xfs_fsblock_t fsbno; @@ -449,7 +451,7 @@ xrep_invalidate_blocks( * because we never own those; and if we can't TRYLOCK the buffer we * assume it's owned by someone else. */ - for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) { + for_each_xbitmap_block(fsbno, bmr, n, bitmap) { /* Skip AG headers and post-EOFS blocks */ if (!xfs_verify_fsbno(sc->mp, fsbno)) continue; @@ -595,18 +597,18 @@ out_free: int xrep_reap_extents( struct xfs_scrub *sc, - struct xfs_bitmap *bitmap, + struct xbitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; xfs_fsblock_t fsbno; int error = 0; ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); - for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) { + for_each_xbitmap_block(fsbno, bmr, n, bitmap) { ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno); trace_xrep_dispose_btree_extent(sc->mp, @@ -615,11 +617,9 @@ xrep_reap_extents( error = xrep_reap_block(sc, fsbno, oinfo, type); if (error) - goto out; + break; } -out: - xfs_bitmap_destroy(bitmap); return error; } @@ -879,7 +879,7 @@ xrep_find_ag_btree_roots( ri.sc = sc; ri.btree_info = btree_info; - ri.agf = XFS_BUF_TO_AGF(agf_bp); + ri.agf = agf_bp->b_addr; ri.agfl_bp = agfl_bp; for (fab = btree_info; fab->buf_ops; fab++) { ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG); diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index c3422403b169..04a47d45605b 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -28,11 +28,11 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, struct xfs_buf **bpp, xfs_btnum_t btnum, const struct xfs_buf_ops *ops); -struct xfs_bitmap; +struct xbitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); -int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist); -int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist, +int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist); +int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); struct xrep_find_ag_btree { diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 8d4cefd761c1..f4fcb4719f41 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -92,7 +92,7 @@ xchk_rmapbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; bool non_inode; bool is_unwritten; bool is_bmbt; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index f1775bb19313..8ebf35b115ce 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -168,6 +168,7 @@ xchk_teardown( xfs_irele(sc->ip); sc->ip = NULL; } + sb_end_write(sc->mp->m_super); if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { @@ -490,6 +491,14 @@ xfs_scrub_metadata( sc.ops = &meta_scrub_ops[sm->sm_type]; sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); retry_op: + /* + * If freeze runs concurrently with a scrub, the freeze can be delayed + * indefinitely as we walk the filesystem and iterate over metadata + * buffers. Freeze quiesces the log (which waits for the buffer LRU to + * be emptied) and that won't happen while checking is running. + */ + sb_start_write(mp->m_super); + /* Set up for the operation. */ error = sc.ops->setup(&sc, ip); if (error) diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 9eaab2eb5ed3..2c6c248be823 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -24,9 +24,9 @@ xchk_btree_cur_fsbno( return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn); else if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS) - return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino); + return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) - return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0); + return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, 0); return NULLFSBLOCK; } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 096203119934..e46f5cef90da 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -379,7 +379,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->ino = sc->ip->i_ino; - __entry->whichfork = cur->bc_private.b.whichfork; + __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __entry->btnum = cur->bc_btnum; __entry->level = level; @@ -459,7 +459,7 @@ TRACE_EVENT(xchk_ifork_btree_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->ino = sc->ip->i_ino; - __entry->whichfork = cur->bc_private.b.whichfork; + __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __entry->btnum = cur->bc_btnum; __entry->level = level; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index cd743fad8478..d4c687b5cd06 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -14,6 +14,8 @@ #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_acl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include <linux/posix_acl_xattr.h> @@ -67,10 +69,12 @@ xfs_acl_from_disk( switch (acl_e->e_tag) { case ACL_USER: - acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id)); + acl_e->e_uid = make_kuid(&init_user_ns, + be32_to_cpu(ace->ae_id)); break; case ACL_GROUP: - acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id)); + acl_e->e_gid = make_kgid(&init_user_ns, + be32_to_cpu(ace->ae_id)); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: @@ -103,10 +107,12 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) ace->ae_tag = cpu_to_be32(acl_e->e_tag); switch (acl_e->e_tag) { case ACL_USER: - ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); + ace->ae_id = cpu_to_be32( + from_kuid(&init_user_ns, acl_e->e_uid)); break; case ACL_GROUP: - ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid)); + ace->ae_id = cpu_to_be32( + from_kgid(&init_user_ns, acl_e->e_gid)); break; default: ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); @@ -120,102 +126,86 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) struct posix_acl * xfs_get_acl(struct inode *inode, int type) { - struct xfs_inode *ip = XFS_I(inode); - struct posix_acl *acl = NULL; - struct xfs_acl *xfs_acl = NULL; - unsigned char *ea_name; - int error; - int len; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct posix_acl *acl = NULL; + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_ROOT, + .valuelen = XFS_ACL_MAX_SIZE(mp), + }; + int error; trace_xfs_get_acl(ip); switch (type) { case ACL_TYPE_ACCESS: - ea_name = SGI_ACL_FILE; + args.name = SGI_ACL_FILE; break; case ACL_TYPE_DEFAULT: - ea_name = SGI_ACL_DEFAULT; + args.name = SGI_ACL_DEFAULT; break; default: BUG(); } + args.namelen = strlen(args.name); /* - * If we have a cached ACLs value just return it, not need to - * go out to the disk. + * If the attribute doesn't exist make sure we have a negative cache + * entry, for any other error assume it is transient. */ - len = XFS_ACL_MAX_SIZE(ip->i_mount); - error = xfs_attr_get(ip, ea_name, strlen(ea_name), - (unsigned char **)&xfs_acl, &len, - ATTR_ALLOC | ATTR_ROOT); - if (error) { - /* - * If the attribute doesn't exist make sure we have a negative - * cache entry, for any other error assume it is transient. - */ - if (error != -ENOATTR) - acl = ERR_PTR(error); - } else { - acl = xfs_acl_from_disk(ip->i_mount, xfs_acl, len, - XFS_ACL_MAX_ENTRIES(ip->i_mount)); - kmem_free(xfs_acl); + error = xfs_attr_get(&args); + if (!error) { + acl = xfs_acl_from_disk(mp, args.value, args.valuelen, + XFS_ACL_MAX_ENTRIES(mp)); + } else if (error != -ENOATTR) { + acl = ERR_PTR(error); } + + kmem_free(args.value); return acl; } int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct xfs_inode *ip = XFS_I(inode); - unsigned char *ea_name; - int error; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_ROOT, + }; + int error; switch (type) { case ACL_TYPE_ACCESS: - ea_name = SGI_ACL_FILE; + args.name = SGI_ACL_FILE; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - ea_name = SGI_ACL_DEFAULT; + args.name = SGI_ACL_DEFAULT; break; default: return -EINVAL; } + args.namelen = strlen(args.name); if (acl) { - struct xfs_acl *xfs_acl; - int len = XFS_ACL_MAX_SIZE(ip->i_mount); - - xfs_acl = kmem_zalloc_large(len, 0); - if (!xfs_acl) + args.valuelen = XFS_ACL_SIZE(acl->a_count); + args.value = kmem_zalloc_large(args.valuelen, 0); + if (!args.value) return -ENOMEM; - - xfs_acl_to_disk(xfs_acl, acl); - - /* subtract away the unused acl entries */ - len -= sizeof(struct xfs_acl_entry) * - (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); - - error = xfs_attr_set(ip, ea_name, strlen(ea_name), - (unsigned char *)xfs_acl, len, ATTR_ROOT); - - kmem_free(xfs_acl); - } else { - /* - * A NULL ACL argument means we want to remove the ACL. - */ - error = xfs_attr_remove(ip, ea_name, - strlen(ea_name), - ATTR_ROOT); - - /* - * If the attribute didn't exist to start with that's fine. - */ - if (error == -ENOATTR) - error = 0; + xfs_acl_to_disk(args.value, acl); } + error = xfs_attr_set(&args); + kmem_free(args.value); + + /* + * If the attribute didn't exist to start with that's fine. + */ + if (!acl && error == -ENOATTR) + error = 0; if (!error) set_cached_acl(inode, type, acl); return error; @@ -275,3 +265,19 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; } + +/* + * Invalidate any cached ACLs if the user has bypassed the ACL interface. + * We don't validate the content whatsoever so it is caller responsibility to + * provide data in valid format and ensure i_mode is consistent. + */ +void +xfs_forget_acl( + struct inode *inode, + const char *name) +{ + if (!strcmp(name, SGI_ACL_FILE)) + forget_cached_acl(inode, ACL_TYPE_ACCESS); + else if (!strcmp(name, SGI_ACL_DEFAULT)) + forget_cached_acl(inode, ACL_TYPE_DEFAULT); +} diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 94615e34bc86..c042c0868016 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -13,14 +13,16 @@ struct posix_acl; extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +void xfs_forget_acl(struct inode *inode, const char *name); #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { return NULL; } # define xfs_set_acl NULL +static inline void xfs_forget_acl(struct inode *inode, const char *name) +{ +} #endif /* CONFIG_XFS_POSIX_ACL */ -extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags); - #endif /* __XFS_ACL_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 58e937be24ce..9d9cebf18726 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -539,7 +539,7 @@ xfs_discard_page( if (XFS_FORCED_SHUTDOWN(mp)) goto out_invalidate; - xfs_alert(mp, + xfs_alert_ratelimited(mp, "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", page, ip->i_ino, offset); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index bbfa6ba84dcd..c42f90e16b4f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -145,8 +145,8 @@ xfs_attr3_node_inactive( * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - xfs_buf_corruption_error(bp); return -EFSCORRUPTED; } @@ -194,7 +194,7 @@ xfs_attr3_node_inactive( error = xfs_attr3_leaf_inactive(trans, dp, child_bp); break; default: - xfs_buf_corruption_error(child_bp); + xfs_buf_mark_corrupt(child_bp); xfs_trans_brelse(*trans, child_bp); error = -EFSCORRUPTED; break; @@ -289,7 +289,7 @@ xfs_attr3_root_inactive( break; default: error = -EFSCORRUPTED; - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); break; } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index d37743bdf274..5ff1d929d3b5 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -52,24 +52,19 @@ static int xfs_attr_shortform_list( struct xfs_attr_list_context *context) { - struct attrlist_cursor_kern *cursor; + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; + struct xfs_inode *dp = context->dp; struct xfs_attr_sf_sort *sbuf, *sbp; struct xfs_attr_shortform *sf; struct xfs_attr_sf_entry *sfe; - struct xfs_inode *dp; int sbsize, nsbuf, count, i; int error = 0; - ASSERT(context != NULL); - dp = context->dp; - ASSERT(dp != NULL); ASSERT(dp->i_afp != NULL); sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; - cursor = context->cursor; - ASSERT(cursor != NULL); trace_xfs_attr_list_sf(context); @@ -205,7 +200,7 @@ out: STATIC int xfs_attr_node_list_lookup( struct xfs_attr_list_context *context, - struct attrlist_cursor_kern *cursor, + struct xfs_attrlist_cursor_kern *cursor, struct xfs_buf **pbp) { struct xfs_da3_icnode_hdr nodehdr; @@ -279,7 +274,7 @@ xfs_attr_node_list_lookup( return 0; out_corruptbuf: - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(tp, bp); return -EFSCORRUPTED; } @@ -288,8 +283,8 @@ STATIC int xfs_attr_node_list( struct xfs_attr_list_context *context) { + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_attr3_icleaf_hdr leafhdr; - struct attrlist_cursor_kern *cursor; struct xfs_attr_leafblock *leaf; struct xfs_da_intnode *node; struct xfs_buf *bp; @@ -299,7 +294,6 @@ xfs_attr_node_list( trace_xfs_attr_node_list(context); - cursor = context->cursor; cursor->initted = 1; /* @@ -394,7 +388,7 @@ xfs_attr3_leaf_list_int( struct xfs_buf *bp, struct xfs_attr_list_context *context) { - struct attrlist_cursor_kern *cursor; + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; @@ -408,7 +402,6 @@ xfs_attr3_leaf_list_int( xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - cursor = context->cursor; cursor->initted = 1; /* @@ -452,8 +445,8 @@ xfs_attr3_leaf_list_int( } if ((entry->flags & XFS_ATTR_INCOMPLETE) && - !(context->flags & ATTR_INCOMPLETE)) - continue; /* skip incomplete entries */ + !context->allow_incomplete) + continue; if (entry->flags & XFS_ATTR_LOCAL) { xfs_attr_leaf_name_local_t *name_loc; @@ -488,14 +481,15 @@ xfs_attr3_leaf_list_int( * Copy out attribute entries for attr_list(), for leaf attribute lists. */ STATIC int -xfs_attr_leaf_list(xfs_attr_list_context_t *context) +xfs_attr_leaf_list( + struct xfs_attr_list_context *context) { - int error; - struct xfs_buf *bp; + struct xfs_buf *bp; + int error; trace_xfs_attr_leaf_list(context); - context->cursor->blkno = 0; + context->cursor.blkno = 0; error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp); if (error) return error; @@ -506,7 +500,7 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) } int -xfs_attr_list_int_ilocked( +xfs_attr_list_ilocked( struct xfs_attr_list_context *context) { struct xfs_inode *dp = context->dp; @@ -526,12 +520,12 @@ xfs_attr_list_int_ilocked( } int -xfs_attr_list_int( - xfs_attr_list_context_t *context) +xfs_attr_list( + struct xfs_attr_list_context *context) { - int error; - xfs_inode_t *dp = context->dp; - uint lock_mode; + struct xfs_inode *dp = context->dp; + uint lock_mode; + int error; XFS_STATS_INC(dp->i_mount, xs_attr_list); @@ -539,130 +533,7 @@ xfs_attr_list_int( return -EIO; lock_mode = xfs_ilock_attr_map_shared(dp); - error = xfs_attr_list_int_ilocked(context); + error = xfs_attr_list_ilocked(context); xfs_iunlock(dp, lock_mode); return error; } - -#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ - (((struct attrlist_ent *) 0)->a_name - (char *) 0) -#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \ - & ~(sizeof(uint32_t)-1)) - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -STATIC void -xfs_attr_put_listent( - xfs_attr_list_context_t *context, - int flags, - unsigned char *name, - int namelen, - int valuelen) -{ - struct attrlist *alist = (struct attrlist *)context->alist; - attrlist_ent_t *aep; - int arraytop; - - ASSERT(!context->seen_enough); - ASSERT(!(context->flags & ATTR_KERNOVAL)); - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*alist)); - ASSERT(context->firstu <= context->bufsize); - - /* - * Only list entries in the right namespace. - */ - if (((context->flags & ATTR_SECURE) == 0) != - ((flags & XFS_ATTR_SECURE) == 0)) - return; - if (((context->flags & ATTR_ROOT) == 0) != - ((flags & XFS_ATTR_ROOT) == 0)) - return; - - arraytop = sizeof(*alist) + - context->count * sizeof(alist->al_offset[0]); - context->firstu -= ATTR_ENTSIZE(namelen); - if (context->firstu < arraytop) { - trace_xfs_attr_list_full(context); - alist->al_more = 1; - context->seen_enough = 1; - return; - } - - aep = (attrlist_ent_t *)&context->alist[context->firstu]; - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[namelen] = 0; - alist->al_offset[context->count++] = context->firstu; - alist->al_count = context->count; - trace_xfs_attr_list_add(context); - return; -} - -/* - * Generate a list of extended attribute names and optionally - * also value lengths. Positive return value follows the XFS - * convention of being an error, zero or negative return code - * is the length of the buffer returned (negated), indicating - * success. - */ -int -xfs_attr_list( - xfs_inode_t *dp, - char *buffer, - int bufsize, - int flags, - attrlist_cursor_kern_t *cursor) -{ - xfs_attr_list_context_t context; - struct attrlist *alist; - int error; - - /* - * Validate the cursor. - */ - if (cursor->pad1 || cursor->pad2) - return -EINVAL; - if ((cursor->initted == 0) && - (cursor->hashval || cursor->blkno || cursor->offset)) - return -EINVAL; - - /* Only internal consumers can retrieve incomplete attrs. */ - if (flags & ATTR_INCOMPLETE) - return -EINVAL; - - /* - * Check for a properly aligned buffer. - */ - if (((long)buffer) & (sizeof(int)-1)) - return -EFAULT; - if (flags & ATTR_KERNOVAL) - bufsize = 0; - - /* - * Initialize the output buffer. - */ - memset(&context, 0, sizeof(context)); - context.dp = dp; - context.cursor = cursor; - context.resynch = 1; - context.flags = flags; - context.alist = buffer; - context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ - context.firstu = context.bufsize; - context.put_listent = xfs_attr_put_listent; - - alist = (struct attrlist *)context.alist; - alist->al_count = 0; - alist->al_more = 0; - alist->al_offset[0] = context.bufsize; - - error = xfs_attr_list_int(&context); - ASSERT(error <= 0); - return error; -} diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e62fb5216341..4f800f7fe888 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1062,7 +1062,6 @@ xfs_collapse_file_space( int error; xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); - uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); bool done = false; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); @@ -1078,32 +1077,34 @@ xfs_collapse_file_space( if (error) return error; - while (!error && !done) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, - &tp); - if (error) - break; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, - ip->i_gdquot, ip->i_pdquot, resblks, 0, - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + while (!done) { error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, &done); if (error) goto out_trans_cancel; + if (done) + break; - error = xfs_trans_commit(tp); + /* finish any deferred frees and roll the transaction */ + error = xfs_defer_finish(&tp); + if (error) + goto out_trans_cancel; } + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1146,35 +1147,41 @@ xfs_insert_file_space( if (error) return error; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + /* * The extent shifting code works on extent granularity. So, if stop_fsb * is not the starting block of extent, we need to split the extent at * stop_fsb. */ - error = xfs_bmap_split_extent(ip, stop_fsb); + error = xfs_bmap_split_extent(tp, ip, stop_fsb); if (error) - return error; + goto out_trans_cancel; - while (!error && !done) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, - &tp); + do { + error = xfs_trans_roll_inode(&tp, ip); if (error) - break; + goto out_trans_cancel; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb, &done, stop_fsb); if (error) goto out_trans_cancel; + } while (!done); - error = xfs_trans_commit(tp); - } - + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1442,12 +1449,12 @@ xfs_swap_extent_forks( * event of a crash. Set the owner change log flags now and leave the * bmbt scan as the last step. */ - if (ip->i_d.di_version == 3 && - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) - (*target_log_flags) |= XFS_ILOG_DOWNER; - if (tip->i_d.di_version == 3 && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) - (*src_log_flags) |= XFS_ILOG_DOWNER; + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) + (*target_log_flags) |= XFS_ILOG_DOWNER; + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + (*src_log_flags) |= XFS_ILOG_DOWNER; + } /* * Swap the data forks of the inodes @@ -1482,7 +1489,7 @@ xfs_swap_extent_forks( (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - ASSERT(ip->i_d.di_version < 3 || + ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (*src_log_flags & XFS_ILOG_DOWNER)); (*src_log_flags) |= XFS_ILOG_DBROOT; break; @@ -1494,7 +1501,7 @@ xfs_swap_extent_forks( break; case XFS_DINODE_FMT_BTREE: (*target_log_flags) |= XFS_ILOG_DBROOT; - ASSERT(tip->i_d.di_version < 3 || + ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (*target_log_flags & XFS_ILOG_DOWNER)); break; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 217e4f82a44a..9ec3eaf1c618 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -327,6 +327,9 @@ xfs_buf_free( __free_page(page); } + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += + bp->b_page_count; } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); @@ -727,8 +730,9 @@ found: if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { - xfs_warn(target->bt_mount, - "%s: failed to map pagesn", __func__); + xfs_warn_ratelimited(target->bt_mount, + "%s: failed to map %u pages", __func__, + bp->b_page_count); xfs_buf_relse(bp); return error; } @@ -1238,7 +1242,7 @@ xfs_buf_ioerror_alert( struct xfs_buf *bp, xfs_failaddr_t func) { - xfs_alert(bp->b_mount, + xfs_alert_ratelimited(bp->b_mount, "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, -bp->b_error); @@ -1573,6 +1577,28 @@ xfs_buf_zero( } /* + * Log a message about and stale a buffer that a caller has decided is corrupt. + * + * This function should be called for the kinds of metadata corruption that + * cannot be detect from a verifier, such as incorrect inter-block relationship + * data. Do /not/ call this function from a verifier function. + * + * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will + * be marked stale, but b_error will not be set. The caller is responsible for + * releasing the buffer or fixing it. + */ +void +__xfs_buf_mark_corrupt( + struct xfs_buf *bp, + xfs_failaddr_t fa) +{ + ASSERT(bp->b_flags & XBF_DONE); + + xfs_buf_corruption_error(bp, fa); + xfs_buf_stale(bp); +} + +/* * Handling of buffer targets (buftargs). */ @@ -2091,9 +2117,11 @@ xfs_buf_delwri_pushbuf( int __init xfs_buf_init(void) { - xfs_buf_zone = kmem_cache_create("xfs_buf", - sizeof(struct xfs_buf), 0, - SLAB_HWCACHE_ALIGN, NULL); + xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD, + NULL); if (!xfs_buf_zone) goto out; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d79a1fe5d738..9a04c53c2488 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -272,6 +272,8 @@ static inline int xfs_buf_submit(struct xfs_buf *bp) } void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); +void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); +#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ extern void *xfs_buf_offset(struct xfs_buf *, size_t); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 663810e6cd59..1545657c3ca0 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -345,7 +345,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 0d3b640cf1cc..871ec22c9aee 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -147,7 +147,7 @@ xfs_dir2_block_getdents( xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; int lock_mode; - unsigned int offset; + unsigned int offset, next_offset; unsigned int end; /* @@ -173,9 +173,10 @@ xfs_dir2_block_getdents( * Loop over the data portion of the block. * Each object is a real entry (dep) or an unused one (dup). */ - offset = geo->data_entry_offset; end = xfs_dir3_data_end_offset(geo, bp->b_addr); - while (offset < end) { + for (offset = geo->data_entry_offset; + offset < end; + offset = next_offset) { struct xfs_dir2_data_unused *dup = bp->b_addr + offset; struct xfs_dir2_data_entry *dep = bp->b_addr + offset; uint8_t filetype; @@ -184,14 +185,15 @@ xfs_dir2_block_getdents( * Unused, skip it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - offset += be16_to_cpu(dup->length); + next_offset = offset + be16_to_cpu(dup->length); continue; } /* * Bump pointer for the next iteration. */ - offset += xfs_dir2_data_entsize(dp->i_mount, dep->namelen); + next_offset = offset + + xfs_dir2_data_entsize(dp->i_mount, dep->namelen); /* * The entry is before the desired starting point, skip it. diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 0b8350e84d28..f979d0d7e6cd 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -31,6 +31,7 @@ xfs_trim_extents( struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; + struct xfs_agf *agf; struct xfs_perag *pag; int error; int i; @@ -47,14 +48,14 @@ xfs_trim_extents( error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); if (error) goto out_put_perag; + agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); /* * Look up the longest btree in the AGF and start with it. */ - error = xfs_alloc_lookup_ge(cur, 0, - be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); + error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); if (error) goto out_del_cursor; @@ -75,7 +76,7 @@ xfs_trim_extents( error = -EFSCORRUPTED; goto out_del_cursor; } - ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); + ASSERT(flen <= be32_to_cpu(agf->agf_longest)); /* * use daddr format for all range/len calculations as that is diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index d223e1ae90a6..af2c8e5ceea0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -829,9 +829,9 @@ xfs_qm_id_for_quotatype( { switch (type) { case XFS_DQ_USER: - return ip->i_d.di_uid; + return i_uid_read(VFS_I(ip)); case XFS_DQ_GROUP: - return ip->i_d.di_gid; + return i_gid_read(VFS_I(ip)); case XFS_DQ_PROJ: return ip->i_d.di_projid; } @@ -1105,8 +1105,8 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp, - &xfs_dquot_buf_ops); + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, + &bp, &xfs_dquot_buf_ops); if (error) goto out_unlock; @@ -1177,7 +1177,7 @@ xfs_qm_dqflush( out_unlock: xfs_dqfunlock(dqp); - return -EIO; + return error; } /* diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index d60647d7197b..baad1748d0d1 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: @@ -307,36 +308,62 @@ xfs_qm_qoffend_logitem_committed( { struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; - struct xfs_ail *ailp = qfs->qql_item.li_ailp; - /* - * Delete the qoff-start logitem from the AIL. - * xfs_trans_ail_delete() drops the AIL lock. - */ - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + xfs_qm_qoff_logitem_relse(qfs); - kmem_free(qfs->qql_item.li_lv_shadow); kmem_free(lip->li_lv_shadow); - kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; } +STATIC void +xfs_qm_qoff_logitem_release( + struct xfs_log_item *lip) +{ + struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip); + + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { + if (qoff->qql_start_lip) + xfs_qm_qoff_logitem_relse(qoff->qql_start_lip); + xfs_qm_qoff_logitem_relse(qoff); + } +} + static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_committed = xfs_qm_qoffend_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, + .iop_release = xfs_qm_qoff_logitem_release, }; static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_push = xfs_qm_qoff_logitem_push, + .iop_release = xfs_qm_qoff_logitem_release, }; /* + * Delete the quotaoff intent from the AIL and free it. On success, + * this should only be called for the start item. It can be used for + * either on shutdown or abort. + */ +void +xfs_qm_qoff_logitem_relse( + struct xfs_qoff_logitem *qoff) +{ + struct xfs_log_item *lip = &qoff->qql_item; + + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) || + test_bit(XFS_LI_ABORTED, &lip->li_flags) || + XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + kmem_free(lip->li_lv_shadow); + kmem_free(qoff); +} + +/* * Allocate and initialize an quotaoff item of the correct quota type(s). */ struct xfs_qoff_logitem * diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 3bb19e556ade..2b86a43d7ce2 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -28,6 +28,7 @@ void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp, struct xfs_qoff_logitem *start, uint flags); +void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *); struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp, struct xfs_qoff_logitem *startqoff, uint flags); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 331765afc53e..a21e9cc6516a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -345,16 +345,19 @@ xfs_corruption_error( * Complain about the kinds of metadata corruption that we can't detect from a * verifier, such as incorrect inter-block relationship data. Does not set * bp->b_error. + * + * Call xfs_buf_mark_corrupt, not this function. */ void xfs_buf_corruption_error( - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_failaddr_t fa) { struct xfs_mount *mp = bp->b_mount; xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, %s block 0x%llx", - __return_address, bp->b_ops->name, bp->b_bn); + fa, bp->b_ops->name, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 31a5d321ba9a..1717b7508356 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -15,7 +15,7 @@ extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, const void *buf, size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr); -void xfs_buf_corruption_error(struct xfs_buf *bp); +void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr); diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index f1372f9046e3..5a4b0119143a 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -15,7 +15,6 @@ #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_icache.h" -#include "xfs_log.h" #include "xfs_pnfs.h" /* @@ -221,18 +220,7 @@ STATIC int xfs_fs_nfs_commit_metadata( struct inode *inode) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(XFS_I(inode)); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b8a4a3f29b36..4b8bdecc3863 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,19 +80,9 @@ xfs_dir_fsync( int datasync) { struct xfs_inode *ip = XFS_I(file->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; trace_xfs_dir_fsync(ip); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(ip); } STATIC int @@ -1069,7 +1059,11 @@ xfs_file_remap_range( ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, remap_flags); + if (ret) + goto out_unlock; + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_log_force_inode(dest); out_unlock: xfs_reflink_remap_unlock(file_in, file_out); if (ret) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 918456ca29e1..4eebcec4aae6 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -344,7 +344,7 @@ xfs_getfsmap_datadev_helper( xfs_fsblock_t fsb; xfs_daddr_t rec_daddr; - fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock); + fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.agno, rec->rm_startblock); rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); @@ -362,7 +362,7 @@ xfs_getfsmap_datadev_bnobt_helper( struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; - rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno, + rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.agno, rec->ar_startblock); irec.rm_startblock = rec->ar_startblock; @@ -896,6 +896,14 @@ xfs_getfsmap( info.format_arg = arg; info.head = head; + /* + * If fsmap runs concurrently with a scrub, the freeze can be delayed + * indefinitely as we walk the rmapbt and iterate over metadata + * buffers. Freeze quiesces the log (which waits for the buffer LRU to + * be emptied) and that won't happen while we're reading buffers. + */ + sb_start_write(mp->m_super); + /* For each device we support... */ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { /* Is this device within the range the user asked for? */ @@ -935,6 +943,7 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); + sb_end_write(mp->m_super); head->fmh_oflags = FMH_OF_DEV_T; return error; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 8dc2e5414276..a7be7a9e5c1a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -289,6 +289,8 @@ xfs_reinit_inode( uint64_t version = inode_peek_iversion(inode); umode_t mode = inode->i_mode; dev_t dev = inode->i_rdev; + kuid_t uid = inode->i_uid; + kgid_t gid = inode->i_gid; error = inode_init_always(mp->m_super, inode); @@ -297,6 +299,8 @@ xfs_reinit_inode( inode_set_iversion_queried(inode, version); inode->i_mode = mode; inode->i_rdev = dev; + inode->i_uid = uid; + inode->i_gid = gid; return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c5077e6326c7..d1772786af29 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -801,26 +801,18 @@ xfs_ialloc( return error; ASSERT(ip != NULL); inode = VFS_I(ip); - - /* - * We always convert v1 inodes to v2 now - we only support filesystems - * with >= v2 inode capability, so there is no reason for ever leaving - * an inode in v1 format. - */ - if (ip->i_d.di_version == 1) - ip->i_d.di_version = 2; - inode->i_mode = mode; set_nlink(inode, nlink); - ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); - ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); + inode->i_uid = current_fsuid(); inode->i_rdev = rdev; ip->i_d.di_projid = prid; if (pip && XFS_INHERIT_GID(pip)) { - ip->i_d.di_gid = pip->i_d.di_gid; + inode->i_gid = VFS_I(pip)->i_gid; if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) inode->i_mode |= S_ISGID; + } else { + inode->i_gid = current_fsgid(); } /* @@ -828,9 +820,8 @@ xfs_ialloc( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if ((irix_sgid_inherit) && - (inode->i_mode & S_ISGID) && - (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) + if (irix_sgid_inherit && + (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid)) inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; @@ -847,14 +838,13 @@ xfs_ialloc( ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime = tv; } - flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: @@ -907,20 +897,13 @@ xfs_ialloc( ip->i_d.di_flags |= di_flags; } - if (pip && - (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && - pip->i_d.di_version == 3 && - ip->i_d.di_version == 3) { - uint64_t di_flags2 = 0; - + if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) { if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; } if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; - - ip->i_d.di_flags2 |= di_flags2; + ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; } /* FALLTHROUGH */ case S_IFLNK: @@ -1122,7 +1105,6 @@ xfs_bumplink( { xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - ASSERT(ip->i_d.di_version > 1); inc_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -1158,8 +1140,7 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1219,8 +1200,7 @@ xfs_create( unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, - resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + resblks - XFS_IALLOC_SPACE_RES(mp)); if (error) { ASSERT(error != -ENOSPC); goto out_trans_cancel; @@ -1309,8 +1289,7 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -2119,7 +2098,7 @@ xfs_iunlink_update_bucket( unsigned int bucket_index, xfs_agino_t new_agino) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + struct xfs_agi *agi = agibp->b_addr; xfs_agino_t old_value; int offset; @@ -2135,7 +2114,7 @@ xfs_iunlink_update_bucket( * head of the list. */ if (old_value == new_agino) { - xfs_buf_corruption_error(agibp); + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; } @@ -2259,7 +2238,7 @@ xfs_iunlink( error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* * Get the index into the agi hash table for the list this inode will @@ -2269,7 +2248,7 @@ xfs_iunlink( next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || !xfs_verify_agino_or_null(mp, agno, next_agino)) { - xfs_buf_corruption_error(agibp); + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; } @@ -2443,7 +2422,7 @@ xfs_iunlink_remove( error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* * Get the index into the agi hash table for the list this inode will @@ -2524,6 +2503,88 @@ out: } /* + * Look up the inode number specified and mark it stale if it is found. If it is + * dirty, return the inode so it can be attached to the cluster buffer so it can + * be processed appropriately when the cluster free transaction completes. + */ +static struct xfs_inode * +xfs_ifree_get_one_inode( + struct xfs_perag *pag, + struct xfs_inode *free_ip, + xfs_ino_t inum) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *ip; + +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); + + /* Inode not in memory, nothing to do */ + if (!ip) + goto out_rcu_unlock; + + /* + * because this is an RCU protected lookup, we could find a recently + * freed or even reallocated inode during the lookup. We need to check + * under the i_flags_lock for a valid inode here. Skip it if it is not + * valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + goto out_rcu_unlock; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Don't try to lock/unlock the current inode, but we _cannot_ skip the + * other inodes that we did not find in the list attached to the buffer + * and are not already marked stale. If we can't lock it, back off and + * retry. + */ + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + + /* + * Check the inode number again in case we're racing with + * freeing in xfs_reclaim_inode(). See the comments in that + * function for more information as to why the initial check is + * not sufficient. + */ + if (ip->i_ino != inum) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_rcu_unlock; + } + } + rcu_read_unlock(); + + xfs_iflock(ip); + xfs_iflags_set(ip, XFS_ISTALE); + + /* + * We don't need to attach clean inodes or those only with unlogged + * changes (which we throw away, anyway). + */ + if (!ip->i_itemp || xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_no_inode; + } + return ip; + +out_rcu_unlock: + rcu_read_unlock(); +out_no_inode: + return NULL; +} + +/* * A big issue when freeing the inode cluster is that we _cannot_ skip any * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. @@ -2623,77 +2684,11 @@ xfs_ifree_cluster( * even trying to lock them. */ for (i = 0; i < igeo->inodes_per_cluster; i++) { -retry: - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, (inum + i))); - - /* Inode not in memory, nothing to do */ - if (!ip) { - rcu_read_unlock(); + ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); + if (!ip) continue; - } - - /* - * because this is an RCU protected lookup, we could - * find a recently freed or even reallocated inode - * during the lookup. We need to check under the - * i_flags_lock for a valid inode here. Skip it if it - * is not valid, the wrong inode or stale. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum + i || - __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - continue; - } - spin_unlock(&ip->i_flags_lock); - - /* - * Don't try to lock/unlock the current inode, but we - * _cannot_ skip the other inodes that we did not find - * in the list attached to the buffer and are not - * already marked stale. If we can't lock it, back off - * and retry. - */ - if (ip != free_ip) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; - } - - /* - * Check the inode number again in case we're - * racing with freeing in xfs_reclaim_inode(). - * See the comments in that function for more - * information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum + i) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - rcu_read_unlock(); - continue; - } - } - rcu_read_unlock(); - - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); - /* - * we don't need to attach clean inodes or those only - * with unlogged changes (which we throw away, anyway). - */ iip = ip->i_itemp; - if (!iip || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; @@ -3807,7 +3802,6 @@ xfs_iflush_int( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); - ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -3868,7 +3862,7 @@ xfs_iflush_int( * backwards compatibility with old kernels that predate logging all * inode changes. */ - if (ip->i_d.di_version < 3) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) ip->i_d.di_flushiter++; /* Check the inline fork data before we write out. */ @@ -3951,3 +3945,22 @@ xfs_irele( trace_xfs_irele(ip, _RET_IP_); iput(VFS_I(ip)); } + +/* + * Ensure all commited transactions touching the inode are written to the log. + */ +int +xfs_log_force_inode( + struct xfs_inode *ip) +{ + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 492e53992fa9..c6a63f6764a6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -426,6 +426,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); +int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 8bd5d0de6321..f779cca2346f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -125,7 +125,7 @@ xfs_inode_item_size( *nvecs += 2; *nbytes += sizeof(struct xfs_inode_log_format) + - xfs_log_dinode_size(ip->i_d.di_version); + xfs_log_dinode_size(ip->i_mount); xfs_inode_item_data_fork_size(iip, nvecs, nbytes); if (XFS_IFORK_Q(ip)) @@ -305,11 +305,9 @@ xfs_inode_to_log_dinode( struct inode *inode = VFS_I(ip); to->di_magic = XFS_DINODE_MAGIC; - - to->di_version = from->di_version; to->di_format = from->di_format; - to->di_uid = from->di_uid; - to->di_gid = from->di_gid; + to->di_uid = i_uid_read(inode); + to->di_gid = i_gid_read(inode); to->di_projid_lo = from->di_projid & 0xffff; to->di_projid_hi = from->di_projid >> 16; @@ -339,7 +337,8 @@ xfs_inode_to_log_dinode( /* log a dummy value to ensure log structure is fully initialised */ to->di_next_unlinked = NULLAGINO; - if (from->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); to->di_crtime.t_sec = from->di_crtime.tv_sec; to->di_crtime.t_nsec = from->di_crtime.tv_nsec; @@ -351,6 +350,7 @@ xfs_inode_to_log_dinode( uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_flushiter = 0; } else { + to->di_version = 2; to->di_flushiter = from->di_flushiter; } } @@ -370,7 +370,7 @@ xfs_inode_item_format_core( dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE); xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn); - xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version)); + xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount)); } /* @@ -395,8 +395,6 @@ xfs_inode_item_format( struct xfs_log_iovec *vecp = NULL; struct xfs_inode_log_format *ilf; - ASSERT(ip->i_d.di_version > 1); - ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); ilf->ilf_type = XFS_LI_INODE; ilf->ilf_ino = ip->i_ino; @@ -554,7 +552,8 @@ xfs_inode_item_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: @@ -732,29 +731,27 @@ xfs_iflush_done( * holding the lock before removing the inode from the AIL. */ if (need_ail) { - bool mlip_changed = false; + xfs_lsn_t tail_lsn = 0; /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->ail_lock); list_for_each_entry(blip, &tmp, li_bio_list) { if (INODE_ITEM(blip)->ili_logged && - blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) - mlip_changed |= xfs_ail_delete_one(ailp, blip); - else { + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { + /* + * xfs_ail_update_finish() only cares about the + * lsn of the first tail item removed, any + * others will be at the same or higher lsn so + * we just ignore them. + */ + xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip); + if (!tail_lsn && lsn) + tail_lsn = lsn; + } else { xfs_clear_li_failed(blip); } } - - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - spin_unlock(&ailp->ail_lock); - - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + xfs_ail_update_finish(ailp, tail_lsn); } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d42de92cb283..cdfb3cd9a25b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -35,6 +35,8 @@ #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include <linux/mount.h> #include <linux/namei.h> @@ -292,62 +294,173 @@ xfs_readlink_by_handle( return error; } -STATIC int -xfs_attrlist_by_handle( - struct file *parfilp, - void __user *arg) +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +static void +xfs_ioc_attr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen) { - int error = -ENOMEM; - attrlist_cursor_kern_t *cursor; - struct xfs_fsop_attrlist_handlereq __user *p = arg; - xfs_fsop_attrlist_handlereq_t al_hreq; - struct dentry *dentry; - char *kbuf; + struct xfs_attrlist *alist = context->buffer; + struct xfs_attrlist_ent *aep; + int arraytop; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) - return -EFAULT; - if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XFS_XATTR_LIST_MAX) + ASSERT(!context->seen_enough); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + + /* decrement by the actual bytes used by the attr */ + context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) + + namelen + 1, sizeof(uint32_t)); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return; + } + + aep = context->buffer + context->firstu; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); +} + +static unsigned int +xfs_attr_filter( + u32 ioc_flags) +{ + if (ioc_flags & XFS_IOC_ATTR_ROOT) + return XFS_ATTR_ROOT; + if (ioc_flags & XFS_IOC_ATTR_SECURE) + return XFS_ATTR_SECURE; + return 0; +} + +static unsigned int +xfs_attr_flags( + u32 ioc_flags) +{ + if (ioc_flags & XFS_IOC_ATTR_CREATE) + return XATTR_CREATE; + if (ioc_flags & XFS_IOC_ATTR_REPLACE) + return XATTR_REPLACE; + return 0; +} + +int +xfs_ioc_attr_list( + struct xfs_inode *dp, + void __user *ubuf, + int bufsize, + int flags, + struct xfs_attrlist_cursor __user *ucursor) +{ + struct xfs_attr_list_context context = { }; + struct xfs_attrlist *alist; + void *buffer; + int error; + + if (bufsize < sizeof(struct xfs_attrlist) || + bufsize > XFS_XATTR_LIST_MAX) return -EINVAL; /* * Reject flags, only allow namespaces. */ - if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) + return -EINVAL; + if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) return -EINVAL; - dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); + /* + * Validate the cursor. + */ + if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor))) + return -EFAULT; + if (context.cursor.pad1 || context.cursor.pad2) + return -EINVAL; + if (!context.cursor.initted && + (context.cursor.hashval || context.cursor.blkno || + context.cursor.offset)) + return -EINVAL; - kbuf = kmem_zalloc_large(al_hreq.buflen, 0); - if (!kbuf) - goto out_dput; + buffer = kmem_zalloc_large(bufsize, 0); + if (!buffer) + return -ENOMEM; - cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, - al_hreq.flags, cursor); + /* + * Initialize the output buffer. + */ + context.dp = dp; + context.resynch = 1; + context.attr_filter = xfs_attr_filter(flags); + context.buffer = buffer; + context.bufsize = round_down(bufsize, sizeof(uint32_t)); + context.firstu = context.bufsize; + context.put_listent = xfs_ioc_attr_put_listent; + + alist = context.buffer; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list(&context); if (error) - goto out_kfree; + goto out_free; - if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { + if (copy_to_user(ubuf, buffer, bufsize) || + copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) error = -EFAULT; - goto out_kfree; - } +out_free: + kmem_free(buffer); + return error; +} - if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) - error = -EFAULT; +STATIC int +xfs_attrlist_by_handle( + struct file *parfilp, + struct xfs_fsop_attrlist_handlereq __user *p) +{ + struct xfs_fsop_attrlist_handlereq al_hreq; + struct dentry *dentry; + int error = -ENOMEM; -out_kfree: - kmem_free(kbuf); -out_dput: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) + return -EFAULT; + + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer, + al_hreq.buflen, al_hreq.flags, &p->pos); dput(dentry); return error; } -int +static int xfs_attrmulti_attr_get( struct inode *inode, unsigned char *name, @@ -355,31 +468,33 @@ xfs_attrmulti_attr_get( uint32_t *len, uint32_t flags) { - unsigned char *kbuf; - int error = -EFAULT; - size_t namelen; + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .attr_flags = xfs_attr_flags(flags), + .name = name, + .namelen = strlen(name), + .valuelen = *len, + }; + int error; if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; - kbuf = kmem_zalloc_large(*len, 0); - if (!kbuf) - return -ENOMEM; - namelen = strlen(name); - error = xfs_attr_get(XFS_I(inode), name, namelen, &kbuf, (int *)len, - flags); + error = xfs_attr_get(&args); if (error) goto out_kfree; - if (copy_to_user(ubuf, kbuf, *len)) + *len = args.valuelen; + if (copy_to_user(ubuf, args.value, args.valuelen)) error = -EFAULT; out_kfree: - kmem_free(kbuf); + kmem_free(args.value); return error; } -int +static int xfs_attrmulti_attr_set( struct inode *inode, unsigned char *name, @@ -387,42 +502,75 @@ xfs_attrmulti_attr_set( uint32_t len, uint32_t flags) { - unsigned char *kbuf; + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .attr_flags = xfs_attr_flags(flags), + .name = name, + .namelen = strlen(name), + }; int error; - size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - if (len > XFS_XATTR_SIZE_MAX) - return -EINVAL; - kbuf = memdup_user(ubuf, len); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + if (ubuf) { + if (len > XFS_XATTR_SIZE_MAX) + return -EINVAL; + args.value = memdup_user(ubuf, len); + if (IS_ERR(args.value)) + return PTR_ERR(args.value); + args.valuelen = len; + } - namelen = strlen(name); - error = xfs_attr_set(XFS_I(inode), name, namelen, kbuf, len, flags); - if (!error) - xfs_forget_acl(inode, name, flags); - kfree(kbuf); + error = xfs_attr_set(&args); + if (!error && (flags & XFS_IOC_ATTR_ROOT)) + xfs_forget_acl(inode, name); + kfree(args.value); return error; } int -xfs_attrmulti_attr_remove( +xfs_ioc_attrmulti_one( + struct file *parfilp, struct inode *inode, - unsigned char *name, + uint32_t opcode, + void __user *uname, + void __user *value, + uint32_t *len, uint32_t flags) { + unsigned char *name; int error; - size_t namelen; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - namelen = strlen(name); - error = xfs_attr_remove(XFS_I(inode), name, namelen, flags); - if (!error) - xfs_forget_acl(inode, name, flags); + if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE)) + return -EINVAL; + + name = strndup_user(uname, MAXNAMELEN); + if (IS_ERR(name)) + return PTR_ERR(name); + + switch (opcode) { + case ATTR_OP_GET: + error = xfs_attrmulti_attr_get(inode, name, value, len, flags); + break; + case ATTR_OP_REMOVE: + value = NULL; + *len = 0; + /* fall through */ + case ATTR_OP_SET: + error = mnt_want_write_file(parfilp); + if (error) + break; + error = xfs_attrmulti_attr_set(inode, name, value, *len, flags); + mnt_drop_write_file(parfilp); + break; + default: + error = -EINVAL; + break; + } + + kfree(name); return error; } @@ -436,7 +584,6 @@ xfs_attrmulti_by_handle( xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -462,63 +609,17 @@ xfs_attrmulti_by_handle( goto out_dput; } - error = -ENOMEM; - attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); - if (!attr_name) - goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { - if ((ops[i].am_flags & ATTR_ROOT) && - (ops[i].am_flags & ATTR_SECURE)) { - ops[i].am_error = -EINVAL; - continue; - } - ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; - - ops[i].am_error = strncpy_from_user((char *)attr_name, - ops[i].am_attrname, MAXNAMELEN); - if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; - if (ops[i].am_error < 0) - break; - - switch (ops[i].am_opcode) { - case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get( - d_inode(dentry), attr_name, - ops[i].am_attrvalue, &ops[i].am_length, - ops[i].am_flags); - break; - case ATTR_OP_SET: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_set( - d_inode(dentry), attr_name, - ops[i].am_attrvalue, ops[i].am_length, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_remove( - d_inode(dentry), attr_name, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - default: - ops[i].am_error = -EINVAL; - } + ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, + d_inode(dentry), ops[i].am_opcode, + ops[i].am_attrname, ops[i].am_attrvalue, + &ops[i].am_length, ops[i].am_flags); } if (copy_to_user(am_hreq.ops, ops, size)) error = -EFAULT; - kfree(attr_name); - out_kfree_ops: kfree(ops); out_dput: dput(dentry); @@ -1162,7 +1263,7 @@ xfs_ioctl_setattr_xflags( /* diflags2 only valid for v3 inodes. */ di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); - if (di_flags2 && ip->i_d.di_version < 3) + if (di_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb)) return -EINVAL; ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); @@ -1372,8 +1473,7 @@ xfs_ioctl_setattr_check_cowextsize( if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) return 0; - if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) || - ip->i_d.di_version != 3) + if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb)) return -EINVAL; if (fa->fsx_cowextsize == 0) @@ -1434,9 +1534,9 @@ xfs_ioctl_setattr( * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp)) { - code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, - ip->i_d.di_gid, fa->fsx_projid, - XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); + code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, + VFS_I(ip)->i_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); if (code) return code; } @@ -1501,7 +1601,6 @@ xfs_ioctl_setattr( olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } - ASSERT(ip->i_d.di_version > 1); ip->i_d.di_projid = fa->fsx_projid; } @@ -1514,7 +1613,7 @@ xfs_ioctl_setattr( ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; else ip->i_d.di_extsize = 0; - if (ip->i_d.di_version == 3 && + if (xfs_sb_version_has_v3inode(&mp->m_sb) && (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) ip->i_d.di_cowextsize = fa->fsx_cowextsize >> mp->m_sb.sb_blocklog; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 420bd95dc326..bab6a5a92407 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -6,6 +6,11 @@ #ifndef __XFS_IOCTL_H__ #define __XFS_IOCTL_H__ +struct xfs_bstat; +struct xfs_ibulk; +struct xfs_inogrp; + + extern int xfs_ioc_space( struct file *filp, @@ -30,27 +35,11 @@ xfs_readlink_by_handle( struct file *parfilp, xfs_fsop_handlereq_t *hreq); -extern int -xfs_attrmulti_attr_get( - struct inode *inode, - unsigned char *name, - unsigned char __user *ubuf, - uint32_t *len, - uint32_t flags); - -extern int -xfs_attrmulti_attr_set( - struct inode *inode, - unsigned char *name, - const unsigned char __user *ubuf, - uint32_t len, - uint32_t flags); - -extern int -xfs_attrmulti_attr_remove( - struct inode *inode, - unsigned char *name, - uint32_t flags); +int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, + uint32_t opcode, void __user *uname, void __user *value, + uint32_t *len, uint32_t flags); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize, + int flags, struct xfs_attrlist_cursor __user *ucursor); extern struct dentry * xfs_handle_to_dentry( @@ -70,10 +59,6 @@ xfs_file_compat_ioctl( unsigned int cmd, unsigned long arg); -struct xfs_ibulk; -struct xfs_bstat; -struct xfs_inogrp; - int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq, const struct xfs_bulkstat *bstat); int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 769581a79c58..c1771e728117 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -352,56 +352,24 @@ xfs_compat_handlereq_to_dentry( STATIC int xfs_compat_attrlist_by_handle( struct file *parfilp, - void __user *arg) + compat_xfs_fsop_attrlist_handlereq_t __user *p) { - int error; - attrlist_cursor_kern_t *cursor; - compat_xfs_fsop_attrlist_handlereq_t __user *p = arg; compat_xfs_fsop_attrlist_handlereq_t al_hreq; struct dentry *dentry; - char *kbuf; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&al_hreq, arg, - sizeof(compat_xfs_fsop_attrlist_handlereq_t))) + if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) return -EFAULT; - if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XFS_XATTR_LIST_MAX) - return -EINVAL; - - /* - * Reject flags, only allow namespaces. - */ - if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) - return -EINVAL; dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); - error = -ENOMEM; - kbuf = kmem_zalloc_large(al_hreq.buflen, 0); - if (!kbuf) - goto out_dput; - - cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, - al_hreq.flags, cursor); - if (error) - goto out_kfree; - - if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { - error = -EFAULT; - goto out_kfree; - } - - if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) - error = -EFAULT; - -out_kfree: - kmem_free(kbuf); -out_dput: + error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), + compat_ptr(al_hreq.buffer), al_hreq.buflen, + al_hreq.flags, &p->pos); dput(dentry); return error; } @@ -416,7 +384,6 @@ xfs_compat_attrmulti_by_handle( compat_xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -443,64 +410,18 @@ xfs_compat_attrmulti_by_handle( goto out_dput; } - error = -ENOMEM; - attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); - if (!attr_name) - goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { - if ((ops[i].am_flags & ATTR_ROOT) && - (ops[i].am_flags & ATTR_SECURE)) { - ops[i].am_error = -EINVAL; - continue; - } - ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; - - ops[i].am_error = strncpy_from_user((char *)attr_name, + ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, + d_inode(dentry), ops[i].am_opcode, compat_ptr(ops[i].am_attrname), - MAXNAMELEN); - if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; - if (ops[i].am_error < 0) - break; - - switch (ops[i].am_opcode) { - case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get( - d_inode(dentry), attr_name, - compat_ptr(ops[i].am_attrvalue), - &ops[i].am_length, ops[i].am_flags); - break; - case ATTR_OP_SET: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_set( - d_inode(dentry), attr_name, - compat_ptr(ops[i].am_attrvalue), - ops[i].am_length, ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_remove( - d_inode(dentry), attr_name, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - default: - ops[i].am_error = -EINVAL; - } + compat_ptr(ops[i].am_attrvalue), + &ops[i].am_length, ops[i].am_flags); } if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) error = -EFAULT; - kfree(attr_name); - out_kfree_ops: kfree(ops); out_dput: dput(dentry); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 81f2f93caec0..f7a99b3bbcf7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -22,7 +22,6 @@ #include "xfs_iomap.h" #include "xfs_error.h" -#include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/security.h> #include <linux/iversion.h> @@ -50,10 +49,15 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, - strlen(xattr->name), - xattr->value, xattr->value_len, - ATTR_SECURE); + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_SECURE, + .name = xattr->name, + .namelen = strlen(xattr->name), + .value = xattr->value, + .valuelen = xattr->value_len, + }; + error = xfs_attr_set(&args); if (error < 0) break; } @@ -553,7 +557,7 @@ xfs_vn_getattr( stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = ip->i_d.di_crtime; @@ -692,9 +696,7 @@ xfs_setattr_nonsize( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), - xfs_kgid_to_gid(gid), - ip->i_d.di_projid, + error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, qflags, &udqp, &gdqp, NULL); if (error) return error; @@ -763,7 +765,6 @@ xfs_setattr_nonsize( olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - ip->i_d.di_uid = xfs_kuid_to_uid(uid); inode->i_uid = uid; } if (!gid_eq(igid, gid)) { @@ -775,7 +776,6 @@ xfs_setattr_nonsize( olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - ip->i_d.di_gid = xfs_kgid_to_gid(gid); inode->i_gid = gid; } } @@ -1304,9 +1304,6 @@ xfs_setup_inode( /* make the inode look hashed for the writeback code */ inode_fake_hash(inode); - inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); - inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); - i_size_write(inode, ip->i_d.di_size); xfs_diflags_to_iflags(inode, ip); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 4b31c29b7e6b..ff2da28fed90 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -86,8 +86,8 @@ xfs_bulkstat_one_int( */ buf->bs_projectid = ip->i_d.di_projid; buf->bs_ino = ino; - buf->bs_uid = dic->di_uid; - buf->bs_gid = dic->di_gid; + buf->bs_uid = i_uid_read(inode); + buf->bs_gid = i_gid_read(inode); buf->bs_size = dic->di_size; buf->bs_nlink = inode->i_nlink; @@ -110,7 +110,7 @@ xfs_bulkstat_one_int( buf->bs_forkoff = XFS_IFORK_BOFF(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; - if (dic->di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) buf->bs_cowextsize_blks = dic->di_cowextsize; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 8738bb03f253..9f70d2f68e05 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -60,6 +60,7 @@ typedef __u32 xfs_nlink_t; #include <linux/list_sort.h> #include <linux/ratelimit.h> #include <linux/rhashtable.h> +#include <linux/xattr.h> #include <asm/page.h> #include <asm/div64.h> @@ -163,32 +164,6 @@ struct xstats { extern struct xstats xfsstats; -/* Kernel uid/gid conversion. These are used to convert to/from the on disk - * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. - * The conversion here is type only, the value will remain the same since we - * are converting to the init_user_ns. The uid is later mapped to a particular - * user namespace value when crossing the kernel/user boundary. - */ -static inline uint32_t xfs_kuid_to_uid(kuid_t uid) -{ - return from_kuid(&init_user_ns, uid); -} - -static inline kuid_t xfs_uid_to_kuid(uint32_t uid) -{ - return make_kuid(&init_user_ns, uid); -} - -static inline uint32_t xfs_kgid_to_gid(kgid_t gid) -{ - return from_kgid(&init_user_ns, gid); -} - -static inline kgid_t xfs_gid_to_kgid(uint32_t gid) -{ - return make_kgid(&init_user_ns, gid); -} - static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev) { return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f6006d94a581..00fda2e8e738 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -24,13 +24,6 @@ kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp); - STATIC struct xlog * xlog_alloc_log( struct xfs_mount *mp, @@ -47,8 +40,7 @@ xlog_dealloc_log( /* local state machine functions */ STATIC void xlog_state_done_syncing( - struct xlog_in_core *iclog, - bool aborted); + struct xlog_in_core *iclog); STATIC int xlog_state_get_iclog_space( struct xlog *log, @@ -63,23 +55,10 @@ xlog_state_switch_iclogs( struct xlog_in_core *iclog, int eventual_size); STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog); - -STATIC void xlog_grant_push_ail( struct xlog *log, int need_bytes); STATIC void -xlog_regrant_reserve_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void -xlog_ungrant_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void xlog_sync( struct xlog *log, struct xlog_in_core *iclog); @@ -484,73 +463,6 @@ out_error: return error; } - -/* - * NOTES: - * - * 1. currblock field gets updated at startup and after in-core logs - * marked as with WANT_SYNC. - */ - -/* - * This routine is called when a user of a log manager ticket is done with - * the reservation. If the ticket was ever used, then a commit record for - * the associated transaction is written out as a log operation header with - * no data. The flag XLOG_TIC_INITED is set when the first write occurs with - * a given ticket. If the ticket was one with a permanent reservation, then - * a few operations are done differently. Permanent reservation tickets by - * default don't release the reservation. They just commit the current - * transaction with the belief that the reservation is still needed. A flag - * must be passed in before permanent reservations are actually released. - * When these type of tickets are not released, they need to be set into - * the inited state again. By doing this, a start record will be written - * out when the next write occurs. - */ -xfs_lsn_t -xfs_log_done( - struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant) -{ - struct xlog *log = mp->m_log; - xfs_lsn_t lsn = 0; - - if (XLOG_FORCED_SHUTDOWN(log) || - /* - * If nothing was ever written, don't write out commit record. - * If we get an error, just continue and give back the log ticket. - */ - (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(log, ticket, iclog, &lsn)))) { - lsn = (xfs_lsn_t) -1; - regrant = false; - } - - - if (!regrant) { - trace_xfs_log_done_nonperm(log, ticket); - - /* - * Release ticket if not permanent reservation or a specific - * request has been made to release a permanent reservation. - */ - xlog_ungrant_log_space(log, ticket); - } else { - trace_xfs_log_done_perm(log, ticket); - - xlog_regrant_reserve_log_space(log, ticket); - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - ticket->t_flags |= XLOG_TIC_INITED; - } - - xfs_log_ticket_put(ticket); - return lsn; -} - static bool __xlog_state_release_iclog( struct xlog *log, @@ -597,26 +509,21 @@ xlog_state_release_iclog( return 0; } -int +void xfs_log_release_iclog( - struct xfs_mount *mp, struct xlog_in_core *iclog) { - struct xlog *log = mp->m_log; - bool sync; - - if (iclog->ic_state == XLOG_STATE_IOERROR) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return -EIO; - } + struct xlog *log = iclog->ic_log; + bool sync = false; if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { - sync = __xlog_state_release_iclog(log, iclog); + if (iclog->ic_state != XLOG_STATE_IOERROR) + sync = __xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); - if (sync) - xlog_sync(log, iclog); } - return 0; + + if (sync) + xlog_sync(log, iclog); } /* @@ -855,32 +762,69 @@ xfs_log_mount_cancel( } /* - * Final log writes as part of unmount. - * - * Mark the filesystem clean as unmount happens. Note that during relocation - * this routine needs to be executed as part of source-bag while the - * deallocation must not be done until source-end. + * Wait for the iclog to be written disk, or return an error if the log has been + * shut down. */ +static int +xlog_wait_on_iclog( + struct xlog_in_core *iclog) + __releases(iclog->ic_log->l_icloglock) +{ + struct xlog *log = iclog->ic_log; -/* Actually write the unmount record to disk. */ -static void -xfs_log_write_unmount_record( - struct xfs_mount *mp) + if (!XLOG_FORCED_SHUTDOWN(log) && + iclog->ic_state != XLOG_STATE_ACTIVE && + iclog->ic_state != XLOG_STATE_DIRTY) { + XFS_STATS_INC(log->l_mp, xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + return 0; +} + +/* + * Write out an unmount record using the ticket provided. We have to account for + * the data space used in the unmount ticket as this write is not done from a + * transaction context that has already done the accounting for us. + */ +static int +xlog_write_unmount_record( + struct xlog *log, + struct xlog_ticket *ticket, + xfs_lsn_t *lsn, + uint flags) { - /* the data section must be 32 bit size aligned */ - struct xfs_unmount_log_format magic = { + struct xfs_unmount_log_format ulf = { .magic = XLOG_UNMOUNT_TYPE, }; struct xfs_log_iovec reg = { - .i_addr = &magic, - .i_len = sizeof(magic), + .i_addr = &ulf, + .i_len = sizeof(ulf), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { .lv_niovecs = 1, .lv_iovecp = ®, }; - struct xlog *log = mp->m_log; + + /* account for space used by record data */ + ticket->t_curr_res -= sizeof(ulf); + return xlog_write(log, &vec, ticket, lsn, NULL, flags, false); +} + +/* + * Mark the filesystem clean by writing an unmount record to the head of the + * log. + */ +static void +xlog_unmount_write( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; xfs_lsn_t lsn; @@ -891,23 +835,7 @@ xfs_log_write_unmount_record( if (error) goto out_err; - /* - * If we think the summary counters are bad, clear the unmount header - * flag in the unmount record so that the summary counters will be - * recalculated during log recovery at next mount. Refer to - * xlog_check_unmount_rec for more details. - */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { - xfs_alert(mp, "%s: will fix summary counters at next mount", - __func__); - flags &= ~XLOG_UNMOUNT_TRANS; - } - - /* remove inited flag, and account for space used */ - tic->t_flags = 0; - tic->t_curr_res -= sizeof(magic); - error = xlog_write(log, &vec, tic, &lsn, NULL, flags); + error = xlog_write_unmount_record(log, tic, &lsn, flags); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -919,28 +847,32 @@ out_err: spin_lock(&log->l_icloglock); iclog = log->l_iclog; atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); error = xlog_state_release_iclog(log, iclog); - switch (iclog->ic_state) { - default: - if (!XLOG_FORCED_SHUTDOWN(log)) { - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - break; - } - /* fall through */ - case XLOG_STATE_ACTIVE: - case XLOG_STATE_DIRTY: - spin_unlock(&log->l_icloglock); - break; - } + xlog_wait_on_iclog(iclog); if (tic) { trace_xfs_log_umount_write(log, tic); - xlog_ungrant_log_space(log, tic); - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); } } +static void +xfs_log_unmount_verify_iclog( + struct xlog *log) +{ + struct xlog_in_core *iclog = log->l_iclog; + + do { + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + ASSERT(iclog->ic_offset == 0); + } while ((iclog = iclog->ic_next) != log->l_iclog); +} + /* * Unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). @@ -948,16 +880,11 @@ out_err: * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ - -static int -xfs_log_unmount_write(xfs_mount_t *mp) +static void +xfs_log_unmount_write( + struct xfs_mount *mp) { - struct xlog *log = mp->m_log; - xlog_in_core_t *iclog; -#ifdef DEBUG - xlog_in_core_t *first_iclog; -#endif - int error; + struct xlog *log = mp->m_log; /* * Don't write out unmount record on norecovery mounts or ro devices. @@ -966,57 +893,30 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (mp->m_flags & XFS_MOUNT_NORECOVERY || xfs_readonly_buftarg(log->l_targ)) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); - return 0; + return; } - error = xfs_log_force(mp, XFS_LOG_SYNC); - ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); + xfs_log_force(mp, XFS_LOG_SYNC); -#ifdef DEBUG - first_iclog = iclog = log->l_iclog; - do { - if (iclog->ic_state != XLOG_STATE_IOERROR) { - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); - ASSERT(iclog->ic_offset == 0); - } - iclog = iclog->ic_next; - } while (iclog != first_iclog); -#endif - if (! (XLOG_FORCED_SHUTDOWN(log))) { - xfs_log_write_unmount_record(mp); - } else { - /* - * We're already in forced_shutdown mode, couldn't - * even attempt to write out the unmount transaction. - * - * Go through the motions of sync'ing and releasing - * the iclog, even though no I/O will actually happen, - * we need to wait for other log I/Os that may already - * be in progress. Do this as a separate section of - * code so we'll know if we ever get stuck here that - * we're in this odd situation of trying to unmount - * a file system that went into forced_shutdown as - * the result of an unmount.. - */ - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); - error = xlog_state_release_iclog(log, iclog); - switch (iclog->ic_state) { - case XLOG_STATE_ACTIVE: - case XLOG_STATE_DIRTY: - case XLOG_STATE_IOERROR: - spin_unlock(&log->l_icloglock); - break; - default: - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - break; - } + if (XLOG_FORCED_SHUTDOWN(log)) + return; + + /* + * If we think the summary counters are bad, avoid writing the unmount + * record to force log recovery at next mount, after which the summary + * counters will be recalculated. Refer to xlog_check_unmount_rec for + * more details. + */ + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, + XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + xfs_alert(mp, "%s: will fix summary counters at next mount", + __func__); + return; } - return error; -} /* xfs_log_unmount_write */ + xfs_log_unmount_verify_iclog(log); + xlog_unmount_write(log); +} /* * Empty the log for unmount/freeze. @@ -1279,7 +1179,6 @@ xlog_ioend_work( struct xlog_in_core *iclog = container_of(work, struct xlog_in_core, ic_end_io_work); struct xlog *log = iclog->ic_log; - bool aborted = false; int error; error = blk_status_to_errno(iclog->ic_bio.bi_status); @@ -1295,17 +1194,9 @@ xlog_ioend_work( if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); - /* - * This flag will be propagated to the trans-committed - * callback routines to let them know that the log-commit - * didn't succeed. - */ - aborted = true; - } else if (iclog->ic_state == XLOG_STATE_IOERROR) { - aborted = true; } - xlog_state_done_syncing(iclog, aborted); + xlog_state_done_syncing(iclog); bio_uninit(&iclog->ic_bio); /* @@ -1551,20 +1442,17 @@ out: return ERR_PTR(error); } /* xlog_alloc_log */ - /* * Write out the commit record of a transaction associated with the given - * ticket. Return the lsn of the commit record. + * ticket to close off a running log write. Return the lsn of the commit record. */ -STATIC int +int xlog_commit_record( struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp) + xfs_lsn_t *lsn) { - struct xfs_mount *mp = log->l_mp; - int error; struct xfs_log_iovec reg = { .i_addr = NULL, .i_len = 0, @@ -1574,12 +1462,15 @@ xlog_commit_record( .lv_niovecs = 1, .lv_iovecp = ®, }; + int error; - ASSERT_ALWAYS(iclog); - error = xlog_write(log, &vec, ticket, commitlsnp, iclog, - XLOG_COMMIT_TRANS); + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + + error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, + false); if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -1739,7 +1630,7 @@ xlog_bio_end_io( &iclog->ic_end_io_work); } -static void +static int xlog_map_iclog_data( struct bio *bio, void *data, @@ -1750,11 +1641,14 @@ xlog_map_iclog_data( unsigned int off = offset_in_page(data); size_t len = min_t(size_t, count, PAGE_SIZE - off); - WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len); + if (bio_add_page(bio, page, len, off) != len) + return -EIO; data += len; count -= len; } while (count); + + return 0; } STATIC void @@ -1784,7 +1678,7 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog, true); + xlog_state_done_syncing(iclog); up(&iclog->ic_sema); return; } @@ -1794,11 +1688,22 @@ xlog_write_iclog( iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; iclog->ic_bio.bi_end_io = xlog_bio_end_io; iclog->ic_bio.bi_private = iclog; - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; + + /* + * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more + * IOs coming immediately after this one. This prevents the block layer + * writeback throttle from throttling log writes behind background + * metadata writeback and causing priority inversions. + */ + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | + REQ_IDLE | REQ_FUA; if (need_flush) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; - xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count); + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + return; + } if (is_vmalloc_addr(iclog->ic_data)) flush_kernel_vmap_range(iclog->ic_data, count); @@ -2011,7 +1916,7 @@ xlog_dealloc_log( log->l_mp->m_log = NULL; destroy_workqueue(log->l_ioend_workqueue); kmem_free(log); -} /* xlog_dealloc_log */ +} /* * Update counters atomically now that memcpy is done. @@ -2148,23 +2053,21 @@ xlog_print_trans( } /* - * Calculate the potential space needed by the log vector. Each region gets - * its own xlog_op_header_t and may need to be double word aligned. + * Calculate the potential space needed by the log vector. We may need a start + * record, and each region gets its own struct xlog_op_header and may need to be + * double word aligned. */ static int xlog_write_calc_vec_length( struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector) + struct xfs_log_vec *log_vector, + bool need_start_rec) { struct xfs_log_vec *lv; - int headers = 0; + int headers = need_start_rec ? 1 : 0; int len = 0; int i; - /* acct for start rec of xact */ - if (ticket->t_flags & XLOG_TIC_INITED) - headers++; - for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) @@ -2186,27 +2089,16 @@ xlog_write_calc_vec_length( return len; } -/* - * If first write for transaction, insert start record We can't be trying to - * commit if we are inited. We can't have any "partial_copy" if we are inited. - */ -static int +static void xlog_write_start_rec( struct xlog_op_header *ophdr, struct xlog_ticket *ticket) { - if (!(ticket->t_flags & XLOG_TIC_INITED)) - return 0; - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); ophdr->oh_clientid = ticket->t_clientid; ophdr->oh_len = 0; ophdr->oh_flags = XLOG_START_TRANS; ophdr->oh_res2 = 0; - - ticket->t_flags &= ~XLOG_TIC_INITED; - - return sizeof(struct xlog_op_header); } static xlog_op_header_t * @@ -2328,7 +2220,11 @@ xlog_write_copy_finish( *record_cnt = 0; *data_cnt = 0; - xlog_state_want_sync(log, iclog); + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); if (!commit_iclog) goto release_iclog; spin_unlock(&log->l_icloglock); @@ -2391,13 +2287,14 @@ xlog_write( struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, - uint flags) + uint flags, + bool need_start_rec) { struct xlog_in_core *iclog = NULL; - struct xfs_log_iovec *vecp; - struct xfs_log_vec *lv; + struct xfs_log_vec *lv = log_vector; + struct xfs_log_iovec *vecp = lv->lv_iovecp; + int index = 0; int len; - int index; int partial_copy = 0; int partial_copy_len = 0; int contwr = 0; @@ -2405,25 +2302,13 @@ xlog_write( int data_cnt = 0; int error = 0; - *start_lsn = 0; - - len = xlog_write_calc_vec_length(ticket, log_vector); - /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. + * If this is a commit or unmount transaction, we don't need a start + * record to be written. We do, however, have to account for the + * commit or unmount header that gets written. Hence we always have + * to account for an extra xlog_op_header here. */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - - /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. - */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - + ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2431,9 +2316,8 @@ xlog_write( xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } - index = 0; - lv = log_vector; - vecp = lv->lv_iovecp; + len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); + *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2457,7 +2341,6 @@ xlog_write( while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; - int start_rec_copy; int copy_len; int copy_off; bool ordered = false; @@ -2473,11 +2356,15 @@ xlog_write( ASSERT(reg->i_len % sizeof(int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - start_rec_copy = xlog_write_start_rec(ptr, ticket); - if (start_rec_copy) { - record_cnt++; + /* + * Before we start formatting log vectors, we need to + * write a start record. Only do this for the first + * iclog we write to. + */ + if (need_start_rec) { + xlog_write_start_rec(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, - start_rec_copy); + sizeof(struct xlog_op_header)); } ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); @@ -2509,8 +2396,13 @@ xlog_write( xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); } - copy_len += start_rec_copy + sizeof(xlog_op_header_t); + copy_len += sizeof(struct xlog_op_header); record_cnt++; + if (need_start_rec) { + copy_len += sizeof(struct xlog_op_header); + record_cnt++; + need_start_rec = false; + } data_cnt += contwr ? copy_len : 0; error = xlog_write_copy_finish(log, iclog, flags, @@ -2567,119 +2459,106 @@ next_lv: return error; } +static void +xlog_state_activate_iclog( + struct xlog_in_core *iclog, + int *iclogs_changed) +{ + ASSERT(list_empty_careful(&iclog->ic_callbacks)); -/***************************************************************************** - * - * State Machine functions - * - ***************************************************************************** - */ + /* + * If the number of ops in this iclog indicate it just contains the + * dummy transaction, we can change state into IDLE (the second time + * around). Otherwise we should change the state into NEED a dummy. + * We don't need to cover the dummy. + */ + if (*iclogs_changed == 0 && + iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { + *iclogs_changed = 1; + } else { + /* + * We have two dirty iclogs so start over. This could also be + * num of ops indicating this is not the dummy going out. + */ + *iclogs_changed = 2; + } + + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_offset = 0; + iclog->ic_header.h_num_logops = 0; + memset(iclog->ic_header.h_cycle_data, 0, + sizeof(iclog->ic_header.h_cycle_data)); + iclog->ic_header.h_lsn = 0; +} /* - * An iclog has just finished IO completion processing, so we need to update - * the iclog state and propagate that up into the overall log state. Hence we - * prepare the iclog for cleaning, and then clean all the pending dirty iclogs - * starting from the head, and then wake up any threads that are waiting for the - * iclog to be marked clean. - * - * The ordering of marking iclogs ACTIVE must be maintained, so an iclog - * doesn't become ACTIVE beyond one that is SYNCING. This is also required to - * maintain the notion that we use a ordered wait queue to hold off would be - * writers to the log when every iclog is trying to sync to disk. - * - * Caller must hold the icloglock before calling us. - * - * State Change: !IOERROR -> DIRTY -> ACTIVE + * Loop through all iclogs and mark all iclogs currently marked DIRTY as + * ACTIVE after iclog I/O has completed. */ -STATIC void -xlog_state_clean_iclog( +static void +xlog_state_activate_iclogs( struct xlog *log, - struct xlog_in_core *dirty_iclog) + int *iclogs_changed) { - struct xlog_in_core *iclog; - int changed = 0; - - /* Prepare the completed iclog. */ - if (dirty_iclog->ic_state != XLOG_STATE_IOERROR) - dirty_iclog->ic_state = XLOG_STATE_DIRTY; + struct xlog_in_core *iclog = log->l_iclog; - /* Walk all the iclogs to update the ordered active state. */ - iclog = log->l_iclog; do { - if (iclog->ic_state == XLOG_STATE_DIRTY) { - iclog->ic_state = XLOG_STATE_ACTIVE; - iclog->ic_offset = 0; - ASSERT(list_empty_careful(&iclog->ic_callbacks)); - /* - * If the number of ops in this iclog indicate it just - * contains the dummy transaction, we can - * change state into IDLE (the second time around). - * Otherwise we should change the state into - * NEED a dummy. - * We don't need to cover the dummy. - */ - if (!changed && - (be32_to_cpu(iclog->ic_header.h_num_logops) == - XLOG_COVER_OPS)) { - changed = 1; - } else { - /* - * We have two dirty iclogs so start over - * This could also be num of ops indicates - * this is not the dummy going out. - */ - changed = 2; - } - iclog->ic_header.h_num_logops = 0; - memset(iclog->ic_header.h_cycle_data, 0, - sizeof(iclog->ic_header.h_cycle_data)); - iclog->ic_header.h_lsn = 0; - } else if (iclog->ic_state == XLOG_STATE_ACTIVE) - /* do nothing */; - else - break; /* stop cleaning */ - iclog = iclog->ic_next; - } while (iclog != log->l_iclog); - + if (iclog->ic_state == XLOG_STATE_DIRTY) + xlog_state_activate_iclog(iclog, iclogs_changed); + /* + * The ordering of marking iclogs ACTIVE must be maintained, so + * an iclog doesn't become ACTIVE beyond one that is SYNCING. + */ + else if (iclog->ic_state != XLOG_STATE_ACTIVE) + break; + } while ((iclog = iclog->ic_next) != log->l_iclog); +} +static int +xlog_covered_state( + int prev_state, + int iclogs_changed) +{ /* - * Wake up threads waiting in xfs_log_force() for the dirty iclog - * to be cleaned. + * We usually go to NEED. But we go to NEED2 if the changed indicates we + * are done writing the dummy record. If we are done with the second + * dummy recored (DONE2), then we go to IDLE. */ - wake_up_all(&dirty_iclog->ic_force_wait); + switch (prev_state) { + case XLOG_STATE_COVER_IDLE: + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + break; + case XLOG_STATE_COVER_DONE: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_NEED2; + break; + case XLOG_STATE_COVER_DONE2: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_IDLE; + break; + default: + ASSERT(0); + } - /* - * Change state for the dummy log recording. - * We usually go to NEED. But we go to NEED2 if the changed indicates - * we are done writing the dummy record. - * If we are done with the second dummy recored (DONE2), then - * we go to IDLE. - */ - if (changed) { - switch (log->l_covered_state) { - case XLOG_STATE_COVER_IDLE: - case XLOG_STATE_COVER_NEED: - case XLOG_STATE_COVER_NEED2: - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; + return XLOG_STATE_COVER_NEED; +} - case XLOG_STATE_COVER_DONE: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_NEED2; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; +STATIC void +xlog_state_clean_iclog( + struct xlog *log, + struct xlog_in_core *dirty_iclog) +{ + int iclogs_changed = 0; - case XLOG_STATE_COVER_DONE2: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_IDLE; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; + dirty_iclog->ic_state = XLOG_STATE_DIRTY; - default: - ASSERT(0); - } + xlog_state_activate_iclogs(log, &iclogs_changed); + wake_up_all(&dirty_iclog->ic_force_wait); + + if (iclogs_changed) { + log->l_covered_state = xlog_covered_state(log->l_covered_state, + iclogs_changed); } } @@ -2808,8 +2687,7 @@ xlog_state_iodone_process_iclog( static void xlog_state_do_iclog_callbacks( struct xlog *log, - struct xlog_in_core *iclog, - bool aborted) + struct xlog_in_core *iclog) __releases(&log->l_icloglock) __acquires(&log->l_icloglock) { @@ -2821,7 +2699,7 @@ xlog_state_do_iclog_callbacks( list_splice_init(&iclog->ic_callbacks, &tmp); spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp, aborted); + xlog_cil_process_committed(&tmp); spin_lock(&iclog->ic_callback_lock); } @@ -2836,8 +2714,7 @@ xlog_state_do_iclog_callbacks( STATIC void xlog_state_do_callback( - struct xlog *log, - bool aborted) + struct xlog *log) { struct xlog_in_core *iclog; struct xlog_in_core *first_iclog; @@ -2878,9 +2755,11 @@ xlog_state_do_callback( * we'll have to run at least one more complete loop. */ cycled_icloglock = true; - xlog_state_do_iclog_callbacks(log, iclog, aborted); - - xlog_state_clean_iclog(log, iclog); + xlog_state_do_iclog_callbacks(log, iclog); + if (XLOG_FORCED_SHUTDOWN(log)) + wake_up_all(&iclog->ic_force_wait); + else + xlog_state_clean_iclog(log, iclog); iclog = iclog->ic_next; } while (first_iclog != iclog); @@ -2916,25 +2795,22 @@ xlog_state_do_callback( */ STATIC void xlog_state_done_syncing( - struct xlog_in_core *iclog, - bool aborted) + struct xlog_in_core *iclog) { struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); - ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* * If we got an error, either on the first buffer, or in the case of - * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, - * and none should ever be attempted to be written to disk - * again. + * split log writes, on the second, we shut down the file system and + * no iclogs should ever be attempted to be written to disk again. */ - if (iclog->ic_state == XLOG_STATE_SYNCING) + if (!XLOG_FORCED_SHUTDOWN(log)) { + ASSERT(iclog->ic_state == XLOG_STATE_SYNCING); iclog->ic_state = XLOG_STATE_DONE_SYNC; - else - ASSERT(iclog->ic_state == XLOG_STATE_IOERROR); + } /* * Someone could be sleeping prior to writing out the next @@ -2943,9 +2819,8 @@ xlog_state_done_syncing( */ wake_up_all(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); - xlog_state_do_callback(log, aborted); /* also cleans log */ -} /* xlog_state_done_syncing */ - + xlog_state_do_callback(log); +} /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must @@ -3064,21 +2939,21 @@ restart: *logoffsetp = log_offset; return 0; -} /* xlog_state_get_iclog_space */ - -/* The first cnt-1 times through here we don't need to - * move the grant write head because the permanent - * reservation has reserved cnt times the unit amount. - * Release part of current permanent unit reservation and - * reset current reservation to be one units worth. Also - * move grant reservation head forward. +} + +/* + * The first cnt-1 times a ticket goes through here we don't need to move the + * grant write head because the permanent reservation has reserved cnt times the + * unit amount. Release part of current permanent unit reservation and reset + * current reservation to be one units worth. Also move grant reservation head + * forward. */ -STATIC void -xlog_regrant_reserve_log_space( +void +xfs_log_ticket_regrant( struct xlog *log, struct xlog_ticket *ticket) { - trace_xfs_log_regrant_reserve_enter(log, ticket); + trace_xfs_log_ticket_regrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; @@ -3090,21 +2965,20 @@ xlog_regrant_reserve_log_space( ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); - trace_xfs_log_regrant_reserve_sub(log, ticket); + trace_xfs_log_ticket_regrant_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ - if (ticket->t_cnt > 0) - return; + if (!ticket->t_cnt) { + xlog_grant_add_space(log, &log->l_reserve_head.grant, + ticket->t_unit_res); + trace_xfs_log_ticket_regrant_exit(log, ticket); - xlog_grant_add_space(log, &log->l_reserve_head.grant, - ticket->t_unit_res); - - trace_xfs_log_regrant_reserve_exit(log, ticket); - - ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); -} /* xlog_regrant_reserve_log_space */ + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); + } + xfs_log_ticket_put(ticket); +} /* * Give back the space left from a reservation. @@ -3120,18 +2994,19 @@ xlog_regrant_reserve_log_space( * space, the count will stay at zero and the only space remaining will be * in the current reservation field. */ -STATIC void -xlog_ungrant_log_space( +void +xfs_log_ticket_ungrant( struct xlog *log, struct xlog_ticket *ticket) { - int bytes; + int bytes; + + trace_xfs_log_ticket_ungrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; - trace_xfs_log_ungrant_enter(log, ticket); - trace_xfs_log_ungrant_sub(log, ticket); + trace_xfs_log_ticket_ungrant_sub(log, ticket); /* * If this is a permanent reservation ticket, we may be able to free @@ -3146,17 +3021,15 @@ xlog_ungrant_log_space( xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); - trace_xfs_log_ungrant_exit(log, ticket); + trace_xfs_log_ticket_ungrant_exit(log, ticket); xfs_log_space_wake(log->l_mp); + xfs_log_ticket_put(ticket); } /* - * This routine will mark the current iclog in the ring as WANT_SYNC - * and move the current iclog pointer to the next iclog in the ring. - * When this routine is called from xlog_state_get_iclog_space(), the - * exact size of the iclog has not yet been determined. All we know is - * that every data block. We have run out of space in this log record. + * This routine will mark the current iclog in the ring as WANT_SYNC and move + * the current iclog pointer to the next iclog in the ring. */ STATIC void xlog_state_switch_iclogs( @@ -3165,6 +3038,8 @@ xlog_state_switch_iclogs( int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + assert_spin_locked(&log->l_icloglock); + if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; @@ -3199,7 +3074,7 @@ xlog_state_switch_iclogs( } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; -} /* xlog_state_switch_iclogs */ +} /* * Write out all data in the in-core log as of this exact moment in time. @@ -3259,9 +3134,6 @@ xfs_log_force( * previous iclog and go to sleep. */ iclog = iclog->ic_prev; - if (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { if (atomic_read(&iclog->ic_refcnt) == 0) { /* @@ -3277,8 +3149,7 @@ xfs_log_force( if (xlog_state_release_iclog(log, iclog)) goto out_error; - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || - iclog->ic_state == XLOG_STATE_DIRTY) + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) goto out_unlock; } else { /* @@ -3298,17 +3169,8 @@ xfs_log_force( ; } - if (!(flags & XFS_LOG_SYNC)) - goto out_unlock; - - if (iclog->ic_state == XLOG_STATE_IOERROR) - goto out_error; - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3339,9 +3201,6 @@ __xfs_log_force_lsn( goto out_unlock; } - if (iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; - if (iclog->ic_state == XLOG_STATE_ACTIVE) { /* * We sleep here if we haven't already slept (e.g. this is the @@ -3375,20 +3234,8 @@ __xfs_log_force_lsn( *log_flushed = 1; } - if (!(flags & XFS_LOG_SYNC) || - (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY)) - goto out_unlock; - - if (iclog->ic_state == XLOG_STATE_IOERROR) - goto out_error; - - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3435,33 +3282,6 @@ xfs_log_force_lsn( } /* - * Called when we want to mark the current iclog as being ready to sync to - * disk. - */ -STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog) -{ - assert_spin_locked(&log->l_icloglock); - - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - xlog_state_switch_iclogs(log, iclog, 0); - } else { - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR); - } -} - - -/***************************************************************************** - * - * TICKET functions - * - ***************************************************************************** - */ - -/* * Free a used ticket when its refcount falls to zero. */ void @@ -3609,7 +3429,6 @@ xlog_ticket_alloc( tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); tic->t_clientid = client; - tic->t_flags = XLOG_TIC_INITED; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; @@ -3618,13 +3437,6 @@ xlog_ticket_alloc( return tic; } - -/****************************************************************************** - * - * Log debug routines - * - ****************************************************************************** - */ #if defined(DEBUG) /* * Make sure that the destination ptr is within the valid data region of @@ -3710,7 +3522,7 @@ xlog_verify_tail_lsn( if (blocks < BTOBB(iclog->ic_offset) + 1) xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); } -} /* xlog_verify_tail_lsn */ +} /* * Perform a number of checks on the iclog before writing to disk. @@ -3813,7 +3625,7 @@ xlog_verify_iclog( } ptr += sizeof(xlog_op_header_t) + op_len; } -} /* xlog_verify_iclog */ +} #endif /* @@ -3937,7 +3749,7 @@ xfs_log_force_umount( spin_lock(&log->l_cilp->xc_push_lock); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); - xlog_state_do_callback(log, true); + xlog_state_do_callback(log); /* return non-zero if log IOERROR transition had already happened */ return retval; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e06805160f..1412d6993f1e 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -105,10 +105,6 @@ struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; -xfs_lsn_t xfs_log_done(struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant); int xfs_log_force(struct xfs_mount *mp, uint flags); int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, int *log_forced); @@ -121,8 +117,7 @@ void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_release_iclog(struct xfs_mount *mp, - struct xlog_in_core *iclog); +void xfs_log_release_iclog(struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, @@ -138,7 +133,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket); void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, bool regrant); -void xlog_cil_process_committed(struct list_head *list, bool aborted); +void xlog_cil_process_committed(struct list_head *list); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 48435cf2aa16..b43f0e8f43f2 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -574,10 +574,10 @@ xlog_discard_busy_extents( */ static void xlog_cil_committed( - struct xfs_cil_ctx *ctx, - bool abort) + struct xfs_cil_ctx *ctx) { struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + bool abort = XLOG_FORCED_SHUTDOWN(ctx->cil->xc_log); /* * If the I/O failed, we're aborting the commit and already shutdown. @@ -613,37 +613,38 @@ xlog_cil_committed( void xlog_cil_process_committed( - struct list_head *list, - bool aborted) + struct list_head *list) { struct xfs_cil_ctx *ctx; while ((ctx = list_first_entry_or_null(list, struct xfs_cil_ctx, iclog_entry))) { list_del(&ctx->iclog_entry); - xlog_cil_committed(ctx, aborted); + xlog_cil_committed(ctx); } } /* - * Push the Committed Item List to the log. If @push_seq flag is zero, then it - * is a background flush and so we can chose to ignore it. Otherwise, if the - * current sequence is the same as @push_seq we need to do a flush. If - * @push_seq is less than the current sequence, then it has already been + * Push the Committed Item List to the log. + * + * If the current sequence is the same as xc_push_seq we need to do a flush. If + * xc_push_seq is less than the current sequence, then it has already been * flushed and we don't need to do anything - the caller will wait for it to * complete if necessary. * - * @push_seq is a value rather than a flag because that allows us to do an - * unlocked check of the sequence number for a match. Hence we can allows log - * forces to run racily and not issue pushes for the same sequence twice. If we - * get a race between multiple pushes for the same sequence they will block on - * the first one and then abort, hence avoiding needless pushes. + * xc_push_seq is checked unlocked against the sequence number for a match. + * Hence we can allow log forces to run racily and not issue pushes for the + * same sequence twice. If we get a race between multiple pushes for the same + * sequence they will block on the first one and then abort, hence avoiding + * needless pushes. */ -STATIC int -xlog_cil_push( - struct xlog *log) +static void +xlog_cil_push_work( + struct work_struct *work) { - struct xfs_cil *cil = log->l_cilp; + struct xfs_cil *cil = + container_of(work, struct xfs_cil, xc_push_work); + struct xlog *log = cil->xc_log; struct xfs_log_vec *lv; struct xfs_cil_ctx *ctx; struct xfs_cil_ctx *new_ctx; @@ -657,9 +658,6 @@ xlog_cil_push( xfs_lsn_t commit_lsn; xfs_lsn_t push_seq; - if (!cil) - return 0; - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -671,6 +669,11 @@ xlog_cil_push( ASSERT(push_seq <= ctx->sequence); /* + * Wake up any background push waiters now this context is being pushed. + */ + wake_up_all(&ctx->push_wait); + + /* * Check if we've anything to push. If there is nothing, then we don't * move on to a new sequence number and so we have to be able to push * this sequence again later. @@ -746,6 +749,7 @@ xlog_cil_push( */ INIT_LIST_HEAD(&new_ctx->committing); INIT_LIST_HEAD(&new_ctx->busy_extents); + init_waitqueue_head(&new_ctx->push_wait); new_ctx->sequence = ctx->sequence + 1; new_ctx->cil = cil; cil->xc_ctx = new_ctx; @@ -803,7 +807,7 @@ xlog_cil_push( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true); if (error) goto out_abort_free_ticket; @@ -841,10 +845,11 @@ restart: } spin_unlock(&cil->xc_push_lock); - /* xfs_log_done always frees the ticket on error. */ - commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false); - if (commit_lsn == -1) - goto out_abort; + error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn); + if (error) + goto out_abort_free_ticket; + + xfs_log_ticket_ungrant(log, tic); spin_lock(&commit_iclog->ic_callback_lock); if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { @@ -867,28 +872,20 @@ restart: spin_unlock(&cil->xc_push_lock); /* release the hounds! */ - return xfs_log_release_iclog(log->l_mp, commit_iclog); + xfs_log_release_iclog(commit_iclog); + return; out_skip: up_write(&cil->xc_ctx_lock); xfs_log_ticket_put(new_ctx->ticket); kmem_free(new_ctx); - return 0; + return; out_abort_free_ticket: - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); out_abort: - xlog_cil_committed(ctx, true); - return -EIO; -} - -static void -xlog_cil_push_work( - struct work_struct *work) -{ - struct xfs_cil *cil = container_of(work, struct xfs_cil, - xc_push_work); - xlog_cil_push(cil->xc_log); + ASSERT(XLOG_FORCED_SHUTDOWN(log)); + xlog_cil_committed(ctx); } /* @@ -900,7 +897,7 @@ xlog_cil_push_work( */ static void xlog_cil_push_background( - struct xlog *log) + struct xlog *log) __releases(cil->xc_ctx_lock) { struct xfs_cil *cil = log->l_cilp; @@ -914,14 +911,36 @@ xlog_cil_push_background( * don't do a background push if we haven't used up all the * space available yet. */ - if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { + up_read(&cil->xc_ctx_lock); return; + } spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); } + + /* + * Drop the context lock now, we can't hold that if we need to sleep + * because we are over the blocking threshold. The push_lock is still + * held, so blocking threshold sleep/wakeup is still correctly + * serialised here. + */ + up_read(&cil->xc_ctx_lock); + + /* + * If we are well over the space limit, throttle the work that is being + * done until the push work on this context has begun. + */ + if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { + trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); + ASSERT(cil->xc_ctx->space_used < log->l_logsize); + xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock); + return; + } + spin_unlock(&cil->xc_push_lock); } @@ -1017,7 +1036,10 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = xc_commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, regrant); + if (regrant && !XLOG_FORCED_SHUTDOWN(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); + else + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; xfs_trans_unreserve_and_mod_sb(tp); @@ -1038,9 +1060,9 @@ xfs_log_commit_cil( if (lip->li_ops->iop_committing) lip->li_ops->iop_committing(lip, xc_commit_lsn); } - xlog_cil_push_background(log); - up_read(&cil->xc_ctx_lock); + /* xlog_cil_push_background() releases cil->xc_ctx_lock */ + xlog_cil_push_background(log); } /* @@ -1199,6 +1221,7 @@ xlog_cil_init( INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); + init_waitqueue_head(&ctx->push_wait); ctx->sequence = 1; ctx->cil = cil; cil->xc_ctx = ctx; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b192c5a9f9fd..ec22c7a3867f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -51,13 +51,11 @@ enum xlog_iclog_state { }; /* - * Flags to log ticket + * Log ticket flags */ -#define XLOG_TIC_INITED 0x1 /* has been initialized */ -#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ +#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ #define XLOG_TIC_FLAGS \ - { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } /* @@ -242,6 +240,7 @@ struct xfs_cil_ctx { struct xfs_log_vec *lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ + wait_queue_head_t push_wait; /* background push throttle */ struct work_struct discard_endio_work; }; @@ -318,13 +317,53 @@ struct xfs_cil { * tries to keep 25% of the log free, so we need to keep below that limit or we * risk running out of free log space to start any new transactions. * - * In order to keep background CIL push efficient, we will set a lower - * threshold at which background pushing is attempted without blocking current - * transaction commits. A separate, higher bound defines when CIL pushes are - * enforced to ensure we stay within our maximum checkpoint size bounds. - * threshold, yet give us plenty of space for aggregation on large logs. + * In order to keep background CIL push efficient, we only need to ensure the + * CIL is large enough to maintain sufficient in-memory relogging to avoid + * repeated physical writes of frequently modified metadata. If we allow the CIL + * to grow to a substantial fraction of the log, then we may be pinning hundreds + * of megabytes of metadata in memory until the CIL flushes. This can cause + * issues when we are running low on memory - pinned memory cannot be reclaimed, + * and the CIL consumes a lot of memory. Hence we need to set an upper physical + * size limit for the CIL that limits the maximum amount of memory pinned by the + * CIL but does not limit performance by reducing relogging efficiency + * significantly. + * + * As such, the CIL push threshold ends up being the smaller of two thresholds: + * - a threshold large enough that it allows CIL to be pushed and progress to be + * made without excessive blocking of incoming transaction commits. This is + * defined to be 12.5% of the log space - half the 25% push threshold of the + * AIL. + * - small enough that it doesn't pin excessive amounts of memory but maintains + * close to peak relogging efficiency. This is defined to be 16x the iclog + * buffer window (32MB) as measurements have shown this to be roughly the + * point of diminishing performance increases under highly concurrent + * modification workloads. + * + * To prevent the CIL from overflowing upper commit size bounds, we introduce a + * new threshold at which we block committing transactions until the background + * CIL commit commences and switches to a new context. While this is not a hard + * limit, it forces the process committing a transaction to the CIL to block and + * yeild the CPU, giving the CIL push work a chance to be scheduled and start + * work. This prevents a process running lots of transactions from overfilling + * the CIL because it is not yielding the CPU. We set the blocking limit at + * twice the background push space threshold so we keep in line with the AIL + * push thresholds. + * + * Note: this is not a -hard- limit as blocking is applied after the transaction + * is inserted into the CIL and the push has been triggered. It is largely a + * throttling mechanism that allows the CIL push to be scheduled and run. A hard + * limit will be difficult to implement without introducing global serialisation + * in the CIL commit fast path, and it's not at all clear that we actually need + * such hard limits given the ~7 years we've run without a hard limit before + * finding the first situation where a checkpoint size overflow actually + * occurred. Hence the simple throttle, and an ASSERT check to tell us that + * we've overrun the max size. */ -#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_SPACE_LIMIT(log) \ + min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) + +#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \ + (XLOG_CIL_SPACE_LIMIT(log) * 2) /* * ticket grant locks, queues and accounting have their own cachlines @@ -402,7 +441,8 @@ struct xlog { #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) -#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) +#define XLOG_FORCED_SHUTDOWN(log) \ + (unlikely((log)->l_flags & XLOG_IO_ERROR)) /* common routines */ extern int @@ -438,14 +478,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); -int -xlog_write( - struct xlog *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, - xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, - uint flags); +int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, uint flags, + bool need_start_rec); +int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, + struct xlog_in_core **iclog, xfs_lsn_t *lsn); +void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); +void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); /* * When we crack an atomic LSN, we sample it first so that the value will not @@ -525,12 +565,6 @@ xlog_cil_force(struct xlog *log) } /* - * Unmount record type is used as a pseudo transaction type for the ticket. - * It's value must be outside the range of XFS_TRANS_* values. - */ -#define XLOG_UNMOUNT_REC_TYPE (-1U) - -/* * Wrapper function for waiting on a wait queue serialised against wakeups * by a spinlock. This matches the semantics of all the wait queues used in the * log code. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 25cfc85dbaa7..11c3502b07b1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2869,8 +2869,8 @@ xfs_recover_inode_owner_change( return -ENOMEM; /* instantiate the inode */ + ASSERT(dip->di_version >= 3); xfs_inode_from_disk(ip, dip); - ASSERT(ip->i_d.di_version >= 3); error = xfs_iformat_fork(ip, dip); if (error) @@ -2997,7 +2997,7 @@ xlog_recover_inode_pass2( * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ - if (!xfs_sb_version_hascrc(&mp->m_sb) && + if (!xfs_sb_version_has_v3inode(&mp->m_sb) && ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less @@ -3068,7 +3068,7 @@ xlog_recover_inode_pass2( error = -EFSCORRUPTED; goto out_release; } - isize = xfs_log_dinode_size(ldip->di_version); + isize = xfs_log_dinode_size(mp); if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, ldip, @@ -4947,7 +4947,7 @@ xlog_recover_clear_agi_bucket( if (error) goto out_abort; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); @@ -5083,7 +5083,7 @@ xlog_recover_process_iunlinks( * buffer reference though, so that it stays pinned in memory * while we need the buffer. */ - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { @@ -5636,7 +5636,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, bp->b_addr); xfs_buf_relse(bp); /* re-initialise in-core superblock and geometry structures */ @@ -5809,7 +5809,6 @@ xlog_recover_check_summary( struct xlog *log) { xfs_mount_t *mp; - xfs_agf_t *agfp; xfs_buf_t *agfbp; xfs_buf_t *agibp; xfs_agnumber_t agno; @@ -5829,7 +5828,8 @@ xlog_recover_check_summary( xfs_alert(mp, "%s agf read failed agno %d error %d", __func__, agno, error); } else { - agfp = XFS_BUF_TO_AGF(agfbp); + struct xfs_agf *agfp = agfbp->b_addr; + freeblks += be32_to_cpu(agfp->agf_freeblks) + be32_to_cpu(agfp->agf_flcount); xfs_buf_relse(agfbp); @@ -5840,7 +5840,7 @@ xlog_recover_check_summary( xfs_alert(mp, "%s agi read failed agno %d error %d", __func__, agno, error); } else { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + struct xfs_agi *agi = agibp->b_addr; itotal += be32_to_cpu(agi->agi_count); ifree += be32_to_cpu(agi->agi_freecount); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 56efe140c923..c5513e5a226a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -310,7 +310,7 @@ reread: /* * Initialize the mount structure from the superblock. */ - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, bp->b_addr); /* * If we haven't validated the superblock, do so now before we try diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 88ab09ed29e7..50c43422fa17 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -167,6 +167,7 @@ typedef struct xfs_mount { struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ + struct ratelimit_state m_flush_inodes_ratelimit; struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 0b0909657bad..c225691fad15 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -121,12 +121,11 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; + int error = -EAGAIN; xfs_dqlock(dqp); - if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - return -EAGAIN; - } + if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) + goto out_unlock; dqp->dq_flags |= XFS_DQ_FREEING; @@ -139,7 +138,6 @@ xfs_qm_dqpurge( */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; - int error; /* * We don't care about getting disk errors here. We need @@ -149,6 +147,8 @@ xfs_qm_dqpurge( if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); + } else if (error == -EAGAIN) { + goto out_unlock; } xfs_dqflock(dqp); } @@ -174,6 +174,10 @@ xfs_qm_dqpurge( xfs_qm_dqdestroy(dqp); return 0; + +out_unlock: + xfs_dqunlock(dqp); + return error; } /* @@ -326,16 +330,16 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, - doalloc, &ip->i_udquot); + error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), + XFS_DQ_USER, doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, - doalloc, &ip->i_gdquot); + error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), + XFS_DQ_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); @@ -871,12 +875,20 @@ xfs_qm_reset_dqcounts( ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; - ddq->d_btimer = 0; - ddq->d_itimer = 0; - ddq->d_rtbtimer = 0; - ddq->d_bwarns = 0; - ddq->d_iwarns = 0; - ddq->d_rtbwarns = 0; + + /* + * dquot id 0 stores the default grace period and the maximum + * warning limit that were set by the administrator, so we + * should not reset them. + */ + if (ddq->d_id != 0) { + ddq->d_btimer = 0; + ddq->d_itimer = 0; + ddq->d_rtbtimer = 0; + ddq->d_bwarns = 0; + ddq->d_iwarns = 0; + ddq->d_rtbwarns = 0; + } if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_update_cksum((char *)&dqb[j], @@ -1613,8 +1625,8 @@ xfs_qm_dqfree_one( int xfs_qm_vop_dqalloc( struct xfs_inode *ip, - xfs_dqid_t uid, - xfs_dqid_t gid, + kuid_t uid, + kgid_t gid, prid_t prid, uint flags, struct xfs_dquot **O_udqpp, @@ -1622,6 +1634,8 @@ xfs_qm_vop_dqalloc( struct xfs_dquot **O_pdqpp) { struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + struct user_namespace *user_ns = inode->i_sb->s_user_ns; struct xfs_dquot *uq = NULL; struct xfs_dquot *gq = NULL; struct xfs_dquot *pq = NULL; @@ -1635,7 +1649,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) - gid = ip->i_d.di_gid; + gid = inode->i_gid; /* * Attach the dquot(s) to this inode, doing a dquot allocation @@ -1650,7 +1664,7 @@ xfs_qm_vop_dqalloc( } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { - if (ip->i_d.di_uid != uid) { + if (!uid_eq(inode->i_uid, uid)) { /* * What we need is the dquot that has this uid, and * if we send the inode to dqget, the uid of the inode @@ -1661,7 +1675,8 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq); + error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), + XFS_DQ_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; @@ -1682,9 +1697,10 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { - if (ip->i_d.di_gid != gid) { + if (!gid_eq(inode->i_gid, gid)) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq); + error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), + XFS_DQ_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1810,7 +1826,7 @@ xfs_qm_vop_chown_reserve( XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && - ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { + i_uid_read(VFS_I(ip)) != be32_to_cpu(udqp->q_core.d_id)) { udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to @@ -1823,7 +1839,7 @@ xfs_qm_vop_chown_reserve( } } if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { + i_gid_read(VFS_I(ip)) != be32_to_cpu(gdqp->q_core.d_id)) { gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); @@ -1920,14 +1936,15 @@ xfs_qm_vop_create_dqattach( if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + ASSERT(i_uid_read(VFS_I(ip)) == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); + ASSERT(i_gid_read(VFS_I(ip)) == be32_to_cpu(gdqp->q_core.d_id)); + ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 1ea82764bf89..5d5ac65aa1cc 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -29,8 +29,6 @@ xfs_qm_log_quotaoff( int error; struct xfs_qoff_logitem *qoffi; - *qoffstartp = NULL; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); if (error) goto out; @@ -62,7 +60,7 @@ out: STATIC int xfs_qm_log_quotaoff_end( struct xfs_mount *mp, - struct xfs_qoff_logitem *startqoff, + struct xfs_qoff_logitem **startqoff, uint flags) { struct xfs_trans *tp; @@ -73,9 +71,10 @@ xfs_qm_log_quotaoff_end( if (error) return error; - qoffi = xfs_trans_get_qoff_item(tp, startqoff, + qoffi = xfs_trans_get_qoff_item(tp, *startqoff, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); + *startqoff = NULL; /* * We have to make sure that the transaction is secure on disk before we @@ -103,7 +102,7 @@ xfs_qm_scall_quotaoff( uint dqtype; int error; uint inactivate_flags; - struct xfs_qoff_logitem *qoffstart; + struct xfs_qoff_logitem *qoffstart = NULL; /* * No file system can have quotas enabled on disk but not in core. @@ -228,7 +227,7 @@ xfs_qm_scall_quotaoff( * So, we have QUOTAOFF start and end logitems; the start * logitem won't get overwritten until the end logitem appears... */ - error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags); if (error) { /* We're screwed now. Shutdown is the only option. */ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -261,6 +260,8 @@ xfs_qm_scall_quotaoff( } out_unlock: + if (error && qoffstart) + xfs_qm_qoff_logitem_relse(qoffstart); mutex_unlock(&q->qi_quotaofflock); return error; } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index efe42ae7a2f3..aa8fc1f55fbd 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -86,7 +86,7 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **); extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, @@ -109,7 +109,7 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *); #else static inline int -xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, +xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, prid_t prid, uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp, struct xfs_dquot **pdqp) { diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 113883c4f202..f70f1255220b 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -57,13 +57,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) /* Loop over all stats groups */ for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { - len += snprintf(buf + len, PATH_MAX - len, "%s", + len += scnprintf(buf + len, PATH_MAX - len, "%s", xstats[i].desc); /* inner loop does each group */ for (; j < xstats[i].endpoint; j++) - len += snprintf(buf + len, PATH_MAX - len, " %u", + len += scnprintf(buf + len, PATH_MAX - len, " %u", counter_val(stats, j)); - len += snprintf(buf + len, PATH_MAX - len, "\n"); + len += scnprintf(buf + len, PATH_MAX - len, "\n"); } /* extra precision counters */ for_each_possible_cpu(i) { @@ -72,9 +72,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; } - len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); - len += snprintf(buf + len, PATH_MAX-len, "debug %u\n", + len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", #if defined(DEBUG) 1); #else diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2094386af8ac..abf06bf9c3f3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -528,6 +528,9 @@ xfs_flush_inodes( { struct super_block *sb = mp->m_super; + if (!__ratelimit(&mp->m_flush_inodes_ratelimit)) + return; + if (down_read_trylock(&sb->s_umount)) { sync_inodes_sb(sb); up_read(&sb->s_umount); @@ -1366,6 +1369,17 @@ xfs_fc_fill_super( if (error) goto out_free_names; + /* + * Cap the number of invocations of xfs_flush_inodes to 16 for every + * quarter of a second. The magic numbers here were determined by + * observation neither to cause stalls in writeback when there are a + * lot of IO threads and the fs is near ENOSPC, nor cause any fstest + * regressions. YMMV. + */ + ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16); + ratelimit_set_flags(&mp->m_flush_inodes_ratelimit, + RATELIMIT_MSG_ON_RELEASE); + error = xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; @@ -1861,7 +1875,8 @@ xfs_init_zones(void) xfs_ili_zone = kmem_cache_create("xfs_ili", sizeof(struct xfs_inode_log_item), 0, - SLAB_MEM_SPREAD, NULL); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index d762d42ed0ff..13fb4b919648 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -176,15 +176,12 @@ xfs_symlink( return -ENAMETOOLONG; ASSERT(pathlen > 0); - udqp = gdqp = NULL; prid = xfs_get_initial_prid(dp); /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, - xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -194,7 +191,7 @@ xfs_symlink( * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. */ - if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + if (pathlen <= XFS_LITINO(mp)) fs_blocks = 0; else fs_blocks = xfs_symlink_blocks(mp, pathlen); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index bc85b89f88ca..120398a37c2a 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -6,6 +6,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -27,6 +28,7 @@ #include "xfs_log_recover.h" #include "xfs_filestream.h" #include "xfs_fsmap.h" +#include "xfs_btree_staging.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e242988f57fb..a4323a63438d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -35,6 +35,12 @@ struct xfs_icreate_log; struct xfs_owner_info; struct xfs_trans_res; struct xfs_inobt_rec_incore; +union xfs_btree_ptr; + +#define XFS_ATTR_FILTER_FLAGS \ + { XFS_ATTR_ROOT, "ROOT" }, \ + { XFS_ATTR_SECURE, "SECURE" }, \ + { XFS_ATTR_INCOMPLETE, "INCOMPLETE" } DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -45,39 +51,39 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class, __field(u32, hashval) __field(u32, blkno) __field(u32, offset) - __field(void *, alist) + __field(void *, buffer) __field(int, bufsize) __field(int, count) __field(int, firstu) __field(int, dupcnt) - __field(int, flags) + __field(unsigned int, attr_filter) ), TP_fast_assign( __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; __entry->ino = ctx->dp->i_ino; - __entry->hashval = ctx->cursor->hashval; - __entry->blkno = ctx->cursor->blkno; - __entry->offset = ctx->cursor->offset; - __entry->alist = ctx->alist; + __entry->hashval = ctx->cursor.hashval; + __entry->blkno = ctx->cursor.blkno; + __entry->offset = ctx->cursor.offset; + __entry->buffer = ctx->buffer; __entry->bufsize = ctx->bufsize; __entry->count = ctx->count; __entry->firstu = ctx->firstu; - __entry->flags = ctx->flags; + __entry->attr_filter = ctx->attr_filter; ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist %p size %u count %u firstu %u flags %d %s", + "buffer %p size %u count %u firstu %u filter %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->hashval, __entry->blkno, __entry->offset, __entry->dupcnt, - __entry->alist, + __entry->buffer, __entry->bufsize, __entry->count, __entry->firstu, - __entry->flags, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS) ) ) @@ -169,31 +175,31 @@ TRACE_EVENT(xfs_attr_list_node_descend, __field(u32, hashval) __field(u32, blkno) __field(u32, offset) - __field(void *, alist) + __field(void *, buffer) __field(int, bufsize) __field(int, count) __field(int, firstu) __field(int, dupcnt) - __field(int, flags) + __field(unsigned int, attr_filter) __field(u32, bt_hashval) __field(u32, bt_before) ), TP_fast_assign( __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; __entry->ino = ctx->dp->i_ino; - __entry->hashval = ctx->cursor->hashval; - __entry->blkno = ctx->cursor->blkno; - __entry->offset = ctx->cursor->offset; - __entry->alist = ctx->alist; + __entry->hashval = ctx->cursor.hashval; + __entry->blkno = ctx->cursor.blkno; + __entry->offset = ctx->cursor.offset; + __entry->buffer = ctx->buffer; __entry->bufsize = ctx->bufsize; __entry->count = ctx->count; __entry->firstu = ctx->firstu; - __entry->flags = ctx->flags; + __entry->attr_filter = ctx->attr_filter; __entry->bt_hashval = be32_to_cpu(btree->hashval); __entry->bt_before = be32_to_cpu(btree->before); ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist %p size %u count %u firstu %u flags %d %s " + "buffer %p size %u count %u firstu %u filter %s " "node hashval %u, node before %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -201,12 +207,12 @@ TRACE_EVENT(xfs_attr_list_node_descend, __entry->blkno, __entry->offset, __entry->dupcnt, - __entry->alist, + __entry->buffer, __entry->bufsize, __entry->count, __entry->firstu, - __entry->flags, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS), __entry->bt_hashval, __entry->bt_before) ); @@ -995,8 +1001,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, DEFINE_EVENT(xfs_loggrant_class, name, \ TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ TP_ARGS(log, tic)) -DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); -DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); @@ -1005,12 +1009,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve); DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait); DECLARE_EVENT_CLASS(xfs_log_item_class, TP_PROTO(struct xfs_log_item *lip), @@ -1701,7 +1706,8 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(int, namelen) __field(int, valuelen) __field(xfs_dahash_t, hashval) - __field(int, flags) + __field(unsigned int, attr_filter) + __field(unsigned int, attr_flags) __field(int, op_flags) ), TP_fast_assign( @@ -1712,11 +1718,12 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen = args->namelen; __entry->valuelen = args->valuelen; __entry->hashval = args->hashval; - __entry->flags = args->flags; + __entry->attr_filter = args->attr_filter; + __entry->attr_flags = args->attr_flags; __entry->op_flags = args->op_flags; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " - "hashval 0x%x flags %s op_flags %s", + "hashval 0x%x filter %s flags %s op_flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -1724,7 +1731,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen, __entry->valuelen, __entry->hashval, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS), + __print_flags(__entry->attr_flags, "|", + { XATTR_CREATE, "CREATE" }, + { XATTR_REPLACE, "REPLACE" }), __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) ) @@ -3594,6 +3605,151 @@ TRACE_EVENT(xfs_check_new_dalign, __entry->calc_rootino) ) +TRACE_EVENT(xfs_btree_commit_afakeroot, + TP_PROTO(struct xfs_btree_cur *cur), + TP_ARGS(cur), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, levels) + __field(unsigned int, blocks) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->agno = cur->bc_ag.agno; + __entry->agbno = cur->bc_ag.afake->af_root; + __entry->levels = cur->bc_ag.afake->af_levels; + __entry->blocks = cur->bc_ag.afake->af_blocks; + ), + TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->levels, + __entry->blocks, + __entry->agbno) +) + +TRACE_EVENT(xfs_btree_commit_ifakeroot, + TP_PROTO(struct xfs_btree_cur *cur), + TP_ARGS(cur), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, levels) + __field(unsigned int, blocks) + __field(int, whichfork) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp, + cur->bc_ino.ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp, + cur->bc_ino.ip->i_ino); + __entry->levels = cur->bc_ino.ifake->if_levels; + __entry->blocks = cur->bc_ino.ifake->if_blocks; + __entry->whichfork = cur->bc_ino.whichfork; + ), + TP_printk("dev %d:%d btree %s ag %u agino %u whichfork %s levels %u blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->agino, + __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data", + __entry->levels, + __entry->blocks) +) + +TRACE_EVENT(xfs_btree_bload_level_geometry, + TP_PROTO(struct xfs_btree_cur *cur, unsigned int level, + uint64_t nr_this_level, unsigned int nr_per_block, + unsigned int desired_npb, uint64_t blocks, + uint64_t blocks_with_extra), + TP_ARGS(cur, level, nr_this_level, nr_per_block, desired_npb, blocks, + blocks_with_extra), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(unsigned int, level) + __field(unsigned int, nlevels) + __field(uint64_t, nr_this_level) + __field(unsigned int, nr_per_block) + __field(unsigned int, desired_npb) + __field(unsigned long long, blocks) + __field(unsigned long long, blocks_with_extra) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->nlevels = cur->bc_nlevels; + __entry->nr_this_level = nr_this_level; + __entry->nr_per_block = nr_per_block; + __entry->desired_npb = desired_npb; + __entry->blocks = blocks; + __entry->blocks_with_extra = blocks_with_extra; + ), + TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->nlevels, + __entry->nr_this_level, + __entry->nr_per_block, + __entry->desired_npb, + __entry->blocks, + __entry->blocks_with_extra) +) + +TRACE_EVENT(xfs_btree_bload_block, + TP_PROTO(struct xfs_btree_cur *cur, unsigned int level, + uint64_t block_idx, uint64_t nr_blocks, + union xfs_btree_ptr *ptr, unsigned int nr_records), + TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(unsigned int, level) + __field(unsigned long long, block_idx) + __field(unsigned long long, nr_blocks) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, nr_records) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->block_idx = block_idx; + __entry->nr_blocks = nr_blocks; + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + xfs_fsblock_t fsb = be64_to_cpu(ptr->l); + + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); + __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb); + } else { + __entry->agno = cur->bc_ag.agno; + __entry->agbno = be32_to_cpu(ptr->s); + } + __entry->nr_records = nr_records; + ), + TP_printk("dev %d:%d btree %s level %u block %llu/%llu fsb (%u/%u) recs %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->block_idx, + __entry->nr_blocks, + __entry->agno, + __entry->agbno, + __entry->nr_records) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 3b208f9a865c..28b983ff8b11 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -9,6 +9,7 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_log_priv.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_extent_busy.h" @@ -150,8 +151,9 @@ xfs_trans_reserve( uint blocks, uint rtextents) { - int error = 0; - bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + struct xfs_mount *mp = tp->t_mountp; + int error = 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -162,7 +164,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return -ENOSPC; @@ -191,9 +193,9 @@ xfs_trans_reserve( if (tp->t_ticket != NULL) { ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); - error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + error = xfs_log_regrant(mp, tp->t_ticket); } else { - error = xfs_log_reserve(tp->t_mountp, + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, &tp->t_ticket, XFS_TRANSACTION, @@ -213,7 +215,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); + error = xfs_mod_frextents(mp, -((int64_t)rtextents)); if (error) { error = -ENOSPC; goto undo_log; @@ -229,7 +231,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false); + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -237,7 +239,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd); + xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -306,6 +308,11 @@ xfs_trans_alloc( * * Note the zero-length reservation; this transaction MUST be cancelled * without any dirty data. + * + * Callers should obtain freeze protection to avoid two conflicts with fs + * freezing: (1) having active transactions trip the m_active_trans ASSERTs; + * and (2) grabbing buffers at the same time that freeze is trying to drain + * the buffer LRU list. */ int xfs_trans_alloc_empty( @@ -450,7 +457,7 @@ xfs_trans_apply_sb_deltas( int whole = 0; bp = xfs_trans_getsb(tp, tp->t_mountp); - sbp = XFS_BUF_TO_SBP(bp); + sbp = bp->b_addr; /* * Check that superblock mods match the mods made to AGF counters. @@ -999,9 +1006,10 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); - if (commit_lsn == -1 && !error) - error = -EIO; + if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log)) + xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + else + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -1060,7 +1068,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xfs_log_done(mp, tp->t_ticket, NULL, false); + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 00cc5b8734be..564253550b75 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -32,6 +32,7 @@ STATIC void xfs_ail_check( struct xfs_ail *ailp, struct xfs_log_item *lip) + __must_hold(&ailp->ail_lock) { struct xfs_log_item *prev_lip; struct xfs_log_item *next_lip; @@ -108,17 +109,25 @@ xfs_ail_next( * We need the AIL lock in order to get a coherent read of the lsn of the last * item in the AIL. */ +static xfs_lsn_t +__xfs_ail_min_lsn( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip = xfs_ail_min(ailp); + + if (lip) + return lip->li_lsn; + return 0; +} + xfs_lsn_t xfs_ail_min_lsn( struct xfs_ail *ailp) { - xfs_lsn_t lsn = 0; - struct xfs_log_item *lip; + xfs_lsn_t lsn; spin_lock(&ailp->ail_lock); - lip = xfs_ail_min(ailp); - if (lip) - lsn = lip->li_lsn; + lsn = __xfs_ail_min_lsn(ailp); spin_unlock(&ailp->ail_lock); return lsn; @@ -529,8 +538,9 @@ xfsaild( { struct xfs_ail *ailp = data; long tout = 0; /* milliseconds */ + unsigned int noreclaim_flag; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); set_freezable(); while (1) { @@ -601,6 +611,7 @@ xfsaild( tout = xfsaild_push(ailp); } + memalloc_noreclaim_restore(noreclaim_flag); return 0; } @@ -678,6 +689,28 @@ xfs_ail_push_all_sync( finish_wait(&ailp->ail_empty, &wait); } +void +xfs_ail_update_finish( + struct xfs_ail *ailp, + xfs_lsn_t old_lsn) __releases(ailp->ail_lock) +{ + struct xfs_mount *mp = ailp->ail_mount; + + /* if the tail lsn hasn't changed, don't do updates or wakeups. */ + if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { + spin_unlock(&ailp->ail_lock); + return; + } + + if (!XFS_FORCED_SHUTDOWN(mp)) + xlog_assign_tail_lsn_locked(mp); + + if (list_empty(&ailp->ail_head)) + wake_up_all(&ailp->ail_empty); + spin_unlock(&ailp->ail_lock); + xfs_log_space_wake(mp); +} + /* * xfs_trans_ail_update - bulk AIL insertion operation. * @@ -709,7 +742,7 @@ xfs_trans_ail_update_bulk( xfs_lsn_t lsn) __releases(ailp->ail_lock) { struct xfs_log_item *mlip; - int mlip_changed = 0; + xfs_lsn_t tail_lsn = 0; int i; LIST_HEAD(tmp); @@ -724,9 +757,10 @@ xfs_trans_ail_update_bulk( continue; trace_xfs_ail_move(lip, lip->li_lsn, lsn); + if (mlip == lip && !tail_lsn) + tail_lsn = lip->li_lsn; + xfs_ail_delete(ailp, lip); - if (mlip == lip) - mlip_changed = 1; } else { trace_xfs_ail_insert(lip, 0, lsn); } @@ -737,23 +771,23 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - spin_unlock(&ailp->ail_lock); - - xfs_log_space_wake(ailp->ail_mount); - } else { - spin_unlock(&ailp->ail_lock); - } + xfs_ail_update_finish(ailp, tail_lsn); } -bool +/* + * Delete one log item from the AIL. + * + * If this item was at the tail of the AIL, return the LSN of the log item so + * that we can use it to check if the LSN of the tail of the log has moved + * when finishing up the AIL delete process in xfs_ail_update_finish(). + */ +xfs_lsn_t xfs_ail_delete_one( struct xfs_ail *ailp, struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); + xfs_lsn_t lsn = lip->li_lsn; trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); @@ -761,7 +795,9 @@ xfs_ail_delete_one( clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; - return mlip == lip; + if (mlip == lip) + return lsn; + return 0; } /** @@ -789,10 +825,10 @@ void xfs_trans_ail_delete( struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock) + int shutdown_type) { struct xfs_mount *mp = ailp->ail_mount; - bool mlip_changed; + xfs_lsn_t tail_lsn; if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); @@ -805,17 +841,8 @@ xfs_trans_ail_delete( return; } - mlip_changed = xfs_ail_delete_one(ailp, lip); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(mp)) - xlog_assign_tail_lsn_locked(mp); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - - spin_unlock(&ailp->ail_lock); - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + tail_lsn = xfs_ail_delete_one(ailp, lip); + xfs_ail_update_finish(ailp, tail_lsn); } int diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 2e073c1c4614..35655eac01a6 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -91,9 +91,11 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn) + __releases(ailp->ail_lock); void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock); + int shutdown_type); static inline void xfs_trans_ail_remove( diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index b0fedb543f97..fc5d7276026e 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -12,53 +12,30 @@ #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_acl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include <linux/posix_acl_xattr.h> -#include <linux/xattr.h> static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *value, size_t size) { - int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(inode); - int error, asize = size; - size_t namelen = strlen(name); - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (!size) { - xflags |= ATTR_KERNOVAL; - value = NULL; - } + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = handler->flags, + .name = name, + .namelen = strlen(name), + .value = value, + .valuelen = size, + }; + int error; - error = xfs_attr_get(ip, name, namelen, (unsigned char **)&value, - &asize, xflags); + error = xfs_attr_get(&args); if (error) return error; - return asize; -} - -void -xfs_forget_acl( - struct inode *inode, - const char *name, - int xflags) -{ - /* - * Invalidate any cached ACLs if the user has bypassed the ACL - * interface. We don't validate the content whatsoever so it is caller - * responsibility to provide data in valid format and ensure i_mode is - * consistent. - */ - if (xflags & ATTR_ROOT) { -#ifdef CONFIG_XFS_POSIX_ACL - if (!strcmp(name, SGI_ACL_FILE)) - forget_cached_acl(inode, ACL_TYPE_ACCESS); - else if (!strcmp(name, SGI_ACL_DEFAULT)) - forget_cached_acl(inode, ACL_TYPE_DEFAULT); -#endif - } + return args.valuelen; } static int @@ -66,25 +43,20 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { - int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = handler->flags, + .attr_flags = flags, + .name = name, + .namelen = strlen(name), + .value = (void *)value, + .valuelen = size, + }; int error; - size_t namelen = strlen(name); - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (flags & XATTR_CREATE) - xflags |= ATTR_CREATE; - if (flags & XATTR_REPLACE) - xflags |= ATTR_REPLACE; - - if (value) - error = xfs_attr_set(ip, name, namelen, (void *)value, size, - xflags); - else - error = xfs_attr_remove(ip, name, namelen, xflags); - if (!error) - xfs_forget_acl(inode, name, xflags); + error = xfs_attr_set(&args); + if (!error && (handler->flags & XFS_ATTR_ROOT)) + xfs_forget_acl(inode, name); return error; } @@ -97,14 +69,14 @@ static const struct xattr_handler xfs_xattr_user_handler = { static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .flags = ATTR_ROOT, + .flags = XFS_ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .flags = ATTR_SECURE, + .flags = XFS_ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; @@ -134,7 +106,7 @@ __xfs_xattr_put_listent( if (context->count < 0 || context->seen_enough) return; - if (!context->alist) + if (!context->buffer) goto compute_size; arraytop = context->count + prefix_len + namelen + 1; @@ -143,7 +115,7 @@ __xfs_xattr_put_listent( context->seen_enough = 1; return; } - offset = (char *)context->alist + context->count; + offset = context->buffer + context->count; strncpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ @@ -218,7 +190,6 @@ xfs_vn_listxattr( size_t size) { struct xfs_attr_list_context context; - struct attrlist_cursor_kern cursor = { 0 }; struct inode *inode = d_inode(dentry); int error; @@ -227,14 +198,13 @@ xfs_vn_listxattr( */ memset(&context, 0, sizeof(context)); context.dp = XFS_I(inode); - context.cursor = &cursor; context.resynch = 1; - context.alist = size ? data : NULL; + context.buffer = size ? data : NULL; context.bufsize = size; context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; - error = xfs_attr_list_int(&context); + error = xfs_attr_list(&context); if (error) return error; if (context.count < 0) |