diff options
Diffstat (limited to 'fs')
480 files changed, 17619 insertions, 11310 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 8d97f0b45e9c..795706520b5e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -399,7 +399,7 @@ static int v9fs_test_inode(struct inode *inode, void *data) umode = p9mode2unixmode(v9ses, st, &rdev); /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + if (inode_wrong_type(inode, umode)) return 0; /* compare qid details */ @@ -1390,7 +1390,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) * Don't update inode if the file type is different */ umode = p9mode2unixmode(v9ses, st, &rdev); - if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + if (inode_wrong_type(inode, umode)) goto out; /* diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 1dc7af046615..e1c0240b51c0 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -59,7 +59,7 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data) struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + if (inode_wrong_type(inode, st->st_mode)) return 0; if (inode->i_generation != st->st_gen) @@ -663,14 +663,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, if (stat->st_result_mask & P9_STATS_NLINK) set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { - inode->i_mode = stat->st_mode; - if ((S_ISBLK(inode->i_mode)) || - (S_ISCHR(inode->i_mode))) - init_special_inode(inode, inode->i_mode, - inode->i_rdev); + mode = stat->st_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; } - if (stat->st_result_mask & P9_STATS_RDEV) - inode->i_rdev = new_decode_dev(stat->st_rdev); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && stat->st_result_mask & P9_STATS_SIZE) v9fs_i_size_write(inode, stat->st_size); @@ -959,7 +955,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) /* * Don't update inode if the file type is different */ - if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + if (inode_wrong_type(inode, st->st_mode)) goto out; /* diff --git a/fs/Kconfig b/fs/Kconfig index a55bda4233bb..97e7b77c9309 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -125,6 +125,7 @@ source "fs/overlayfs/Kconfig" menu "Caches" +source "fs/netfs/Kconfig" source "fs/fscache/Kconfig" source "fs/cachefiles/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index c6f1c8c1934e..06fb7a93a1bd 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK config BINFMT_FLAT_OLD_ALWAYS_RAM bool +config BINFMT_FLAT_NO_DATA_START_OFFSET + bool + config BINFMT_FLAT_OLD bool "Enable support for very old legacy flat binaries" depends on BINFMT_FLAT diff --git a/fs/Makefile b/fs/Makefile index 3215fe205256..9c708e1fbe8f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -67,6 +67,7 @@ obj-y += devpts/ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line +obj-$(CONFIG_NETFS_SUPPORT) += netfs/ obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT4_FS) += ext4/ diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 1ad211d72b3b..fc8ba9142f2f 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -4,6 +4,7 @@ config AFS_FS depends on INET select AF_RXRPC select DNS_RESOLVER + select NETFS_SUPPORT help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 714fcca9af99..9fbe5a5ec9bd 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -70,7 +70,6 @@ const struct inode_operations afs_dir_inode_operations = { .permission = afs_permission, .getattr = afs_getattr, .setattr = afs_setattr, - .listxattr = afs_listxattr, }; const struct address_space_operations afs_dir_aops = { @@ -104,6 +103,35 @@ struct afs_lookup_cookie { }; /* + * Drop the refs that we're holding on the pages we were reading into. We've + * got refs on the first nr_pages pages. + */ +static void afs_dir_read_cleanup(struct afs_read *req) +{ + struct address_space *mapping = req->vnode->vfs_inode.i_mapping; + struct page *page; + pgoff_t last = req->nr_pages - 1; + + XA_STATE(xas, &mapping->i_pages, 0); + + if (unlikely(!req->nr_pages)) + return; + + rcu_read_lock(); + xas_for_each(&xas, page, last) { + if (xas_retry(&xas, page)) + continue; + BUG_ON(xa_is_value(page)); + BUG_ON(PageCompound(page)); + ASSERTCMP(page->mapping, ==, mapping); + + put_page(page); + } + + rcu_read_unlock(); +} + +/* * check that a directory page is valid */ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, @@ -128,7 +156,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, qty /= sizeof(union afs_xdr_dir_block); /* check them */ - dbuf = kmap(page); + dbuf = kmap_atomic(page); for (tmp = 0; tmp < qty; tmp++) { if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n", @@ -147,7 +175,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, ((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0; } - kunmap(page); + kunmap_atomic(dbuf); checked: afs_stat_v(dvnode, n_read_dir); @@ -158,35 +186,74 @@ error: } /* - * Check the contents of a directory that we've just read. + * Dump the contents of a directory. */ -static bool afs_dir_check_pages(struct afs_vnode *dvnode, struct afs_read *req) +static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) { struct afs_xdr_dir_page *dbuf; - unsigned int i, j, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block); + struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct page *page; + unsigned int i, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block); + pgoff_t last = req->nr_pages - 1; - for (i = 0; i < req->nr_pages; i++) - if (!afs_dir_check_page(dvnode, req->pages[i], req->actual_len)) - goto bad; - return true; + XA_STATE(xas, &mapping->i_pages, 0); -bad: - pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx r=%llx\n", + pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx\n", dvnode->fid.vid, dvnode->fid.vnode, - req->file_size, req->len, req->actual_len, req->remain); - pr_warn("DIR %llx %x %x %x\n", - req->pos, req->index, req->nr_pages, req->offset); + req->file_size, req->len, req->actual_len); + pr_warn("DIR %llx %x %zx %zx\n", + req->pos, req->nr_pages, + req->iter->iov_offset, iov_iter_count(req->iter)); - for (i = 0; i < req->nr_pages; i++) { - dbuf = kmap(req->pages[i]); - for (j = 0; j < qty; j++) { - union afs_xdr_dir_block *block = &dbuf->blocks[j]; + xas_for_each(&xas, page, last) { + if (xas_retry(&xas, page)) + continue; + + BUG_ON(PageCompound(page)); + BUG_ON(page->mapping != mapping); + + dbuf = kmap_atomic(page); + for (i = 0; i < qty; i++) { + union afs_xdr_dir_block *block = &dbuf->blocks[i]; - pr_warn("[%02x] %32phN\n", i * qty + j, block); + pr_warn("[%02lx] %32phN\n", page->index * qty + i, block); } - kunmap(req->pages[i]); + kunmap_atomic(dbuf); } - return false; +} + +/* + * Check all the pages in a directory. All the pages are held pinned. + */ +static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) +{ + struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct page *page; + pgoff_t last = req->nr_pages - 1; + int ret = 0; + + XA_STATE(xas, &mapping->i_pages, 0); + + if (unlikely(!req->nr_pages)) + return 0; + + rcu_read_lock(); + xas_for_each(&xas, page, last) { + if (xas_retry(&xas, page)) + continue; + + BUG_ON(PageCompound(page)); + BUG_ON(page->mapping != mapping); + + if (!afs_dir_check_page(dvnode, page, req->file_size)) { + afs_dir_dump(dvnode, req); + ret = -EIO; + break; + } + } + + rcu_read_unlock(); + return ret; } /* @@ -215,57 +282,57 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) { struct afs_read *req; loff_t i_size; - int nr_pages, nr_inline, i, n; - int ret = -ENOMEM; + int nr_pages, i, n; + int ret; + + _enter(""); -retry: + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return ERR_PTR(-ENOMEM); + + refcount_set(&req->usage, 1); + req->vnode = dvnode; + req->key = key_get(key); + req->cleanup = afs_dir_read_cleanup; + +expand: i_size = i_size_read(&dvnode->vfs_inode); - if (i_size < 2048) - return ERR_PTR(afs_bad(dvnode, afs_file_error_dir_small)); + if (i_size < 2048) { + ret = afs_bad(dvnode, afs_file_error_dir_small); + goto error; + } if (i_size > 2048 * 1024) { trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); - return ERR_PTR(-EFBIG); + ret = -EFBIG; + goto error; } _enter("%llu", i_size); - /* Get a request record to hold the page list. We want to hold it - * inline if we can, but we don't want to make an order 1 allocation. - */ nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE; - nr_inline = nr_pages; - if (nr_inline > (PAGE_SIZE - sizeof(*req)) / sizeof(struct page *)) - nr_inline = 0; - req = kzalloc(struct_size(req, array, nr_inline), GFP_KERNEL); - if (!req) - return ERR_PTR(-ENOMEM); - - refcount_set(&req->usage, 1); - req->nr_pages = nr_pages; req->actual_len = i_size; /* May change */ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ req->data_version = dvnode->status.data_version; /* May change */ - if (nr_inline > 0) { - req->pages = req->array; - } else { - req->pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_KERNEL); - if (!req->pages) - goto error; - } + iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages, + 0, i_size); + req->iter = &req->def_iter; - /* Get a list of all the pages that hold or will hold the directory - * content. We need to fill in any gaps that we might find where the - * memory reclaimer has been at work. If there are any gaps, we will + /* Fill in any gaps that we might find where the memory reclaimer has + * been at work and pin all the pages. If there are any gaps, we will * need to reread the entire directory contents. */ - i = 0; - do { + i = req->nr_pages; + while (i < nr_pages) { + struct page *pages[8], *page; + n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i, - req->nr_pages - i, - req->pages + i); - _debug("find %u at %u/%u", n, i, req->nr_pages); + min_t(unsigned int, nr_pages - i, + ARRAY_SIZE(pages)), + pages); + _debug("find %u at %u/%u", n, i, nr_pages); + if (n == 0) { gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask; @@ -273,22 +340,24 @@ retry: afs_stat_v(dvnode, n_inval); ret = -ENOMEM; - req->pages[i] = __page_cache_alloc(gfp); - if (!req->pages[i]) + page = __page_cache_alloc(gfp); + if (!page) goto error; - ret = add_to_page_cache_lru(req->pages[i], + ret = add_to_page_cache_lru(page, dvnode->vfs_inode.i_mapping, i, gfp); if (ret < 0) goto error; - attach_page_private(req->pages[i], (void *)1); - unlock_page(req->pages[i]); + attach_page_private(page, (void *)1); + unlock_page(page); + req->nr_pages++; i++; } else { + req->nr_pages += n; i += n; } - } while (i < req->nr_pages); + } /* If we're going to reload, we need to lock all the pages to prevent * races. @@ -306,18 +375,23 @@ retry: if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { trace_afs_reload_dir(dvnode); - ret = afs_fetch_data(dvnode, key, req); + ret = afs_fetch_data(dvnode, req); if (ret < 0) goto error_unlock; task_io_account_read(PAGE_SIZE * req->nr_pages); - if (req->len < req->file_size) - goto content_has_grown; + if (req->len < req->file_size) { + /* The content has grown, so we need to expand the + * buffer. + */ + up_write(&dvnode->validate_lock); + goto expand; + } /* Validate the data we just read. */ - ret = -EIO; - if (!afs_dir_check_pages(dvnode, req)) + ret = afs_dir_check(dvnode, req); + if (ret < 0) goto error_unlock; // TODO: Trim excess pages @@ -335,11 +409,6 @@ error: afs_put_read(req); _leave(" = %d", ret); return ERR_PTR(ret); - -content_has_grown: - up_write(&dvnode->validate_lock); - afs_put_read(req); - goto retry; } /* @@ -449,6 +518,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, struct afs_read *req; struct page *page; unsigned blkoff, limit; + void __rcu **slot; int ret; _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos); @@ -473,9 +543,15 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1); /* Fetch the appropriate page from the directory and re-add it - * to the LRU. + * to the LRU. We have all the pages pinned with an extra ref. */ - page = req->pages[blkoff / PAGE_SIZE]; + rcu_read_lock(); + page = NULL; + slot = radix_tree_lookup_slot(&dvnode->vfs_inode.i_mapping->i_pages, + blkoff / PAGE_SIZE); + if (slot) + page = radix_tree_deref_slot(slot); + rcu_read_unlock(); if (!page) { ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; @@ -1343,6 +1419,7 @@ static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; op->create.mode = S_IFDIR | mode; @@ -1424,6 +1501,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; @@ -1560,6 +1638,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; /* Try to make sure we have a callback promise on the victim. */ @@ -1642,6 +1721,7 @@ static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; @@ -1716,6 +1796,7 @@ static int afs_link(struct dentry *from, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -1911,6 +1992,8 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; op->file[1].dv_delta = 1; + op->file[0].modification = true; + op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -2007,6 +2090,6 @@ static void afs_dir_invalidatepage(struct page *page, unsigned int offset, afs_stat_v(dvnode, n_inval); /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_SIZE) + if (offset == 0 && length == thp_size(page)) detach_page_private(page); } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 04f75a44f243..dae9a57d7ec0 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -73,6 +73,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode afs_op_set_vnode(op, 1, dvnode); op->file[0].dv_delta = 1; op->file[1].dv_delta = 1; + op->file[0].modification = true; + op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -201,6 +203,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->file[1].op_unlinked = true; op->file[1].update_ctime = true; diff --git a/fs/afs/file.c b/fs/afs/file.c index 85f5adf21aa0..db035ae2a134 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -14,6 +14,7 @@ #include <linux/gfp.h> #include <linux/task_io_accounting_ops.h> #include <linux/mm.h> +#include <linux/netfs.h> #include "internal.h" static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); @@ -22,8 +23,7 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int afs_releasepage(struct page *page, gfp_t gfp_flags); -static int afs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); +static void afs_readahead(struct readahead_control *ractl); const struct file_operations afs_file_operations = { .open = afs_open, @@ -43,12 +43,11 @@ const struct inode_operations afs_file_inode_operations = { .getattr = afs_getattr, .setattr = afs_setattr, .permission = afs_permission, - .listxattr = afs_listxattr, }; const struct address_space_operations afs_fs_aops = { .readpage = afs_readpage, - .readpages = afs_readpages, + .readahead = afs_readahead, .set_page_dirty = afs_set_page_dirty, .launder_page = afs_launder_page, .releasepage = afs_releasepage, @@ -185,41 +184,50 @@ int afs_release(struct inode *inode, struct file *file) } /* + * Allocate a new read record. + */ +struct afs_read *afs_alloc_read(gfp_t gfp) +{ + struct afs_read *req; + + req = kzalloc(sizeof(struct afs_read), gfp); + if (req) + refcount_set(&req->usage, 1); + + return req; +} + +/* * Dispose of a ref to a read record. */ void afs_put_read(struct afs_read *req) { - int i; - if (refcount_dec_and_test(&req->usage)) { - if (req->pages) { - for (i = 0; i < req->nr_pages; i++) - if (req->pages[i]) - put_page(req->pages[i]); - if (req->pages != req->array) - kfree(req->pages); - } + if (req->cleanup) + req->cleanup(req); + key_put(req->key); kfree(req); } } -#ifdef CONFIG_AFS_FSCACHE -/* - * deal with notification that a page was read from the cache - */ -static void afs_file_readpage_read_complete(struct page *page, - void *data, - int error) +static void afs_fetch_data_notify(struct afs_operation *op) { - _enter("%p,%p,%d", page, data, error); - - /* if the read completes with an error, we just unlock the page and let - * the VM reissue the readpage */ - if (!error) - SetPageUptodate(page); - unlock_page(page); + struct afs_read *req = op->fetch.req; + struct netfs_read_subrequest *subreq = req->subreq; + int error = op->error; + + if (error == -ECONNABORTED) + error = afs_abort_to_error(op->ac.abort_code); + req->error = error; + + if (subreq) { + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_subreq_terminated(subreq, error ?: req->actual_len, false); + req->subreq = NULL; + } else if (req->done) { + req->done(req); + } } -#endif static void afs_fetch_data_success(struct afs_operation *op) { @@ -229,10 +237,12 @@ static void afs_fetch_data_success(struct afs_operation *op) afs_vnode_commit_status(op, &op->file[0]); afs_stat_v(vnode, n_fetches); atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes); + afs_fetch_data_notify(op); } static void afs_fetch_data_put(struct afs_operation *op) { + op->fetch.req->error = op->error; afs_put_read(op->fetch.req); } @@ -241,13 +251,14 @@ static const struct afs_operation_ops afs_fetch_data_operation = { .issue_yfs_rpc = yfs_fs_fetch_data, .success = afs_fetch_data_success, .aborted = afs_check_for_remote_deletion, + .failed = afs_fetch_data_notify, .put = afs_fetch_data_put, }; /* * Fetch file data from the volume. */ -int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req) +int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) { struct afs_operation *op; @@ -256,11 +267,14 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, - key_serial(key)); + key_serial(req->key)); - op = afs_alloc_operation(key, vnode->volume); - if (IS_ERR(op)) + op = afs_alloc_operation(req->key, vnode->volume); + if (IS_ERR(op)) { + if (req->subreq) + netfs_subreq_terminated(req->subreq, PTR_ERR(op), false); return PTR_ERR(op); + } afs_op_set_vnode(op, 0, vnode); @@ -269,336 +283,103 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re return afs_do_sync_operation(op); } -/* - * read page from file, directory or symlink, given a key to use - */ -int afs_page_filler(void *data, struct page *page) +static void afs_req_issue_op(struct netfs_read_subrequest *subreq) { - struct inode *inode = page->mapping->host; - struct afs_vnode *vnode = AFS_FS_I(inode); - struct afs_read *req; - struct key *key = data; - int ret; - - _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + struct afs_read *fsreq; - BUG_ON(!PageLocked(page)); + fsreq = afs_alloc_read(GFP_NOFS); + if (!fsreq) + return netfs_subreq_terminated(subreq, -ENOMEM, false); - ret = -ESTALE; - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - goto error; + fsreq->subreq = subreq; + fsreq->pos = subreq->start + subreq->transferred; + fsreq->len = subreq->len - subreq->transferred; + fsreq->key = subreq->rreq->netfs_priv; + fsreq->vnode = vnode; + fsreq->iter = &fsreq->def_iter; - /* is it cached? */ -#ifdef CONFIG_AFS_FSCACHE - ret = fscache_read_or_alloc_page(vnode->cache, - page, - afs_file_readpage_read_complete, - NULL, - GFP_KERNEL); -#else - ret = -ENOBUFS; -#endif - switch (ret) { - /* read BIO submitted (page in cache) */ - case 0: - break; - - /* page not yet cached */ - case -ENODATA: - _debug("cache said ENODATA"); - goto go_on; - - /* page will not be cached */ - case -ENOBUFS: - _debug("cache said ENOBUFS"); - - fallthrough; - default: - go_on: - req = kzalloc(struct_size(req, array, 1), GFP_KERNEL); - if (!req) - goto enomem; - - /* We request a full page. If the page is a partial one at the - * end of the file, the server will return a short read and the - * unmarshalling code will clear the unfilled space. - */ - refcount_set(&req->usage, 1); - req->pos = (loff_t)page->index << PAGE_SHIFT; - req->len = PAGE_SIZE; - req->nr_pages = 1; - req->pages = req->array; - req->pages[0] = page; - get_page(page); - - /* read the contents of the file from the server into the - * page */ - ret = afs_fetch_data(vnode, key, req); - afs_put_read(req); - - if (ret < 0) { - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } + iov_iter_xarray(&fsreq->def_iter, READ, + &fsreq->vnode->vfs_inode.i_mapping->i_pages, + fsreq->pos, fsreq->len); -#ifdef CONFIG_AFS_FSCACHE - fscache_uncache_page(vnode->cache, page); -#endif - BUG_ON(PageFsCache(page)); - - if (ret == -EINTR || - ret == -ENOMEM || - ret == -ERESTARTSYS || - ret == -EAGAIN) - goto error; - goto io_error; - } + afs_fetch_data(fsreq->vnode, fsreq); +} - SetPageUptodate(page); +static int afs_symlink_readpage(struct page *page) +{ + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + struct afs_read *fsreq; + int ret; - /* send the page to the cache */ -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, vnode->status.size, - GFP_KERNEL) != 0) { - fscache_uncache_page(vnode->cache, page); - BUG_ON(PageFsCache(page)); - } -#endif - unlock_page(page); - } + fsreq = afs_alloc_read(GFP_NOFS); + if (!fsreq) + return -ENOMEM; - _leave(" = 0"); - return 0; + fsreq->pos = page->index * PAGE_SIZE; + fsreq->len = PAGE_SIZE; + fsreq->vnode = vnode; + fsreq->iter = &fsreq->def_iter; + iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages, + fsreq->pos, fsreq->len); -io_error: - SetPageError(page); - goto error; -enomem: - ret = -ENOMEM; -error: - unlock_page(page); - _leave(" = %d", ret); + ret = afs_fetch_data(fsreq->vnode, fsreq); + page_endio(page, false, ret); return ret; } -/* - * read page from file, directory or symlink, given a file to nominate the key - * to be used - */ -static int afs_readpage(struct file *file, struct page *page) +static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file) { - struct key *key; - int ret; - - if (file) { - key = afs_file_key(file); - ASSERT(key != NULL); - ret = afs_page_filler(key, page); - } else { - struct inode *inode = page->mapping->host; - key = afs_request_key(AFS_FS_S(inode->i_sb)->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - } else { - ret = afs_page_filler(key, page); - key_put(key); - } - } - return ret; + rreq->netfs_priv = key_get(afs_file_key(file)); } -/* - * Make pages available as they're filled. - */ -static void afs_readpages_page_done(struct afs_read *req) +static bool afs_is_cache_enabled(struct inode *inode) { -#ifdef CONFIG_AFS_FSCACHE - struct afs_vnode *vnode = req->vnode; -#endif - struct page *page = req->pages[req->index]; + struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode)); - req->pages[req->index] = NULL; - SetPageUptodate(page); - - /* send the page to the cache */ -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, vnode->status.size, - GFP_KERNEL) != 0) { - fscache_uncache_page(vnode->cache, page); - BUG_ON(PageFsCache(page)); - } -#endif - unlock_page(page); - put_page(page); + return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects); } -/* - * Read a contiguous set of pages. - */ -static int afs_readpages_one(struct file *file, struct address_space *mapping, - struct list_head *pages) +static int afs_begin_cache_operation(struct netfs_read_request *rreq) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct afs_read *req; - struct list_head *p; - struct page *first, *page; - struct key *key = afs_file_key(file); - pgoff_t index; - int ret, n, i; - - /* Count the number of contiguous pages at the front of the list. Note - * that the list goes prev-wards rather than next-wards. - */ - first = lru_to_page(pages); - index = first->index + 1; - n = 1; - for (p = first->lru.prev; p != pages; p = p->prev) { - page = list_entry(p, struct page, lru); - if (page->index != index) - break; - index++; - n++; - } - - req = kzalloc(struct_size(req, array, n), GFP_NOFS); - if (!req) - return -ENOMEM; - - refcount_set(&req->usage, 1); - req->vnode = vnode; - req->page_done = afs_readpages_page_done; - req->pos = first->index; - req->pos <<= PAGE_SHIFT; - req->pages = req->array; - - /* Transfer the pages to the request. We add them in until one fails - * to add to the LRU and then we stop (as that'll make a hole in the - * contiguous run. - * - * Note that it's possible for the file size to change whilst we're - * doing this, but we rely on the server returning less than we asked - * for if the file shrank. We also rely on this to deal with a partial - * page at the end of the file. - */ - do { - page = lru_to_page(pages); - list_del(&page->lru); - index = page->index; - if (add_to_page_cache_lru(page, mapping, index, - readahead_gfp_mask(mapping))) { -#ifdef CONFIG_AFS_FSCACHE - fscache_uncache_page(vnode->cache, page); -#endif - put_page(page); - break; - } - - req->pages[req->nr_pages++] = page; - req->len += PAGE_SIZE; - } while (req->nr_pages < n); + struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - if (req->nr_pages == 0) { - kfree(req); - return 0; - } - - ret = afs_fetch_data(vnode, key, req); - if (ret < 0) - goto error; - - task_io_account_read(PAGE_SIZE * req->nr_pages); - afs_put_read(req); - return 0; - -error: - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - - for (i = 0; i < req->nr_pages; i++) { - page = req->pages[i]; - if (page) { -#ifdef CONFIG_AFS_FSCACHE - fscache_uncache_page(vnode->cache, page); -#endif - SetPageError(page); - unlock_page(page); - } - } - - afs_put_read(req); - return ret; + return fscache_begin_read_operation(rreq, afs_vnode_cache(vnode)); } -/* - * read a set of pages - */ -static int afs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, + struct page *page, void **_fsdata) { - struct key *key = afs_file_key(file); - struct afs_vnode *vnode; - int ret = 0; - - _enter("{%d},{%lu},,%d", - key_serial(key), mapping->host->i_ino, nr_pages); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - ASSERT(key != NULL); + return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; +} - vnode = AFS_FS_I(mapping->host); - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _leave(" = -ESTALE"); - return -ESTALE; - } +static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv) +{ + key_put(netfs_priv); +} - /* attempt to read as many of the pages as possible */ -#ifdef CONFIG_AFS_FSCACHE - ret = fscache_read_or_alloc_pages(vnode->cache, - mapping, - pages, - &nr_pages, - afs_file_readpage_read_complete, - NULL, - mapping_gfp_mask(mapping)); -#else - ret = -ENOBUFS; -#endif +const struct netfs_read_request_ops afs_req_ops = { + .init_rreq = afs_init_rreq, + .is_cache_enabled = afs_is_cache_enabled, + .begin_cache_operation = afs_begin_cache_operation, + .check_write_begin = afs_check_write_begin, + .issue_op = afs_req_issue_op, + .cleanup = afs_priv_cleanup, +}; - switch (ret) { - /* all pages are being read from the cache */ - case 0: - BUG_ON(!list_empty(pages)); - BUG_ON(nr_pages != 0); - _leave(" = 0 [reading all]"); - return 0; - - /* there were pages that couldn't be read from the cache */ - case -ENODATA: - case -ENOBUFS: - break; - - /* other error */ - default: - _leave(" = %d", ret); - return ret; - } +static int afs_readpage(struct file *file, struct page *page) +{ + if (!file) + return afs_symlink_readpage(page); - while (!list_empty(pages)) { - ret = afs_readpages_one(file, mapping, pages); - if (ret < 0) - break; - } + return netfs_readpage(file, page, &afs_req_ops, NULL); +} - _leave(" = %d [netting]", ret); - return ret; +static void afs_readahead(struct readahead_control *ractl) +{ + netfs_readahead(ractl, &afs_req_ops, NULL); } /* @@ -626,8 +407,8 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset, return; /* We may need to shorten the dirty region */ - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); if (t <= offset || f >= end) return; /* Doesn't overlap */ @@ -645,17 +426,17 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset, if (f == t) goto undirty; - priv = afs_page_dirty(f, t); + priv = afs_page_dirty(page, f, t); set_page_private(page, priv); - trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page); return; undirty: - trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page); clear_page_dirty_for_io(page); full_invalidate: - priv = (unsigned long)detach_page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("inval"), page); + detach_page_private(page); } /* @@ -670,20 +451,10 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, BUG_ON(!PageLocked(page)); -#ifdef CONFIG_AFS_FSCACHE - /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_SIZE) { - if (PageFsCache(page)) { - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - fscache_wait_on_page_write(vnode->cache, page); - fscache_uncache_page(vnode->cache, page); - } - } -#endif - if (PagePrivate(page)) afs_invalidate_dirty(page, offset, length); + wait_on_page_fscache(page); _leave(""); } @@ -694,7 +465,6 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, static int afs_releasepage(struct page *page, gfp_t gfp_flags) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - unsigned long priv; _enter("{{%llx:%llu}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, @@ -703,16 +473,16 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) /* deny if page is being written to the cache and the caller hasn't * elected to wait */ #ifdef CONFIG_AFS_FSCACHE - if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) { - _leave(" = F [cache busy]"); - return 0; + if (PageFsCache(page)) { + if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS)) + return false; + wait_on_page_fscache(page); } #endif if (PagePrivate(page)) { - priv = (unsigned long)detach_page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("rel"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("rel"), page); + detach_page_private(page); } /* indicate that the page can be released */ diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 97cab12b0a6c..d222dfbe976b 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -118,6 +118,8 @@ static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param * vp->cb_break_before = afs_calc_vnode_cb_break(vnode); if (vnode->lock_state != AFS_VNODE_LOCK_NONE) op->flags |= AFS_OPERATION_CUR_ONLY; + if (vp->modification) + set_bit(AFS_VNODE_MODIFYING, &vnode->flags); } if (vp->fid.vnode) @@ -181,10 +183,13 @@ void afs_wait_for_operation(struct afs_operation *op) if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) && op->ops->issue_yfs_rpc) op->ops->issue_yfs_rpc(op); - else + else if (op->ops->issue_afs_rpc) op->ops->issue_afs_rpc(op); + else + op->ac.error = -ENOTSUPP; - op->error = afs_wait_for_call_to_complete(op->call, &op->ac); + if (op->call) + op->error = afs_wait_for_call_to_complete(op->call, &op->ac); } switch (op->error) { @@ -195,8 +200,10 @@ void afs_wait_for_operation(struct afs_operation *op) case -ECONNABORTED: if (op->ops->aborted) op->ops->aborted(op); - break; + fallthrough; default: + if (op->ops->failed) + op->ops->failed(op); break; } @@ -220,6 +227,10 @@ int afs_put_operation(struct afs_operation *op) if (op->ops && op->ops->put) op->ops->put(op); + if (op->file[0].modification) + clear_bit(AFS_VNODE_MODIFYING, &op->file[0].vnode->flags); + if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode) + clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags); if (op->file[0].put_vnode) iput(&op->file[0].vnode->vfs_inode); if (op->file[1].put_vnode) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 1d95ed9dd86e..2f695a260442 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -10,6 +10,7 @@ #include <linux/sched.h> #include <linux/circ_buf.h> #include <linux/iversion.h> +#include <linux/netfs.h> #include "internal.h" #include "afs_fs.h" #include "xdr_fs.h" @@ -302,17 +303,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) struct afs_vnode_param *vp = &op->file[0]; struct afs_read *req = op->fetch.req; const __be32 *bp; - unsigned int size; int ret; - _enter("{%u,%zu/%llu}", - call->unmarshall, iov_iter_count(call->iter), req->actual_len); + _enter("{%u,%zu,%zu/%llu}", + call->unmarshall, call->iov_len, iov_iter_count(call->iter), + req->actual_len); switch (call->unmarshall) { case 0: req->actual_len = 0; - req->index = 0; - req->offset = req->pos & (PAGE_SIZE - 1); call->unmarshall++; if (call->operation_ID == FSFETCHDATA64) { afs_extract_to_tmp64(call); @@ -322,7 +321,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) } fallthrough; - /* extract the returned data length */ + /* Extract the returned data length into + * ->actual_len. This may indicate more or less data than was + * requested will be returned. + */ case 1: _debug("extract data length"); ret = afs_extract_data(call, true); @@ -331,44 +333,25 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) req->actual_len = be64_to_cpu(call->tmp64); _debug("DATA length: %llu", req->actual_len); - req->remain = min(req->len, req->actual_len); - if (req->remain == 0) + + if (req->actual_len == 0) goto no_more_data; + call->iter = req->iter; + call->iov_len = min(req->actual_len, req->len); call->unmarshall++; - - begin_page: - ASSERTCMP(req->index, <, req->nr_pages); - if (req->remain > PAGE_SIZE - req->offset) - size = PAGE_SIZE - req->offset; - else - size = req->remain; - call->bvec[0].bv_len = size; - call->bvec[0].bv_offset = req->offset; - call->bvec[0].bv_page = req->pages[req->index]; - iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); - ASSERTCMP(size, <=, PAGE_SIZE); fallthrough; /* extract the returned data */ case 2: _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->remain); + iov_iter_count(call->iter), req->actual_len); ret = afs_extract_data(call, true); if (ret < 0) return ret; - req->remain -= call->bvec[0].bv_len; - req->offset += call->bvec[0].bv_len; - ASSERTCMP(req->offset, <=, PAGE_SIZE); - if (req->offset == PAGE_SIZE) { - req->offset = 0; - req->index++; - if (req->remain > 0) - goto begin_page; - } - ASSERTCMP(req->remain, ==, 0); + call->iter = &call->def_iter; if (req->actual_len <= req->len) goto no_more_data; @@ -410,17 +393,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) break; } - for (; req->index < req->nr_pages; req->index++) { - if (req->offset < PAGE_SIZE) - zero_user_segment(req->pages[req->index], - req->offset, PAGE_SIZE); - req->offset = 0; - } - - if (req->page_done) - for (req->index = 0; req->index < req->nr_pages; req->index++) - req->page_done(req); - _leave(" = 0 [done]"); return 0; } @@ -494,6 +466,8 @@ void afs_fs_fetch_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); + req->call_debug_id = call->debug_id; + /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA); @@ -1079,8 +1053,7 @@ static const struct afs_call_type afs_RXFSStoreData64 = { /* * store a set of pages to a very large file */ -static void afs_fs_store_data64(struct afs_operation *op, - loff_t pos, loff_t size, loff_t i_size) +static void afs_fs_store_data64(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; @@ -1095,7 +1068,7 @@ static void afs_fs_store_data64(struct afs_operation *op, if (!call) return afs_op_nomem(op); - call->send_pages = true; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; @@ -1111,47 +1084,38 @@ static void afs_fs_store_data64(struct afs_operation *op, *bp++ = 0; /* unix mode */ *bp++ = 0; /* segment size */ - *bp++ = htonl(upper_32_bits(pos)); - *bp++ = htonl(lower_32_bits(pos)); - *bp++ = htonl(upper_32_bits(size)); - *bp++ = htonl(lower_32_bits(size)); - *bp++ = htonl(upper_32_bits(i_size)); - *bp++ = htonl(lower_32_bits(i_size)); + *bp++ = htonl(upper_32_bits(op->store.pos)); + *bp++ = htonl(lower_32_bits(op->store.pos)); + *bp++ = htonl(upper_32_bits(op->store.size)); + *bp++ = htonl(lower_32_bits(op->store.size)); + *bp++ = htonl(upper_32_bits(op->store.i_size)); + *bp++ = htonl(lower_32_bits(op->store.i_size)); trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } /* - * store a set of pages + * Write data to a file on the server. */ void afs_fs_store_data(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - loff_t size, pos, i_size; __be32 *bp; _enter(",%x,{%llx:%llu},,", key_serial(op->key), vp->fid.vid, vp->fid.vnode); - size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset; - if (op->store.first != op->store.last) - size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT; - pos = (loff_t)op->store.first << PAGE_SHIFT; - pos += op->store.first_offset; - - i_size = i_size_read(&vp->vnode->vfs_inode); - if (pos + size > i_size) - i_size = size + pos; - _debug("size %llx, at %llx, i_size %llx", - (unsigned long long) size, (unsigned long long) pos, - (unsigned long long) i_size); + (unsigned long long)op->store.size, + (unsigned long long)op->store.pos, + (unsigned long long)op->store.i_size); - if (upper_32_bits(pos) || upper_32_bits(i_size) || upper_32_bits(size) || - upper_32_bits(pos + size)) - return afs_fs_store_data64(op, pos, size, i_size); + if (upper_32_bits(op->store.pos) || + upper_32_bits(op->store.size) || + upper_32_bits(op->store.i_size)) + return afs_fs_store_data64(op); call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData, (4 + 6 + 3) * 4, @@ -1159,7 +1123,7 @@ void afs_fs_store_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); - call->send_pages = true; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; @@ -1175,9 +1139,9 @@ void afs_fs_store_data(struct afs_operation *op) *bp++ = 0; /* unix mode */ *bp++ = 0; /* segment size */ - *bp++ = htonl(lower_32_bits(pos)); - *bp++ = htonl(lower_32_bits(size)); - *bp++ = htonl(lower_32_bits(i_size)); + *bp++ = htonl(lower_32_bits(op->store.pos)); + *bp++ = htonl(lower_32_bits(op->store.size)); + *bp++ = htonl(lower_32_bits(op->store.i_size)); trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1156b2df28d3..80b6c8d967d5 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -27,7 +27,6 @@ static const struct inode_operations afs_symlink_inode_operations = { .get_link = page_get_link, - .listxattr = afs_listxattr, }; static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode) @@ -103,13 +102,13 @@ static int afs_inode_init_from_status(struct afs_operation *op, switch (status->type) { case AFS_FTYPE_FILE: - inode->i_mode = S_IFREG | status->mode; + inode->i_mode = S_IFREG | (status->mode & S_IALLUGO); inode->i_op = &afs_file_inode_operations; inode->i_fop = &afs_file_operations; inode->i_mapping->a_ops = &afs_fs_aops; break; case AFS_FTYPE_DIR: - inode->i_mode = S_IFDIR | status->mode; + inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO); inode->i_op = &afs_dir_inode_operations; inode->i_fop = &afs_dir_file_operations; inode->i_mapping->a_ops = &afs_dir_aops; @@ -199,7 +198,7 @@ static void afs_apply_status(struct afs_operation *op, if (status->mode != vnode->status.mode) { mode = inode->i_mode; mode &= ~S_IALLUGO; - mode |= status->mode; + mode |= status->mode & S_IALLUGO; WRITE_ONCE(inode->i_mode, mode); } @@ -215,11 +214,12 @@ static void afs_apply_status(struct afs_operation *op, if (vp->dv_before + vp->dv_delta != status->data_version) { if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) - pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s\n", + pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n", vnode->fid.vid, vnode->fid.vnode, (unsigned long long)vp->dv_before + vp->dv_delta, (unsigned long long)status->data_version, - op->type ? op->type->name : "???"); + op->type ? op->type->name : "???", + op->debug_id); vnode->invalid_before = status->data_version; if (vnode->status.type == AFS_FTYPE_DIR) { @@ -294,8 +294,9 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } } else if (vp->scb.have_status) { - if (vp->dv_before + vp->dv_delta != vp->scb.status.data_version && - vp->speculative) + if (vp->speculative && + (test_bit(AFS_VNODE_MODIFYING, &vnode->flags) || + vp->dv_before != vnode->status.data_version)) /* Ignore the result of a speculative bulk status fetch * if it splits around a modification op, thereby * appearing to regress the data version. @@ -428,7 +429,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) } __packed key; struct afs_vnode_cache_aux aux; - if (vnode->status.type == AFS_FTYPE_DIR) { + if (vnode->status.type != AFS_FTYPE_FILE) { vnode->cache = NULL; return; } @@ -911,6 +912,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } op->ctime = attr->ia_ctime; op->file[0].update_ctime = 1; + op->file[0].modification = true; op->ops = &afs_setattr_operation; ret = afs_do_sync_operation(op); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index b626e38e9ab5..5ed416f4ff33 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -14,6 +14,7 @@ #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> +#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> @@ -31,6 +32,7 @@ struct pagevec; struct afs_call; +struct afs_vnode; /* * Partial file-locking emulation mode. (The problem being that AFS3 only @@ -104,7 +106,9 @@ struct afs_call { struct afs_server *server; /* The fileserver record if fs op (pins ref) */ struct afs_vlserver *vlserver; /* The vlserver record if vl op */ void *request; /* request data (first part) */ + size_t iov_len; /* Size of *iter to be used */ struct iov_iter def_iter; /* Default buffer/data iterator */ + struct iov_iter *write_iter; /* Iterator defining write to be made */ struct iov_iter *iter; /* Iterator currently in use */ union { /* Convenience for ->def_iter */ struct kvec kvec[1]; @@ -131,7 +135,6 @@ struct afs_call { unsigned char unmarshall; /* unmarshalling phase */ unsigned char addr_ix; /* Address in ->alist */ bool drop_ref; /* T if need to drop ref for incoming call */ - bool send_pages; /* T if data from mapping should be sent */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ @@ -202,17 +205,19 @@ struct afs_read { loff_t pos; /* Where to start reading */ loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ - loff_t remain; /* Amount remaining */ loff_t file_size; /* File size returned by server */ + struct key *key; /* The key to use to reissue the read */ + struct afs_vnode *vnode; /* The file being read into. */ + struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */ afs_dataversion_t data_version; /* Version number returned by server */ refcount_t usage; - unsigned int index; /* Which page we're reading into */ + unsigned int call_debug_id; unsigned int nr_pages; - unsigned int offset; /* offset into current page */ - struct afs_vnode *vnode; - void (*page_done)(struct afs_read *); - struct page **pages; - struct page *array[]; + int error; + void (*done)(struct afs_read *); + void (*cleanup)(struct afs_read *); + struct iov_iter *iter; /* Iterator representing the buffer */ + struct iov_iter def_iter; /* Default iterator */ }; /* @@ -640,6 +645,7 @@ struct afs_vnode { #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ +#define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ @@ -739,6 +745,7 @@ struct afs_operation_ops { void (*issue_yfs_rpc)(struct afs_operation *op); void (*success)(struct afs_operation *op); void (*aborted)(struct afs_operation *op); + void (*failed)(struct afs_operation *op); void (*edit_dir)(struct afs_operation *op); void (*put)(struct afs_operation *op); }; @@ -756,6 +763,7 @@ struct afs_vnode_param { bool set_size:1; /* Must update i_size */ bool op_unlinked:1; /* True if file was unlinked by op */ bool speculative:1; /* T if speculative status fetch (no vnode lock) */ + bool modification:1; /* Set if the content gets modified */ }; /* @@ -808,12 +816,11 @@ struct afs_operation { afs_lock_type_t type; } lock; struct { - struct address_space *mapping; /* Pages being written from */ - pgoff_t first; /* first page in mapping to deal with */ - pgoff_t last; /* last page in mapping to deal with */ - unsigned first_offset; /* offset into mapping[first] */ - unsigned last_to; /* amount of mapping[last] */ - bool laundering; /* Laundering page, PG_writeback not set */ + struct iov_iter *write_iter; + loff_t pos; + loff_t size; + loff_t i_size; + bool laundering; /* Laundering page, PG_writeback not set */ } store; struct { struct iattr *attr; @@ -875,31 +882,31 @@ struct afs_vnode_cache_aux { #define __AFS_PAGE_PRIV_MMAPPED 0x8000UL #endif -static inline unsigned int afs_page_dirty_resolution(void) +static inline unsigned int afs_page_dirty_resolution(struct page *page) { - int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); + int shift = thp_order(page) + PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); return (shift > 0) ? shift : 0; } -static inline size_t afs_page_dirty_from(unsigned long priv) +static inline size_t afs_page_dirty_from(struct page *page, unsigned long priv) { unsigned long x = priv & __AFS_PAGE_PRIV_MASK; /* The lower bound is inclusive */ - return x << afs_page_dirty_resolution(); + return x << afs_page_dirty_resolution(page); } -static inline size_t afs_page_dirty_to(unsigned long priv) +static inline size_t afs_page_dirty_to(struct page *page, unsigned long priv) { unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; /* The upper bound is immediately beyond the region */ - return (x + 1) << afs_page_dirty_resolution(); + return (x + 1) << afs_page_dirty_resolution(page); } -static inline unsigned long afs_page_dirty(size_t from, size_t to) +static inline unsigned long afs_page_dirty(struct page *page, size_t from, size_t to) { - unsigned int res = afs_page_dirty_resolution(); + unsigned int res = afs_page_dirty_resolution(page); from >>= res; to = (to - 1) >> res; return (to << __AFS_PAGE_PRIV_SHIFT) | from; @@ -1040,13 +1047,14 @@ extern void afs_dynroot_depopulate(struct super_block *); extern const struct address_space_operations afs_fs_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; +extern const struct netfs_read_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); -extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *); -extern int afs_page_filler(void *, struct page *); +extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); +extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); static inline struct afs_read *afs_get_read(struct afs_read *req) @@ -1270,6 +1278,7 @@ static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *c static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) { + call->iov_len = size; call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size); @@ -1277,21 +1286,25 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si static inline void afs_extract_to_tmp(struct afs_call *call) { + call->iov_len = sizeof(call->tmp); afs_extract_begin(call, &call->tmp, sizeof(call->tmp)); } static inline void afs_extract_to_tmp64(struct afs_call *call) { + call->iov_len = sizeof(call->tmp64); afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64)); } static inline void afs_extract_discard(struct afs_call *call, size_t size) { + call->iov_len = size; iov_iter_discard(&call->def_iter, READ, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) { + call->iov_len = size; afs_extract_begin(call, call->buffer, size); } @@ -1509,7 +1522,6 @@ extern int afs_launder_page(struct page *); * xattr.c */ extern const struct xattr_handler *afs_xattr_handlers[]; -extern ssize_t afs_listxattr(struct dentry *, char *, size_t); /* * yfsclient.c diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 052dab2f5c03..bbb2c210d139 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -32,7 +32,6 @@ const struct inode_operations afs_mntpt_inode_operations = { .lookup = afs_mntpt_lookup, .readlink = page_readlink, .getattr = afs_getattr, - .listxattr = afs_listxattr, }; const struct inode_operations afs_autocell_inode_operations = { diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8be709cb8542..23a1a92d64bb 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -271,40 +271,6 @@ void afs_flat_call_destructor(struct afs_call *call) call->buffer = NULL; } -#define AFS_BVEC_MAX 8 - -/* - * Load the given bvec with the next few pages. - */ -static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, - struct bio_vec *bv, pgoff_t first, pgoff_t last, - unsigned offset) -{ - struct afs_operation *op = call->op; - struct page *pages[AFS_BVEC_MAX]; - unsigned int nr, n, i, to, bytes = 0; - - nr = min_t(pgoff_t, last - first + 1, AFS_BVEC_MAX); - n = find_get_pages_contig(op->store.mapping, first, nr, pages); - ASSERTCMP(n, ==, nr); - - msg->msg_flags |= MSG_MORE; - for (i = 0; i < nr; i++) { - to = PAGE_SIZE; - if (first + i >= last) { - to = op->store.last_to; - msg->msg_flags &= ~MSG_MORE; - } - bv[i].bv_page = pages[i]; - bv[i].bv_len = to - offset; - bv[i].bv_offset = offset; - bytes += to - offset; - offset = 0; - } - - iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes); -} - /* * Advance the AFS call state when the RxRPC call ends the transmit phase. */ @@ -318,42 +284,6 @@ static void afs_notify_end_request_tx(struct sock *sock, } /* - * attach the data from a bunch of pages on an inode to a call - */ -static int afs_send_pages(struct afs_call *call, struct msghdr *msg) -{ - struct afs_operation *op = call->op; - struct bio_vec bv[AFS_BVEC_MAX]; - unsigned int bytes, nr, loop, offset; - pgoff_t first = op->store.first, last = op->store.last; - int ret; - - offset = op->store.first_offset; - op->store.first_offset = 0; - - do { - afs_load_bvec(call, msg, bv, first, last, offset); - trace_afs_send_pages(call, msg, first, last, offset); - - offset = 0; - bytes = msg->msg_iter.count; - nr = msg->msg_iter.nr_segs; - - ret = rxrpc_kernel_send_data(op->net->socket, call->rxcall, msg, - bytes, afs_notify_end_request_tx); - for (loop = 0; loop < nr; loop++) - put_page(bv[loop].bv_page); - if (ret < 0) - break; - - first += nr; - } while (first <= last); - - trace_afs_sent_pages(call, op->store.first, last, first, ret); - return ret; -} - -/* * Initiate a call and synchronously queue up the parameters for dispatch. Any * error is stored into the call struct, which the caller must check for. */ @@ -363,6 +293,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; + size_t len; s64 tx_total_len; int ret; @@ -383,21 +314,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) * after the initial fixed part. */ tx_total_len = call->request_size; - if (call->send_pages) { - struct afs_operation *op = call->op; - - if (op->store.last == op->store.first) { - tx_total_len += op->store.last_to - op->store.first_offset; - } else { - /* It looks mathematically like you should be able to - * combine the following lines with the ones above, but - * unsigned arithmetic is fun when it wraps... - */ - tx_total_len += PAGE_SIZE - op->store.first_offset; - tx_total_len += op->store.last_to; - tx_total_len += (op->store.last - op->store.first - 1) * PAGE_SIZE; - } - } + if (call->write_iter) + tx_total_len += iov_iter_count(call->write_iter); /* If the call is going to be asynchronous, we need an extra ref for * the call to hold itself so the caller need not hang on to its ref. @@ -439,7 +357,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; - msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0); + msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0); ret = rxrpc_kernel_send_data(call->net->socket, rxcall, &msg, call->request_size, @@ -447,8 +365,18 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) if (ret < 0) goto error_do_abort; - if (call->send_pages) { - ret = afs_send_pages(call, &msg); + if (call->write_iter) { + msg.msg_iter = *call->write_iter; + msg.msg_flags &= ~MSG_MORE; + trace_afs_send_data(call, &msg); + + ret = rxrpc_kernel_send_data(call->net->socket, + call->rxcall, &msg, + iov_iter_count(&msg.msg_iter), + afs_notify_end_request_tx); + *call->write_iter = msg.msg_iter; + + trace_afs_sent_data(call, &msg, ret); if (ret < 0) goto error_do_abort; } @@ -466,9 +394,10 @@ error_do_abort: rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); } else { + len = 0; iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, - &msg.msg_iter, false, + &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); ac->abort_code = call->abort_code; ac->responded = true; @@ -499,11 +428,45 @@ error_kill_call: } /* + * Log remote abort codes that indicate that we have a protocol disagreement + * with the server. + */ +static void afs_log_error(struct afs_call *call, s32 remote_abort) +{ + static int max = 0; + const char *msg; + int m; + + switch (remote_abort) { + case RX_EOF: msg = "unexpected EOF"; break; + case RXGEN_CC_MARSHAL: msg = "client marshalling"; break; + case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling"; break; + case RXGEN_SS_MARSHAL: msg = "server marshalling"; break; + case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling"; break; + case RXGEN_DECODE: msg = "opcode decode"; break; + case RXGEN_SS_XDRFREE: msg = "server XDR cleanup"; break; + case RXGEN_CC_XDRFREE: msg = "client XDR cleanup"; break; + case -32: msg = "insufficient data"; break; + default: + return; + } + + m = max; + if (m < 3) { + max = m + 1; + pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", + msg, call->type->name, + &call->alist->addrs[call->addr_ix].transport); + } +} + +/* * deliver messages to a call */ static void afs_deliver_to_call(struct afs_call *call) { enum afs_call_state state; + size_t len; u32 abort_code, remote_abort = 0; int ret; @@ -516,10 +479,11 @@ static void afs_deliver_to_call(struct afs_call *call) state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { + len = 0; iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, call->rxcall, &call->def_iter, - false, &remote_abort, + &len, false, &remote_abort, &call->service_id); trace_afs_receive_data(call, &call->def_iter, false, ret); @@ -559,6 +523,7 @@ static void afs_deliver_to_call(struct afs_call *call) goto out; case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); + afs_log_error(call, call->abort_code); goto done; case -ENOTSUPP: abort_code = RXGEN_OPCODE; @@ -929,10 +894,11 @@ int afs_extract_data(struct afs_call *call, bool want_more) u32 remote_abort = 0; int ret; - _enter("{%s,%zu},%d", call->type->name, iov_iter_count(iter), want_more); + _enter("{%s,%zu,%zu},%d", + call->type->name, call->iov_len, iov_iter_count(iter), want_more); ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter, - want_more, &remote_abort, + &call->iov_len, want_more, &remote_abort, &call->service_id); if (ret == 0 || ret == -EAGAIN) return ret; diff --git a/fs/afs/write.c b/fs/afs/write.c index c9195fc67fd8..3edb6204b937 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -11,6 +11,8 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> +#include <linux/netfs.h> +#include <linux/fscache.h> #include "internal.h" /* @@ -23,55 +25,6 @@ int afs_set_page_dirty(struct page *page) } /* - * partly or wholly fill a page that's under preparation for writing - */ -static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - loff_t pos, unsigned int len, struct page *page) -{ - struct afs_read *req; - size_t p; - void *data; - int ret; - - _enter(",,%llu", (unsigned long long)pos); - - if (pos >= vnode->vfs_inode.i_size) { - p = pos & ~PAGE_MASK; - ASSERTCMP(p + len, <=, PAGE_SIZE); - data = kmap(page); - memset(data + p, 0, len); - kunmap(page); - return 0; - } - - req = kzalloc(struct_size(req, array, 1), GFP_KERNEL); - if (!req) - return -ENOMEM; - - refcount_set(&req->usage, 1); - req->pos = pos; - req->len = len; - req->nr_pages = 1; - req->pages = req->array; - req->pages[0] = page; - get_page(page); - - ret = afs_fetch_data(vnode, key, req); - afs_put_read(req); - if (ret < 0) { - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - } - - _leave(" = %d", ret); - return ret; -} - -/* * prepare to perform part of a write to a page */ int afs_write_begin(struct file *file, struct address_space *mapping, @@ -80,47 +33,40 @@ int afs_write_begin(struct file *file, struct address_space *mapping, { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct page *page; - struct key *key = afs_file_key(file); unsigned long priv; - unsigned f, from = pos & (PAGE_SIZE - 1); - unsigned t, to = from + len; - pgoff_t index = pos >> PAGE_SHIFT; + unsigned f, from; + unsigned t, to; + pgoff_t index; int ret; - _enter("{%llx:%llu},{%lx},%u,%u", - vnode->fid.vid, vnode->fid.vnode, index, from, to); + _enter("{%llx:%llu},%llx,%x", + vnode->fid.vid, vnode->fid.vnode, pos, len); - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; + /* Prefetch area to be written into the cache if we're caching this + * file. We need to do this before we get a lock on the page in case + * there's more than one writer competing for the same cache block. + */ + ret = netfs_write_begin(file, mapping, pos, len, flags, &page, fsdata, + &afs_req_ops, NULL); + if (ret < 0) + return ret; - if (!PageUptodate(page) && len != PAGE_SIZE) { - ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - _leave(" = %d [prep]", ret); - return ret; - } - SetPageUptodate(page); - } + index = page->index; + from = pos - index * PAGE_SIZE; + to = from + len; try_again: /* See if this page is already partially written in a way that we can * merge the new write with. */ - t = f = 0; if (PagePrivate(page)) { priv = page_private(page); - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); ASSERTCMP(f, <=, t); - } - if (f != t) { if (PageWriteback(page)) { - trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), page); goto flush_conflicting_write; } /* If the file is being filled locally, allow inter-write @@ -164,12 +110,10 @@ int afs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct key *key = afs_file_key(file); unsigned long priv; - unsigned int f, from = pos & (PAGE_SIZE - 1); + unsigned int f, from = pos & (thp_size(page) - 1); unsigned int t, to = from + copied; loff_t i_size, maybe_i_size; - int ret = 0; _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); @@ -188,88 +132,75 @@ int afs_write_end(struct file *file, struct address_space *mapping, write_sequnlock(&vnode->cb_lock); } - if (!PageUptodate(page)) { - if (copied < len) { - /* Try and load any missing data from the server. The - * unmarshalling routine will take care of clearing any - * bits that are beyond the EOF. - */ - ret = afs_fill_page(vnode, key, pos + copied, - len - copied, page); - if (ret < 0) - goto out; - } - SetPageUptodate(page); - } + ASSERT(PageUptodate(page)); if (PagePrivate(page)) { priv = page_private(page); - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); if (from < f) f = from; if (to > t) t = to; - priv = afs_page_dirty(f, t); + priv = afs_page_dirty(page, f, t); set_page_private(page, priv); - trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), page); } else { - priv = afs_page_dirty(from, to); + priv = afs_page_dirty(page, from, to); attach_page_private(page, (void *)priv); - trace_afs_page_dirty(vnode, tracepoint_string("dirty"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty"), page); } - set_page_dirty(page); - if (PageDirty(page)) - _debug("dirtied"); - ret = copied; + if (set_page_dirty(page)) + _debug("dirtied %lx", page->index); out: unlock_page(page); put_page(page); - return ret; + return copied; } /* * kill all the pages in the given range */ static void afs_kill_pages(struct address_space *mapping, - pgoff_t first, pgoff_t last) + loff_t start, loff_t len) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct pagevec pv; - unsigned count, loop; + unsigned int loop, psize; - _enter("{%llx:%llu},%lx-%lx", - vnode->fid.vid, vnode->fid.vnode, first, last); + _enter("{%llx:%llu},%llx @%llx", + vnode->fid.vid, vnode->fid.vnode, len, start); pagevec_init(&pv); do { - _debug("kill %lx-%lx", first, last); + _debug("kill %llx @%llx", len, start); - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(mapping, first, count, pv.pages); - ASSERTCMP(pv.nr, ==, count); + pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE, + PAGEVEC_SIZE, pv.pages); + if (pv.nr == 0) + break; - for (loop = 0; loop < count; loop++) { + for (loop = 0; loop < pv.nr; loop++) { struct page *page = pv.pages[loop]; + + if (page->index * PAGE_SIZE >= start + len) + break; + + psize = thp_size(page); + start += psize; + len -= psize; ClearPageUptodate(page); - SetPageError(page); end_page_writeback(page); - if (page->index >= first) - first = page->index + 1; lock_page(page); generic_error_remove_page(mapping, page); unlock_page(page); } __pagevec_release(&pv); - } while (first <= last); + } while (len > 0); _leave(""); } @@ -279,37 +210,40 @@ static void afs_kill_pages(struct address_space *mapping, */ static void afs_redirty_pages(struct writeback_control *wbc, struct address_space *mapping, - pgoff_t first, pgoff_t last) + loff_t start, loff_t len) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct pagevec pv; - unsigned count, loop; + unsigned int loop, psize; - _enter("{%llx:%llu},%lx-%lx", - vnode->fid.vid, vnode->fid.vnode, first, last); + _enter("{%llx:%llu},%llx @%llx", + vnode->fid.vid, vnode->fid.vnode, len, start); pagevec_init(&pv); do { - _debug("redirty %lx-%lx", first, last); + _debug("redirty %llx @%llx", len, start); - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(mapping, first, count, pv.pages); - ASSERTCMP(pv.nr, ==, count); + pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE, + PAGEVEC_SIZE, pv.pages); + if (pv.nr == 0) + break; - for (loop = 0; loop < count; loop++) { + for (loop = 0; loop < pv.nr; loop++) { struct page *page = pv.pages[loop]; + if (page->index * PAGE_SIZE >= start + len) + break; + + psize = thp_size(page); + start += psize; + len -= psize; redirty_page_for_writepage(wbc, page); end_page_writeback(page); - if (page->index >= first) - first = page->index + 1; } __pagevec_release(&pv); - } while (first <= last); + } while (len > 0); _leave(""); } @@ -317,37 +251,32 @@ static void afs_redirty_pages(struct writeback_control *wbc, /* * completion of write to server */ -static void afs_pages_written_back(struct afs_vnode *vnode, - pgoff_t first, pgoff_t last) +static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct pagevec pv; - unsigned long priv; - unsigned count, loop; + struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct page *page; + pgoff_t end; - _enter("{%llx:%llu},{%lx-%lx}", - vnode->fid.vid, vnode->fid.vnode, first, last); + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); - pagevec_init(&pv); + _enter("{%llx:%llu},{%x @%llx}", + vnode->fid.vid, vnode->fid.vnode, len, start); - do { - _debug("done %lx-%lx", first, last); - - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, - first, count, pv.pages); - ASSERTCMP(pv.nr, ==, count); - - for (loop = 0; loop < count; loop++) { - priv = (unsigned long)detach_page_private(pv.pages[loop]); - trace_afs_page_dirty(vnode, tracepoint_string("clear"), - pv.pages[loop]->index, priv); - end_page_writeback(pv.pages[loop]); + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, page, end) { + if (!PageWriteback(page)) { + kdebug("bad %x @%llx page %lx %lx", len, start, page->index, end); + ASSERT(PageWriteback(page)); } - first += count; - __pagevec_release(&pv); - } while (first <= last); + + trace_afs_page_dirty(vnode, tracepoint_string("clear"), page); + detach_page_private(page); + page_endio(page, true, 0); + } + + rcu_read_unlock(); afs_prune_wb_keys(vnode); _leave(""); @@ -402,11 +331,9 @@ static void afs_store_data_success(struct afs_operation *op) afs_vnode_commit_status(op, &op->file[0]); if (op->error == 0) { if (!op->store.laundering) - afs_pages_written_back(vnode, op->store.first, op->store.last); + afs_pages_written_back(vnode, op->store.pos, op->store.size); afs_stat_v(vnode, n_stores); - atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) - - (op->store.first * PAGE_SIZE + op->store.first_offset), - &afs_v2net(vnode)->n_store_bytes); + atomic_long_add(op->store.size, &afs_v2net(vnode)->n_store_bytes); } } @@ -419,21 +346,20 @@ static const struct afs_operation_ops afs_store_data_operation = { /* * write to a file */ -static int afs_store_data(struct address_space *mapping, - pgoff_t first, pgoff_t last, - unsigned offset, unsigned to, bool laundering) +static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos, + bool laundering) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct afs_operation *op; struct afs_wb_key *wbk = NULL; - int ret; + loff_t size = iov_iter_count(iter), i_size; + int ret = -ENOKEY; - _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x", + _enter("%s{%llx:%llu.%u},%llx,%llx", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, - first, last, offset, to); + size, pos); ret = afs_get_writeback_key(vnode, &wbk); if (ret) { @@ -447,13 +373,15 @@ static int afs_store_data(struct address_space *mapping, return -ENOMEM; } + i_size = i_size_read(&vnode->vfs_inode); + afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; - op->store.mapping = mapping; - op->store.first = first; - op->store.last = last; - op->store.first_offset = offset; - op->store.last_to = to; + op->file[0].modification = true; + op->store.write_iter = iter; + op->store.pos = pos; + op->store.size = size; + op->store.i_size = max(pos + size, i_size); op->store.laundering = laundering; op->mtime = vnode->vfs_inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; @@ -487,73 +415,58 @@ try_next_key: } /* - * Synchronously write back the locked page and any subsequent non-locked dirty - * pages. + * Extend the region to be written back to include subsequent contiguously + * dirty pages if possible, but don't sleep while doing so. + * + * If this page holds new content, then we can include filler zeros in the + * writeback. */ -static int afs_write_back_from_locked_page(struct address_space *mapping, - struct writeback_control *wbc, - struct page *primary_page, - pgoff_t final_page) +static void afs_extend_writeback(struct address_space *mapping, + struct afs_vnode *vnode, + long *_count, + loff_t start, + loff_t max_len, + bool new_content, + unsigned int *_len) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct page *pages[8], *page; - unsigned long count, priv; - unsigned n, offset, to, f, t; - pgoff_t start, first, last; - loff_t i_size, end; - int loop, ret; - - _enter(",%lx", primary_page->index); + struct pagevec pvec; + struct page *page; + unsigned long priv; + unsigned int psize, filler = 0; + unsigned int f, t; + loff_t len = *_len; + pgoff_t index = (start + len) / PAGE_SIZE; + bool stop = true; + unsigned int i; - count = 1; - if (test_set_page_writeback(primary_page)) - BUG(); + XA_STATE(xas, &mapping->i_pages, index); + pagevec_init(&pvec); - /* Find all consecutive lockable dirty pages that have contiguous - * written regions, stopping when we find a page that is not - * immediately lockable, is not dirty or is missing, or we reach the - * end of the range. - */ - start = primary_page->index; - priv = page_private(primary_page); - offset = afs_page_dirty_from(priv); - to = afs_page_dirty_to(priv); - trace_afs_page_dirty(vnode, tracepoint_string("store"), - primary_page->index, priv); - - WARN_ON(offset == to); - if (offset == to) - trace_afs_page_dirty(vnode, tracepoint_string("WARN"), - primary_page->index, priv); - - if (start >= final_page || - (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))) - goto no_more; - - start++; do { - _debug("more %lx [%lx]", start, count); - n = final_page - start + 1; - if (n > ARRAY_SIZE(pages)) - n = ARRAY_SIZE(pages); - n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages); - _debug("fgpc %u", n); - if (n == 0) - goto no_more; - if (pages[0]->index != start) { - do { - put_page(pages[--n]); - } while (n > 0); - goto no_more; - } + /* Firstly, we gather up a batch of contiguous dirty pages + * under the RCU read lock - but we can't clear the dirty flags + * there if any of those pages are mapped. + */ + rcu_read_lock(); - for (loop = 0; loop < n; loop++) { - page = pages[loop]; - if (to != PAGE_SIZE && - !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) + xas_for_each(&xas, page, ULONG_MAX) { + stop = true; + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) break; - if (page->index > final_page) + if (page->index != index) break; + + if (!page_cache_get_speculative(page)) { + xas_reset(&xas); + continue; + } + + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(&xas))) + break; + if (!trylock_page(page)) break; if (!PageDirty(page) || PageWriteback(page)) { @@ -561,57 +474,134 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, break; } + psize = thp_size(page); priv = page_private(page); - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); - if (f != 0 && - !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); + if (f != 0 && !new_content) { unlock_page(page); break; } - to = t; - trace_afs_page_dirty(vnode, tracepoint_string("store+"), - page->index, priv); + len += filler + t; + filler = psize - t; + if (len >= max_len || *_count <= 0) + stop = true; + else if (t == psize || new_content) + stop = false; + + index += thp_nr_pages(page); + if (!pagevec_add(&pvec, page)) + break; + if (stop) + break; + } + + if (!stop) + xas_pause(&xas); + rcu_read_unlock(); + + /* Now, if we obtained any pages, we can shift them to being + * writable and mark them for caching. + */ + if (!pagevec_count(&pvec)) + break; + + for (i = 0; i < pagevec_count(&pvec); i++) { + page = pvec.pages[i]; + trace_afs_page_dirty(vnode, tracepoint_string("store+"), page); if (!clear_page_dirty_for_io(page)) BUG(); if (test_set_page_writeback(page)) BUG(); + + *_count -= thp_nr_pages(page); unlock_page(page); - put_page(page); - } - count += loop; - if (loop < n) { - for (; loop < n; loop++) - put_page(pages[loop]); - goto no_more; } - start += loop; - } while (start <= final_page && count < 65536); + pagevec_release(&pvec); + cond_resched(); + } while (!stop); + + *_len = len; +} + +/* + * Synchronously write back the locked page and any subsequent non-locked dirty + * pages. + */ +static ssize_t afs_write_back_from_locked_page(struct address_space *mapping, + struct writeback_control *wbc, + struct page *page, + loff_t start, loff_t end) +{ + struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct iov_iter iter; + unsigned long priv; + unsigned int offset, to, len, max_len; + loff_t i_size = i_size_read(&vnode->vfs_inode); + bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + long count = wbc->nr_to_write; + int ret; + + _enter(",%lx,%llx-%llx", page->index, start, end); + + if (test_set_page_writeback(page)) + BUG(); + + count -= thp_nr_pages(page); + + /* Find all consecutive lockable dirty pages that have contiguous + * written regions, stopping when we find a page that is not + * immediately lockable, is not dirty or is missing, or we reach the + * end of the range. + */ + priv = page_private(page); + offset = afs_page_dirty_from(page, priv); + to = afs_page_dirty_to(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("store"), page); + + len = to - offset; + start += offset; + if (start < i_size) { + /* Trim the write to the EOF; the extra data is ignored. Also + * put an upper limit on the size of a single storedata op. + */ + max_len = 65536 * 4096; + max_len = min_t(unsigned long long, max_len, end - start + 1); + max_len = min_t(unsigned long long, max_len, i_size - start); + + if (len < max_len && + (to == thp_size(page) || new_content)) + afs_extend_writeback(mapping, vnode, &count, + start, max_len, new_content, &len); + len = min_t(loff_t, len, max_len); + } -no_more: /* We now have a contiguous set of dirty pages, each with writeback * set; the first page is still locked at this point, but all the rest * have been unlocked. */ - unlock_page(primary_page); + unlock_page(page); - first = primary_page->index; - last = first + count - 1; + if (start < i_size) { + _debug("write back %x @%llx [%llx]", len, start, i_size); - end = (loff_t)last * PAGE_SIZE + to; - i_size = i_size_read(&vnode->vfs_inode); + iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + ret = afs_store_data(vnode, &iter, start, false); + } else { + _debug("write discard %x @%llx [%llx]", len, start, i_size); - _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); - if (end > i_size) - to = i_size & ~PAGE_MASK; + /* The dirty region was entirely beyond the EOF. */ + afs_pages_written_back(vnode, start, len); + ret = 0; + } - ret = afs_store_data(mapping, first, last, offset, to, false); switch (ret) { case 0: - ret = count; + wbc->nr_to_write = count; + ret = len; break; default: @@ -623,13 +613,13 @@ no_more: case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: - afs_redirty_pages(wbc, mapping, first, last); + afs_redirty_pages(wbc, mapping, start, len); mapping_set_error(mapping, ret); break; case -EDQUOT: case -ENOSPC: - afs_redirty_pages(wbc, mapping, first, last); + afs_redirty_pages(wbc, mapping, start, len); mapping_set_error(mapping, -ENOSPC); break; @@ -641,7 +631,7 @@ no_more: case -ENOMEDIUM: case -ENXIO: trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail); - afs_kill_pages(mapping, first, last); + afs_kill_pages(mapping, start, len); mapping_set_error(mapping, ret); break; } @@ -656,19 +646,19 @@ no_more: */ int afs_writepage(struct page *page, struct writeback_control *wbc) { - int ret; + ssize_t ret; + loff_t start; _enter("{%lx},", page->index); + start = page->index * PAGE_SIZE; ret = afs_write_back_from_locked_page(page->mapping, wbc, page, - wbc->range_end >> PAGE_SHIFT); + start, LLONG_MAX - start); if (ret < 0) { - _leave(" = %d", ret); - return 0; + _leave(" = %zd", ret); + return ret; } - wbc->nr_to_write -= ret; - _leave(" = 0"); return 0; } @@ -678,35 +668,46 @@ int afs_writepage(struct page *page, struct writeback_control *wbc) */ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, - pgoff_t index, pgoff_t end, pgoff_t *_next) + loff_t start, loff_t end, loff_t *_next) { struct page *page; - int ret, n; + ssize_t ret; + int n; - _enter(",,%lx,%lx,", index, end); + _enter("%llx,%llx,", start, end); do { - n = find_get_pages_range_tag(mapping, &index, end, - PAGECACHE_TAG_DIRTY, 1, &page); + pgoff_t index = start / PAGE_SIZE; + + n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, + PAGECACHE_TAG_DIRTY, 1, &page); if (!n) break; + start = (loff_t)page->index * PAGE_SIZE; /* May regress with THPs */ + _debug("wback %lx", page->index); - /* - * at this point we hold neither the i_pages lock nor the + /* At this point we hold neither the i_pages lock nor the * page lock: the page may be truncated or invalidated * (changing page->mapping to NULL), or even swizzled * back from swapper_space to tmpfs file mapping */ - ret = lock_page_killable(page); - if (ret < 0) { - put_page(page); - _leave(" = %d", ret); - return ret; + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = lock_page_killable(page); + if (ret < 0) { + put_page(page); + return ret; + } + } else { + if (!trylock_page(page)) { + put_page(page); + return 0; + } } if (page->mapping != mapping || !PageDirty(page)) { + start += thp_size(page); unlock_page(page); put_page(page); continue; @@ -722,20 +723,20 @@ static int afs_writepages_region(struct address_space *mapping, if (!clear_page_dirty_for_io(page)) BUG(); - ret = afs_write_back_from_locked_page(mapping, wbc, page, end); + ret = afs_write_back_from_locked_page(mapping, wbc, page, start, end); put_page(page); if (ret < 0) { - _leave(" = %d", ret); + _leave(" = %zd", ret); return ret; } - wbc->nr_to_write -= ret; + start += ret * PAGE_SIZE; cond_resched(); - } while (index < end && wbc->nr_to_write > 0); + } while (wbc->nr_to_write > 0); - *_next = index; - _leave(" = 0 [%lx]", *_next); + *_next = start; + _leave(" = 0 [%llx]", *_next); return 0; } @@ -746,7 +747,7 @@ int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - pgoff_t start, end, next; + loff_t start, next; int ret; _enter(""); @@ -761,22 +762,19 @@ int afs_writepages(struct address_space *mapping, return 0; if (wbc->range_cyclic) { - start = mapping->writeback_index; - end = -1; - ret = afs_writepages_region(mapping, wbc, start, end, &next); + start = mapping->writeback_index * PAGE_SIZE; + ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, &next); if (start > 0 && wbc->nr_to_write > 0 && ret == 0) ret = afs_writepages_region(mapping, wbc, 0, start, &next); - mapping->writeback_index = next; + mapping->writeback_index = next / PAGE_SIZE; } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - end = (pgoff_t)(LLONG_MAX >> PAGE_SHIFT); - ret = afs_writepages_region(mapping, wbc, 0, end, &next); + ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next); if (wbc->nr_to_write > 0) mapping->writeback_index = next; } else { - start = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - ret = afs_writepages_region(mapping, wbc, start, end, &next); + ret = afs_writepages_region(mapping, wbc, + wbc->range_start, wbc->range_end, &next); } up_read(&vnode->validate_lock); @@ -834,13 +832,13 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { + struct page *page = thp_head(vmf->page); struct file *file = vmf->vma->vm_file; struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); unsigned long priv; - _enter("{{%llx:%llu}},{%lx}", - vnode->fid.vid, vnode->fid.vnode, vmf->page->index); + _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); sb_start_pagefault(inode->i_sb); @@ -848,30 +846,35 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) * be modified. We then assume the entire page will need writing back. */ #ifdef CONFIG_AFS_FSCACHE - fscache_wait_on_page_write(vnode->cache, vmf->page); + if (PageFsCache(page) && + wait_on_page_fscache_killable(page) < 0) + return VM_FAULT_RETRY; #endif - if (PageWriteback(vmf->page) && - wait_on_page_bit_killable(vmf->page, PG_writeback) < 0) + if (wait_on_page_writeback_killable(page)) return VM_FAULT_RETRY; - if (lock_page_killable(vmf->page) < 0) + if (lock_page_killable(page) < 0) return VM_FAULT_RETRY; /* We mustn't change page->private until writeback is complete as that * details the portion of the page we need to write back and we might * need to redirty the page if there's a problem. */ - wait_on_page_writeback(vmf->page); + if (wait_on_page_writeback_killable(page) < 0) { + unlock_page(page); + return VM_FAULT_RETRY; + } - priv = afs_page_dirty(0, PAGE_SIZE); + priv = afs_page_dirty(page, 0, thp_size(page)); priv = afs_page_dirty_mmapped(priv); - trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), - vmf->page->index, priv); - if (PagePrivate(vmf->page)) - set_page_private(vmf->page, priv); - else - attach_page_private(vmf->page, (void *)priv); + if (PagePrivate(page)) { + set_page_private(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("mkwrite+"), page); + } else { + attach_page_private(page, (void *)priv); + trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), page); + } file_update_time(file); sb_end_pagefault(inode->i_sb); @@ -913,6 +916,8 @@ int afs_launder_page(struct page *page) { struct address_space *mapping = page->mapping; struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct iov_iter iter; + struct bio_vec bv[1]; unsigned long priv; unsigned int f, t; int ret = 0; @@ -922,26 +927,24 @@ int afs_launder_page(struct page *page) priv = page_private(page); if (clear_page_dirty_for_io(page)) { f = 0; - t = PAGE_SIZE; + t = thp_size(page); if (PagePrivate(page)) { - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); } - trace_afs_page_dirty(vnode, tracepoint_string("launder"), - page->index, priv); - ret = afs_store_data(mapping, page->index, page->index, t, f, true); - } - - priv = (unsigned long)detach_page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("laundered"), - page->index, priv); + bv[0].bv_page = page; + bv[0].bv_offset = f; + bv[0].bv_len = t - f; + iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page)) { - fscache_wait_on_page_write(vnode->cache, page); - fscache_uncache_page(vnode->cache, page); + trace_afs_page_dirty(vnode, tracepoint_string("launder"), page); + ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE, + true); } -#endif + + trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page); + detach_page_private(page); + wait_on_page_fscache(page); return ret; } diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index c629caae5002..7751b0b3f81d 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -11,29 +11,6 @@ #include <linux/xattr.h> #include "internal.h" -static const char afs_xattr_list[] = - "afs.acl\0" - "afs.cell\0" - "afs.fid\0" - "afs.volume\0" - "afs.yfs.acl\0" - "afs.yfs.acl_inherited\0" - "afs.yfs.acl_num_cleaned\0" - "afs.yfs.vol_acl"; - -/* - * Retrieve a list of the supported xattrs. - */ -ssize_t afs_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - if (size == 0) - return sizeof(afs_xattr_list); - if (size < sizeof(afs_xattr_list)) - return -ERANGE; - memcpy(buffer, afs_xattr_list, sizeof(afs_xattr_list)); - return sizeof(afs_xattr_list); -} - /* * Deal with the result of a successful fetch ACL operation. */ @@ -231,6 +208,8 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler, else ret = -ERANGE; } + } else if (ret == -ENOTSUPP) { + ret = -ENODATA; } error_yacl: @@ -256,6 +235,7 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler, { struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(inode); + int ret; if (flags == XATTR_CREATE || strcmp(name, "acl") != 0) @@ -270,7 +250,10 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler, return afs_put_operation(op); op->ops = &yfs_store_opaque_acl2_operation; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + if (ret == -ENOTSUPP) + ret = -ENODATA; + return ret; } static const struct xattr_handler afs_xattr_yfs_handler = { diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index bd787e71a657..2b35cba8ad62 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -360,22 +360,23 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) struct afs_vnode_param *vp = &op->file[0]; struct afs_read *req = op->fetch.req; const __be32 *bp; - unsigned int size; int ret; - _enter("{%u,%zu/%llu}", - call->unmarshall, iov_iter_count(call->iter), req->actual_len); + _enter("{%u,%zu, %zu/%llu}", + call->unmarshall, call->iov_len, iov_iter_count(call->iter), + req->actual_len); switch (call->unmarshall) { case 0: req->actual_len = 0; - req->index = 0; - req->offset = req->pos & (PAGE_SIZE - 1); afs_extract_to_tmp64(call); call->unmarshall++; fallthrough; - /* extract the returned data length */ + /* Extract the returned data length into ->actual_len. This + * may indicate more or less data than was requested will be + * returned. + */ case 1: _debug("extract data length"); ret = afs_extract_data(call, true); @@ -384,44 +385,25 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) req->actual_len = be64_to_cpu(call->tmp64); _debug("DATA length: %llu", req->actual_len); - req->remain = min(req->len, req->actual_len); - if (req->remain == 0) + + if (req->actual_len == 0) goto no_more_data; + call->iter = req->iter; + call->iov_len = min(req->actual_len, req->len); call->unmarshall++; - - begin_page: - ASSERTCMP(req->index, <, req->nr_pages); - if (req->remain > PAGE_SIZE - req->offset) - size = PAGE_SIZE - req->offset; - else - size = req->remain; - call->bvec[0].bv_len = size; - call->bvec[0].bv_offset = req->offset; - call->bvec[0].bv_page = req->pages[req->index]; - iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); - ASSERTCMP(size, <=, PAGE_SIZE); fallthrough; /* extract the returned data */ case 2: _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->remain); + iov_iter_count(call->iter), req->actual_len); ret = afs_extract_data(call, true); if (ret < 0) return ret; - req->remain -= call->bvec[0].bv_len; - req->offset += call->bvec[0].bv_len; - ASSERTCMP(req->offset, <=, PAGE_SIZE); - if (req->offset == PAGE_SIZE) { - req->offset = 0; - req->index++; - if (req->remain > 0) - goto begin_page; - } - ASSERTCMP(req->remain, ==, 0); + call->iter = &call->def_iter; if (req->actual_len <= req->len) goto no_more_data; @@ -467,17 +449,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) break; } - for (; req->index < req->nr_pages; req->index++) { - if (req->offset < PAGE_SIZE) - zero_user_segment(req->pages[req->index], - req->offset, PAGE_SIZE); - req->offset = 0; - } - - if (req->page_done) - for (req->index = 0; req->index < req->nr_pages; req->index++) - req->page_done(req); - _leave(" = 0 [done]"); return 0; } @@ -516,6 +487,8 @@ void yfs_fs_fetch_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); + req->call_debug_id = call->debug_id; + /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSFETCHDATA64); @@ -1102,25 +1075,15 @@ void yfs_fs_store_data(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - loff_t size, pos, i_size; __be32 *bp; _enter(",%x,{%llx:%llu},,", key_serial(op->key), vp->fid.vid, vp->fid.vnode); - size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset; - if (op->store.first != op->store.last) - size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT; - pos = (loff_t)op->store.first << PAGE_SHIFT; - pos += op->store.first_offset; - - i_size = i_size_read(&vp->vnode->vfs_inode); - if (pos + size > i_size) - i_size = size + pos; - _debug("size %llx, at %llx, i_size %llx", - (unsigned long long)size, (unsigned long long)pos, - (unsigned long long)i_size); + (unsigned long long)op->store.size, + (unsigned long long)op->store.pos, + (unsigned long long)op->store.i_size); call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreData64, sizeof(__be32) + @@ -1133,8 +1096,7 @@ void yfs_fs_store_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); - call->key = op->key; - call->send_pages = true; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; @@ -1142,9 +1104,9 @@ void yfs_fs_store_data(struct afs_operation *op) bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &vp->fid); bp = xdr_encode_YFSStoreStatus_mtime(bp, &op->mtime); - bp = xdr_encode_u64(bp, pos); - bp = xdr_encode_u64(bp, size); - bp = xdr_encode_u64(bp, i_size); + bp = xdr_encode_u64(bp, op->store.pos); + bp = xdr_encode_u64(bp, op->store.size); + bp = xdr_encode_u64(bp, op->store.i_size); yfs_check_req(call, bp); trace_afs_make_fs_call(call, &vp->fid); @@ -323,16 +323,13 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags) +static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; - if (flags & MREMAP_DONTUNMAP) - return -EINVAL; - spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 054f97b07754..918826eaceea 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -87,6 +87,7 @@ struct autofs_wait_queue { autofs_wqt_t wait_queue_token; /* We use the following to see what we are waiting for */ struct qstr name; + u32 offset; u32 dev; u64 ino; kuid_t uid; diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index a1c7701007e7..b3fefd6237c3 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -355,7 +355,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { + if (d_is_symlink(dentry)) { pr_debug("checking symlink %p %pd\n", dentry, dentry); /* Forced expire, user space handles busy mounts */ diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 5ced859dac53..16b5fca0626e 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -30,7 +30,7 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi) while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ - kfree(wq->name.name); + kfree(wq->name.name - wq->offset); wq->name.name = NULL; wq->wait_ctr--; wake_up_interruptible(&wq->queue); @@ -175,51 +175,6 @@ static void autofs_notify_daemon(struct autofs_sb_info *sbi, fput(pipe); } -static int autofs_getpath(struct autofs_sb_info *sbi, - struct dentry *dentry, char *name) -{ - struct dentry *root = sbi->sb->s_root; - struct dentry *tmp; - char *buf; - char *p; - int len; - unsigned seq; - -rename_retry: - buf = name; - len = 0; - - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - spin_lock(&sbi->fs_lock); - for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) - len += tmp->d_name.len + 1; - - if (!len || --len > NAME_MAX) { - spin_unlock(&sbi->fs_lock); - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) - goto rename_retry; - return 0; - } - - *(buf + len) = '\0'; - p = buf + len - dentry->d_name.len; - strncpy(p, dentry->d_name.name, dentry->d_name.len); - - for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) { - *(--p) = '/'; - p -= tmp->d_name.len; - strncpy(p, tmp->d_name.name, tmp->d_name.len); - } - spin_unlock(&sbi->fs_lock); - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) - goto rename_retry; - - return len; -} - static struct autofs_wait_queue * autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { @@ -352,6 +307,7 @@ int autofs_wait(struct autofs_sb_info *sbi, struct qstr qstr; char *name; int status, ret, type; + unsigned int offset = 0; pid_t pid; pid_t tgid; @@ -389,20 +345,23 @@ int autofs_wait(struct autofs_sb_info *sbi, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) + if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) { + qstr.name = name; qstr.len = sprintf(name, "%p", dentry); - else { - qstr.len = autofs_getpath(sbi, dentry, name); - if (!qstr.len) { + } else { + char *p = dentry_path_raw(dentry, name, NAME_MAX); + if (IS_ERR(p)) { kfree(name); return -ENOENT; } + qstr.name = ++p; // skip the leading slash + qstr.len = strlen(p); + offset = p - name; } - qstr.name = name; qstr.hash = full_name_hash(dentry, name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { - kfree(qstr.name); + kfree(name); return -EINTR; } @@ -410,7 +369,7 @@ int autofs_wait(struct autofs_sb_info *sbi, if (ret <= 0) { if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); + kfree(name); return ret; } @@ -418,7 +377,7 @@ int autofs_wait(struct autofs_sb_info *sbi, /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL); if (!wq) { - kfree(qstr.name); + kfree(name); mutex_unlock(&sbi->wq_mutex); return -ENOMEM; } @@ -430,6 +389,7 @@ int autofs_wait(struct autofs_sb_info *sbi, sbi->queues = wq; init_waitqueue_head(&wq->queue); memcpy(&wq->name, &qstr, sizeof(struct qstr)); + wq->offset = offset; wq->dev = autofs_get_dev(sbi); wq->ino = autofs_get_ino(sbi); wq->uid = current_uid(); @@ -469,7 +429,7 @@ int autofs_wait(struct autofs_sb_info *sbi, (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); + kfree(name); } /* @@ -540,7 +500,7 @@ int autofs_wait_release(struct autofs_sb_info *sbi, } *wql = wq->next; /* Unlink from chain */ - kfree(wq->name.name); + kfree(wq->name.name - wq->offset); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; wake_up(&wq->queue); diff --git a/fs/befs/TODO b/fs/befs/TODO deleted file mode 100644 index 3250921aa2e6..000000000000 --- a/fs/befs/TODO +++ /dev/null @@ -1,14 +0,0 @@ -TODO -========== - -* Convert comments to the Kernel-Doc format. - -* Befs_fs.h has gotten big and messy. No reason not to break it up into - smaller peices. - -* See if Alexander Viro's option parser made it into the kernel tree. - Use that if we can. (include/linux/parser.h) - -* See if we really need separate types for on-disk and in-memory - representations of the superblock and inode. - diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b12ba98ae9f5..187b3f2b9202 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2267,8 +2267,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Align to page */ - if (!dump_skip(cprm, dataoff - cprm->pos)) - goto end_coredump; + dump_skip_to(cprm, dataoff); for (i = 0; i < vma_count; i++) { struct core_vma_metadata *meta = vma_meta + i; @@ -2276,7 +2275,6 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; } - dump_truncate(cprm); if (!elf_core_write_extra_data(cprm)) goto end_coredump; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 3cfd6cd46f26..2c99b102c860 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1631,8 +1631,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; } - if (!dump_skip(cprm, dataoff - cprm->pos)) - goto end_coredump; + dump_skip_to(cprm, dataoff); if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) goto end_coredump; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index b9c658e0548e..a1072c6a2341 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -74,6 +74,12 @@ #define MAX_SHARED_LIBS (1) #endif +#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET +#define DATA_START_OFFSET_WORDS (0) +#else +#define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS) +#endif + struct lib_info { struct { unsigned long start_code; /* Start of text segment */ @@ -576,7 +582,8 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = data_len + extra + + DATA_START_OFFSET_WORDS * sizeof(unsigned long); len = PAGE_ALIGN(len); realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); @@ -591,7 +598,7 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } datapos = ALIGN(realdatastart + - MAX_SHARED_LIBS * sizeof(unsigned long), + DATA_START_OFFSET_WORDS * sizeof(unsigned long), FLAT_DATA_ALIGN); pr_debug("Allocated data+bss+stack (%u bytes): %lx\n", @@ -622,7 +629,8 @@ static int load_flat_file(struct linux_binprm *bprm, memp_size = len; } else { - len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32); + len = text_len + data_len + extra + + DATA_START_OFFSET_WORDS * sizeof(u32); len = PAGE_ALIGN(len); textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); @@ -638,7 +646,7 @@ static int load_flat_file(struct linux_binprm *bprm, realdatastart = textpos + ntohl(hdr->data_start); datapos = ALIGN(realdatastart + - MAX_SHARED_LIBS * sizeof(u32), + DATA_START_OFFSET_WORDS * sizeof(u32), FLAT_DATA_ALIGN); reloc = (__be32 __user *) @@ -714,7 +722,7 @@ static int load_flat_file(struct linux_binprm *bprm, ret = result; pr_err("Unable to read code+data+bss, errno %d\n", ret); vm_munmap(textpos, text_len + data_len + extra + - MAX_SHARED_LIBS * sizeof(u32)); + DATA_START_OFFSET_WORDS * sizeof(u32)); goto err; } } diff --git a/fs/block_dev.c b/fs/block_dev.c index 92ed7d5df677..a5244e08b6c8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -275,6 +275,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) bio_set_polled(&bio, iocb); @@ -428,6 +430,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; @@ -1236,16 +1240,18 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); int bdev_disk_changed(struct block_device *bdev, bool invalidate) { struct gendisk *disk = bdev->bd_disk; - int ret; + int ret = 0; lockdep_assert_held(&bdev->bd_mutex); - clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); - rescan: - ret = blk_drop_partitions(bdev); - if (ret) - return ret; + if (bdev->bd_part_count) + return -EBUSY; + sync_blockdev(bdev); + invalidate_bdev(bdev); + blk_drop_partitions(disk); + + clear_bit(GD_NEED_PART_SCAN, &disk->state); /* * Historically we only set the capacity to zero for devices that @@ -1259,9 +1265,6 @@ rescan: if (disk_part_scan_enabled(disk) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); - } else { - if (disk->fops->revalidate_disk) - disk->fops->revalidate_disk(disk); } if (get_capacity(disk)) { @@ -1433,10 +1436,6 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (ret) return ERR_PTR(ret); - /* - * If we lost a race with 'disk' being deleted, try again. See md.c. - */ -retry: bdev = blkdev_get_no_open(dev); if (!bdev) return ERR_PTR(-ENXIO); @@ -1483,8 +1482,6 @@ abort_claiming: disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); - if (ret == -ERESTARTSYS) - goto retry; return ERR_PTR(ret); } EXPORT_SYMBOL(blkdev_get_by_dev); diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index b634c42115ea..cec88a66bd6c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,10 +7,12 @@ subdir-ccflags-y += -Wmissing-format-attribute subdir-ccflags-y += -Wmissing-prototypes subdir-ccflags-y += -Wold-style-definition subdir-ccflags-y += -Wmissing-include-dirs -subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable) -subdir-ccflags-y += $(call cc-option, -Wunused-const-variable) -subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned) -subdir-ccflags-y += $(call cc-option, -Wstringop-truncation) +condflags := \ + $(call cc-option, -Wunused-but-set-variable) \ + $(call cc-option, -Wunused-const-variable) \ + $(call cc-option, -Wpacked-not-aligned) \ + $(call cc-option, -Wstringop-truncation) +subdir-ccflags-y += $(condflags) # The following turn off the warnings enabled by -Wextra subdir-ccflags-y += -Wno-missing-field-initializers subdir-ccflags-y += -Wno-sign-compare @@ -28,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o + subpage.o tree-mod-log.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f47c1528eb9a..117d423fdb93 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -14,6 +14,7 @@ #include "delayed-ref.h" #include "locking.h" #include "misc.h" +#include "tree-mod-log.h" /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -452,7 +453,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(eb) || is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb)) { - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -476,7 +477,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (slot == 0 && (is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb))) { - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -514,7 +515,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_item(root, path); else ret = btrfs_next_old_item(root, path, time_seq); @@ -574,7 +575,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == SEQ_LAST) + else if (time_seq == BTRFS_SEQ_LAST) root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -605,7 +606,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, search_key.offset >= LLONG_MAX) search_key.offset = 0; path->lowest_level = level; - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else ret = btrfs_search_old_slot(root, &search_key, path, time_seq); @@ -1147,8 +1148,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * - * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave - * much like trans == NULL case, the difference only lies in it will not + * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and + * behave much like trans == NULL case, the difference only lies in it will not * commit root. * The special case is for qgroup to search roots in commit_transaction(). * @@ -1199,7 +1200,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; /* @@ -1217,9 +1218,9 @@ again: #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != SEQ_LAST) { + time_seq != BTRFS_SEQ_LAST) { #else - if (trans && time_seq != SEQ_LAST) { + if (trans && time_seq != BTRFS_SEQ_LAST) { #endif /* * look if there are updates for this ref queued and lock the @@ -1527,7 +1528,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, struct btrfs_trans_handle *trans; struct ulist_iterator uiter; struct ulist_node *node; - struct seq_list elem = SEQ_LIST_INIT(elem); + struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { .root_objectid = root->root_key.objectid, @@ -1953,7 +1954,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); + struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem); struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; @@ -1971,12 +1972,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, } if (trans) - btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_get_tree_mod_seq(fs_info, &seq_elem); else down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - tree_mod_seq_elem.seq, &refs, + seq_elem.seq, &refs, &extent_item_pos, ignore_offset); if (ret) goto out; @@ -1984,7 +1985,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots, + seq_elem.seq, &roots, ignore_offset); if (ret) break; @@ -2007,7 +2008,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, free_leaf_list(refs); out: if (trans) { - btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_put_tree_mod_seq(fs_info, &seq_elem); btrfs_end_transaction(trans); } else { up_read(&fs_info->commit_root_sem); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 744b99ddc28c..aa57bdc8fc89 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1289,7 +1289,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Long running balances can keep us blocked here for eternity, so * simply skip deletion if we're unable to get the mutex. */ - if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex)) + if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) return; spin_lock(&fs_info->unused_bgs_lock); @@ -1462,12 +1462,12 @@ next: spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); return; flip_async: btrfs_end_transaction(trans); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_put_block_group(block_group); btrfs_discard_punt_unused_bgs_list(fs_info); } @@ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +void btrfs_reclaim_bgs_work(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info = + container_of(work, struct btrfs_fs_info, reclaim_bgs_work); + struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + int ret; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + return; + + mutex_lock(&fs_info->reclaim_bgs_lock); + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->reclaim_bgs)) { + bg = list_first_entry(&fs_info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&bg->bg_list); + + space_info = bg->space_info; + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + + spin_lock(&bg->lock); + if (bg->reserved || bg->pinned || bg->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&bg->lock); + + /* Get out fast, in case we're unmounting the filesystem */ + if (btrfs_fs_closing(fs_info)) { + up_write(&space_info->groups_sem); + goto next; + } + + ret = inc_block_group_ro(bg, 0); + up_write(&space_info->groups_sem); + if (ret < 0) + goto next; + + btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used", + bg->start, div_u64(bg->used * 100, bg->length)); + trace_btrfs_reclaim_block_group(bg); + ret = btrfs_relocate_chunk(fs_info, bg->start); + if (ret) + btrfs_err(fs_info, "error relocating chunk %llu", + bg->start); + +next: + btrfs_put_block_group(bg); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->reclaim_bgs_lock); + btrfs_exclop_finish(fs_info); +} + +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->unused_bgs_lock); + if (!list_empty(&fs_info->reclaim_bgs)) + queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); + spin_unlock(&fs_info->unused_bgs_lock); +} + +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + trace_btrfs_add_reclaim_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct btrfs_path *path) { @@ -2267,29 +2358,33 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, struct btrfs_trans_handle *trans; u64 alloc_flags; int ret; + bool dirty_bg_running; -again: - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + do { + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); - /* - * we're not allowed to set block groups readonly after the dirty - * block groups cache has started writing. If it already started, - * back off and let this transaction commit - */ - mutex_lock(&fs_info->ro_block_group_mutex); - if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { - u64 transid = trans->transid; + dirty_bg_running = false; - mutex_unlock(&fs_info->ro_block_group_mutex); - btrfs_end_transaction(trans); + /* + * We're not allowed to set block groups readonly after the dirty + * block group cache has started writing. If it already started, + * back off and let this transaction commit. + */ + mutex_lock(&fs_info->ro_block_group_mutex); + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { + u64 transid = trans->transid; - ret = btrfs_wait_for_commit(fs_info, transid); - if (ret) - return ret; - goto again; - } + mutex_unlock(&fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans); + + ret = btrfs_wait_for_commit(fs_info, transid); + if (ret) + return ret; + dirty_bg_running = true; + } + } while (dirty_bg_running); if (do_chunk_alloc) { /* @@ -3269,6 +3364,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) */ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) { + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *info; u64 left; @@ -3283,6 +3379,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) lockdep_assert_held(&fs_info->chunk_mutex); info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +again: spin_lock(&info->lock); left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); @@ -3301,6 +3398,58 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) if (left < thresh) { u64 flags = btrfs_system_alloc_profile(fs_info); + u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved); + + /* + * If there's not available space for the chunk tree (system + * space) and there are other tasks that reserved space for + * creating a new system block group, wait for them to complete + * the creation of their system block group and release excess + * reserved space. We do this because: + * + * *) We can end up allocating more system chunks than necessary + * when there are multiple tasks that are concurrently + * allocating block groups, which can lead to exhaustion of + * the system array in the superblock; + * + * *) If we allocate extra and unnecessary system block groups, + * despite being empty for a long time, and possibly forever, + * they end not being added to the list of unused block groups + * because that typically happens only when deallocating the + * last extent from a block group - which never happens since + * we never allocate from them in the first place. The few + * exceptions are when mounting a filesystem or running scrub, + * which add unused block groups to the list of unused block + * groups, to be deleted by the cleaner kthread. + * And even when they are added to the list of unused block + * groups, it can take a long time until they get deleted, + * since the cleaner kthread might be sleeping or busy with + * other work (deleting subvolumes, running delayed iputs, + * defrag scheduling, etc); + * + * This is rare in practice, but can happen when too many tasks + * are allocating blocks groups in parallel (via fallocate()) + * and before the one that reserved space for a new system block + * group finishes the block group creation and releases the space + * reserved in excess (at btrfs_create_pending_block_groups()), + * other tasks end up here and see free system space temporarily + * not enough for updating the chunk tree. + * + * We unlock the chunk mutex before waiting for such tasks and + * lock it again after the wait, otherwise we would deadlock. + * It is safe to do so because allocating a system chunk is the + * first thing done while allocating a new block group. + */ + if (reserved > trans->chunk_bytes_reserved) { + const u64 min_needed = reserved - thresh; + + mutex_unlock(&fs_info->chunk_mutex); + wait_event(cur_trans->chunk_reserve_wait, + atomic64_read(&cur_trans->chunk_bytes_reserved) <= + min_needed); + mutex_lock(&fs_info->chunk_mutex); + goto again; + } /* * Ignore failure to create system chunk. We might end up not @@ -3315,8 +3464,10 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) ret = btrfs_block_rsv_add(fs_info->chunk_root, &fs_info->chunk_block_rsv, thresh, BTRFS_RESERVE_NO_FLUSH); - if (!ret) + if (!ret) { + atomic64_add(thresh, &cur_trans->chunk_bytes_reserved); trans->chunk_bytes_reserved += thresh; + } } } @@ -3386,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->reclaim_bgs)) { + block_group = list_first_entry(&info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 3ecc3372a5ce..7b927425dc71 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 group_start, struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_unused(struct btrfs_block_group *bg); +void btrfs_reclaim_bgs_work(struct work_struct *work); +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 28e202e89660..c652e19ad74e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -220,6 +220,7 @@ struct btrfs_inode { /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; + struct rw_semaphore i_mmap_lock; struct inode vfs_inode; }; @@ -299,24 +300,30 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, mod); } -static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) +/* + * Called every time after doing a buffered, direct IO or memory mapped write. + * + * This is to ensure that if we write to a file that was previously fsynced in + * the current transaction, then try to fsync it again in the same transaction, + * we will know that there were changes in the file and that it needs to be + * logged. + */ +static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) +{ + spin_lock(&inode->lock); + inode->last_sub_trans = inode->root->log_transid; + spin_unlock(&inode->lock); +} + +static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { - int ret = 0; + bool ret = false; spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && - inode->last_sub_trans <= inode->root->last_log_commit) { - /* - * After a ranged fsync we might have left some extent maps - * (that fall outside the fsync's range). So return false - * here if the list isn't empty, to make sure btrfs_log_inode() - * will be called and process those extent maps. - */ - smp_mb(); - if (list_empty(&inode->extent_tree.modified_extents)) - ret = 1; - } + inode->last_sub_trans <= inode->root->last_log_commit) + ret = true; spin_unlock(&inode->lock); return ret; } diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 113cb85c1fd4..169508609324 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1555,10 +1555,11 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) BUG_ON(!block_ctx->pagev); num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> PAGE_SHIFT; + /* Pages must be unmapped in reverse order */ while (num_pages > 0) { num_pages--; if (block_ctx->datav[num_pages]) { - kunmap(block_ctx->pagev[num_pages]); + kunmap_local(block_ctx->datav[num_pages]); block_ctx->datav[num_pages] = NULL; } if (block_ctx->pagev[num_pages]) { @@ -1637,7 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, i = j; } for (i = 0; i < num_pages; i++) - block_ctx->datav[i] = kmap(block_ctx->pagev[i]); + block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]); return block_ctx->len; } @@ -2677,7 +2678,7 @@ static void __btrfsic_submit_bio(struct bio *bio) dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - unsigned int i = 0; + int i = 0; u64 dev_bytenr; u64 cur_bytenr; struct bio_vec bvec; @@ -2702,7 +2703,7 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_for_each_segment(bvec, bio, iter) { BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = kmap(bvec.bv_page); + mapped_datav[i] = kmap_local_page(bvec.bv_page); i++; if (dev_state->state->print_mask & @@ -2715,8 +2716,9 @@ static void __btrfsic_submit_bio(struct bio *bio) mapped_datav, segs, bio, &bio_is_patched, bio->bi_opf); - bio_for_each_segment(bvec, bio, iter) - kunmap(bvec.bv_page); + /* Unmap in reverse order */ + for (--i; i >= 0; i--) + kunmap_local(mapped_datav[i]); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 3f4c832abfed..17f93fd28f7e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -80,10 +80,15 @@ static int compression_compress_pages(int type, struct list_head *ws, case BTRFS_COMPRESS_NONE: default: /* - * This can't happen, the type is validated several times - * before we get here. As a sane fallback, return what the - * callers will understand as 'no compression happened'. + * This can happen when compression races with remount setting + * it to 'no compress', while caller doesn't call + * inode_need_compress() to check if we really need to + * compress. + * + * Not a big deal, just need to inform caller that we + * haven't allocated any pages yet. */ + *out_pages = 0; return -E2BIG; } } @@ -1611,7 +1616,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, curr_sample_pos = 0; while (index < index_end) { page = find_get_page(inode->i_mapping, index); - in_data = kmap(page); + in_data = kmap_local_page(page); /* Handle case where the start is not aligned to PAGE_SIZE */ i = start % PAGE_SIZE; while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { @@ -1624,7 +1629,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, start += SAMPLING_INTERVAL; curr_sample_pos += SAMPLING_READ_SIZE; } - kunmap(page); + kunmap_local(in_data); put_page(page); index++; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d56730a67885..a484fb72a01f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -14,6 +14,7 @@ #include "locking.h" #include "volumes.h" #include "qgroup.h" +#include "tree-mod-log.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -233,597 +234,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } -enum mod_log_op { - MOD_LOG_KEY_REPLACE, - MOD_LOG_KEY_ADD, - MOD_LOG_KEY_REMOVE, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, - MOD_LOG_MOVE_KEYS, - MOD_LOG_ROOT_REPLACE, -}; - -struct tree_mod_root { - u64 logical; - u8 level; -}; - -struct tree_mod_elem { - struct rb_node node; - u64 logical; - u64 seq; - enum mod_log_op op; - - /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ - int slot; - - /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ - u64 generation; - - /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ - struct btrfs_disk_key key; - u64 blockptr; - - /* this is used for op == MOD_LOG_MOVE_KEYS */ - struct { - int dst_slot; - int nr_items; - } move; - - /* this is used for op == MOD_LOG_ROOT_REPLACE */ - struct tree_mod_root old_root; -}; - -/* - * Pull a new tree mod seq number for our operation. - */ -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) -{ - return atomic64_inc_return(&fs_info->tree_mod_seq); -} - -/* - * This adds a new blocker to the tree mod log's blocker list if the @elem - * passed does not already have a sequence number set. So when a caller expects - * to record tree modifications, it should ensure to set elem->seq to zero - * before calling btrfs_get_tree_mod_seq. - * Returns a fresh, unused tree log modification sequence number, even if no new - * blocker was added. - */ -u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) -{ - write_lock(&fs_info->tree_mod_log_lock); - if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq(fs_info); - list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); - } - write_unlock(&fs_info->tree_mod_log_lock); - - return elem->seq; -} - -void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) -{ - struct rb_root *tm_root; - struct rb_node *node; - struct rb_node *next; - struct tree_mod_elem *tm; - u64 min_seq = (u64)-1; - u64 seq_putting = elem->seq; - - if (!seq_putting) - return; - - write_lock(&fs_info->tree_mod_log_lock); - list_del(&elem->list); - elem->seq = 0; - - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *first; - - first = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - if (seq_putting > first->seq) { - /* - * Blocker with lower sequence number exists, we - * cannot remove anything from the log. - */ - write_unlock(&fs_info->tree_mod_log_lock); - return; - } - min_seq = first->seq; - } - - /* - * anything that's lower than the lowest existing (read: blocked) - * sequence number can be removed from the tree. - */ - tm_root = &fs_info->tree_mod_log; - for (node = rb_first(tm_root); node; node = next) { - next = rb_next(node); - tm = rb_entry(node, struct tree_mod_elem, node); - if (tm->seq >= min_seq) - continue; - rb_erase(node, tm_root); - kfree(tm); - } - write_unlock(&fs_info->tree_mod_log_lock); -} - -/* - * key order of the log: - * node/leaf start address -> sequence - * - * The 'start address' is the logical address of the *new* root node - * for root replace operations, or the logical address of the affected - * block for all other operations. - */ -static noinline int -__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) -{ - struct rb_root *tm_root; - struct rb_node **new; - struct rb_node *parent = NULL; - struct tree_mod_elem *cur; - - lockdep_assert_held_write(&fs_info->tree_mod_log_lock); - - tm->seq = btrfs_inc_tree_mod_seq(fs_info); - - tm_root = &fs_info->tree_mod_log; - new = &tm_root->rb_node; - while (*new) { - cur = rb_entry(*new, struct tree_mod_elem, node); - parent = *new; - if (cur->logical < tm->logical) - new = &((*new)->rb_left); - else if (cur->logical > tm->logical) - new = &((*new)->rb_right); - else if (cur->seq < tm->seq) - new = &((*new)->rb_left); - else if (cur->seq > tm->seq) - new = &((*new)->rb_right); - else - return -EEXIST; - } - - rb_link_node(&tm->node, parent, new); - rb_insert_color(&tm->node, tm_root); - return 0; -} - -/* - * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it - * returns zero with the tree_mod_log_lock acquired. The caller must hold - * this until all tree mod log insertions are recorded in the rb tree and then - * write unlock fs_info::tree_mod_log_lock. - */ -static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) { - smp_mb(); - if (list_empty(&(fs_info)->tree_mod_seq_list)) - return 1; - if (eb && btrfs_header_level(eb) == 0) - return 1; - - write_lock(&fs_info->tree_mod_log_lock); - if (list_empty(&(fs_info)->tree_mod_seq_list)) { - write_unlock(&fs_info->tree_mod_log_lock); - return 1; - } - - return 0; -} - -/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ -static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) -{ - smp_mb(); - if (list_empty(&(fs_info)->tree_mod_seq_list)) - return 0; - if (eb && btrfs_header_level(eb) == 0) - return 0; - - return 1; -} - -static struct tree_mod_elem * -alloc_tree_mod_elem(struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) -{ - struct tree_mod_elem *tm; - - tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return NULL; - - tm->logical = eb->start; - if (op != MOD_LOG_KEY_ADD) { - btrfs_node_key(eb, &tm->key, slot); - tm->blockptr = btrfs_node_blockptr(eb, slot); - } - tm->op = op; - tm->slot = slot; - tm->generation = btrfs_node_ptr_generation(eb, slot); - RB_CLEAR_NODE(&tm->node); - - return tm; -} - -static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) -{ - struct tree_mod_elem *tm; - int ret; - - if (!tree_mod_need_log(eb->fs_info, eb)) - return 0; - - tm = alloc_tree_mod_elem(eb, slot, op, flags); - if (!tm) - return -ENOMEM; - - if (tree_mod_dont_log(eb->fs_info, eb)) { - kfree(tm); - return 0; - } - - ret = __tree_mod_log_insert(eb->fs_info, tm); - write_unlock(&eb->fs_info->tree_mod_log_lock); - if (ret) - kfree(tm); - - return ret; -} - -static noinline int tree_mod_log_insert_move(struct extent_buffer *eb, - int dst_slot, int src_slot, int nr_items) -{ - struct tree_mod_elem *tm = NULL; - struct tree_mod_elem **tm_list = NULL; - int ret = 0; - int i; - int locked = 0; - - if (!tree_mod_need_log(eb->fs_info, eb)) - return 0; - - tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - tm = kzalloc(sizeof(*tm), GFP_NOFS); - if (!tm) { - ret = -ENOMEM; - goto free_tms; - } - - tm->logical = eb->start; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = MOD_LOG_MOVE_KEYS; - - for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); - if (!tm_list[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - - if (tree_mod_dont_log(eb->fs_info, eb)) - goto free_tms; - locked = 1; - - /* - * When we override something during the move, we log these removals. - * This can only happen when we move towards the beginning of the - * buffer, i.e. dst_slot < src_slot. - */ - for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]); - if (ret) - goto free_tms; - } - - ret = __tree_mod_log_insert(eb->fs_info, tm); - if (ret) - goto free_tms; - write_unlock(&eb->fs_info->tree_mod_log_lock); - kfree(tm_list); - - return 0; -free_tms: - for (i = 0; i < nr_items; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); - kfree(tm_list[i]); - } - if (locked) - write_unlock(&eb->fs_info->tree_mod_log_lock); - kfree(tm_list); - kfree(tm); - - return ret; -} - -static inline int -__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct tree_mod_elem **tm_list, - int nritems) -{ - int i, j; - int ret; - - for (i = nritems - 1; i >= 0; i--) { - ret = __tree_mod_log_insert(fs_info, tm_list[i]); - if (ret) { - for (j = nritems - 1; j > i; j--) - rb_erase(&tm_list[j]->node, - &fs_info->tree_mod_log); - return ret; - } - } - - return 0; -} - -static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root, - struct extent_buffer *new_root, int log_removal) -{ - struct btrfs_fs_info *fs_info = old_root->fs_info; - struct tree_mod_elem *tm = NULL; - struct tree_mod_elem **tm_list = NULL; - int nritems = 0; - int ret = 0; - int i; - - if (!tree_mod_need_log(fs_info, NULL)) - return 0; - - if (log_removal && btrfs_header_level(old_root) > 0) { - nritems = btrfs_header_nritems(old_root); - tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), - GFP_NOFS); - if (!tm_list) { - ret = -ENOMEM; - goto free_tms; - } - for (i = 0; i < nritems; i++) { - tm_list[i] = alloc_tree_mod_elem(old_root, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); - if (!tm_list[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - } - - tm = kzalloc(sizeof(*tm), GFP_NOFS); - if (!tm) { - ret = -ENOMEM; - goto free_tms; - } - - tm->logical = new_root->start; - tm->old_root.logical = old_root->start; - tm->old_root.level = btrfs_header_level(old_root); - tm->generation = btrfs_header_generation(old_root); - tm->op = MOD_LOG_ROOT_REPLACE; - - if (tree_mod_dont_log(fs_info, NULL)) - goto free_tms; - - if (tm_list) - ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); - if (!ret) - ret = __tree_mod_log_insert(fs_info, tm); - - write_unlock(&fs_info->tree_mod_log_lock); - if (ret) - goto free_tms; - kfree(tm_list); - - return ret; - -free_tms: - if (tm_list) { - for (i = 0; i < nritems; i++) - kfree(tm_list[i]); - kfree(tm_list); - } - kfree(tm); - - return ret; -} - -static struct tree_mod_elem * -__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, - int smallest) -{ - struct rb_root *tm_root; - struct rb_node *node; - struct tree_mod_elem *cur = NULL; - struct tree_mod_elem *found = NULL; - - read_lock(&fs_info->tree_mod_log_lock); - tm_root = &fs_info->tree_mod_log; - node = tm_root->rb_node; - while (node) { - cur = rb_entry(node, struct tree_mod_elem, node); - if (cur->logical < start) { - node = node->rb_left; - } else if (cur->logical > start) { - node = node->rb_right; - } else if (cur->seq < min_seq) { - node = node->rb_left; - } else if (!smallest) { - /* we want the node with the highest seq */ - if (found) - BUG_ON(found->seq > cur->seq); - found = cur; - node = node->rb_left; - } else if (cur->seq > min_seq) { - /* we want the node with the smallest seq */ - if (found) - BUG_ON(found->seq < cur->seq); - found = cur; - node = node->rb_right; - } else { - found = cur; - break; - } - } - read_unlock(&fs_info->tree_mod_log_lock); - - return found; -} - -/* - * this returns the element from the log with the smallest time sequence - * value that's in the log (the oldest log item). any element with a time - * sequence lower than min_seq will be ignored. - */ -static struct tree_mod_elem * -tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, - u64 min_seq) -{ - return __tree_mod_log_search(fs_info, start, min_seq, 1); -} - -/* - * this returns the element from the log with the largest time sequence - * value that's in the log (the most recent log item). any element with - * a time sequence lower than min_seq will be ignored. - */ -static struct tree_mod_elem * -tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) -{ - return __tree_mod_log_search(fs_info, start, min_seq, 0); -} - -static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) -{ - struct btrfs_fs_info *fs_info = dst->fs_info; - int ret = 0; - struct tree_mod_elem **tm_list = NULL; - struct tree_mod_elem **tm_list_add, **tm_list_rem; - int i; - int locked = 0; - - if (!tree_mod_need_log(fs_info, NULL)) - return 0; - - if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) - return 0; - - tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), - GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - tm_list_add = tm_list; - tm_list_rem = tm_list + nr_items; - for (i = 0; i < nr_items; i++) { - tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, - MOD_LOG_KEY_REMOVE, GFP_NOFS); - if (!tm_list_rem[i]) { - ret = -ENOMEM; - goto free_tms; - } - - tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, - MOD_LOG_KEY_ADD, GFP_NOFS); - if (!tm_list_add[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - - if (tree_mod_dont_log(fs_info, NULL)) - goto free_tms; - locked = 1; - - for (i = 0; i < nr_items; i++) { - ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); - if (ret) - goto free_tms; - ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); - if (ret) - goto free_tms; - } - - write_unlock(&fs_info->tree_mod_log_lock); - kfree(tm_list); - - return 0; - -free_tms: - for (i = 0; i < nr_items * 2; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); - kfree(tm_list[i]); - } - if (locked) - write_unlock(&fs_info->tree_mod_log_lock); - kfree(tm_list); - - return ret; -} - -static noinline int tree_mod_log_free_eb(struct extent_buffer *eb) -{ - struct tree_mod_elem **tm_list = NULL; - int nritems = 0; - int i; - int ret = 0; - - if (btrfs_header_level(eb) == 0) - return 0; - - if (!tree_mod_need_log(eb->fs_info, NULL)) - return 0; - - nritems = btrfs_header_nritems(eb); - tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - for (i = 0; i < nritems; i++) { - tm_list[i] = alloc_tree_mod_elem(eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); - if (!tm_list[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - - if (tree_mod_dont_log(eb->fs_info, eb)) - goto free_tms; - - ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); - write_unlock(&eb->fs_info->tree_mod_log_lock); - if (ret) - goto free_tms; - kfree(tm_list); - - return 0; - -free_tms: - for (i = 0; i < nritems; i++) - kfree(tm_list[i]); - kfree(tm_list); - - return ret; -} - /* * check if the tree block can be shared by multiple trees */ @@ -1090,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = buf->start; atomic_inc(&cow->refs); - ret = tree_mod_log_insert_root(root->node, cow, 1); + ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); BUG_ON(ret < 0); rcu_assign_pointer(root->node, cow); @@ -1100,15 +510,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); - tree_mod_log_insert_key(parent, parent_slot, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + btrfs_tree_mod_log_insert_key(parent, parent_slot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); if (last_ref) { - ret = tree_mod_log_free_eb(buf); + ret = btrfs_tree_mod_log_free_eb(buf); if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); @@ -1127,296 +537,6 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } -/* - * returns the logical address of the oldest predecessor of the given root. - * entries older than time_seq are ignored. - */ -static struct tree_mod_elem *__tree_mod_log_oldest_root( - struct extent_buffer *eb_root, u64 time_seq) -{ - struct tree_mod_elem *tm; - struct tree_mod_elem *found = NULL; - u64 root_logical = eb_root->start; - int looped = 0; - - if (!time_seq) - return NULL; - - /* - * the very last operation that's logged for a root is the - * replacement operation (if it is replaced at all). this has - * the logical address of the *new* root, making it the very - * first operation that's logged for this root. - */ - while (1) { - tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, - time_seq); - if (!looped && !tm) - return NULL; - /* - * if there are no tree operation for the oldest root, we simply - * return it. this should only happen if that (old) root is at - * level 0. - */ - if (!tm) - break; - - /* - * if there's an operation that's not a root replacement, we - * found the oldest version of our root. normally, we'll find a - * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. - */ - if (tm->op != MOD_LOG_ROOT_REPLACE) - break; - - found = tm; - root_logical = tm->old_root.logical; - looped = 1; - } - - /* if there's no old root to return, return what we found instead */ - if (!found) - found = tm; - - return found; -} - -/* - * tm is a pointer to the first operation to rewind within eb. then, all - * previous operations will be rewound (until we reach something older than - * time_seq). - */ -static void -__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 time_seq, struct tree_mod_elem *first_tm) -{ - u32 n; - struct rb_node *next; - struct tree_mod_elem *tm = first_tm; - unsigned long o_dst; - unsigned long o_src; - unsigned long p_size = sizeof(struct btrfs_key_ptr); - - n = btrfs_header_nritems(eb); - read_lock(&fs_info->tree_mod_log_lock); - while (tm && tm->seq >= time_seq) { - /* - * all the operations are recorded with the operator used for - * the modification. as we're going backwards, we do the - * opposite of each operation here. - */ - switch (tm->op) { - case MOD_LOG_KEY_REMOVE_WHILE_FREEING: - BUG_ON(tm->slot < n); - fallthrough; - case MOD_LOG_KEY_REMOVE_WHILE_MOVING: - case MOD_LOG_KEY_REMOVE: - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); - btrfs_set_node_ptr_generation(eb, tm->slot, - tm->generation); - n++; - break; - case MOD_LOG_KEY_REPLACE: - BUG_ON(tm->slot >= n); - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); - btrfs_set_node_ptr_generation(eb, tm->slot, - tm->generation); - break; - case MOD_LOG_KEY_ADD: - /* if a move operation is needed it's in the log */ - n--; - break; - case MOD_LOG_MOVE_KEYS: - o_dst = btrfs_node_key_ptr_offset(tm->slot); - o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); - memmove_extent_buffer(eb, o_dst, o_src, - tm->move.nr_items * p_size); - break; - case MOD_LOG_ROOT_REPLACE: - /* - * this operation is special. for roots, this must be - * handled explicitly before rewinding. - * for non-roots, this operation may exist if the node - * was a root: root A -> child B; then A gets empty and - * B is promoted to the new root. in the mod log, we'll - * have a root-replace operation for B, a tree block - * that is no root. we simply ignore that operation. - */ - break; - } - next = rb_next(&tm->node); - if (!next) - break; - tm = rb_entry(next, struct tree_mod_elem, node); - if (tm->logical != first_tm->logical) - break; - } - read_unlock(&fs_info->tree_mod_log_lock); - btrfs_set_header_nritems(eb, n); -} - -/* - * Called with eb read locked. If the buffer cannot be rewound, the same buffer - * is returned. If rewind operations happen, a fresh buffer is returned. The - * returned buffer is always read-locked. If the returned buffer is not the - * input buffer, the lock on the input buffer is released and the input buffer - * is freed (its refcount is decremented). - */ -static struct extent_buffer * -tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct extent_buffer *eb, u64 time_seq) -{ - struct extent_buffer *eb_rewin; - struct tree_mod_elem *tm; - - if (!time_seq) - return eb; - - if (btrfs_header_level(eb) == 0) - return eb; - - tm = tree_mod_log_search(fs_info, eb->start, time_seq); - if (!tm) - return eb; - - if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { - BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); - if (!eb_rewin) { - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - return NULL; - } - btrfs_set_header_bytenr(eb_rewin, eb->start); - btrfs_set_header_backref_rev(eb_rewin, - btrfs_header_backref_rev(eb)); - btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); - btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); - } else { - eb_rewin = btrfs_clone_extent_buffer(eb); - if (!eb_rewin) { - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - return NULL; - } - } - - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - - btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), - eb_rewin, btrfs_header_level(eb_rewin)); - btrfs_tree_read_lock(eb_rewin); - __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); - WARN_ON(btrfs_header_nritems(eb_rewin) > - BTRFS_NODEPTRS_PER_BLOCK(fs_info)); - - return eb_rewin; -} - -/* - * get_old_root() rewinds the state of @root's root node to the given @time_seq - * value. If there are no changes, the current root->root_node is returned. If - * anything changed in between, there's a fresh buffer allocated on which the - * rewind operations are done. In any case, the returned buffer is read locked. - * Returns NULL on error (with no locks held). - */ -static inline struct extent_buffer * -get_old_root(struct btrfs_root *root, u64 time_seq) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct tree_mod_elem *tm; - struct extent_buffer *eb = NULL; - struct extent_buffer *eb_root; - u64 eb_root_owner = 0; - struct extent_buffer *old; - struct tree_mod_root *old_root = NULL; - u64 old_generation = 0; - u64 logical; - int level; - - eb_root = btrfs_read_lock_root_node(root); - tm = __tree_mod_log_oldest_root(eb_root, time_seq); - if (!tm) - return eb_root; - - if (tm->op == MOD_LOG_ROOT_REPLACE) { - old_root = &tm->old_root; - old_generation = tm->generation; - logical = old_root->logical; - level = old_root->level; - } else { - logical = eb_root->start; - level = btrfs_header_level(eb_root); - } - - tm = tree_mod_log_search(fs_info, logical, time_seq); - if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - old = read_tree_block(fs_info, logical, root->root_key.objectid, - 0, level, NULL); - if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { - if (!IS_ERR(old)) - free_extent_buffer(old); - btrfs_warn(fs_info, - "failed to read tree block %llu from get_old_root", - logical); - } else { - eb = btrfs_clone_extent_buffer(old); - free_extent_buffer(old); - } - } else if (old_root) { - eb_root_owner = btrfs_header_owner(eb_root); - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(fs_info, logical); - } else { - eb = btrfs_clone_extent_buffer(eb_root); - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - } - - if (!eb) - return NULL; - if (old_root) { - btrfs_set_header_bytenr(eb, eb->start); - btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(eb, eb_root_owner); - btrfs_set_header_level(eb, old_root->level); - btrfs_set_header_generation(eb, old_generation); - } - btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, - btrfs_header_level(eb)); - btrfs_tree_read_lock(eb); - if (tm) - __tree_mod_log_rewind(fs_info, eb, time_seq, tm); - else - WARN_ON(btrfs_header_level(eb) != 0); - WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); - - return eb; -} - -int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) -{ - struct tree_mod_elem *tm; - int level; - struct extent_buffer *eb_root = btrfs_root_node(root); - - tm = __tree_mod_log_oldest_root(eb_root, time_seq); - if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { - level = tm->old_root.level; - } else { - level = btrfs_header_level(eb_root); - } - free_extent_buffer(eb_root); - - return level; -} - static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) @@ -1838,7 +958,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - ret = tree_mod_log_insert_root(root->node, child, 1); + ret = btrfs_tree_mod_log_insert_root(root->node, child, true); BUG_ON(ret < 0); rcu_assign_pointer(root->node, child); @@ -1918,8 +1038,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); - ret = tree_mod_log_insert_key(parent, pslot + 1, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1964,8 +1084,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); - ret = tree_mod_log_insert_key(parent, pslot, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -2066,8 +1186,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); - ret = tree_mod_log_insert_key(parent, pslot, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -2120,8 +1240,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); - ret = tree_mod_log_insert_key(parent, pslot + 1, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2159,12 +1279,13 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, u64 search; u64 target; u64 nread = 0; + u64 nread_max; struct extent_buffer *eb; u32 nr; u32 blocksize; u32 nscan = 0; - if (level != 1) + if (level != 1 && path->reada != READA_FORWARD_ALWAYS) return; if (!path->nodes[level]) @@ -2172,6 +1293,20 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, node = path->nodes[level]; + /* + * Since the time between visiting leaves is much shorter than the time + * between visiting nodes, limit read ahead of nodes to 1, to avoid too + * much IO at once (possibly random). + */ + if (path->reada == READA_FORWARD_ALWAYS) { + if (level > 1) + nread_max = node->fs_info->nodesize; + else + nread_max = SZ_128K; + } else { + nread_max = SZ_64K; + } + search = btrfs_node_blockptr(node, slot); blocksize = fs_info->nodesize; eb = find_extent_buffer(fs_info, search); @@ -2190,7 +1325,8 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, if (nr == 0) break; nr--; - } else if (path->reada == READA_FORWARD) { + } else if (path->reada == READA_FORWARD || + path->reada == READA_FORWARD_ALWAYS) { nr++; if (nr >= nritems) break; @@ -2201,13 +1337,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, break; } search = btrfs_node_blockptr(node, nr); - if ((search <= target && target - search <= 65536) || + if (path->reada == READA_FORWARD_ALWAYS || + (search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { btrfs_readahead_node_child(node, nr); nread += blocksize; } nscan++; - if ((nread > 65536 || nscan > 32)) + if (nread > nread_max || nscan > 32) break; } } @@ -2316,6 +1453,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { + if (p->reada == READA_FORWARD_ALWAYS) + reada_for_search(fs_info, p, level, slot, key->objectid); + /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { /* @@ -2859,7 +1999,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, } again: - b = get_old_root(root, time_seq); + b = btrfs_get_old_root(root, time_seq); if (!b) { ret = -EIO; goto done; @@ -2914,7 +2054,7 @@ again: level = btrfs_header_level(b); btrfs_tree_read_lock(b); - b = tree_mod_log_rewind(fs_info, p, b, time_seq); + b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); if (!b) { ret = -ENOMEM; goto done; @@ -3028,8 +2168,8 @@ static void fixup_low_keys(struct btrfs_path *path, if (!path->nodes[i]) break; t = path->nodes[i]; - ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE, - GFP_ATOMIC); + ret = btrfs_tree_mod_log_insert_key(t, tslot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); @@ -3192,7 +2332,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } - ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); + ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3204,8 +2344,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, if (push_items < src_nritems) { /* - * Don't call tree_mod_log_insert_move here, key removal was - * already fully logged by tree_mod_log_eb_copy above. + * Don't call btrfs_tree_mod_log_insert_move() here, key removal + * was already fully logged by btrfs_tree_mod_log_eb_copy() above. */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), @@ -3266,15 +2406,15 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } - ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); + ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), btrfs_node_key_ptr_offset(0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); - ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, - push_items); + ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, + push_items); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3340,7 +2480,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; - ret = tree_mod_log_insert_root(root->node, c, 0); + ret = btrfs_tree_mod_log_insert_root(root->node, c, false); BUG_ON(ret < 0); rcu_assign_pointer(root->node, c); @@ -3379,8 +2519,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); if (slot != nritems) { if (level) { - ret = tree_mod_log_insert_move(lower, slot + 1, slot, - nritems - slot); + ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, + slot, nritems - slot); BUG_ON(ret < 0); } memmove_extent_buffer(lower, @@ -3389,8 +2529,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, (nritems - slot) * sizeof(struct btrfs_key_ptr)); } if (level) { - ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD, - GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(lower, slot, + BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -3431,9 +2571,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans, * tree mod log: We don't log_removal old root in * insert_new_root, because that root buffer will be kept as a * normal node. We are going to log removal of half of the - * elements below with tree_mod_log_eb_copy. We're holding a - * tree lock on the buffer, which is why we cannot race with - * other tree_mod_log users. + * elements below with btrfs_tree_mod_log_eb_copy(). We're + * holding a tree lock on the buffer, which is why we cannot + * race with other tree_mod_log users. */ ret = insert_new_root(trans, root, path, level + 1); if (ret) @@ -3460,7 +2600,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, root_add_used(root, fs_info->nodesize); ASSERT(btrfs_header_level(c) == level); - ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); + ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -4842,8 +3982,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { if (level) { - ret = tree_mod_log_insert_move(parent, slot, slot + 1, - nritems - slot - 1); + ret = btrfs_tree_mod_log_insert_move(parent, slot, + slot + 1, nritems - slot - 1); BUG_ON(ret < 0); } memmove_extent_buffer(parent, @@ -4852,8 +3992,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } else if (level) { - ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE, - GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, slot, + BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); BUG_ON(ret < 0); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9ae776ab3967..f83fd3cbf243 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -342,6 +342,27 @@ struct btrfs_node { struct btrfs_key_ptr ptrs[]; } __attribute__ ((__packed__)); +/* Read ahead values for struct btrfs_path.reada */ +enum { + READA_NONE, + READA_BACK, + READA_FORWARD, + /* + * Similar to READA_FORWARD but unlike it: + * + * 1) It will trigger readahead even for leaves that are not close to + * each other on disk; + * 2) It also triggers readahead for nodes; + * 3) During a search, even when a node or leaf is already in memory, it + * will still trigger readahead for other nodes and leaves that follow + * it. + * + * This is meant to be used only when we know we are iterating over the + * entire tree or a very large part of it. + */ + READA_FORWARD_ALWAYS, +}; + /* * btrfs_paths remember the path taken from the root down to the leaf. * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point @@ -350,7 +371,6 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ -enum { READA_NONE, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; @@ -482,16 +502,6 @@ struct btrfs_discard_ctl { atomic64_t discard_bytes_saved; }; -/* delayed seq elem */ -struct seq_list { - struct list_head list; - u64 seq; -}; - -#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } - -#define SEQ_LAST ((u64)-1) - enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -572,6 +582,15 @@ enum { /* Indicate that we can't trust the free space tree for caching yet */ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, + + /* Indicate whether there are any tree modification log users */ + BTRFS_FS_TREE_MOD_LOG_USERS, + +#if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, + BTRFS_FS_32BIT_WARN, +#endif }; /* @@ -941,10 +960,16 @@ struct btrfs_fs_info { struct work_struct async_data_reclaim_work; struct work_struct preempt_reclaim_work; + /* Reclaim partially filled block groups in the background */ + struct work_struct reclaim_bgs_work; + struct list_head reclaim_bgs; + int bg_reclaim_threshold; + spinlock_t unused_bgs_lock; struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; - struct mutex delete_unused_bgs_mutex; + /* Protect block groups that are going to be deleted */ + struct mutex reclaim_bgs_lock; /* Cached block sizes */ u32 nodesize; @@ -2691,7 +2716,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); /* @@ -2929,13 +2953,6 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb) clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); } -/* tree mod log functions from ctree.c */ -u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); -void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); -int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); - /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const char *name, @@ -3084,7 +3101,7 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end, int mirror); + struct page *page, u64 start, u64 end); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3179,6 +3196,7 @@ extern const struct iomap_dio_ops btrfs_dio_ops; /* Inode locking type flags, by default the exclusive lock is taken */ #define BTRFS_ILOCK_SHARED (1U << 0) #define BTRFS_ILOCK_TRY (1U << 1) +#define BTRFS_ILOCK_MMAP (1U << 2) int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); @@ -3189,6 +3207,9 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(u8 *uuid); @@ -3217,8 +3238,9 @@ extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_drop_extents_args *args); -int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, - const u64 start, const u64 end, +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, @@ -3405,6 +3427,19 @@ static inline void assertfail(const char *expr, const char* file, int line) { } #define ASSERT(expr) (void)(expr) #endif +#if BITS_PER_LONG == 32 +#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) +/* + * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical + * addresses of extents. + * + * For 4K page size it's about 10T, for 64K it's 160T. + */ +#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) +void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); +void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); +#endif + /* * Get the correct offset inside the page of extent buffer. * @@ -3732,8 +3767,6 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) return signal_pending(current); } -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) - /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bf25401c9768..1a88f6214ebc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -602,7 +602,6 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_delayed_node *node) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -633,32 +632,17 @@ static int btrfs_delayed_inode_reserve_metadata( return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); - /* - * Since we're under a transaction reserve_metadata_bytes could - * try to commit the transaction which will make it return - * EAGAIN to make us stop the transaction we have, so return - * ENOSPC instead so that btrfs_dirty_inode knows what to do. - */ - if (ret == -EAGAIN) { - ret = -ENOSPC; - btrfs_qgroup_free_meta_prealloc(root, num_bytes); - } - if (!ret) { - node->bytes_reserved = num_bytes; - trace_btrfs_space_reservation(fs_info, - "delayed_inode", - btrfs_ino(inode), - num_bytes, 1); - } else { + /* NO_FLUSH could only fail with -ENOSPC */ + ASSERT(ret == 0 || ret == -ENOSPC); + if (ret) btrfs_qgroup_free_meta_prealloc(root, num_bytes); - } - return ret; + } else { + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); } - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_inode", - btrfs_ino(inode), num_bytes, 1); + node->inode_id, num_bytes, 1); node->bytes_reserved = num_bytes; } @@ -1589,8 +1573,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * We can only do one readdir with delayed items at a time because of * item->readdir_list. */ - inode_unlock_shared(inode); - inode_lock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(inode, 0); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1833,8 +1817,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, - delayed_node); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); if (ret) goto release_node; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 63be7d01a9a3..c92d9d4f5f46 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -11,6 +11,7 @@ #include "transaction.h" #include "qgroup.h" #include "space-info.h" +#include "tree-mod-log.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -494,16 +495,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, if (head->is_data) return; - read_lock(&fs_info->tree_mod_log_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *elem; - - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - seq = elem->seq; - } - read_unlock(&fs_info->tree_mod_log_lock); - + seq = btrfs_tree_mod_log_lowest_seq(fs_info); again: for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { @@ -517,23 +509,16 @@ again: int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) { - struct seq_list *elem; int ret = 0; + u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); - read_lock(&fs_info->tree_mod_log_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - if (seq >= elem->seq) { - btrfs_debug(fs_info, - "holding back delayed_ref %#x.%x, lowest is %#x.%x", - (u32)(seq >> 32), (u32)seq, - (u32)(elem->seq >> 32), (u32)elem->seq); - ret = 1; - } + if (min_seq != 0 && seq >= min_seq) { + btrfs_debug(fs_info, + "holding back delayed_ref %llu, lowest is %llu", + seq, min_seq); + ret = 1; } - read_unlock(&fs_info->tree_mod_log_lock); return ret; } diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 3a9c1e046ebe..d05f73530af7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -81,6 +81,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace_item *ptr; u64 src_devid; + if (!dev_root) + return 0; + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 41b718cfea40..c9a3036c23bf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "discard.h" #include "space-info.h" #include "zoned.h" +#include "subpage.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -440,6 +441,74 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb, return ret; } +static int csum_one_extent_buffer(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + u8 result[BTRFS_CSUM_SIZE]; + int ret; + + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, + offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE) == 0); + csum_tree_block(eb, result); + + if (btrfs_header_level(eb)) + ret = btrfs_check_node(eb); + else + ret = btrfs_check_leaf_full(eb); + + if (ret < 0) { + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, + "block=%llu write time tree block corruption detected", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return ret; + } + write_extent_buffer(eb, result, 0, fs_info->csum_size); + + return 0; +} + +/* Checksum all dirty extent buffers in one bio_vec */ +static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, + struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + u64 bvec_start = page_offset(page) + bvec->bv_offset; + u64 cur; + int ret = 0; + + for (cur = bvec_start; cur < bvec_start + bvec->bv_len; + cur += fs_info->nodesize) { + struct extent_buffer *eb; + bool uptodate; + + eb = find_extent_buffer(fs_info, cur); + uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, + fs_info->nodesize); + + /* A dirty eb shouldn't disappear from buffer_radix */ + if (WARN_ON(!eb)) + return -EUCLEAN; + + if (WARN_ON(cur != btrfs_header_bytenr(eb))) { + free_extent_buffer(eb); + return -EUCLEAN; + } + if (WARN_ON(!uptodate)) { + free_extent_buffer(eb); + return -EUCLEAN; + } + + ret = csum_one_extent_buffer(eb); + free_extent_buffer(eb); + if (ret < 0) + return ret; + } + return ret; +} + /* * Checksum a dirty tree block before IO. This has extra checks to make sure * we only fill in the checksum field in the first page of a multi-page block. @@ -450,9 +519,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec struct page *page = bvec->bv_page; u64 start = page_offset(page); u64 found_start; - u8 result[BTRFS_CSUM_SIZE]; struct extent_buffer *eb; - int ret; + + if (fs_info->sectorsize < PAGE_SIZE) + return csum_dirty_subpage_buffers(fs_info, bvec); eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) @@ -474,28 +544,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec if (WARN_ON(!PageUptodate(page))) return -EUCLEAN; - ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, - offsetof(struct btrfs_header, fsid), - BTRFS_FSID_SIZE) == 0); - - csum_tree_block(eb, result); - - if (btrfs_header_level(eb)) - ret = btrfs_check_node(eb); - else - ret = btrfs_check_leaf_full(eb); - - if (ret < 0) { - btrfs_print_tree(eb, 0); - btrfs_err(fs_info, - "block=%llu write time tree block corruption detected", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return ret; - } - write_extent_buffer(eb, result, 0, fs_info->csum_size); - - return 0; + return csum_one_extent_buffer(eb); } static int check_tree_block_fsid(struct extent_buffer *eb) @@ -992,14 +1041,48 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, static int btree_set_page_dirty(struct page *page) { #ifdef DEBUG + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_subpage *subpage; struct extent_buffer *eb; + int cur_bit = 0; + u64 page_start = page_offset(page); + + if (fs_info->sectorsize == PAGE_SIZE) { + BUG_ON(!PagePrivate(page)); + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(!atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); + return __set_page_dirty_nobuffers(page); + } + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + ASSERT(subpage->dirty_bitmap); + while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { + unsigned long flags; + u64 cur; + u16 tmp = (1 << cur_bit); + + spin_lock_irqsave(&subpage->lock, flags); + if (!(tmp & subpage->dirty_bitmap)) { + spin_unlock_irqrestore(&subpage->lock, flags); + cur_bit++; + continue; + } + spin_unlock_irqrestore(&subpage->lock, flags); + cur = page_start + cur_bit * fs_info->sectorsize; - BUG_ON(!PagePrivate(page)); - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(!atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); + eb = find_extent_buffer(fs_info, cur); + ASSERT(eb); + ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + ASSERT(atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); + free_extent_buffer(eb); + + cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); + } #endif return __set_page_dirty_nobuffers(page); } @@ -1807,14 +1890,21 @@ static int cleaner_kthread(void *arg) btrfs_run_defrag_inodes(fs_info); /* - * Acquires fs_info->delete_unused_bgs_mutex to avoid racing + * Acquires fs_info->reclaim_bgs_lock to avoid racing * with relocation (btrfs_relocate_chunk) and relocation * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) - * after acquiring fs_info->delete_unused_bgs_mutex. So we + * after acquiring fs_info->reclaim_bgs_lock. So we * can't hold, nor need to, fs_info->cleaner_mutex when deleting * unused block groups. */ btrfs_delete_unused_bgs(fs_info); + + /* + * Reclaim block groups in the reclaim_bgs list after we deleted + * all unused block_groups. This possibly gives us some more free + * space. + */ + btrfs_reclaim_bgs(fs_info); sleep: clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) @@ -2387,8 +2477,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; - btrfs_init_devices_late(fs_info); } + /* Initialize fs_info for all devices in any case */ + btrfs_init_devices_late(fs_info); /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { @@ -2792,7 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->treelog_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); - mutex_init(&fs_info->delete_unused_bgs_mutex); + mutex_init(&fs_info->reclaim_bgs_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); @@ -2802,6 +2893,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); + INIT_LIST_HEAD(&fs_info->reclaim_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); INIT_LIST_HEAD(&fs_info->allocated_ebs); @@ -2890,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->swapfile_pins = RB_ROOT; fs_info->send_in_progress = 0; + + fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; + INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); } static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) @@ -3009,6 +3104,21 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) } } + /* + * btrfs_find_orphan_roots() is responsible for finding all the dead + * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load + * them into the fs_info->fs_roots_radix tree. This must be done before + * calling btrfs_orphan_cleanup() on the tree root. If we don't do it + * first, then btrfs_orphan_cleanup() will delete a dead root's orphan + * item before the root's tree is deleted - this means that if we unmount + * or crash before the deletion completes, on the next mount we will not + * delete what remains of the tree because the orphan item does not + * exists anymore, which is what tells us we have a pending deletion. + */ + ret = btrfs_find_orphan_roots(fs_info); + if (ret) + goto out; + ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto out; @@ -3068,7 +3178,6 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) } } - ret = btrfs_find_orphan_roots(fs_info); out: return ret; } @@ -4234,6 +4343,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->reclaim_bgs_work); + /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 78ad31a59e59..7a28314189b4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2490,19 +2490,6 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } -int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group *block_group; - int readonly = 0; - - block_group = btrfs_lookup_block_group(fs_info, bytenr); - if (!block_group || block_group->ro) - readonly = 1; - if (block_group) - btrfs_put_block_group(block_group); - return readonly; -} - static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3323,6 +3310,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (last_ref && btrfs_header_generation(buf) == trans->transid) { struct btrfs_block_group *cache; + bool must_pin = false; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); @@ -3340,7 +3328,25 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } - if (btrfs_is_zoned(fs_info)) { + /* + * If this is a leaf and there are tree mod log users, we may + * have recorded mod log operations that point to this leaf. + * So we must make sure no one reuses this leaf's extent before + * mod log operations are applied to a node, otherwise after + * rewinding a node using the mod log operations we get an + * inconsistent btree, as the leaf's extent may now be used as + * a node or leaf for another different btree. + * We are safe from races here because at this point no other + * node or root points to this extent buffer, so if after this + * check a new tree mod log user joins, it will not be able to + * find a node pointing to this leaf and record operations that + * point to this leaf. + */ + if (btrfs_header_level(buf) == 0 && + test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + must_pin = true; + + if (must_pin || btrfs_is_zoned(fs_info)) { btrfs_redirty_list_add(trans->transaction, buf); pin_down_extent(trans, cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 191e358f1322..f2d1bb234377 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -13,6 +13,7 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/cleancache.h> +#include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" #include "extent_map.h" @@ -2886,6 +2887,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) } /* + * Find extent buffer for a givne bytenr. + * + * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking + * in endio context. + */ +static struct extent_buffer *find_extent_buffer_readpage( + struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) +{ + struct extent_buffer *eb; + + /* + * For regular sectorsize, we can use page->private to grab extent + * buffer + */ + if (fs_info->sectorsize == PAGE_SIZE) { + ASSERT(PagePrivate(page) && page->private); + return (struct extent_buffer *)page->private; + } + + /* For subpage case, we need to lookup buffer radix tree */ + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + bytenr >> fs_info->sectorsize_bits); + rcu_read_unlock(); + ASSERT(eb); + return eb; +} + +/* * after a readpage IO is done, we need to: * clear the uptodate bits on error * set the uptodate bits if things worked @@ -2954,8 +2984,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) { if (is_data_inode(inode)) ret = btrfs_verify_data_csum(io_bio, - bio_offset, page, start, end, - mirror); + bio_offset, page, start, end); else ret = btrfs_validate_metadata_buffer(io_bio, page, start, end, mirror); @@ -2996,7 +3025,7 @@ static void end_bio_extent_readpage(struct bio *bio) } else { struct extent_buffer *eb; - eb = (struct extent_buffer *)page->private; + eb = find_extent_buffer_readpage(fs_info, page, start); set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = mirror; atomic_dec(&eb->io_pages); @@ -3020,7 +3049,7 @@ readpage_ok: */ if (page->index == end_index && i_size <= end) { u32 zero_start = max(offset_in_page(i_size), - offset_in_page(end)); + offset_in_page(start)); zero_user_segment(page, zero_start, offset_in_page(end) + 1); @@ -3938,7 +3967,13 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb btrfs_tree_unlock(eb); - if (!ret) + /* + * Either we don't need to submit any tree block, or we're submitting + * subpage eb. + * Subpage metadata doesn't use page locking at all, so we can skip + * the page locking. + */ + if (!ret || fs_info->sectorsize < PAGE_SIZE) return ret; num_pages = num_extent_pages(eb); @@ -3983,12 +4018,11 @@ err_unlock: return ret; } -static void set_btree_ioerr(struct page *page) +static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) { - struct extent_buffer *eb = (struct extent_buffer *)page->private; - struct btrfs_fs_info *fs_info; + struct btrfs_fs_info *fs_info = eb->fs_info; - SetPageError(page); + btrfs_page_set_error(fs_info, page, eb->start, eb->len); if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; @@ -3996,7 +4030,6 @@ static void set_btree_ioerr(struct page *page) * If we error out, we should add back the dirty_metadata_bytes * to make it consistent. */ - fs_info = eb->fs_info; percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, fs_info->dirty_metadata_batch); @@ -4040,26 +4073,111 @@ static void set_btree_ioerr(struct page *page) */ switch (eb->log_index) { case -1: - set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); + set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); break; case 0: - set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); + set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); break; case 1: - set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); + set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); break; default: BUG(); /* unexpected, logic error */ } } +/* + * The endio specific version which won't touch any unsafe spinlock in endio + * context. + */ +static struct extent_buffer *find_extent_buffer_nolock( + struct btrfs_fs_info *fs_info, u64 start) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + return eb; + } + rcu_read_unlock(); + return NULL; +} + +/* + * The endio function for subpage extent buffer write. + * + * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() + * after all extent buffers in the page has finished their writeback. + */ +static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, + struct bio *bio) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + u64 bvec_start = page_offset(page) + bvec->bv_offset; + u64 bvec_end = bvec_start + bvec->bv_len - 1; + u64 cur_bytenr = bvec_start; + + ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); + + /* Iterate through all extent buffers in the range */ + while (cur_bytenr <= bvec_end) { + struct extent_buffer *eb; + int done; + + /* + * Here we can't use find_extent_buffer(), as it may + * try to lock eb->refs_lock, which is not safe in endio + * context. + */ + eb = find_extent_buffer_nolock(fs_info, cur_bytenr); + ASSERT(eb); + + cur_bytenr = eb->start + eb->len; + + ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); + done = atomic_dec_and_test(&eb->io_pages); + ASSERT(done); + + if (bio->bi_status || + test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { + ClearPageUptodate(page); + set_btree_ioerr(page, eb); + } + + btrfs_subpage_clear_writeback(fs_info, page, eb->start, + eb->len); + end_extent_buffer_writeback(eb); + /* + * free_extent_buffer() will grab spinlock which is not + * safe in endio context. Thus here we manually dec + * the ref. + */ + atomic_dec(&eb->refs); + } + } + bio_put(bio); +} + static void end_bio_extent_buffer_writepage(struct bio *bio) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct extent_buffer *eb; int done; struct bvec_iter_all iter_all; + fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); + if (fs_info->sectorsize < PAGE_SIZE) + return end_bio_subpage_eb_writepage(fs_info, bio); + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -4071,7 +4189,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); - set_btree_ioerr(page); + set_btree_ioerr(page, eb); } end_page_writeback(page); @@ -4085,6 +4203,56 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) bio_put(bio); } +/* + * Unlike the work in write_one_eb(), we rely completely on extent locking. + * Page locking is only utilized at minimum to keep the VMM code happy. + * + * Caller should still call write_one_eb() other than this function directly. + * As write_one_eb() has extra preparation before submitting the extent buffer. + */ +static int write_one_subpage_eb(struct extent_buffer *eb, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page = eb->pages[0]; + unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + bool no_dirty_ebs = false; + int ret; + + /* clear_page_dirty_for_io() in subpage helper needs page locked */ + lock_page(page); + btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); + + /* Check if this is the last dirty bit to update nr_written */ + no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, + eb->start, eb->len); + if (no_dirty_ebs) + clear_page_dirty_for_io(page); + + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, + eb->start, eb->len, eb->start - page_offset(page), + &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, + false); + if (ret) { + btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); + set_btree_ioerr(page, eb); + unlock_page(page); + + if (atomic_dec_and_test(&eb->io_pages)) + end_extent_buffer_writeback(eb); + return -EIO; + } + unlock_page(page); + /* + * Submission finished without problem, if no range of the page is + * dirty anymore, we have submitted a page. Update nr_written in wbc. + */ + if (no_dirty_ebs) + update_nr_written(wbc, 1); + return ret; +} + static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) @@ -4116,6 +4284,9 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, memzero_extent_buffer(eb, start, end - start); } + if (eb->fs_info->sectorsize < PAGE_SIZE) + return write_one_subpage_eb(eb, wbc, epd); + for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -4127,7 +4298,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, end_bio_extent_buffer_writepage, 0, 0, 0, false); if (ret) { - set_btree_ioerr(p); + set_btree_ioerr(p, eb); if (PageWriteback(p)) end_page_writeback(p); if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) @@ -4152,6 +4323,98 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, } /* + * Submit one subpage btree page. + * + * The main difference to submit_eb_page() is: + * - Page locking + * For subpage, we don't rely on page locking at all. + * + * - Flush write bio + * We only flush bio if we may be unable to fit current extent buffers into + * current bio. + * + * Return >=0 for the number of submitted extent buffers. + * Return <0 for fatal error. + */ +static int submit_eb_subpage(struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + int submitted = 0; + u64 page_start = page_offset(page); + int bit_start = 0; + const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE; + int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; + int ret; + + /* Lock and write each dirty extent buffers in the range */ + while (bit_start < nbits) { + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct extent_buffer *eb; + unsigned long flags; + u64 start; + + /* + * Take private lock to ensure the subpage won't be detached + * in the meantime. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + break; + } + spin_lock_irqsave(&subpage->lock, flags); + if (!((1 << bit_start) & subpage->dirty_bitmap)) { + spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock(&page->mapping->private_lock); + bit_start++; + continue; + } + + start = page_start + bit_start * fs_info->sectorsize; + bit_start += sectors_per_node; + + /* + * Here we just want to grab the eb without touching extra + * spin locks, so call find_extent_buffer_nolock(). + */ + eb = find_extent_buffer_nolock(fs_info, start); + spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock(&page->mapping->private_lock); + + /* + * The eb has already reached 0 refs thus find_extent_buffer() + * doesn't return it. We don't need to write back such eb + * anyway. + */ + if (!eb) + continue; + + ret = lock_extent_buffer_for_io(eb, epd); + if (ret == 0) { + free_extent_buffer(eb); + continue; + } + if (ret < 0) { + free_extent_buffer(eb); + goto cleanup; + } + ret = write_one_eb(eb, wbc, epd); + free_extent_buffer(eb); + if (ret < 0) + goto cleanup; + submitted++; + } + return submitted; + +cleanup: + /* We hit error, end bio for the submitted extent buffers */ + end_write_bio(epd, ret); + return ret; +} + +/* * Submit all page(s) of one extent buffer. * * @page: the page of one extent buffer @@ -4183,6 +4446,9 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!PagePrivate(page)) return 0; + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return submit_eb_subpage(page, wbc, epd); + spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { spin_unlock(&mapping->private_lock); @@ -4623,10 +4889,8 @@ void extent_readahead(struct readahead_control *rac) int nr; while ((nr = readahead_page_batch(rac, pagepool))) { - u64 contig_start = page_offset(pagepool[0]); - u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1; - - ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); + u64 contig_start = readahead_pos(rac); + u64 contig_end = contig_start + readahead_batch_length(rac) - 1; contiguous_readpages(pagepool, nr, contig_start, contig_end, &em_cached, &bio, &bio_flags, &prev_em_start); @@ -5440,36 +5704,28 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - /* - * Lock our eb's refs_lock to avoid races with - * free_extent_buffer. When we get our eb it might be flagged - * with EXTENT_BUFFER_STALE and another task running - * free_extent_buffer might have seen that flag set, - * eb->refs == 2, that the buffer isn't under IO (dirty and - * writeback flags not set) and it's still in the tree (flag - * EXTENT_BUFFER_TREE_REF set), therefore being in the process - * of decrementing the extent buffer's reference count twice. - * So here we could race and increment the eb's reference count, - * clear its stale flag, mark it as dirty and drop our reference - * before the other task finishes executing free_extent_buffer, - * which would later result in an attempt to free an extent - * buffer that is dirty. - */ - if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { - spin_lock(&eb->refs_lock); - spin_unlock(&eb->refs_lock); - } - mark_extent_buffer_accessed(eb, NULL); - return eb; + eb = find_extent_buffer_nolock(fs_info, start); + if (!eb) + return NULL; + /* + * Lock our eb's refs_lock to avoid races with free_extent_buffer(). + * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and + * another task running free_extent_buffer() might have seen that flag + * set, eb->refs == 2, that the buffer isn't under IO (dirty and + * writeback flags not set) and it's still in the tree (flag + * EXTENT_BUFFER_TREE_REF set), therefore being in the process of + * decrementing the extent buffer's reference count twice. So here we + * could race and increment the eb's reference count, clear its stale + * flag, mark it as dirty and drop our reference before the other task + * finishes executing free_extent_buffer, which would later result in + * an attempt to free an extent buffer that is dirty. + */ + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { + spin_lock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); } - rcu_read_unlock(); - - return NULL; + mark_extent_buffer_accessed(eb, NULL); + return eb; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -5565,6 +5821,17 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-EINVAL); } +#if BITS_PER_LONG == 32 + if (start >= MAX_LFS_FILESIZE) { + btrfs_err_rl(fs_info, + "extent buffer %llu is beyond 32bit page cache limit", start); + btrfs_err_32bit_limit(fs_info); + return ERR_PTR(-EOVERFLOW); + } + if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) + btrfs_warn_32bit_limit(fs_info); +#endif + if (fs_info->sectorsize < PAGE_SIZE && offset_in_page(start) + len > PAGE_SIZE) { btrfs_err(fs_info, @@ -5636,7 +5903,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_page_inc_eb_refs(fs_info, p); spin_unlock(&mapping->private_lock); - WARN_ON(PageDirty(p)); + WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -5785,28 +6052,51 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } +static void btree_clear_page_dirty(struct page *page) +{ + ASSERT(PageDirty(page)); + ASSERT(PageLocked(page)); + clear_page_dirty_for_io(page); + xa_lock_irq(&page->mapping->i_pages); + if (!PageDirty(page)) + __xa_clear_mark(&page->mapping->i_pages, + page_index(page), PAGECACHE_TAG_DIRTY); + xa_unlock_irq(&page->mapping->i_pages); +} + +static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page = eb->pages[0]; + bool last; + + /* btree_clear_page_dirty() needs page locked */ + lock_page(page); + last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, + eb->len); + if (last) + btree_clear_page_dirty(page); + unlock_page(page); + WARN_ON(atomic_read(&eb->refs) == 0); +} + void clear_extent_buffer_dirty(const struct extent_buffer *eb) { int i; int num_pages; struct page *page; + if (eb->fs_info->sectorsize < PAGE_SIZE) + return clear_subpage_extent_buffer_dirty(eb); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (!PageDirty(page)) continue; - lock_page(page); - WARN_ON(!PagePrivate(page)); - - clear_page_dirty_for_io(page); - xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) - __xa_clear_mark(&page->mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); - xa_unlock_irq(&page->mapping->i_pages); + btree_clear_page_dirty(page); ClearPageError(page); unlock_page(page); } @@ -5827,10 +6117,28 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); - if (!was_dirty) - for (i = 0; i < num_pages; i++) - set_page_dirty(eb->pages[i]); + if (!was_dirty) { + bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; + /* + * For subpage case, we can have other extent buffers in the + * same page, and in clear_subpage_extent_buffer_dirty() we + * have to clear page dirty without subpage lock held. + * This can cause race where our page gets dirty cleared after + * we just set it. + * + * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * its page for other reasons, we can use page lock to prevent + * the above race. + */ + if (subpage) + lock_page(eb->pages[0]); + for (i = 0; i < num_pages; i++) + btrfs_page_set_dirty(eb->fs_info, eb->pages[i], + eb->start, eb->len); + if (subpage) + unlock_page(eb->pages[0]); + } #ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) ASSERT(PageDirty(eb->pages[i])); @@ -6188,12 +6496,34 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, return ret; } +/* + * Check that the extent buffer is uptodate. + * + * For regular sector size == PAGE_SIZE case, check if @page is uptodate. + * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. + */ +static void assert_eb_page_uptodate(const struct extent_buffer *eb, + struct page *page) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + + if (fs_info->sectorsize < PAGE_SIZE) { + bool uptodate; + + uptodate = btrfs_subpage_test_uptodate(fs_info, page, + eb->start, eb->len); + WARN_ON(!uptodate); + } else { + WARN_ON(!PageUptodate(page)); + } +} + void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; - WARN_ON(!PageUptodate(eb->pages[0])); + assert_eb_page_uptodate(eb, eb->pages[0]); kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, BTRFS_FSID_SIZE); @@ -6203,7 +6533,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; - WARN_ON(!PageUptodate(eb->pages[0])); + assert_eb_page_uptodate(eb, eb->pages[0]); kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, BTRFS_FSID_SIZE); @@ -6228,7 +6558,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, while (len > 0) { page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); @@ -6257,7 +6587,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, while (len > 0) { page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); @@ -6315,7 +6645,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, while (len > 0) { page = dst->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(dst, page); cur = min(len, (unsigned long)(PAGE_SIZE - offset)); @@ -6377,7 +6707,7 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, eb_bitmap_offset(eb, start, nr, &i, &offset); page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } @@ -6402,7 +6732,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); while (len >= bits_to_set) { @@ -6413,7 +6743,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); } } @@ -6445,7 +6775,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); while (len >= bits_to_clear) { @@ -6456,7 +6786,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); } } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 824640cb0ace..227215a5722c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -66,6 +66,7 @@ enum { struct btrfs_root; struct btrfs_inode; struct btrfs_io_bio; +struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; @@ -270,9 +271,6 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); -struct btrfs_fs_info; -struct btrfs_inode; - int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 47cd3a6dc635..294602f139ef 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -9,6 +9,7 @@ #include <linux/highmem.h> #include <linux/sched/mm.h> #include <crypto/hash.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0e155f013839..864c08d08a35 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2014,14 +2014,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, else num_written = btrfs_buffered_write(iocb, from); - /* - * We also have to set last_sub_trans to the current log transid, - * otherwise subsequent syncs to a file that's been synced in this - * transaction will appear to have already occurred. - */ - spin_lock(&inode->lock); - inode->last_sub_trans = inode->root->log_transid; - spin_unlock(&inode->lock); + btrfs_set_inode_last_sub_trans(inode); + if (num_written > 0) num_written = generic_write_sync(iocb, num_written); @@ -2122,7 +2116,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - inode_lock(inode); + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -2135,11 +2129,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) &BTRFS_I(inode)->runtime_flags); /* - * Before we acquired the inode's lock, someone may have dirtied more - * pages in the target range. We need to make sure that writeback for - * any such pages does not start while we are logging the inode, because - * if it does, any of the following might happen when we are not doing a - * full inode sync: + * Before we acquired the inode's lock and the mmap lock, someone may + * have dirtied more pages in the target range. We need to make sure + * that writeback for any such pages does not start while we are logging + * the inode, because if it does, any of the following might happen when + * we are not doing a full inode sync: * * 1) We log an extent after its writeback finishes but before its * checksums are added to the csum tree, leading to -EIO errors @@ -2154,7 +2148,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -2255,7 +2249,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { @@ -2285,7 +2279,7 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -2605,16 +2599,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, * extents without inserting a new one, so we must abort the transaction to avoid * a corruption. */ -int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, - const u64 start, const u64 end, - struct btrfs_replace_extent_info *extent_info, - struct btrfs_trans_handle **trans_out) +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, + struct btrfs_replace_extent_info *extent_info, + struct btrfs_trans_handle **trans_out) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); - u64 ino_size = round_up(inode->i_size, fs_info->sectorsize); - struct btrfs_root *root = BTRFS_I(inode)->root; + u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; struct btrfs_block_rsv *rsv; unsigned int rsv_count; @@ -2662,10 +2657,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, drop_args.drop_cache = true; while (cur_offset < end) { drop_args.start = cur_offset; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); /* If we are punching a hole decrement the inode's byte count */ if (!extent_info) - btrfs_update_inode_bytes(BTRFS_I(inode), 0, + btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); if (ret != -ENOSPC) { /* @@ -2685,8 +2680,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, if (!extent_info && cur_offset < drop_args.drop_end && cur_offset < ino_size) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_args.drop_end); + ret = fill_holes(trans, inode, path, cur_offset, + drop_args.drop_end); if (ret) { /* * If we failed then we didn't insert our hole @@ -2704,7 +2699,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, * know to not set disk_i_size in this area until a new * file extent is inserted here. */ - ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); if (ret) { @@ -2723,8 +2718,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, u64 replace_len = drop_args.drop_end - extent_info->file_offset; - ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), - path, extent_info, replace_len, + ret = btrfs_insert_replace_extent(trans, inode, path, + extent_info, replace_len, drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2735,9 +2730,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, extent_info->file_offset += replace_len; } - cur_offset = drop_args.drop_end; - - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -2756,9 +2749,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; - if (!extent_info) { - ret = find_first_non_hole(BTRFS_I(inode), &cur_offset, - &len); + cur_offset = drop_args.drop_end; + len = end - cur_offset; + if (!extent_info && len) { + ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; if (ret && !len) { @@ -2771,14 +2765,11 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, /* * If we were cloning, force the next fsync to be a full one since we * we replaced (or just dropped in the case of cloning holes when - * NO_HOLES is enabled) extents and extent maps. - * This is for the sake of simplicity, and cloning into files larger - * than 16Mb would force the full fsync any way (when - * try_release_extent_mapping() is invoked during page cache truncation. + * NO_HOLES is enabled) file extent items and did not setup new extent + * maps for the replacement extents (or holes). */ if (extent_info && !extent_info->is_new_extent) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); if (ret) goto out_trans; @@ -2804,8 +2795,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, */ if (!extent_info && cur_offset < ino_size && cur_offset < drop_args.drop_end) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_args.drop_end); + ret = fill_holes(trans, inode, path, cur_offset, + drop_args.drop_end); if (ret) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); @@ -2813,8 +2804,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } } else if (!extent_info && cur_offset < drop_args.drop_end) { /* See the comment in the loop above for the reasoning here. */ - ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), - cur_offset, drop_args.drop_end - cur_offset); + ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, + drop_args.drop_end - cur_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2822,7 +2813,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } if (extent_info) { - ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path, + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len, drop_args.bytes_found); if (ret) { @@ -2868,7 +2859,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - inode_lock(inode); + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); ino_size = round_up(inode->i_size, fs_info->sectorsize); ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) @@ -2908,7 +2899,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) { - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); return ret; } } @@ -2967,8 +2958,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL, - &trans); + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, + lockend, NULL, &trans); btrfs_free_path(path); if (ret) goto out; @@ -3009,7 +3000,7 @@ out_only_mutex: ret = ret2; } } - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); return ret; } @@ -3335,7 +3326,7 @@ static long btrfs_fallocate(struct file *file, int mode, return ret; } - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); @@ -3377,7 +3368,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_ZERO_RANGE) { ret = btrfs_zero_range(inode, offset, len, mode); - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); return ret; } @@ -3487,7 +3478,7 @@ out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); out: - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); /* Let go of our reservation. */ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, @@ -3496,13 +3487,13 @@ out: return ret; } -static loff_t find_desired_extent(struct inode *inode, loff_t offset, +static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, int whence) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - loff_t i_size = inode->i_size; + loff_t i_size = inode->vfs_inode.i_size; u64 lockstart; u64 lockend; u64 start; @@ -3525,11 +3516,10 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset, lockend--; len = lockend - lockstart + 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state); while (start < i_size) { - em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len); + em = btrfs_get_extent_fiemap(inode, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); em = NULL; @@ -3551,7 +3541,7 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset, cond_resched(); } free_extent_map(em); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_extent_cached(&inode->io_tree, lockstart, lockend, &cached_state); if (ret) { offset = ret; @@ -3575,7 +3565,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - offset = find_desired_extent(inode, offset, whence); + offset = find_desired_extent(BTRFS_I(inode), offset, whence); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); break; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9988decd5717..e54466fc101f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,6 +11,7 @@ #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> +#include "misc.h" #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -2539,6 +2540,7 @@ out: static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 bytenr, u64 size, bool used) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; @@ -2569,8 +2571,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, } /* All the region is now unusable. Mark it as unused and reclaim */ - if (block_group->zone_unusable == block_group->length) + if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); + } else if (block_group->zone_unusable >= + div_factor_fine(block_group->length, + fs_info->bg_reclaim_threshold)) { + btrfs_mark_bg_to_reclaim(block_group); + } return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 35bfa0533f23..b21d491b3adc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -102,6 +102,7 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt * return -EAGAIN + * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) { @@ -122,6 +123,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) } inode_lock(inode); } + if (ilock_flags & BTRFS_ILOCK_MMAP) + down_write(&BTRFS_I(inode)->i_mmap_lock); return 0; } @@ -133,6 +136,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) */ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) { + if (ilock_flags & BTRFS_ILOCK_MMAP) + up_write(&BTRFS_I(inode)->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) inode_unlock_shared(inode); else @@ -1516,7 +1521,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end, - int *page_started, int force, + int *page_started, unsigned long *nr_written) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1530,6 +1535,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 ino = btrfs_ino(inode); bool nocow = false; u64 disk_bytenr = 0; + const bool force = inode->flags & BTRFS_INODE_NODATACOW; path = btrfs_alloc_path(); if (!path) { @@ -1863,23 +1869,16 @@ error: return ret; } -static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end) +static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { - - if (!(inode->flags & BTRFS_INODE_NODATACOW) && - !(inode->flags & BTRFS_INODE_PREALLOC)) - return 0; - - /* - * @defrag_bytes is a hint value, no spinlock held here, - * if is not zero, it means the file is defragging. - * Force cow if given extent needs to be defragged. - */ - if (inode->defrag_bytes && - test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL)) - return 1; - - return 0; + if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { + if (inode->defrag_bytes && + test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, + 0, NULL)) + return false; + return true; + } + return false; } /* @@ -1891,17 +1890,12 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page struct writeback_control *wbc) { int ret; - int force_cow = need_force_cow(inode, start, end); const bool zoned = btrfs_is_zoned(inode->root->fs_info); - if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { - ASSERT(!zoned); - ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, 1, nr_written); - } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { + if (should_nocow(inode, start, end)) { ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, 0, nr_written); + page_started, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { if (zoned) @@ -3099,11 +3093,13 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page + * @start: logical offset in the file * * The length of such check is always one sector size. */ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, - u32 bio_offset, struct page *page, u32 pgoff) + u32 bio_offset, struct page *page, u32 pgoff, + u64 start) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@ -3130,8 +3126,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, kunmap_atomic(kaddr); return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff, - csum, csum_expected, io_bio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, + io_bio->mirror_num); if (io_bio->device) btrfs_dev_stat_inc_and_print(io_bio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -3149,10 +3145,9 @@ zeroit: * @bio_offset: offset to the beginning of the bio (in bytes) * @start: file offset of the range start * @end: file offset of the range end (inclusive) - * @mirror: mirror number */ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end, int mirror) + struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -3184,7 +3179,8 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, pg_off += sectorsize, bio_offset += sectorsize) { int ret; - ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off); + ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, + page_offset(page) + pg_off); if (ret < 0) return -EIO; } @@ -3390,15 +3386,19 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) int is_dead_root = 0; /* - * this is an orphan in the tree root. Currently these + * This is an orphan in the tree root. Currently these * could come from 2 sources: - * a) a snapshot deletion in progress + * a) a root (snapshot/subvolume) deletion in progress * b) a free space cache inode - * We need to distinguish those two, as the snapshot - * orphan must not get deleted. - * find_dead_roots already ran before us, so if this - * is a snapshot deletion, we should find the root - * in the fs_roots radix tree. + * We need to distinguish those two, as the orphan item + * for a root must not get deleted before the deletion + * of the snapshot/subvolume's tree completes. + * + * btrfs_find_orphan_roots() ran before us, which has + * found all deleted roots and loaded them into + * fs_info->fs_roots_radix. So here we can find if an + * orphan item corresponds to a deleted root by looking + * up the root from that radix tree. */ spin_lock(&fs_info->fs_roots_radix_lock); @@ -4329,7 +4329,11 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) goto out_end_trans; } - btrfs_record_root_in_trans(trans, dest); + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } memset(&dest->root_item.drop_progress, 0, sizeof(dest->root_item.drop_progress)); @@ -7023,7 +7027,7 @@ next: if (ret) goto out; } else { - map = kmap(page); + map = kmap_local_page(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); if (pg_offset + copy_size < PAGE_SIZE) { @@ -7031,7 +7035,7 @@ next: PAGE_SIZE - pg_offset - copy_size); } - kunmap(page); + kunmap_local(map); } flush_dcache_page(page); } @@ -7259,6 +7263,19 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, return em; } +static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *block_group; + bool readonly = false; + + block_group = btrfs_lookup_block_group(fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = true; + if (block_group) + btrfs_put_block_group(block_group); + return readonly; +} + /* * Check if we can do nocow write into the range [@offset, @offset + @len) * @@ -7910,7 +7927,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, ASSERT(pgoff < PAGE_SIZE); if (uptodate && (!csum || !check_data_csum(inode, io_bio, - bio_offset, bvec.bv_page, pgoff))) { + bio_offset, bvec.bv_page, + pgoff, start))) { clean_io_failure(fs_info, failure_tree, io_tree, start, bvec.bv_page, btrfs_ino(BTRFS_I(inode)), @@ -8169,10 +8187,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; - WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) && - fs_info->max_zone_append_size && - bio_op(bio) != REQ_OP_ZONE_APPEND); - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); @@ -8403,17 +8417,11 @@ again: * for the finish_ordered_io */ if (TestClearPagePrivate2(page)) { - struct btrfs_ordered_inode_tree *tree; - u64 new_len; - - tree = &inode->ordered_tree; - - spin_lock_irq(&tree->lock); + spin_lock_irq(&inode->ordered_tree.lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - new_len = start - ordered->file_offset; - if (new_len < ordered->truncated_len) - ordered->truncated_len = new_len; - spin_unlock_irq(&tree->lock); + ordered->truncated_len = min(ordered->truncated_len, + start - ordered->file_offset); + spin_unlock_irq(&inode->ordered_tree.lock); if (btrfs_dec_test_ordered_pending(inode, &ordered, start, @@ -8539,6 +8547,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: + down_read(&BTRFS_I(inode)->i_mmap_lock); lock_page(page); size = i_size_read(inode); @@ -8567,6 +8576,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; @@ -8619,11 +8629,10 @@ again: set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = fs_info->generation; - BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); @@ -8632,6 +8641,7 @@ again: out_unlock: unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, @@ -8883,6 +8893,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->i_mmap_lock); return inode; } @@ -9008,7 +9019,7 @@ int __init btrfs_init_cachep(void) btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", PAGE_SIZE, PAGE_SIZE, - SLAB_RED_ZONE, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_bitmap_cachep) goto fail; @@ -9101,8 +9112,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_notrans; } - if (dest != root) - btrfs_record_root_in_trans(trans, dest); + if (dest != root) { + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) + goto out_fail; + } /* * We need to find a free sequence number both in the source and @@ -9406,8 +9420,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_notrans; } - if (dest != root) - btrfs_record_root_in_trans(trans, dest); + if (dest != root) { + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) + goto out_fail; + } ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); if (ret) @@ -9877,6 +9894,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; + int qgroup_released; int ret; memset(&stack_fi, 0, sizeof(stack_fi)); @@ -9889,16 +9907,16 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ - ret = btrfs_qgroup_release_data(inode, file_offset, len); - if (ret < 0) - return ERR_PTR(ret); + qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); + if (qgroup_released < 0) + return ERR_PTR(qgroup_released); if (trans) { ret = insert_reserved_file_extent(trans, inode, file_offset, &stack_fi, - true, ret); + true, qgroup_released); if (ret) - return ERR_PTR(ret); + goto free_qgroup; return trans; } @@ -9909,21 +9927,35 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; - extent_info.qgroup_reserved = ret; + extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; path = btrfs_alloc_path(); - if (!path) - return ERR_PTR(-ENOMEM); + if (!path) { + ret = -ENOMEM; + goto free_qgroup; + } - ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset, + ret = btrfs_replace_file_extents(inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); if (ret) - return ERR_PTR(ret); - + goto free_qgroup; return trans; + +free_qgroup: + /* + * We have released qgroup data range at the beginning of the function, + * and normally qgroup_released bytes will be freed when committing + * transaction. + * But if we error out early, we have to free what we have released + * or we leak qgroup data reservation. + */ + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, qgroup_released, + BTRFS_QGROUP_RSV_DATA); + return ERR_PTR(ret); } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -10588,6 +10620,8 @@ static const struct inode_operations btrfs_dir_inode_operations = { .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, + .fileattr_get = btrfs_fileattr_get, + .fileattr_set = btrfs_fileattr_set, }; static const struct file_operations btrfs_dir_file_operations = { @@ -10641,6 +10675,8 @@ static const struct inode_operations btrfs_file_inode_operations = { .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, + .fileattr_get = btrfs_fileattr_get, + .fileattr_set = btrfs_fileattr_set, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e8d53fea4c61..ee1dbabb5d3c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -26,6 +26,7 @@ #include <linux/btrfs.h> #include <linux/uaccess.h> #include <linux/iversion.h> +#include <linux/fileattr.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -153,16 +154,6 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) new_fl); } -static int btrfs_ioctl_getflags(struct file *file, void __user *arg) -{ - struct btrfs_inode *binode = BTRFS_I(file_inode(file)); - unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags); - - if (copy_to_user(arg, &flags, sizeof(flags))) - return -EFAULT; - return 0; -} - /* * Check if @flags are a supported and valid set of FS_*_FL flags and that * the old and new flags are not conflicting @@ -201,9 +192,22 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, return 0; } -static int btrfs_ioctl_setflags(struct file *file, void __user *arg) +/* + * Set flags/xflags from the internal inode flags. The remaining items of + * fsxattr are zeroed. + */ +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); + + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags)); + return 0; +} + +int btrfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = file_inode(file); + struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; @@ -213,34 +217,21 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) const char *comp = NULL; u32 binode_flags; - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EPERM; - if (btrfs_root_readonly(root)) return -EROFS; - if (copy_from_user(&fsflags, arg, sizeof(fsflags))) - return -EFAULT; - - ret = mnt_want_write_file(file); - if (ret) - return ret; + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; - inode_lock(inode); - fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); + fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); - - ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags); - if (ret) - goto out_unlock; - ret = check_fsflags(old_fsflags, fsflags); if (ret) - goto out_unlock; + return ret; ret = check_fsflags_compatible(fs_info, fsflags); if (ret) - goto out_unlock; + return ret; binode_flags = binode->flags; if (fsflags & FS_SYNC_FL) @@ -263,6 +254,14 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) binode_flags |= BTRFS_INODE_NOATIME; else binode_flags &= ~BTRFS_INODE_NOATIME; + + /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ + if (!fa->flags_valid) { + /* 1 item for the inode */ + trans = btrfs_start_transaction(root, 1); + goto update_flags; + } + if (fsflags & FS_DIRSYNC_FL) binode_flags |= BTRFS_INODE_DIRSYNC; else @@ -303,10 +302,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) binode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_unlock; - } + if (IS_SWAPFILE(inode)) + return -ETXTBSY; binode_flags |= BTRFS_INODE_COMPRESS; binode_flags &= ~BTRFS_INODE_NOCOMPRESS; @@ -323,10 +320,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) * 2 for properties */ trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_unlock; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); if (comp) { ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, @@ -344,6 +339,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } +update_flags: binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); @@ -352,44 +348,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) out_end_trans: btrfs_end_transaction(trans); - out_unlock: - inode_unlock(inode); - mnt_drop_write_file(file); return ret; } -/* - * Translate btrfs internal inode flags to xflags as expected by the - * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are - * silently dropped. - */ -static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags) -{ - unsigned int xflags = 0; - - if (flags & BTRFS_INODE_APPEND) - xflags |= FS_XFLAG_APPEND; - if (flags & BTRFS_INODE_IMMUTABLE) - xflags |= FS_XFLAG_IMMUTABLE; - if (flags & BTRFS_INODE_NOATIME) - xflags |= FS_XFLAG_NOATIME; - if (flags & BTRFS_INODE_NODUMP) - xflags |= FS_XFLAG_NODUMP; - if (flags & BTRFS_INODE_SYNC) - xflags |= FS_XFLAG_SYNC; - - return xflags; -} - -/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */ -static int check_xflags(unsigned int flags) -{ - if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME | - FS_XFLAG_NODUMP | FS_XFLAG_SYNC)) - return -EOPNOTSUPP; - return 0; -} - bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { @@ -402,111 +363,6 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } -/* - * Set the xflags from the internal inode flags. The remaining items of fsxattr - * are zeroed. - */ -static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg) -{ - struct btrfs_inode *binode = BTRFS_I(file_inode(file)); - struct fsxattr fa; - - simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags)); - if (copy_to_user(arg, &fa, sizeof(fa))) - return -EFAULT; - - return 0; -} - -static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) -{ - struct inode *inode = file_inode(file); - struct btrfs_inode *binode = BTRFS_I(inode); - struct btrfs_root *root = binode->root; - struct btrfs_trans_handle *trans; - struct fsxattr fa, old_fa; - unsigned old_flags; - unsigned old_i_flags; - int ret = 0; - - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EPERM; - - if (btrfs_root_readonly(root)) - return -EROFS; - - if (copy_from_user(&fa, arg, sizeof(fa))) - return -EFAULT; - - ret = check_xflags(fa.fsx_xflags); - if (ret) - return ret; - - if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0) - return -EOPNOTSUPP; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - inode_lock(inode); - - old_flags = binode->flags; - old_i_flags = inode->i_flags; - - simple_fill_fsxattr(&old_fa, - btrfs_inode_flags_to_xflags(binode->flags)); - ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa); - if (ret) - goto out_unlock; - - if (fa.fsx_xflags & FS_XFLAG_SYNC) - binode->flags |= BTRFS_INODE_SYNC; - else - binode->flags &= ~BTRFS_INODE_SYNC; - if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE) - binode->flags |= BTRFS_INODE_IMMUTABLE; - else - binode->flags &= ~BTRFS_INODE_IMMUTABLE; - if (fa.fsx_xflags & FS_XFLAG_APPEND) - binode->flags |= BTRFS_INODE_APPEND; - else - binode->flags &= ~BTRFS_INODE_APPEND; - if (fa.fsx_xflags & FS_XFLAG_NODUMP) - binode->flags |= BTRFS_INODE_NODUMP; - else - binode->flags &= ~BTRFS_INODE_NODUMP; - if (fa.fsx_xflags & FS_XFLAG_NOATIME) - binode->flags |= BTRFS_INODE_NOATIME; - else - binode->flags &= ~BTRFS_INODE_NOATIME; - - /* 1 item for the inode */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_unlock; - } - - btrfs_sync_inode_flags_to_i_flags(inode); - inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - - btrfs_end_transaction(trans); - -out_unlock: - if (ret) { - binode->flags = old_flags; - inode->i_flags = old_i_flags; - } - - inode_unlock(inode); - mnt_drop_write_file(file); - - return ret; -} - static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { struct inode *inode = file_inode(file); @@ -697,8 +553,6 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - leaf = NULL; btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); @@ -707,8 +561,22 @@ static noinline int create_subvol(struct inode *dir, key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); - if (ret) + if (ret) { + /* + * Since we don't abort the transaction in this case, free the + * tree block so that we don't leak space and leave the + * filesystem in an inconsistent state (an extent item in the + * extent tree without backreferences). Also no need to have + * the tree block locked since it is not in any tree at this + * point, so no other task can find it and use it. + */ + btrfs_free_tree_block(trans, root, leaf, 0, 1); + free_extent_buffer(leaf); goto fail; + } + + free_extent_buffer(leaf); + leaf = NULL; key.offset = (u64)-1; new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); @@ -721,7 +589,12 @@ static noinline int create_subvol(struct inode *dir, /* Freeing will be done in btrfs_put_root() of new_root */ anon_dev = 0; - btrfs_record_root_in_trans(trans, new_root); + ret = btrfs_record_root_in_trans(trans, new_root); + if (ret) { + btrfs_put_root(new_root); + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_create_subvol_root(trans, new_root, root); btrfs_put_root(new_root); @@ -1014,7 +887,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - inode_unlock(dir); + btrfs_inode_unlock(dir, 0); return error; } @@ -1612,7 +1485,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += cluster; } - inode_lock(inode); + btrfs_inode_lock(inode, 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; } else { @@ -1621,13 +1494,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ret = cluster_pages_for_defrag(inode, pages, i, cluster); } if (ret < 0) { - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); goto out_ra; } defrag_count += ret; balance_dirty_pages_ratelimited(inode->i_mapping); - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); if (newer_than) { if (newer_off == (u64)-1) @@ -1675,9 +1548,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, out_ra: if (do_compress) { - inode_lock(inode); + btrfs_inode_lock(inode, 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); } if (!file) kfree(ra); @@ -3112,9 +2985,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - inode_lock(inode); + btrfs_inode_lock(inode, 0); err = btrfs_delete_subvolume(dir, dentry); - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); if (!err) { fsnotify_rmdir(dir, dentry); d_delete(dentry); @@ -3123,7 +2996,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, out_dput: dput(dentry); out_unlock_dir: - inode_unlock(dir); + btrfs_inode_unlock(dir, 0); free_subvol_name: kfree(subvol_name_ptr); free_parent: @@ -4915,10 +4788,6 @@ long btrfs_ioctl(struct file *file, unsigned int void __user *argp = (void __user *)arg; switch (cmd) { - case FS_IOC_GETFLAGS: - return btrfs_ioctl_getflags(file, argp); - case FS_IOC_SETFLAGS: - return btrfs_ioctl_setflags(file, argp); case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); case FS_IOC_GETFSLABEL: @@ -5044,10 +4913,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_features(fs_info, argp); case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); - case FS_IOC_FSGETXATTR: - return btrfs_ioctl_fsgetxattr(file, argp); - case FS_IOC_FSSETXATTR: - return btrfs_ioctl_fssetxattr(file, argp); case BTRFS_IOC_GET_SUBVOL_INFO: return btrfs_ioctl_get_subvol_info(file, argp); case BTRFS_IOC_GET_SUBVOL_ROOTREF: @@ -5067,12 +4932,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) * handling is necessary. */ switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 9084a950dc09..cd042c7567a4 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -118,7 +118,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; char *data_in; - char *cpage_out; + char *cpage_out, *sizes_ptr; int nr_pages = 0; struct page *in_page = NULL; struct page *out_page = NULL; @@ -258,10 +258,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* store the size of all chunks of compressed data */ - cpage_out = kmap(pages[0]); - write_compress_length(cpage_out, tot_out); - - kunmap(pages[0]); + sizes_ptr = kmap_local_page(pages[0]); + write_compress_length(sizes_ptr, tot_out); + kunmap_local(sizes_ptr); ret = 0; *total_out = tot_out; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 985a21558437..07b0b4218791 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -107,17 +107,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, return NULL; } -/* - * helper to check if a given offset is inside a given entry - */ -static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) -{ - if (file_offset < entry->file_offset || - entry->file_offset + entry->num_bytes <= file_offset) - return 0; - return 1; -} - static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, u64 len) { @@ -142,7 +131,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, if (tree->last) { entry = rb_entry(tree->last, struct btrfs_ordered_extent, rb_node); - if (offset_in_entry(entry, file_offset)) + if (in_range(file_offset, entry->file_offset, entry->num_bytes)) return tree->last; } ret = __tree_search(root, file_offset, &prev); @@ -349,7 +338,7 @@ bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, *file_offset)) + if (!in_range(*file_offset, entry->file_offset, entry->num_bytes)) goto out; dec_start = max(*file_offset, entry->file_offset); @@ -428,7 +417,7 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); have_entry: - if (!offset_in_entry(entry, file_offset)) + if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) goto out; if (io_size > entry->bytes_left) @@ -779,7 +768,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, file_offset)) + if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) entry = NULL; if (entry) refcount_inc(&entry->refs); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 99e0853e4d3b..e60c07f36427 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -39,8 +39,8 @@ struct btrfs_ordered_sum { */ enum { /* - * Different types for direct io, one and only one of the 4 type can - * be set when creating ordered extent. + * Different types for ordered extents, one and only one of the 4 types + * need to be set when creating ordered extent. * * REGULAR: For regular non-compressed COW write * NOCOW: For NOCOW write into existing non-hole extent diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 14ff388fd3bd..2319c923c9e6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -23,6 +23,7 @@ #include "qgroup.h" #include "block-group.h" #include "sysfs.h" +#include "tree-mod-log.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -226,7 +227,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, { struct btrfs_qgroup_list *list; - btrfs_sysfs_del_one_qgroup(fs_info, qgroup); list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, @@ -243,7 +243,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, list_del(&list->next_member); kfree(list); } - kfree(qgroup); } /* must be called with qgroup_lock held */ @@ -569,6 +568,8 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); __del_qgroup_rb(fs_info, qgroup); + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kfree(qgroup); } /* * We call btrfs_free_qgroup_config() when unmounting @@ -1578,6 +1579,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); + + /* + * Remove the qgroup from sysfs now without holding the qgroup_lock + * spinlock, since the sysfs_remove_group() function needs to take + * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). + */ + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kfree(qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -2631,12 +2640,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) record->data_rsv, BTRFS_QGROUP_RSV_DATA); /* - * Use SEQ_LAST as time_seq to do special search, which - * doesn't lock tree or delayed_refs and search current - * root. It's safe inside commit_transaction(). + * Use BTRFS_SEQ_LAST as time_seq to do special search, + * which doesn't lock tree or delayed_refs and search + * current root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, SEQ_LAST, &new_roots, false); + record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); if (ret < 0) goto cleanup; if (qgroup_to_skip) { @@ -3535,37 +3544,19 @@ static int try_flush_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle *trans; int ret; - bool can_commit = true; - /* - * If current process holds a transaction, we shouldn't flush, as we - * assume all space reservation happens before a transaction handle is - * held. - * - * But there are cases like btrfs_delayed_item_reserve_metadata() where - * we try to reserve space with one transction handle already held. - * In that case we can't commit transaction, but at least try to end it - * and hope the started data writes can free some space. - */ - if (current->journal_info && - current->journal_info != BTRFS_SEND_TRANS_STUB) - can_commit = false; + /* Can't hold an open transaction or we run the risk of deadlocking */ + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_SEND_TRANS_STUB); + if (WARN_ON(current->journal_info && + current->journal_info != BTRFS_SEND_TRANS_STUB)) + return 0; /* * We don't want to run flush again and again, so if there is a running * one, we won't try to start a new flush, but exit directly. */ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { - /* - * We are already holding a transaction, thus we can block other - * threads from flushing. So exit right now. This increases - * the chance of EDQUOT for heavy load and near limit cases. - * But we can argue that if we're already near limit, EDQUOT is - * unavoidable anyway. - */ - if (!can_commit) - return 0; - wait_event(root->qgroup_flush_wait, !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); return 0; @@ -3582,10 +3573,7 @@ static int try_flush_qgroup(struct btrfs_root *root) goto out; } - if (can_commit) - ret = btrfs_commit_transaction(trans); - else - ret = btrfs_end_transaction(trans); + ret = btrfs_commit_transaction(trans); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); wake_up(&root->qgroup_flush_wait); @@ -3638,8 +3626,7 @@ cleanup: qgroup_unreserve_range(inode, reserved, start, len); out: if (new_reserved) { - extent_changeset_release(reserved); - kfree(reserved); + extent_changeset_free(reserved); *reserved_ret = NULL; } return ret; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 8c31357f08ed..244d499ebc72 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -13,6 +13,7 @@ #include <linux/list_sort.h> #include <linux/raid/xor.h> #include <linux/mm.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" @@ -1231,13 +1232,13 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap(p); + pointers[stripe] = kmap_local_page(p); } /* then add the parity stripe */ p = rbio_pstripe_page(rbio, pagenr); SetPageUptodate(p); - pointers[stripe++] = kmap(p); + pointers[stripe++] = kmap_local_page(p); if (has_qstripe) { @@ -1247,7 +1248,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) */ p = rbio_qstripe_page(rbio, pagenr); SetPageUptodate(p); - pointers[stripe++] = kmap(p); + pointers[stripe++] = kmap_local_page(p); raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, pointers); @@ -1256,10 +1257,8 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) copy_page(pointers[nr_data], pointers[0]); run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } - - - for (stripe = 0; stripe < rbio->real_stripes; stripe++) - kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + for (stripe = stripe - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); } /* @@ -1634,7 +1633,8 @@ struct btrfs_plug_cb { /* * rbios on the plug list are sorted for easier merging. */ -static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +static int plug_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, plug_list); @@ -1776,6 +1776,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) { int pagenr, stripe; void **pointers; + void **unmap_array; int faila = -1, failb = -1; struct page *page; blk_status_t err; @@ -1787,6 +1788,16 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) goto cleanup_io; } + /* + * Store copy of pointers that does not get reordered during + * reconstruction so that kunmap_local works. + */ + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!unmap_array) { + err = BLK_STS_RESOURCE; + goto cleanup_pointers; + } + faila = rbio->faila; failb = rbio->failb; @@ -1808,8 +1819,11 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) !test_bit(pagenr, rbio->dbitmap)) continue; - /* setup our array of pointers with pages - * from each stripe + /* + * Setup our array of pointers with pages from each stripe + * + * NOTE: store a duplicate array of pointers to preserve the + * pointer order */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* @@ -1823,7 +1837,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } else { page = rbio_stripe_page(rbio, stripe, pagenr); } - pointers[stripe] = kmap(page); + pointers[stripe] = kmap_local_page(page); + unmap_array[stripe] = pointers[stripe]; } /* all raid6 handling here */ @@ -1916,24 +1931,14 @@ pstripe: } } } - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - /* - * if we're rebuilding a read, we have to use - * pages from the bio list - */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe == faila || stripe == failb)) { - page = page_in_rbio(rbio, stripe, pagenr, 0); - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } - kunmap(page); - } + for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) + kunmap_local(unmap_array[stripe]); } err = BLK_STS_OK; cleanup: + kfree(unmap_array); +cleanup_pointers: kfree(pointers); cleanup_io: @@ -2358,13 +2363,13 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, goto cleanup; } SetPageUptodate(q_page); - pointers[rbio->real_stripes - 1] = kmap(q_page); + pointers[rbio->real_stripes - 1] = kmap_local_page(q_page); } atomic_set(&rbio->error, 0); /* Map the parity stripe just once */ - pointers[nr_data] = kmap(p_page); + pointers[nr_data] = kmap_local_page(p_page); for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { struct page *p; @@ -2372,7 +2377,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap(p); + pointers[stripe] = kmap_local_page(p); } if (has_qstripe) { @@ -2387,22 +2392,22 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* Check scrubbing parity and repair it */ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - parity = kmap(p); + parity = kmap_local_page(p); if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) copy_page(parity, pointers[rbio->scrubp]); else /* Parity is right, needn't writeback */ bitmap_clear(rbio->dbitmap, pagenr, 1); - kunmap(p); + kunmap_local(parity); - for (stripe = 0; stripe < nr_data; stripe++) - kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + for (stripe = nr_data - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); } - kunmap(p_page); + kunmap_local(pointers[nr_data]); __free_page(p_page); if (q_page) { - kunmap(q_page); + kunmap_local(pointers[rbio->real_stripes - 1]); __free_page(q_page); } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 20fd4aa48a8c..06713a8fe26b 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -209,7 +209,7 @@ int btree_readahead_hook(struct extent_buffer *eb, int err) /* find extent */ spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, - eb->start >> PAGE_SHIFT); + eb->start >> fs_info->sectorsize_bits); if (re) re->refcnt++; spin_unlock(&fs_info->reada_lock); @@ -240,7 +240,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, zone = NULL; spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> PAGE_SHIFT, 1); + logical >> fs_info->sectorsize_bits, 1); if (ret == 1 && logical >= zone->start && logical <= zone->end) { kref_get(&zone->refcnt); spin_unlock(&fs_info->reada_lock); @@ -283,13 +283,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)(zone->end >> PAGE_SHIFT), - zone); + (unsigned long)(zone->end >> fs_info->sectorsize_bits), + zone); if (ret == -EEXIST) { kfree(zone); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> PAGE_SHIFT, 1); + logical >> fs_info->sectorsize_bits, 1); if (ret == 1 && logical >= zone->start && logical <= zone->end) kref_get(&zone->refcnt); else @@ -315,7 +315,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, u64 length; int real_stripes; int nzones = 0; - unsigned long index = logical >> PAGE_SHIFT; + unsigned long index = logical >> fs_info->sectorsize_bits; int dev_replace_is_ongoing; int have_zone = 0; @@ -497,7 +497,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, struct reada_extent *re) { int i; - unsigned long index = re->logical >> PAGE_SHIFT; + unsigned long index = re->logical >> fs_info->sectorsize_bits; spin_lock(&fs_info->reada_lock); if (--re->refcnt) { @@ -538,11 +538,12 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, static void reada_zone_release(struct kref *kref) { struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + struct btrfs_fs_info *fs_info = zone->device->fs_info; - lockdep_assert_held(&zone->device->fs_info->reada_lock); + lockdep_assert_held(&fs_info->reada_lock); radix_tree_delete(&zone->device->reada_zones, - zone->end >> PAGE_SHIFT); + zone->end >> fs_info->sectorsize_bits); kfree(zone); } @@ -593,7 +594,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical, static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) { int i; - unsigned long index = zone->end >> PAGE_SHIFT; + unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits; for (i = 0; i < zone->ndevs; ++i) { struct reada_zone *peer; @@ -628,7 +629,7 @@ static int reada_pick_zone(struct btrfs_device *dev) (void **)&zone, index, 1); if (ret == 0) break; - index = (zone->end >> PAGE_SHIFT) + 1; + index = (zone->end >> dev->fs_info->sectorsize_bits) + 1; if (zone->locked) { if (zone->elems > top_locked_elems) { top_locked_elems = zone->elems; @@ -709,7 +710,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev) * plugging to speed things up */ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> PAGE_SHIFT, 1); + dev->reada_next >> fs_info->sectorsize_bits, 1); if (ret == 0 || re->logical > dev->reada_curr_zone->end) { ret = reada_pick_zone(dev); if (!ret) { @@ -718,7 +719,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev) } re = NULL; ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> PAGE_SHIFT, 1); + dev->reada_next >> fs_info->sectorsize_bits, 1); } if (ret == 0) { spin_unlock(&fs_info->reada_lock); @@ -885,7 +886,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) pr_cont(" curr off %llu", device->reada_next - zone->start); pr_cont("\n"); - index = (zone->end >> PAGE_SHIFT) + 1; + index = (zone->end >> fs_info->sectorsize_bits) + 1; } cnt = 0; index = 0; @@ -910,7 +911,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } } pr_cont("\n"); - index = (re->logical >> PAGE_SHIFT) + 1; + index = (re->logical >> fs_info->sectorsize_bits) + 1; if (++cnt > 15) break; } @@ -926,7 +927,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) if (ret == 0) break; if (!re->scheduled) { - index = (re->logical >> PAGE_SHIFT) + 1; + index = (re->logical >> fs_info->sectorsize_bits) + 1; continue; } pr_debug("re: logical %llu size %u list empty %d scheduled %d", @@ -942,7 +943,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } } pr_cont("\n"); - index = (re->logical >> PAGE_SHIFT) + 1; + index = (re->logical >> fs_info->sectorsize_bits) + 1; } spin_unlock(&fs_info->reada_lock); } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 762881b777b3..f4ec06b53aa0 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -478,9 +478,9 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; - ret = btrfs_replace_file_extents(inode, path, drop_start, - new_key.offset + datal - 1, &clone_info, - &trans); + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, + drop_start, new_key.offset + datal - 1, + &clone_info, &trans); if (ret) goto out; } else if (type == BTRFS_FILE_EXTENT_INLINE) { @@ -567,8 +567,8 @@ process_slot: set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - ret = btrfs_replace_file_extents(inode, path, last_dest_end, - destoff + len - 1, NULL, &trans); + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, + last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; @@ -604,6 +604,20 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); } +static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) +{ + if (inode1 < inode2) + swap(inode1, inode2); + down_write(&BTRFS_I(inode1)->i_mmap_lock); + down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); +} + +static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) +{ + up_write(&BTRFS_I(inode1)->i_mmap_lock); + up_write(&BTRFS_I(inode2)->i_mmap_lock); +} + static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { @@ -820,6 +834,16 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, len, remap_flags); } +static bool file_sync_write(const struct file *file) +{ + if (file->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(file))) + return true; + + return false; +} + loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) @@ -832,10 +856,12 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - if (same_inode) - inode_lock(src_inode); - else + if (same_inode) { + btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); + } else { lock_two_nondirectories(src_inode, dst_inode); + btrfs_double_mmap_lock(src_inode, dst_inode); + } ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, &len, remap_flags); @@ -848,10 +874,27 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: - if (same_inode) - inode_unlock(src_inode); - else + if (same_inode) { + btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); + } else { + btrfs_double_mmap_unlock(src_inode, dst_inode); unlock_two_nondirectories(src_inode, dst_inode); + } + + /* + * If either the source or the destination file was opened with O_SYNC, + * O_DSYNC or has the S_SYNC attribute, fsync both the destination and + * source files/ranges, so that after a successful return (0) followed + * by a power failure results in the reflinked data to be readable from + * both files/ranges. + */ + if (ret == 0 && len > 0 && + (file_sync_write(src_file) || file_sync_write(dst_file))) { + ret = btrfs_sync_file(src_file, off, off + len - 1, 0); + if (ret == 0) + ret = btrfs_sync_file(dst_file, destoff, + destoff + len - 1, 0); + } return ret < 0 ? ret : len; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 232d5da7b7be..b70be2ac2e9e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -638,9 +638,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { - btrfs_panic(fs_info, -EEXIST, + btrfs_err(fs_info, "Duplicate root found for start=%llu while inserting into relocation tree", node->bytenr); + return -EEXIST; } list_add_tail(&root->root_list, &rc->reloc_roots); @@ -733,10 +734,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; - int ret; + int ret = 0; + bool must_abort = false; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); - BUG_ON(!root_item); + if (!root_item) + return ERR_PTR(-ENOMEM); root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -748,7 +751,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (ret) + goto fail; + /* * Set the last_snapshot field to the generation of the commit * root - like this ctree.c:btrfs_block_can_be_shared() behaves @@ -769,9 +774,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, */ ret = btrfs_copy_root(trans, root, root->node, &eb, BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (ret) + goto fail; } + /* + * We have changed references at this point, we must abort the + * transaction if anything fails. + */ + must_abort = true; + memcpy(root_item, &root->root_item, sizeof(*root_item)); btrfs_set_root_bytenr(root_item, eb->start); btrfs_set_root_level(root_item, btrfs_header_level(eb)); @@ -789,14 +801,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, fs_info->tree_root, &root_key, root_item); - BUG_ON(ret); + if (ret) + goto fail; + kfree(root_item); reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); - BUG_ON(IS_ERR(reloc_root)); + if (IS_ERR(reloc_root)) { + ret = PTR_ERR(reloc_root); + goto abort; + } set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); reloc_root->last_trans = trans->transid; return reloc_root; +fail: + kfree(root_item); +abort: + if (must_abort) + btrfs_abort_transaction(trans, ret); + return ERR_PTR(ret); } /* @@ -856,9 +879,16 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, reloc_root = create_reloc_root(trans, root, root->root_key.objectid); if (clear_rsv) trans->block_rsv = rsv; + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); - BUG_ON(ret < 0); + ASSERT(ret != -EEXIST); + if (ret) { + /* Pairs with create_reloc_root */ + btrfs_put_root(reloc_root); + return ret; + } root->reloc_root = btrfs_grab_root(reloc_root); return 0; } @@ -875,7 +905,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int ret; if (!have_reloc_root(root)) - goto out; + return 0; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -908,10 +938,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, fs_info->tree_root, &reloc_root->root_key, root_item); - BUG_ON(ret); btrfs_put_root(reloc_root); -out: - return 0; + return ret; } /* @@ -1185,8 +1213,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, int ret; int slot; - BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); last_snapshot = btrfs_root_last_snapshot(&src->root_item); again: @@ -1205,7 +1233,11 @@ again: if (cow) { ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, BTRFS_NESTING_COW); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return ret; + } } if (next_key) { @@ -1217,7 +1249,7 @@ again: parent = eb; while (1) { level = btrfs_header_level(parent); - BUG_ON(level < lowest_level); + ASSERT(level >= lowest_level); ret = btrfs_bin_search(parent, &key, &slot); if (ret < 0) @@ -1265,7 +1297,11 @@ again: ret = btrfs_cow_block(trans, dest, eb, parent, slot, &eb, BTRFS_NESTING_COW); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + break; + } } btrfs_tree_unlock(parent); @@ -1289,7 +1325,11 @@ again: path->lowest_level = level; ret = btrfs_search_slot(trans, src, &key, path, 0, 1); path->lowest_level = 0; - BUG_ON(ret); + if (ret) { + if (ret > 0) + ret = -ENOENT; + break; + } /* * Info qgroup to trace both subtrees. @@ -1329,27 +1369,39 @@ again: ref.skip_qgroup = true; btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, blocksize, 0); ref.skip_qgroup = true; btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, blocksize, path->nodes[level]->start); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); ref.skip_qgroup = true; ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, blocksize, 0); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); ref.skip_qgroup = true; ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_unlock_up_safe(path, 0); @@ -1537,12 +1589,13 @@ static int find_next_key(struct btrfs_path *path, int level, /* * Insert current subvolume into reloc_control::dirty_subvol_roots */ -static void insert_dirty_subvol(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *root) +static int insert_dirty_subvol(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root) { struct btrfs_root *reloc_root = root->reloc_root; struct btrfs_root_item *reloc_root_item; + int ret; /* @root must be a subvolume tree root with a valid reloc tree */ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); @@ -1553,12 +1606,16 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans, sizeof(reloc_root_item->drop_progress)); btrfs_set_root_drop_level(reloc_root_item, 0); btrfs_set_root_refs(reloc_root_item, 0); - btrfs_update_reloc_root(trans, root); + ret = btrfs_update_reloc_root(trans, root); + if (ret) + return ret; if (list_empty(&root->reloc_dirty_list)) { btrfs_grab_root(root); list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); } + + return 0; } static int clean_dirty_subvols(struct reloc_control *rc) @@ -1760,8 +1817,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, out: btrfs_free_path(path); - if (ret == 0) - insert_dirty_subvol(trans, rc, root); + if (ret == 0) { + ret = insert_dirty_subvol(trans, rc, root); + if (ret) + btrfs_abort_transaction(trans, ret); + } if (trans) btrfs_end_transaction_throttle(trans); @@ -1825,8 +1885,18 @@ again: root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + if (IS_ERR(root)) { + /* + * Even if we have an error we need this reloc root + * back on our list so we can clean up properly. + */ + list_add(&reloc_root->root_list, &reloc_roots); + btrfs_abort_transaction(trans, (int)PTR_ERR(root)); + if (!err) + err = PTR_ERR(root); + break; + } + ASSERT(root->reloc_root == reloc_root); /* * set reference count to 1, so btrfs_recover_relocation @@ -1834,16 +1904,27 @@ again: */ if (!err) btrfs_set_root_refs(&reloc_root->root_item, 1); - btrfs_update_reloc_root(trans, root); + ret = btrfs_update_reloc_root(trans, root); + /* + * Even if we have an error we need this reloc root back on our + * list so we can clean up properly. + */ list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); + + if (ret) { + btrfs_abort_transaction(trans, ret); + if (!err) + err = ret; + break; + } } list_splice(&reloc_roots, &rc->reloc_roots); if (!err) - btrfs_commit_transaction(trans); + err = btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); return err; @@ -1888,8 +1969,29 @@ again: root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (btrfs_root_refs(&reloc_root->root_item) > 0) { - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + if (IS_ERR(root)) { + /* + * For recovery we read the fs roots on mount, + * and if we didn't find the root then we marked + * the reloc root as a garbage root. For normal + * relocation obviously the root should exist in + * memory. However there's no reason we can't + * handle the error properly here just in case. + */ + ASSERT(0); + ret = PTR_ERR(root); + goto out; + } + if (root->reloc_root != reloc_root) { + /* + * This is actually impossible without something + * going really wrong (like weird race condition + * or cosmic rays). + */ + ASSERT(0); + ret = -EINVAL; + goto out; + } ret = merge_reloc_root(rc, root); btrfs_put_root(root); if (ret) { @@ -1971,8 +2073,27 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, return 0; root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + + /* + * This should succeed, since we can't have a reloc root without having + * already looked up the actual root and created the reloc root for this + * root. + * + * However if there's some sort of corruption where we have a ref to a + * reloc root without a corresponding root this could return ENOENT. + */ + if (IS_ERR(root)) { + ASSERT(0); + return PTR_ERR(root); + } + if (root->reloc_root != reloc_root) { + ASSERT(0); + btrfs_err(fs_info, + "root %llu has two reloc roots associated with it", + reloc_root->root_key.offset); + btrfs_put_root(root); + return -EUCLEAN; + } ret = btrfs_record_root_in_trans(trans, root); btrfs_put_root(root); @@ -1988,26 +2109,77 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_backref_node *next; struct btrfs_root *root; int index = 0; + int ret; next = node; while (1) { cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - BUG_ON(!root); - BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)); + + /* + * If there is no root, then our references for this block are + * incomplete, as we should be able to walk all the way up to a + * block that is owned by a root. + * + * This path is only for SHAREABLE roots, so if we come upon a + * non-SHAREABLE root then we have backrefs that resolve + * improperly. + * + * Both of these cases indicate file system corruption, or a bug + * in the backref walking code. + */ + if (!root) { + ASSERT(0); + btrfs_err(trans->fs_info, + "bytenr %llu doesn't have a backref path ending in a root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + ASSERT(0); + btrfs_err(trans->fs_info, + "bytenr %llu has multiple refs with one ending in a non-shareable root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - record_reloc_root_in_trans(trans, root); + ret = record_reloc_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); break; } - btrfs_record_root_in_trans(trans, root); + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); root = root->reloc_root; + /* + * We could have raced with another thread which failed, so + * root->reloc_root may not be set, return ENOENT in this case. + */ + if (!root) + return ERR_PTR(-ENOENT); + if (next->new_bytenr != root->node->start) { - BUG_ON(next->new_bytenr); - BUG_ON(!list_empty(&next->list)); + /* + * We just created the reloc root, so we shouldn't have + * ->new_bytenr set and this shouldn't be in the changed + * list. If it is then we have multiple roots pointing + * at the same bytenr which indicates corruption, or + * we've made a mistake in the backref walking code. + */ + ASSERT(next->new_bytenr == 0); + ASSERT(list_empty(&next->list)); + if (next->new_bytenr || !list_empty(&next->list)) { + btrfs_err(trans->fs_info, + "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", + node->bytenr, next->bytenr); + return ERR_PTR(-EUCLEAN); + } + next->new_bytenr = root->node->start; btrfs_put_root(next->root); next->root = btrfs_grab_root(root); @@ -2024,8 +2196,14 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!next || next->level <= node->level) break; } - if (!root) - return NULL; + if (!root) { + /* + * This can happen if there's fs corruption or if there's a bug + * in the backref lookup code. + */ + ASSERT(0); + return ERR_PTR(-ENOENT); + } next = node; /* setup backref node path for btrfs_reloc_cow_block */ @@ -2061,7 +2239,13 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - BUG_ON(!root); + + /* + * This can occur if we have incomplete extent refs leading all + * the way up a particular path, in this case return -EUCLEAN. + */ + if (!root) + return ERR_PTR(-EUCLEAN); /* No other choice for non-shareable tree */ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) @@ -2181,7 +2365,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, int slot; int ret = 0; - BUG_ON(lowest && node->eb); + /* + * If we are lowest then this is the first time we're processing this + * block, and thus shouldn't have an eb associated with it yet. + */ + ASSERT(!lowest || !node->eb); path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; @@ -2192,7 +2380,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, upper = edge->node[UPPER]; root = select_reloc_root(trans, rc, upper, edges); - BUG_ON(!root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto next; + } if (upper->eb && !upper->locked) { if (!lowest) { @@ -2266,7 +2457,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, free_extent_buffer(eb); if (ret < 0) goto next; - BUG_ON(node->eb != eb); + /* + * We've just COWed this block, it should have updated + * the correct backref node entry. + */ + ASSERT(node->eb == eb); } else { btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); @@ -2281,10 +2476,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, btrfs_init_tree_ref(&ref, node->level, btrfs_header_owner(upper->eb)); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); - - ret = btrfs_drop_subtree(trans, root, eb, upper->eb); - BUG_ON(ret); + if (!ret) + ret = btrfs_drop_subtree(trans, root, eb, + upper->eb); + if (ret) + btrfs_abort_transaction(trans, ret); } next: if (!upper->pending) @@ -2302,7 +2498,12 @@ next: } path->lowest_level = 0; - BUG_ON(ret == -ENOSPC); + + /* + * We should have allocated all of our space in the block rsv and thus + * shouldn't ENOSPC. + */ + ASSERT(ret != -ENOSPC); return ret; } @@ -2434,16 +2635,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, BUG_ON(node->processed); root = select_one_root(node); - if (root == ERR_PTR(-ENOENT)) { - update_processed_blocks(rc, node); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + + /* See explanation in select_one_root for the -EUCLEAN case. */ + ASSERT(ret == -ENOENT); + if (ret == -ENOENT) { + ret = 0; + update_processed_blocks(rc, node); + } goto out; } if (root) { if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { - BUG_ON(node->new_bytenr); - BUG_ON(!list_empty(&node->list)); - btrfs_record_root_in_trans(trans, root); + /* + * This block was the root block of a root, and this is + * the first time we're processing the block and thus it + * should not have had the ->new_bytenr modified and + * should have not been included on the changed list. + * + * However in the case of corruption we could have + * multiple refs pointing to the same block improperly, + * and thus we would trip over these checks. ASSERT() + * for the developer case, because it could indicate a + * bug in the backref code, however error out for a + * normal user in the case of corruption. + */ + ASSERT(node->new_bytenr == 0); + ASSERT(list_empty(&node->list)); + if (node->new_bytenr || !list_empty(&node->list)) { + btrfs_err(root->fs_info, + "bytenr %llu has improper references to it", + node->bytenr); + ret = -EUCLEAN; + goto out; + } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + goto out; + /* + * Another thread could have failed, need to check if we + * have reloc_root actually set. + */ + if (!root->reloc_root) { + ret = -ENOENT; + goto out; + } root = root->reloc_root; node->new_bytenr = root->node->start; btrfs_put_root(node->root); @@ -2578,7 +2816,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( return btrfs_end_transaction(trans); } - inode_lock(&inode->vfs_inode); + btrfs_inode_lock(&inode->vfs_inode, 0); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) @@ -2596,7 +2834,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) break; } - inode_unlock(&inode->vfs_inode); + btrfs_inode_unlock(&inode->vfs_inode, 0); if (cur_offset < prealloc_end) btrfs_free_reserved_data_space_noquota(inode->root->fs_info, @@ -3220,20 +3458,6 @@ static void unset_reloc_control(struct reloc_control *rc) mutex_unlock(&fs_info->reloc_mutex); } -static int check_extent_flags(u64 flags) -{ - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if (!(flags & BTRFS_EXTENT_FLAG_DATA) && - !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - return 1; - return 0; -} - static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { @@ -3272,8 +3496,7 @@ int prepare_to_relocate(struct reloc_control *rc) */ return PTR_ERR(trans); } - btrfs_commit_transaction(trans); - return 0; + return btrfs_commit_transaction(trans); } static noinline_for_stack int relocate_block_group(struct reloc_control *rc) @@ -3285,7 +3508,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct btrfs_path *path; struct btrfs_extent_item *ei; u64 flags; - u32 item_size; int ret; int err = 0; int progress = 0; @@ -3334,19 +3556,7 @@ restart: ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - if (item_size >= sizeof(*ei)) { - flags = btrfs_extent_flags(path->nodes[0], ei); - ret = check_extent_flags(flags); - BUG_ON(ret); - } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { - err = -EINVAL; - btrfs_print_v0_err(trans->fs_info); - btrfs_abort_transaction(trans, err); - break; - } else { - BUG(); - } + flags = btrfs_extent_flags(path->nodes[0], ei); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); @@ -3445,7 +3655,9 @@ restart: err = PTR_ERR(trans); goto out_free; } - btrfs_commit_transaction(trans); + ret = btrfs_commit_transaction(trans); + if (ret && !err) + err = ret; out_free: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) @@ -3488,6 +3700,35 @@ out: return ret; } +static void delete_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto out; + } + ret = btrfs_del_item(trans, root, path); +out: + if (ret) + btrfs_abort_transaction(trans, ret); + btrfs_free_path(path); +} + /* * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 @@ -3514,10 +3755,16 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, goto out; err = __insert_orphan_inode(trans, root, objectid); - BUG_ON(err); + if (err) + goto out; inode = btrfs_iget(fs_info->sb, objectid, root); - BUG_ON(IS_ERR(inode)); + if (IS_ERR(inode)) { + delete_orphan_inode(trans, root, objectid); + err = PTR_ERR(inode); + inode = NULL; + goto out; + } BTRFS_I(inode)->index_cnt = group->start; err = btrfs_orphan_add(trans, BTRFS_I(inode)); @@ -3859,7 +4106,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) } err = __add_reloc_root(reloc_root); - BUG_ON(err < 0); /* -ENOMEM or logic error */ + ASSERT(err != -EEXIST); + if (err) { + list_add_tail(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(fs_root); + btrfs_end_transaction(trans); + goto out_unset; + } fs_root->reloc_root = btrfs_grab_root(reloc_root); btrfs_put_root(fs_root); } @@ -4074,7 +4327,12 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); - BUG_ON(ret < 0); + ASSERT(ret != -EEXIST); + if (ret) { + /* Pairs with create_reloc_root */ + btrfs_put_root(reloc_root); + return ret; + } new_root->reloc_root = btrfs_grab_root(reloc_root); if (rc->create_reloc_tree) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3d9088eab2fc..485cda3eb8d7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -206,9 +206,6 @@ struct full_stripe_lock { struct mutex mutex; }; -static void scrub_pending_bio_inc(struct scrub_ctx *sctx); -static void scrub_pending_bio_dec(struct scrub_ctx *sctx); -static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, @@ -226,14 +223,11 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); -static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); static void scrub_page_get(struct scrub_page *spage); static void scrub_page_put(struct scrub_page *spage); static void scrub_parity_get(struct scrub_parity *sparity); static void scrub_parity_put(struct scrub_parity *sparity); -static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, - struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, @@ -251,8 +245,6 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, static void scrub_wr_submit(struct scrub_ctx *sctx); static void scrub_wr_bio_end_io(struct bio *bio); static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); -static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); -static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_put_ctx(struct scrub_ctx *sctx); static inline int scrub_is_page_on_raid56(struct scrub_page *spage) @@ -3682,8 +3674,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, spin_lock(&cache->lock); if (!cache->to_copy) { spin_unlock(&cache->lock); - ro_set = 0; - goto done; + btrfs_put_block_group(cache); + goto skip; } spin_unlock(&cache->lock); } @@ -3841,7 +3833,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, cache, found_key.offset)) ro_set = 0; -done: down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8f323859156b..55741adf9071 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6650,6 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx) path = alloc_path_for_send(); if (!path) return -ENOMEM; + path->reada = READA_FORWARD_ALWAYS; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; @@ -6688,15 +6689,35 @@ out: return ret; } -static int tree_move_down(struct btrfs_path *path, int *level) +static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) { struct extent_buffer *eb; + struct extent_buffer *parent = path->nodes[*level]; + int slot = path->slots[*level]; + const int nritems = btrfs_header_nritems(parent); + u64 reada_max; + u64 reada_done = 0; BUG_ON(*level == 0); - eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]); + eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) return PTR_ERR(eb); + /* + * Trigger readahead for the next leaves we will process, so that it is + * very likely that when we need them they are already in memory and we + * will not block on disk IO. For nodes we only do readahead for one, + * since the time window between processing nodes is typically larger. + */ + reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize); + + for (slot++; slot < nritems && reada_done < reada_max; slot++) { + if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) { + btrfs_readahead_node_child(parent, slot); + reada_done += eb->fs_info->nodesize; + } + } + path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; @@ -6736,14 +6757,15 @@ static int tree_move_next_or_upnext(struct btrfs_path *path, static int tree_advance(struct btrfs_path *path, int *level, int root_level, int allow_down, - struct btrfs_key *key) + struct btrfs_key *key, + u64 reada_min_gen) { int ret; if (*level == 0 || !allow_down) { ret = tree_move_next_or_upnext(path, level, root_level); } else { - ret = tree_move_down(path, level); + ret = tree_move_down(path, level, reada_min_gen); } if (ret >= 0) { if (*level == 0) @@ -6817,6 +6839,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, u64 right_blockptr; u64 left_gen; u64 right_gen; + u64 reada_min_gen; left_path = btrfs_alloc_path(); if (!left_path) { @@ -6896,6 +6919,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } + /* + * Our right root is the parent root, while the left root is the "send" + * root. We know that all new nodes/leaves in the left root must have + * a generation greater than the right root's generation, so we trigger + * readahead for those nodes and leaves of the left root, as we know we + * will need to read them at some point. + */ + reada_min_gen = btrfs_header_generation(right_root->commit_root); up_read(&fs_info->commit_root_sem); if (left_level == 0) @@ -6920,7 +6951,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = tree_advance(left_path, &left_level, left_root_level, advance_left != ADVANCE_ONLY_NEXT, - &left_key); + &left_key, reada_min_gen); if (ret == -1) left_end_reached = ADVANCE; else if (ret < 0) @@ -6931,7 +6962,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = tree_advance(right_path, &right_level, right_root_level, advance_right != ADVANCE_ONLY_NEXT, - &right_key); + &right_key, reada_min_gen); if (ret == -1) right_end_reached = ADVANCE; else if (ret < 0) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2da6177f4b0b..2dc674b7c3b1 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -861,8 +861,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * of heavy DIO or ordered reservations, preemptive flushing will just * waste time and cause us to slow down. */ - ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); - delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + ordered = percpu_counter_read_positive(&fs_info->ordered_bytes); + delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); if (ordered >= delalloc) used += fs_info->delayed_refs_rsv.reserved + fs_info->delayed_block_rsv.reserved; diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index c69049e7daa9..2d19089ab625 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -4,6 +4,64 @@ #include "ctree.h" #include "subpage.h" +/* + * Subpage (sectorsize < PAGE_SIZE) support overview: + * + * Limitations: + * + * - Only support 64K page size for now + * This is to make metadata handling easier, as 64K page would ensure + * all nodesize would fit inside one page, thus we don't need to handle + * cases where a tree block crosses several pages. + * + * - Only metadata read-write for now + * The data read-write part is in development. + * + * - Metadata can't cross 64K page boundary + * btrfs-progs and kernel have done that for a while, thus only ancient + * filesystems could have such problem. For such case, do a graceful + * rejection. + * + * Special behavior: + * + * - Metadata + * Metadata read is fully supported. + * Meaning when reading one tree block will only trigger the read for the + * needed range, other unrelated range in the same page will not be touched. + * + * Metadata write support is partial. + * The writeback is still for the full page, but we will only submit + * the dirty extent buffers in the page. + * + * This means, if we have a metadata page like this: + * + * Page offset + * 0 16K 32K 48K 64K + * |/////////| |///////////| + * \- Tree block A \- Tree block B + * + * Even if we just want to writeback tree block A, we will also writeback + * tree block B if it's also dirty. + * + * This may cause extra metadata writeback which results more COW. + * + * Implementation: + * + * - Common + * Both metadata and data will use a new structure, btrfs_subpage, to + * record the status of each sector inside a page. This provides the extra + * granularity needed. + * + * - Metadata + * Since we have multiple tree blocks inside one page, we can't rely on page + * locking anymore, or we will have greatly reduced concurrency or even + * deadlocks (hold one tree lock while trying to lock another tree lock in + * the same page). + * + * Thus for metadata locking, subpage support relies on io_tree locking only. + * This means a slightly higher tree locking latency. + */ + int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type) { @@ -220,6 +278,82 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } +void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->dirty_bitmap |= tmp; + spin_unlock_irqrestore(&subpage->lock, flags); + set_page_dirty(page); +} + +/* + * Extra clear_and_test function for subpage dirty bitmap. + * + * Return true if we're the last bits in the dirty_bitmap and clear the + * dirty_bitmap. + * Return false otherwise. + * + * NOTE: Callers should manually clear page dirty for true case, as we have + * extra handling for tree blocks. + */ +bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + bool last = false; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->dirty_bitmap &= ~tmp; + if (subpage->dirty_bitmap == 0) + last = true; + spin_unlock_irqrestore(&subpage->lock, flags); + return last; +} + +void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + bool last; + + last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); + if (last) + clear_page_dirty_for_io(page); +} + +void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->writeback_bitmap |= tmp; + set_page_writeback(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->writeback_bitmap &= ~tmp; + if (subpage->writeback_bitmap == 0) + end_page_writeback(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -240,6 +374,8 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -276,3 +412,7 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); +IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, + PageDirty); +IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, + PageWriteback); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index b86a4881475d..bfd626e955be 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -20,6 +20,8 @@ struct btrfs_subpage { spinlock_t lock; u16 uptodate_bitmap; u16 error_bitmap; + u16 dirty_bitmap; + u16 writeback_bitmap; union { /* * Structures only used by metadata @@ -87,5 +89,10 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(error); +DECLARE_BTRFS_SUBPAGE_OPS(dirty); +DECLARE_BTRFS_SUBPAGE_OPS(writeback); + +bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f7a4ad86adee..4a396c1147f1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -252,6 +252,32 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . } #endif +#if BITS_PER_LONG == 32 +void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { + btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); + btrfs_warn(fs_info, +"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_warn(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} + +void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { + btrfs_err(fs_info, "reached 32bit limit for logical addresses"); + btrfs_err(fs_info, +"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_err(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} +#endif + /* * We only mark the transaction aborted and then set the file system read-only. * This will prevent new transactions from starting or trying to join this diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 6eb1c50fa98c..436ac7b4b334 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -360,11 +360,26 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj, BTRFS_ATTR(static_feature, supported_rescue_options, supported_rescue_options_show); +static ssize_t supported_sectorsizes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + ssize_t ret = 0; + + /* Only sectorsize == PAGE_SIZE is now supported */ + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE); + + return ret; +} +BTRFS_ATTR(static_feature, supported_sectorsizes, + supported_sectorsizes_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), BTRFS_ATTR_PTR(static_feature, supported_rescue_options), + BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), NULL }; @@ -965,6 +980,40 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); +static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + ssize_t ret; + + ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold); + + return ret; +} + +static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int thresh; + int ret; + + ret = kstrtoint(buf, 10, &thresh); + if (ret) + return ret; + + if (thresh <= 50 || thresh > 100) + return -EINVAL; + + fs_info->bg_reclaim_threshold = thresh; + + return len; +} +BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, + btrfs_bg_reclaim_threshold_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -976,6 +1025,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, exclusive_operation), BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), + BTRFS_ATTR_PTR(, bg_reclaim_threshold), NULL, }; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index acff6bb49a97..f75de9f6c0ad 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -260,6 +260,7 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans) void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_transaction *cur_trans = trans->transaction; if (!trans->chunk_bytes_reserved) return; @@ -268,6 +269,8 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, trans->chunk_bytes_reserved, NULL); + atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved); + cond_wake_up(&cur_trans->chunk_reserve_wait); trans->chunk_bytes_reserved = 0; } @@ -383,6 +386,8 @@ loop: spin_lock_init(&cur_trans->dropped_roots_lock); INIT_LIST_HEAD(&cur_trans->releasing_ebs); spin_lock_init(&cur_trans->releasing_ebs_lock); + atomic64_set(&cur_trans->chunk_bytes_reserved, 0); + init_waitqueue_head(&cur_trans->chunk_reserve_wait); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); @@ -408,6 +413,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, int force) { struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && root->last_trans < trans->transid) || force) { @@ -456,11 +462,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * lock. smp_wmb() makes sure that all the writes above are * done before we pop in the zero below */ - btrfs_init_reloc_root(trans, root); + ret = btrfs_init_reloc_root(trans, root); smp_mb__before_atomic(); clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } - return 0; + return ret; } @@ -487,6 +493,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + int ret; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return 0; @@ -501,10 +508,10 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&fs_info->reloc_mutex); - record_root_in_trans(trans, root, 0); + ret = record_root_in_trans(trans, root, 0); mutex_unlock(&fs_info->reloc_mutex); - return 0; + return ret; } static inline int is_transaction_blocked(struct btrfs_transaction *trans) @@ -741,7 +748,16 @@ got_it: * Thus it need to be called after current->journal_info initialized, * or we can deadlock. */ - btrfs_record_root_in_trans(h, root); + ret = btrfs_record_root_in_trans(h, root); + if (ret) { + /* + * The transaction handle is fully initialized and linked with + * other structures so it needs to be ended in case of errors, + * not just freed. + */ + btrfs_end_transaction(h); + return ERR_PTR(ret); + } return h; @@ -1347,7 +1363,9 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); - btrfs_update_reloc_root(trans, root); + ret2 = btrfs_update_reloc_root(trans, root); + if (ret2) + return ret2; /* see comments in should_cow_block() */ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); @@ -1440,7 +1458,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * recorded root will never be updated again, causing an outdated root * item. */ - record_root_in_trans(trans, src, 1); + ret = record_root_in_trans(trans, src, 1); + if (ret) + return ret; /* * btrfs_qgroup_inherit relies on a consistent view of the usage for the @@ -1509,7 +1529,7 @@ out: * insert_dir_item() */ if (!ret) - record_root_in_trans(trans, parent, 1); + ret = record_root_in_trans(trans, parent, 1); return ret; } @@ -1586,8 +1606,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root, 0); - + ret = record_root_in_trans(trans, parent_root, 0); + if (ret) + goto fail; cur_time = current_time(parent_inode); /* @@ -1623,7 +1644,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - record_root_in_trans(trans, root, 0); + ret = record_root_in_trans(trans, root, 0); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1961,7 +1986,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) */ BUG_ON(list_empty(&cur_trans->list)); - list_del_init(&cur_trans->list); if (cur_trans == fs_info->running_transaction) { cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); @@ -1970,6 +1994,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) spin_lock(&fs_info->trans_lock); } + + /* + * Now that we know no one else is still using the transaction we can + * remove the transaction from the list of transactions. This avoids + * the transaction kthread from cleaning up the transaction while some + * other task is still using it, which could result in a use-after-free + * on things like log trees, as it forces the transaction kthread to + * wait for this transaction to be cleaned up by us. + */ + list_del_init(&cur_trans->list); + spin_unlock(&fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction, fs_info); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6335716e513f..364cfbb4c5c5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -96,6 +96,13 @@ struct btrfs_transaction { spinlock_t releasing_ebs_lock; struct list_head releasing_ebs; + + /* + * The number of bytes currently reserved, by all transaction handles + * attached to this transaction, for metadata extents of the chunk tree. + */ + atomic64_t chunk_bytes_reserved; + wait_queue_head_t chunk_reserve_wait; }; #define __TRANS_FREEZABLE (1U << 0) @@ -175,7 +182,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; inode->last_sub_trans = inode->root->log_transid; - inode->last_log_commit = inode->root->last_log_commit; + inode->last_log_commit = inode->last_sub_trans - 1; spin_unlock(&inode->lock); } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index f4ade821307d..a8b2e0d2c025 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1290,6 +1290,11 @@ static int check_extent_item(struct extent_buffer *leaf, key->offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + extent_err(leaf, slot, + "invalid extent flag, data has full backref set"); + return -EUCLEAN; + } } ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2f1acc9aea9e..f67721d82e5d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3165,24 +3165,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); - btrfs_init_log_ctx(&root_log_ctx, NULL); - - mutex_lock(&log_root_tree->log_mutex); - - index2 = log_root_tree->log_transid % 2; - list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); - root_log_ctx.log_transid = log_root_tree->log_transid; - if (btrfs_is_zoned(fs_info)) { + mutex_lock(&fs_info->tree_root->log_mutex); if (!log_root_tree->node) { ret = btrfs_alloc_log_tree_node(trans, log_root_tree); if (ret) { - mutex_unlock(&log_root_tree->log_mutex); + mutex_unlock(&fs_info->tree_log_mutex); goto out; } } + mutex_unlock(&fs_info->tree_root->log_mutex); } + btrfs_init_log_ctx(&root_log_ctx, NULL); + + mutex_lock(&log_root_tree->log_mutex); + + index2 = log_root_tree->log_transid % 2; + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + root_log_ctx.log_transid = log_root_tree->log_transid; + /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit @@ -4136,7 +4138,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, return ret; } -static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) +static int extent_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct extent_map *em1, *em2; @@ -6278,8 +6281,13 @@ again: } wc.replay_dest->log_root = log; - btrfs_record_root_in_trans(trans, wc.replay_dest); - ret = walk_log_tree(trans, log, &wc); + ret = btrfs_record_root_in_trans(trans, wc.replay_dest); + if (ret) + /* The loop needs to continue due to the root refs */ + btrfs_handle_fs_error(fs_info, ret, + "failed to record the log root in transaction"); + else + ret = walk_log_tree(trans, log, &wc); if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c new file mode 100644 index 000000000000..8a3a14686d3e --- /dev/null +++ b/fs/btrfs/tree-mod-log.c @@ -0,0 +1,929 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "tree-mod-log.h" +#include "disk-io.h" + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 logical; + u64 seq; + enum btrfs_mod_log_op op; + + /* + * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS + * operations. + */ + int slot; + + /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ + u64 generation; + + /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ + struct btrfs_disk_key key; + u64 blockptr; + + /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ + struct { + int dst_slot; + int nr_items; + } move; + + /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ + struct tree_mod_root old_root; +}; + +/* + * Pull a new tree mod seq number for our operation. + */ +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +{ + return atomic64_inc_return(&fs_info->tree_mod_seq); +} + +/* + * This adds a new blocker to the tree mod log's blocker list if the @elem + * passed does not already have a sequence number set. So when a caller expects + * to record tree modifications, it should ensure to set elem->seq to zero + * before calling btrfs_get_tree_mod_seq. + * Returns a fresh, unused tree log modification sequence number, even if no new + * blocker was added. + */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem) +{ + write_lock(&fs_info->tree_mod_log_lock); + if (!elem->seq) { + elem->seq = btrfs_inc_tree_mod_seq(fs_info); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); + } + write_unlock(&fs_info->tree_mod_log_lock); + + return elem->seq; +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct tree_mod_elem *tm; + u64 min_seq = BTRFS_SEQ_LAST; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + write_lock(&fs_info->tree_mod_log_lock); + list_del(&elem->list); + elem->seq = 0; + + if (list_empty(&fs_info->tree_mod_seq_list)) { + clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); + } else { + struct btrfs_seq_list *first; + + first = list_first_entry(&fs_info->tree_mod_seq_list, + struct btrfs_seq_list, list); + if (seq_putting > first->seq) { + /* + * Blocker with lower sequence number exists, we cannot + * remove anything from the log. + */ + write_unlock(&fs_info->tree_mod_log_lock); + return; + } + min_seq = first->seq; + } + + /* + * Anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = rb_entry(node, struct tree_mod_elem, node); + if (tm->seq >= min_seq) + continue; + rb_erase(node, tm_root); + kfree(tm); + } + write_unlock(&fs_info->tree_mod_log_lock); +} + +/* + * Key order of the log: + * node/leaf start address -> sequence + * + * The 'start address' is the logical address of the *new* root node for root + * replace operations, or the logical address of the affected block for all + * other operations. + */ +static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, + struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + + lockdep_assert_held_write(&fs_info->tree_mod_log_lock); + + tm->seq = btrfs_inc_tree_mod_seq(fs_info); + + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = rb_entry(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->logical < tm->logical) + new = &((*new)->rb_left); + else if (cur->logical > tm->logical) + new = &((*new)->rb_right); + else if (cur->seq < tm->seq) + new = &((*new)->rb_left); + else if (cur->seq > tm->seq) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); + return 0; +} + +/* + * Determines if logging can be omitted. Returns true if it can. Otherwise, it + * returns false with the tree_mod_log_lock acquired. The caller must hold + * this until all tree mod log insertions are recorded in the rb tree and then + * write unlock fs_info::tree_mod_log_lock. + */ +static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + return true; + if (eb && btrfs_header_level(eb) == 0) + return true; + + write_lock(&fs_info->tree_mod_log_lock); + if (list_empty(&(fs_info)->tree_mod_seq_list)) { + write_unlock(&fs_info->tree_mod_log_lock); + return true; + } + + return false; +} + +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + return false; + if (eb && btrfs_header_level(eb) == 0) + return false; + + return true; +} + +static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, + int slot, + enum btrfs_mod_log_op op, + gfp_t flags) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return NULL; + + tm->logical = eb->start; + if (op != BTRFS_MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); + + return tm; +} + +int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, + enum btrfs_mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + tm = alloc_tree_mod_elem(eb, slot, op, flags); + if (!tm) + return -ENOMEM; + + if (tree_mod_dont_log(eb->fs_info, eb)) { + kfree(tm); + return 0; + } + + ret = tree_mod_log_insert(eb->fs_info, tm); + write_unlock(&eb->fs_info->tree_mod_log_lock); + if (ret) + kfree(tm); + + return ret; +} + +int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items) +{ + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; + int i; + bool locked = false; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->logical = eb->start; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = BTRFS_MOD_LOG_MOVE_KEYS; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(eb->fs_info, eb)) + goto free_tms; + locked = true; + + /* + * When we override something during the move, we log these removals. + * This can only happen when we move towards the beginning of the + * buffer, i.e. dst_slot < src_slot. + */ + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = tree_mod_log_insert(eb->fs_info, tm_list[i]); + if (ret) + goto free_tms; + } + + ret = tree_mod_log_insert(eb->fs_info, tm); + if (ret) + goto free_tms; + write_unlock(&eb->fs_info->tree_mod_log_lock); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + write_unlock(&eb->fs_info->tree_mod_log_lock); + kfree(tm_list); + kfree(tm); + + return ret; +} + +static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) +{ + int i, j; + int ret; + + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } + } + + return 0; +} + +int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, + struct extent_buffer *new_root, + bool log_removal) +{ + struct btrfs_fs_info *fs_info = old_root->fs_info; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) { + ret = -ENOMEM; + goto free_tms; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + } + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->logical = new_root->start; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + + if (tm_list) + ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = tree_mod_log_insert(fs_info, tm); + + write_unlock(&fs_info->tree_mod_log_lock); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + + return ret; +} + +static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq, + bool smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + + read_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = rb_entry(node, struct tree_mod_elem, node); + if (cur->logical < start) { + node = node->rb_left; + } else if (cur->logical > start) { + node = node->rb_right; + } else if (cur->seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* We want the node with the highest seq */ + if (found) + BUG_ON(found->seq > cur->seq); + found = cur; + node = node->rb_left; + } else if (cur->seq > min_seq) { + /* We want the node with the smallest seq */ + if (found) + BUG_ON(found->seq < cur->seq); + found = cur; + node = node->rb_right; + } else { + found = cur; + break; + } + } + read_unlock(&fs_info->tree_mod_log_lock); + + return found; +} + +/* + * This returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). Any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, true); +} + +/* + * This returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). Any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, false); +} + +int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, + struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, + int nr_items) +{ + struct btrfs_fs_info *fs_info = dst->fs_info; + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add, **tm_list_rem; + int i; + bool locked = false; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return 0; + + tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; + for (i = 0; i < nr_items; i++) { + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto free_tms; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + locked = true; + + for (i = 0; i < nr_items; i++) { + ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; + } + + write_unlock(&fs_info->tree_mod_log_lock); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + write_unlock(&fs_info->tree_mod_log_lock); + kfree(tm_list); + + return ret; +} + +int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) +{ + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(eb->fs_info, eb)) + goto free_tms; + + ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); + write_unlock(&eb->fs_info->tree_mod_log_lock); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + + return ret; +} + +/* + * Returns the logical address of the oldest predecessor of the given root. + * Entries older than time_seq are ignored. + */ +static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, + u64 time_seq) +{ + struct tree_mod_elem *tm; + struct tree_mod_elem *found = NULL; + u64 root_logical = eb_root->start; + bool looped = false; + + if (!time_seq) + return NULL; + + /* + * The very last operation that's logged for a root is the replacement + * operation (if it is replaced at all). This has the logical address + * of the *new* root, making it the very first operation that's logged + * for this root. + */ + while (1) { + tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, + time_seq); + if (!looped && !tm) + return NULL; + /* + * If there are no tree operation for the oldest root, we simply + * return it. This should only happen if that (old) root is at + * level 0. + */ + if (!tm) + break; + + /* + * If there's an operation that's not a root replacement, we + * found the oldest version of our root. Normally, we'll find a + * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. + */ + if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) + break; + + found = tm; + root_logical = tm->old_root.logical; + looped = true; + } + + /* If there's no old root to return, return what we found instead */ + if (!found) + found = tm; + + return found; +} + + +/* + * tm is a pointer to the first operation to rewind within eb. Then, all + * previous operations will be rewound (until we reach something older than + * time_seq). + */ +static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + u64 time_seq, + struct tree_mod_elem *first_tm) +{ + u32 n; + struct rb_node *next; + struct tree_mod_elem *tm = first_tm; + unsigned long o_dst; + unsigned long o_src; + unsigned long p_size = sizeof(struct btrfs_key_ptr); + + n = btrfs_header_nritems(eb); + read_lock(&fs_info->tree_mod_log_lock); + while (tm && tm->seq >= time_seq) { + /* + * All the operations are recorded with the operator used for + * the modification. As we're going backwards, we do the + * opposite of each operation here. + */ + switch (tm->op) { + case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + fallthrough; + case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case BTRFS_MOD_LOG_KEY_REMOVE: + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + n++; + break; + case BTRFS_MOD_LOG_KEY_REPLACE: + BUG_ON(tm->slot >= n); + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + break; + case BTRFS_MOD_LOG_KEY_ADD: + /* if a move operation is needed it's in the log */ + n--; + break; + case BTRFS_MOD_LOG_MOVE_KEYS: + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + memmove_extent_buffer(eb, o_dst, o_src, + tm->move.nr_items * p_size); + break; + case BTRFS_MOD_LOG_ROOT_REPLACE: + /* + * This operation is special. For roots, this must be + * handled explicitly before rewinding. + * For non-roots, this operation may exist if the node + * was a root: root A -> child B; then A gets empty and + * B is promoted to the new root. In the mod log, we'll + * have a root-replace operation for B, a tree block + * that is no root. We simply ignore that operation. + */ + break; + } + next = rb_next(&tm->node); + if (!next) + break; + tm = rb_entry(next, struct tree_mod_elem, node); + if (tm->logical != first_tm->logical) + break; + } + read_unlock(&fs_info->tree_mod_log_lock); + btrfs_set_header_nritems(eb, n); +} + +/* + * Called with eb read locked. If the buffer cannot be rewound, the same buffer + * is returned. If rewind operations happen, a fresh buffer is returned. The + * returned buffer is always read-locked. If the returned buffer is not the + * input buffer, the lock on the input buffer is released and the input buffer + * is freed (its refcount is decremented). + */ +struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct extent_buffer *eb, + u64 time_seq) +{ + struct extent_buffer *eb_rewin; + struct tree_mod_elem *tm; + + if (!time_seq) + return eb; + + if (btrfs_header_level(eb) == 0) + return eb; + + tm = tree_mod_log_search(fs_info, eb->start, time_seq); + if (!tm) + return eb; + + if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + BUG_ON(tm->slot != 0); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + if (!eb_rewin) { + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + return NULL; + } + btrfs_set_header_bytenr(eb_rewin, eb->start); + btrfs_set_header_backref_rev(eb_rewin, + btrfs_header_backref_rev(eb)); + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); + } else { + eb_rewin = btrfs_clone_extent_buffer(eb); + if (!eb_rewin) { + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + return NULL; + } + } + + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), + eb_rewin, btrfs_header_level(eb_rewin)); + btrfs_tree_read_lock(eb_rewin); + tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); + WARN_ON(btrfs_header_nritems(eb_rewin) > + BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + + return eb_rewin; +} + +/* + * Rewind the state of @root's root node to the given @time_seq value. + * If there are no changes, the current root->root_node is returned. If anything + * changed in between, there's a fresh buffer allocated on which the rewind + * operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ +struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct tree_mod_elem *tm; + struct extent_buffer *eb = NULL; + struct extent_buffer *eb_root; + u64 eb_root_owner = 0; + struct extent_buffer *old; + struct tree_mod_root *old_root = NULL; + u64 old_generation = 0; + u64 logical; + int level; + + eb_root = btrfs_read_lock_root_node(root); + tm = tree_mod_log_oldest_root(eb_root, time_seq); + if (!tm) + return eb_root; + + if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { + old_root = &tm->old_root; + old_generation = tm->generation; + logical = old_root->logical; + level = old_root->level; + } else { + logical = eb_root->start; + level = btrfs_header_level(eb_root); + } + + tm = tree_mod_log_search(fs_info, logical, time_seq); + if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + old = read_tree_block(fs_info, logical, root->root_key.objectid, + 0, level, NULL); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); + btrfs_warn(fs_info, + "failed to read tree block %llu from get_old_root", + logical); + } else { + struct tree_mod_elem *tm2; + + btrfs_tree_read_lock(old); + eb = btrfs_clone_extent_buffer(old); + /* + * After the lookup for the most recent tree mod operation + * above and before we locked and cloned the extent buffer + * 'old', a new tree mod log operation may have been added. + * So lookup for a more recent one to make sure the number + * of mod log operations we replay is consistent with the + * number of items we have in the cloned extent buffer, + * otherwise we can hit a BUG_ON when rewinding the extent + * buffer. + */ + tm2 = tree_mod_log_search(fs_info, logical, time_seq); + btrfs_tree_read_unlock(old); + free_extent_buffer(old); + ASSERT(tm2); + ASSERT(tm2 == tm || tm2->seq > tm->seq); + if (!tm2 || tm2->seq < tm->seq) { + free_extent_buffer(eb); + return NULL; + } + tm = tm2; + } + } else if (old_root) { + eb_root_owner = btrfs_header_owner(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + eb = alloc_dummy_extent_buffer(fs_info, logical); + } else { + eb = btrfs_clone_extent_buffer(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + } + + if (!eb) + return NULL; + if (old_root) { + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(eb, eb_root_owner); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + } + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, + btrfs_header_level(eb)); + btrfs_tree_read_lock(eb); + if (tm) + tree_mod_log_rewind(fs_info, eb, time_seq, tm); + else + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + + return eb; +} + +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + int level; + struct extent_buffer *eb_root = btrfs_root_node(root); + + tm = tree_mod_log_oldest_root(eb_root, time_seq); + if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) + level = tm->old_root.level; + else + level = btrfs_header_level(eb_root); + + free_extent_buffer(eb_root); + + return level; +} + +/* + * Return the lowest sequence number in the tree modification log. + * + * Return the sequence number of the oldest tree modification log user, which + * corresponds to the lowest sequence number of all existing users. If there are + * no users it returns 0. + */ +u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) +{ + u64 ret = 0; + + read_lock(&fs_info->tree_mod_log_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct btrfs_seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct btrfs_seq_list, list); + ret = elem->seq; + } + read_unlock(&fs_info->tree_mod_log_lock); + + return ret; +} diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h new file mode 100644 index 000000000000..12605d19621b --- /dev/null +++ b/fs/btrfs/tree-mod-log.h @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef BTRFS_TREE_MOD_LOG_H +#define BTRFS_TREE_MOD_LOG_H + +#include "ctree.h" + +/* Represents a tree mod log user. */ +struct btrfs_seq_list { + struct list_head list; + u64 seq; +}; + +#define BTRFS_SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } +#define BTRFS_SEQ_LAST ((u64)-1) + +enum btrfs_mod_log_op { + BTRFS_MOD_LOG_KEY_REPLACE, + BTRFS_MOD_LOG_KEY_ADD, + BTRFS_MOD_LOG_KEY_REMOVE, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, + BTRFS_MOD_LOG_MOVE_KEYS, + BTRFS_MOD_LOG_ROOT_REPLACE, +}; + +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem); +int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, + struct extent_buffer *new_root, + bool log_removal); +int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, + enum btrfs_mod_log_op op, gfp_t flags); +int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); +struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct extent_buffer *eb, + u64 time_seq); +struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); +int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, + struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, + int nr_items); +int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items); +u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bc3b33efddc5..9a1ead0c4a31 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1224,7 +1224,8 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, return 0; } -static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) +static int devid_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct btrfs_device *dev1, *dev2; @@ -1458,8 +1459,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, /* Given hole range was invalid (outside of device) */ if (ret == -ERANGE) { *hole_start += *hole_size; - *hole_size = 0; - return 1; + *hole_size = false; + return true; } *hole_start += zone_size; @@ -3098,11 +3099,12 @@ out: return ret; } -static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; struct btrfs_block_group *block_group; + u64 length; int ret; /* @@ -3117,7 +3119,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) * we release the path used to search the chunk/dev tree and before * the current task acquires this mutex and calls us. */ - lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); + lockdep_assert_held(&fs_info->reclaim_bgs_lock); /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); @@ -3130,8 +3132,23 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (!block_group) return -ENOENT; btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + length = block_group->length; btrfs_put_block_group(block_group); + /* + * On a zoned file system, discard the whole block group, this will + * trigger a REQ_OP_ZONE_RESET operation on the device zone. If + * resetting the zone fails, don't treat it as a fatal problem from the + * filesystem's point of view. + */ + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); + if (ret) + btrfs_info(fs_info, + "failed to reset zone %llu after relocation", + chunk_offset); + } + trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { @@ -3172,10 +3189,10 @@ again: key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } BUG_ON(ret == 0); /* Corruption */ @@ -3183,7 +3200,7 @@ again: ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); if (ret) - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) goto error; if (ret > 0) @@ -3204,7 +3221,7 @@ again: else BUG_ON(ret); } - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (found_key.offset == 0) break; @@ -3744,10 +3761,10 @@ again: goto error; } - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } @@ -3761,7 +3778,7 @@ again: ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); ret = 0; break; } @@ -3771,7 +3788,7 @@ again: btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); break; } @@ -3788,12 +3805,12 @@ again: btrfs_release_path(path); if (!ret) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto loop; } if (counting) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); @@ -3818,7 +3835,7 @@ again: count_meta < bctl->meta.limit_min) || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && count_sys < bctl->sys.limit_min)) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto loop; } @@ -3832,7 +3849,7 @@ again: ret = btrfs_may_alloc_data_chunk(fs_info, found_key.offset); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } else if (ret == 1) { chunk_reserved = 1; @@ -3840,7 +3857,7 @@ again: } ret = btrfs_relocate_chunk(fs_info, found_key.offset); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { enospc_errors++; } else if (ret == -ETXTBSY) { @@ -4725,16 +4742,16 @@ again: key.type = BTRFS_DEV_EXTENT_KEY; do { - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto done; } ret = btrfs_previous_item(root, path, 0, key.type); if (ret) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) goto done; ret = 0; @@ -4747,7 +4764,7 @@ again: btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_release_path(path); break; } @@ -4756,7 +4773,7 @@ again: length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_release_path(path); break; } @@ -4772,12 +4789,12 @@ again: */ ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto done; } ret = btrfs_relocate_chunk(fs_info, chunk_offset); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { failed++; } else if (ret) { @@ -4989,6 +5006,8 @@ static void init_alloc_chunk_ctl_policy_zoned( ctl->max_chunk_size = 2 * ctl->max_stripe_size; ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); + } else { + BUG(); } /* We don't want a chunk larger than 10% of writable space */ @@ -6787,6 +6806,46 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) return div_u64(chunk_len, data_stripes); } +#if BITS_PER_LONG == 32 +/* + * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE + * can't be accessed on 32bit systems. + * + * This function do mount time check to reject the fs if it already has + * metadata chunk beyond that limit. + */ +static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, u64 type) +{ + if (!(type & BTRFS_BLOCK_GROUP_METADATA)) + return 0; + + if (logical + length < MAX_LFS_FILESIZE) + return 0; + + btrfs_err_32bit_limit(fs_info); + return -EOVERFLOW; +} + +/* + * This is to give early warning for any metadata chunk reaching + * BTRFS_32BIT_EARLY_WARN_THRESHOLD. + * Although we can still access the metadata, it's not going to be possible + * once the limit is reached. + */ +static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, u64 type) +{ + if (!(type & BTRFS_BLOCK_GROUP_METADATA)) + return; + + if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) + return; + + btrfs_warn_32bit_limit(fs_info); +} +#endif + static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { @@ -6797,6 +6856,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, u64 logical; u64 length; u64 devid; + u64 type; u8 uuid[BTRFS_UUID_SIZE]; int num_stripes; int ret; @@ -6804,8 +6864,16 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); +#if BITS_PER_LONG == 32 + ret = check_32bit_meta_chunk(fs_info, logical, length, type); + if (ret < 0) + return ret; + warn_32bit_meta_chunk(fs_info, logical, length, type); +#endif + /* * Only need to verify chunk item if we're reading from sys chunk array, * as chunk item in tree block is already verified by tree-checker. @@ -6849,10 +6917,10 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - map->type = btrfs_chunk_type(leaf, chunk); + map->type = type; map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); map->verified_stripes = 0; - em->orig_block_len = calc_stripe_length(map->type, em->len, + em->orig_block_len = calc_stripe_length(type, em->len, map->num_stripes); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = @@ -7448,6 +7516,9 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device, int item_size; int i, ret, slot; + if (!device->fs_info->dev_root) + return 0; + key.objectid = BTRFS_DEV_STATS_OBJECTID; key.type = BTRFS_PERSISTENT_ITEM_KEY; key.offset = device->devid; @@ -7998,7 +8069,7 @@ static int relocating_repair_kthread(void *data) return -EBUSY; } - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); /* Ensure block group still exists */ cache = btrfs_lookup_block_group(fs_info, target); @@ -8020,7 +8091,7 @@ static int relocating_repair_kthread(void *data) out: if (cache) btrfs_put_block_group(cache); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d4c3e0dd32b8..9c0d84e5ec06 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -484,6 +484,7 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_uuid_scan_kthread(void *data); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1f972b75a9ab..70b23a0d03b1 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -21,9 +21,30 @@ /* Pseudo write pointer value for conventional zone */ #define WP_CONVENTIONAL ((u64)-2) +/* + * Location of the first zone of superblock logging zone pairs. + * + * - primary superblock: 0B (zone 0) + * - first copy: 512G (zone starting at that offset) + * - second copy: 4T (zone starting at that offset) + */ +#define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) +#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) +#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) + +#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) +#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) + /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 +/* + * Maximum supported zone size. Currently, SMR disks have a zone size of + * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not + * expect the zone size to become larger than 8GiB in the near future. + */ +#define BTRFS_MAX_ZONE_SIZE SZ_8G + static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; @@ -111,23 +132,22 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, } /* - * The following zones are reserved as the circular buffer on ZONED btrfs. - * - The primary superblock: zones 0 and 1 - * - The first copy: zones 16 and 17 - * - The second copy: zones 1024 or zone at 256GB which is minimum, and - * the following one + * Get the first zone number of the superblock mirror */ static inline u32 sb_zone_number(int shift, int mirror) { - ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + u64 zone; + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { - case 0: return 0; - case 1: return 16; - case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024); + case 0: zone = 0; break; + case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; + case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; } - return 0; + ASSERT(zone <= U32_MAX); + + return (u32)zone; } /* @@ -300,10 +320,21 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) zone_sectors = bdev_zone_sectors(bdev); } - nr_sectors = bdev_nr_sectors(bdev); /* Check if it's power of 2 (see is_power_of_2) */ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; + + /* We reject devices with a zone size larger than 8GB */ + if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu larger than supported maximum %llu", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); + ret = -EINVAL; + goto out; + } + + nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->max_zone_append_size = (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; @@ -311,6 +342,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; + if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) { + btrfs_err(fs_info, "zoned: device %pg does not support zone append", + bdev); + ret = -EINVAL; + goto out; + } + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { ret = -ENOMEM; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 61e969652fe1..5e41a74a9cb2 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -9,6 +9,12 @@ #include "disk-io.h" #include "block-group.h" +/* + * Block groups with more than this value (percents) of unusable space will be + * scheduled for background reclaim. + */ +#define BTRFS_DEFAULT_RECLAIM_THRESH 75 + struct btrfs_zoned_device_info { /* * Number of zones, zone size and types of zones if bdev is a diff --git a/fs/buffer.c b/fs/buffer.c index 0cb7ffd4977c..c2e052c0fc5d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1020,11 +1020,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) pgoff_t index; int sizebits; - sizebits = -1; - do { - sizebits++; - } while ((size << sizebits) < PAGE_SIZE); - + sizebits = PAGE_SHIFT - __ffs(size); index = block >> sizebits; /* diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile index 891dedda5905..2227dc2d5498 100644 --- a/fs/cachefiles/Makefile +++ b/fs/cachefiles/Makefile @@ -7,6 +7,7 @@ cachefiles-y := \ bind.o \ daemon.o \ interface.o \ + io.o \ key.o \ main.o \ namei.o \ diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index dfb14dbddf51..38bb7764b454 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -118,6 +118,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) cache->mnt = path.mnt; root = path.dentry; + ret = -EINVAL; + if (mnt_user_ns(path.mnt) != &init_user_ns) { + pr_warn("File cache on idmapped mounts not supported"); + goto error_unsupported; + } + /* check parameters */ ret = -EOPNOTSUPP; if (d_is_negative(root) || diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 5efa6a3702c0..da3948fdb615 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -319,8 +319,8 @@ static void cachefiles_drop_object(struct fscache_object *_object) /* * dispose of a reference to an object */ -static void cachefiles_put_object(struct fscache_object *_object, - enum fscache_obj_ref_trace why) +void cachefiles_put_object(struct fscache_object *_object, + enum fscache_obj_ref_trace why) { struct cachefiles_object *object; struct fscache_cache *cache; @@ -568,4 +568,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = { .uncache_page = cachefiles_uncache_page, .dissociate_pages = cachefiles_dissociate_pages, .check_consistency = cachefiles_check_consistency, + .begin_read_operation = cachefiles_begin_read_operation, }; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index cf9bd6401c2d..4ed83aa5253b 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -150,6 +150,9 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache, */ extern const struct fscache_cache_ops cachefiles_cache_ops; +void cachefiles_put_object(struct fscache_object *_object, + enum fscache_obj_ref_trace why); + /* * key.c */ @@ -218,6 +221,12 @@ extern int cachefiles_write_page(struct fscache_storage *, struct page *); extern void cachefiles_uncache_page(struct fscache_object *, struct page *); /* + * rdwr2.c + */ +extern int cachefiles_begin_read_operation(struct netfs_read_request *, + struct fscache_retrieval *); + +/* * security.c */ extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c new file mode 100644 index 000000000000..b13fb45fc3f3 --- /dev/null +++ b/fs/cachefiles/io.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* kiocb-using read/write + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/mount.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/sched/mm.h> +#include <linux/netfs.h> +#include "internal.h" + +struct cachefiles_kiocb { + struct kiocb iocb; + refcount_t ki_refcnt; + loff_t start; + union { + size_t skipped; + size_t len; + }; + netfs_io_terminated_t term_func; + void *term_func_priv; + bool was_async; +}; + +static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) +{ + if (refcount_dec_and_test(&ki->ki_refcnt)) { + fput(ki->iocb.ki_filp); + kfree(ki); + } +} + +/* + * Handle completion of a read from the cache. + */ +static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2) +{ + struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); + + _enter("%ld,%ld", ret, ret2); + + if (ki->term_func) { + if (ret >= 0) + ret += ki->skipped; + ki->term_func(ki->term_func_priv, ret, ki->was_async); + } + + cachefiles_put_kiocb(ki); +} + +/* + * Initiate a read from the cache. + */ +static int cachefiles_read(struct netfs_cache_resources *cres, + loff_t start_pos, + struct iov_iter *iter, + bool seek_data, + netfs_io_terminated_t term_func, + void *term_func_priv) +{ + struct cachefiles_kiocb *ki; + struct file *file = cres->cache_priv2; + unsigned int old_nofs; + ssize_t ret = -ENOBUFS; + size_t len = iov_iter_count(iter), skipped = 0; + + _enter("%pD,%li,%llx,%zx/%llx", + file, file_inode(file)->i_ino, start_pos, len, + i_size_read(file->f_inode)); + + /* If the caller asked us to seek for data before doing the read, then + * we should do that now. If we find a gap, we fill it with zeros. + */ + if (seek_data) { + loff_t off = start_pos, off2; + + off2 = vfs_llseek(file, off, SEEK_DATA); + if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) { + skipped = 0; + ret = off2; + goto presubmission_error; + } + + if (off2 == -ENXIO || off2 >= start_pos + len) { + /* The region is beyond the EOF or there's no more data + * in the region, so clear the rest of the buffer and + * return success. + */ + iov_iter_zero(len, iter); + skipped = len; + ret = 0; + goto presubmission_error; + } + + skipped = off2 - off; + iov_iter_zero(skipped, iter); + } + + ret = -ENOBUFS; + ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); + if (!ki) + goto presubmission_error; + + refcount_set(&ki->ki_refcnt, 2); + ki->iocb.ki_filp = file; + ki->iocb.ki_pos = start_pos + skipped; + ki->iocb.ki_flags = IOCB_DIRECT; + ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); + ki->iocb.ki_ioprio = get_current_ioprio(); + ki->skipped = skipped; + ki->term_func = term_func; + ki->term_func_priv = term_func_priv; + ki->was_async = true; + + if (ki->term_func) + ki->iocb.ki_complete = cachefiles_read_complete; + + get_file(ki->iocb.ki_filp); + + old_nofs = memalloc_nofs_save(); + ret = vfs_iocb_iter_read(file, &ki->iocb, iter); + memalloc_nofs_restore(old_nofs); + switch (ret) { + case -EIOCBQUEUED: + goto in_progress; + + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + ret = -EINTR; + fallthrough; + default: + ki->was_async = false; + cachefiles_read_complete(&ki->iocb, ret, 0); + if (ret > 0) + ret = 0; + break; + } + +in_progress: + cachefiles_put_kiocb(ki); + _leave(" = %zd", ret); + return ret; + +presubmission_error: + if (term_func) + term_func(term_func_priv, ret < 0 ? ret : skipped, false); + return ret; +} + +/* + * Handle completion of a write to the cache. + */ +static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2) +{ + struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); + struct inode *inode = file_inode(ki->iocb.ki_filp); + + _enter("%ld,%ld", ret, ret2); + + /* Tell lockdep we inherited freeze protection from submission thread */ + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); + + if (ki->term_func) + ki->term_func(ki->term_func_priv, ret, ki->was_async); + + cachefiles_put_kiocb(ki); +} + +/* + * Initiate a write to the cache. + */ +static int cachefiles_write(struct netfs_cache_resources *cres, + loff_t start_pos, + struct iov_iter *iter, + netfs_io_terminated_t term_func, + void *term_func_priv) +{ + struct cachefiles_kiocb *ki; + struct inode *inode; + struct file *file = cres->cache_priv2; + unsigned int old_nofs; + ssize_t ret = -ENOBUFS; + size_t len = iov_iter_count(iter); + + _enter("%pD,%li,%llx,%zx/%llx", + file, file_inode(file)->i_ino, start_pos, len, + i_size_read(file->f_inode)); + + ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); + if (!ki) + goto presubmission_error; + + refcount_set(&ki->ki_refcnt, 2); + ki->iocb.ki_filp = file; + ki->iocb.ki_pos = start_pos; + ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE; + ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); + ki->iocb.ki_ioprio = get_current_ioprio(); + ki->start = start_pos; + ki->len = len; + ki->term_func = term_func; + ki->term_func_priv = term_func_priv; + ki->was_async = true; + + if (ki->term_func) + ki->iocb.ki_complete = cachefiles_write_complete; + + /* Open-code file_start_write here to grab freeze protection, which + * will be released by another thread in aio_complete_rw(). Fool + * lockdep by telling it the lock got released so that it doesn't + * complain about the held lock when we return to userspace. + */ + inode = file_inode(file); + __sb_start_write(inode->i_sb, SB_FREEZE_WRITE); + __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + + get_file(ki->iocb.ki_filp); + + old_nofs = memalloc_nofs_save(); + ret = vfs_iocb_iter_write(file, &ki->iocb, iter); + memalloc_nofs_restore(old_nofs); + switch (ret) { + case -EIOCBQUEUED: + goto in_progress; + + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + ret = -EINTR; + fallthrough; + default: + ki->was_async = false; + cachefiles_write_complete(&ki->iocb, ret, 0); + if (ret > 0) + ret = 0; + break; + } + +in_progress: + cachefiles_put_kiocb(ki); + _leave(" = %zd", ret); + return ret; + +presubmission_error: + if (term_func) + term_func(term_func_priv, -ENOMEM, false); + return -ENOMEM; +} + +/* + * Prepare a read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ +static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq, + loff_t i_size) +{ + struct fscache_retrieval *op = subreq->rreq->cache_resources.cache_priv; + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + struct file *file = subreq->rreq->cache_resources.cache_priv2; + loff_t off, to; + + _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size); + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + if (!file) + goto cache_fail_nosec; + + if (subreq->start >= i_size) + return NETFS_FILL_WITH_ZEROES; + + cachefiles_begin_secure(cache, &saved_cred); + + off = vfs_llseek(file, subreq->start, SEEK_DATA); + if (off < 0 && off >= (loff_t)-MAX_ERRNO) { + if (off == (loff_t)-ENXIO) + goto download_and_store; + goto cache_fail; + } + + if (off >= subreq->start + subreq->len) + goto download_and_store; + + if (off > subreq->start) { + off = round_up(off, cache->bsize); + subreq->len = off - subreq->start; + goto download_and_store; + } + + to = vfs_llseek(file, subreq->start, SEEK_HOLE); + if (to < 0 && to >= (loff_t)-MAX_ERRNO) + goto cache_fail; + + if (to < subreq->start + subreq->len) { + if (subreq->start + subreq->len >= i_size) + to = round_up(to, cache->bsize); + else + to = round_down(to, cache->bsize); + subreq->len = to - subreq->start; + } + + cachefiles_end_secure(cache, saved_cred); + return NETFS_READ_FROM_CACHE; + +download_and_store: + if (cachefiles_has_space(cache, 0, (subreq->len + PAGE_SIZE - 1) / PAGE_SIZE) == 0) + __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); +cache_fail: + cachefiles_end_secure(cache, saved_cred); +cache_fail_nosec: + return NETFS_DOWNLOAD_FROM_SERVER; +} + +/* + * Prepare for a write to occur. + */ +static int cachefiles_prepare_write(struct netfs_cache_resources *cres, + loff_t *_start, size_t *_len, loff_t i_size) +{ + loff_t start = *_start; + size_t len = *_len, down; + + /* Round to DIO size */ + down = start - round_down(start, PAGE_SIZE); + *_start = start - down; + *_len = round_up(down + len, PAGE_SIZE); + return 0; +} + +/* + * Clean up an operation. + */ +static void cachefiles_end_operation(struct netfs_cache_resources *cres) +{ + struct fscache_retrieval *op = cres->cache_priv; + struct file *file = cres->cache_priv2; + + _enter(""); + + if (file) + fput(file); + if (op) { + fscache_op_complete(&op->op, false); + fscache_put_retrieval(op); + } + + _leave(""); +} + +static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { + .end_operation = cachefiles_end_operation, + .read = cachefiles_read, + .write = cachefiles_write, + .prepare_read = cachefiles_prepare_read, + .prepare_write = cachefiles_prepare_write, +}; + +/* + * Open the cache file when beginning a cache operation. + */ +int cachefiles_begin_read_operation(struct netfs_read_request *rreq, + struct fscache_retrieval *op) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct path path; + struct file *file; + + _enter(""); + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + path.mnt = cache->mnt; + path.dentry = object->backer; + file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, + d_inode(object->backer), cache->cache_cred); + if (IS_ERR(file)) + return PTR_ERR(file); + if (!S_ISREG(file_inode(file)->i_mode)) + goto error_file; + if (unlikely(!file->f_op->read_iter) || + unlikely(!file->f_op->write_iter)) { + pr_notice("Cache does not support read_iter and write_iter\n"); + goto error_file; + } + + fscache_get_retrieval(op); + rreq->cache_resources.cache_priv = op; + rreq->cache_resources.cache_priv2 = file; + rreq->cache_resources.ops = &cachefiles_netfs_cache_ops; + rreq->cookie_debug_id = object->fscache.debug_id; + _leave(""); + return 0; + +error_file: + fput(file); + return -EIO; +} diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index e027c718ca01..8ffc40e84a59 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -24,17 +24,16 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, container_of(wait, struct cachefiles_one_read, monitor); struct cachefiles_object *object; struct fscache_retrieval *op = monitor->op; - struct wait_bit_key *key = _key; + struct wait_page_key *key = _key; struct page *page = wait->private; ASSERT(key); _enter("{%lu},%u,%d,{%p,%u}", monitor->netfs_page->index, mode, sync, - key->flags, key->bit_nr); + key->page, key->bit_nr); - if (key->flags != &page->flags || - key->bit_nr != PG_locked) + if (key->page != page || key->bit_nr != PG_locked) return 0; _debug("--- monitor %p %lx ---", page, page->flags); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 570731c4d019..3c03fa37cac4 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3358,7 +3358,13 @@ static void handle_cap_grant(struct inode *inode, if ((newcaps & CEPH_CAP_AUTH_SHARED) && (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { - inode->i_mode = le32_to_cpu(grant->mode); + umode_t mode = le32_to_cpu(grant->mode); + + if (inode_wrong_type(inode, mode)) + pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", + ceph_vinop(inode), inode->i_mode, mode); + else + inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); ci->i_btime = extra_info->btime; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 83d9358854fb..f7a790ed62c4 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -677,6 +677,8 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { struct inode *inode = ceph_get_snapdir(parent); + if (IS_ERR(inode)) + return PTR_ERR(inode); dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", dentry, dentry, inode); BUG_ON(!d_unhashed(dentry)); diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e088843a7734..f22156ee7306 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -248,9 +248,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, ihold(inode); } else { /* mds does not support lookup snapped inode */ - err = -EOPNOTSUPP; - inode = NULL; + inode = ERR_PTR(-EOPNOTSUPP); } + } else { + inode = ERR_PTR(-ESTALE); } ceph_mdsc_put_request(req); @@ -261,8 +262,8 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d", vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); } - if (!inode) - return ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_CAST(inode); /* see comments in ceph_get_parent() */ return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 156f849f5385..689e3ffd29d7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -78,9 +78,21 @@ struct inode *ceph_get_snapdir(struct inode *parent) struct inode *inode = ceph_get_inode(parent->i_sb, vino); struct ceph_inode_info *ci = ceph_inode(inode); - BUG_ON(!S_ISDIR(parent->i_mode)); if (IS_ERR(inode)) return inode; + + if (!S_ISDIR(parent->i_mode)) { + pr_warn_once("bad snapdir parent type (mode=0%o)\n", + parent->i_mode); + return ERR_PTR(-ENOTDIR); + } + + if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { + pr_warn_once("bad snapdir inode type (mode=0%o)\n", + inode->i_mode); + return ERR_PTR(-ENOTDIR); + } + inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; @@ -757,11 +769,32 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, bool queue_trunc = false; bool new_version = false; bool fill_inline = false; + umode_t mode = le32_to_cpu(info->mode); + dev_t rdev = le32_to_cpu(info->rdev); dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); + /* Once I_NEW is cleared, we can't change type or dev numbers */ + if (inode->i_state & I_NEW) { + inode->i_mode = mode; + } else { + if (inode_wrong_type(inode, mode)) { + pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", + ceph_vinop(inode), inode->i_mode, mode); + return -ESTALE; + } + + if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) { + pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n", + ceph_vinop(inode), MAJOR(inode->i_rdev), + MINOR(inode->i_rdev), MAJOR(rdev), + MINOR(rdev)); + return -ESTALE; + } + } + info_caps = le32_to_cpu(info->cap.caps); /* prealloc new cap struct */ @@ -815,8 +848,6 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, issued |= __ceph_caps_dirty(ci); new_issued = ~issued & info_caps; - /* update inode */ - inode->i_rdev = le32_to_cpu(info->rdev); /* directories have fl_stripe_unit set to zero */ if (le32_to_cpu(info->layout.fl_stripe_unit)) inode->i_blkbits = @@ -828,7 +859,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && (issued & CEPH_CAP_AUTH_EXCL) == 0) { - inode->i_mode = le32_to_cpu(info->mode); + inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, @@ -926,7 +957,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, case S_IFCHR: case S_IFSOCK: inode->i_blkbits = PAGE_SHIFT; - init_special_inode(inode, inode->i_mode, inode->i_rdev); + init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ceph_file_iops; break; case S_IFREG: diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index fe03cbdae959..bf52e9326ebe 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -18,6 +18,7 @@ config CIFS select CRYPTO_AES select CRYPTO_LIB_DES select KEYS + select DNS_RESOLVER help This is the client VFS module for the SMB3 family of NAS protocols, (including support for the most recent, most secure dialect SMB3.1.1) @@ -112,7 +113,6 @@ config CIFS_WEAK_PW_HASH config CIFS_UPCALL bool "Kerberos/SPNEGO advanced session setup" depends on CIFS - select DNS_RESOLVER help Enables an upcall mechanism for CIFS which accesses userspace helper utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets @@ -179,7 +179,6 @@ config CIFS_DEBUG_DUMP_KEYS config CIFS_DFS_UPCALL bool "DFS feature support" depends on CIFS - select DNS_RESOLVER help Distributed File System (DFS) support is used to access shares transparently in an enterprise name space, even if the share diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 5213b20843b5..3ee3b7de4ded 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -10,13 +10,14 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ - smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o + smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ + dns_resolve.o cifs-$(CONFIG_CIFS_XATTR) += xattr.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o -cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o +cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 88a7958170ee..68e8e5b27841 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -17,15 +17,14 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#include "fs_context.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif #ifdef CONFIG_CIFS_SMB_DIRECT #include "smbdirect.h" #endif -#ifdef CONFIG_CIFS_SWN_UPCALL #include "cifs_swn.h" -#endif void cifs_dump_mem(char *label, void *data, int length) @@ -118,10 +117,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_printf(m, " POSIX Extensions"); if (tcon->ses->server->ops->dump_share_caps) tcon->ses->server->ops->dump_share_caps(m, tcon); -#ifdef CONFIG_CIFS_SWN_UPCALL if (tcon->use_witness) seq_puts(m, " Witness"); -#endif if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); @@ -490,10 +487,8 @@ skip_rdma: spin_unlock(&cifs_tcp_ses_lock); seq_putc(m, '\n'); - -#ifdef CONFIG_CIFS_SWN_UPCALL cifs_swn_dump(m); -#endif + /* BB add code to dump additional info such as TCP session info now */ return 0; } @@ -702,6 +697,7 @@ static const struct proc_ops cifs_lookup_cache_proc_ops; static const struct proc_ops traceSMB_proc_ops; static const struct proc_ops cifs_security_flags_proc_ops; static const struct proc_ops cifs_linux_ext_proc_ops; +static const struct proc_ops cifs_mount_params_proc_ops; void cifs_proc_init(void) @@ -726,6 +722,8 @@ cifs_proc_init(void) proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, &cifs_lookup_cache_proc_ops); + proc_create("mount_params", 0444, proc_fs_cifs, &cifs_mount_params_proc_ops); + #ifdef CONFIG_CIFS_DFS_UPCALL proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_ops); #endif @@ -764,6 +762,7 @@ cifs_proc_clean(void) remove_proc_entry("SecurityFlags", proc_fs_cifs); remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); + remove_proc_entry("mount_params", proc_fs_cifs); #ifdef CONFIG_CIFS_DFS_UPCALL remove_proc_entry("dfscache", proc_fs_cifs); @@ -1023,6 +1022,51 @@ static const struct proc_ops cifs_security_flags_proc_ops = { .proc_release = single_release, .proc_write = cifs_security_flags_proc_write, }; + +/* To make it easier to debug, can help to show mount params */ +static int cifs_mount_params_proc_show(struct seq_file *m, void *v) +{ + const struct fs_parameter_spec *p; + const char *type; + + for (p = smb3_fs_parameters; p->name; p++) { + /* cannot use switch with pointers... */ + if (!p->type) { + if (p->flags == fs_param_neg_with_no) + type = "noflag"; + else + type = "flag"; + } else if (p->type == fs_param_is_bool) + type = "bool"; + else if (p->type == fs_param_is_u32) + type = "u32"; + else if (p->type == fs_param_is_u64) + type = "u64"; + else if (p->type == fs_param_is_string) + type = "string"; + else + type = "unknown"; + + seq_printf(m, "%s:%s\n", p->name, type); + } + + return 0; +} + +static int cifs_mount_params_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_mount_params_proc_show, NULL); +} + +static const struct proc_ops cifs_mount_params_proc_ops = { + .proc_open = cifs_mount_params_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + /* No need for write for now */ + /* .proc_write = cifs_mount_params_proc_write, */ +}; + #else inline void cifs_proc_init(void) { diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 6b1ce4efb591..c87c37cf2914 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -270,7 +270,7 @@ static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt, char *mountdata; char *devname; - devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); + devname = kstrdup(fullpath, GFP_KERNEL); if (!devname) return ERR_PTR(-ENOMEM); @@ -302,6 +302,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) struct cifs_sb_info *cifs_sb; struct cifs_ses *ses; struct cifs_tcon *tcon; + void *page; char *full_path, *root_path; unsigned int xid; int rc; @@ -324,10 +325,13 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) goto cdda_exit; } + page = alloc_dentry_path(); /* always use tree name prefix */ - full_path = build_path_from_dentry_optional_prefix(mntpt, true); - if (full_path == NULL) - goto cdda_exit; + full_path = build_path_from_dentry_optional_prefix(mntpt, page, true); + if (IS_ERR(full_path)) { + mnt = ERR_CAST(full_path); + goto free_full_path; + } convert_delimiter(full_path, '\\'); @@ -385,7 +389,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) free_root_path: kfree(root_path); free_full_path: - kfree(full_path); + free_dentry_path(page); cdda_exit: cifs_dbg(FYI, "leaving %s\n" , __func__); return mnt; diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index aa77edc12212..2a5325a7ae49 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -81,5 +81,9 @@ struct cifs_sb_info { * (cifs_autodisable_serverino) in order to match new mounts. */ bool mnt_cifs_serverino_autodisabled; + /* + * Available once the mount has completed. + */ + struct dentry *root; }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index f2d730fffccb..d829b8bf833e 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -248,7 +248,7 @@ nlmsg_fail: /* * Try to find a matching registration for the tcon's server name and share name. - * Calls to this funciton must be protected by cifs_swnreg_idr_mutex. + * Calls to this function must be protected by cifs_swnreg_idr_mutex. * TODO Try to avoid memory allocations */ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) diff --git a/fs/cifs/cifs_swn.h b/fs/cifs/cifs_swn.h index 236ecd4959d5..8a9d2a5c9077 100644 --- a/fs/cifs/cifs_swn.h +++ b/fs/cifs/cifs_swn.h @@ -7,11 +7,13 @@ #ifndef _CIFS_SWN_H #define _CIFS_SWN_H +#include "cifsglob.h" struct cifs_tcon; struct sk_buff; struct genl_info; +#ifdef CONFIG_CIFS_SWN_UPCALL extern int cifs_swn_register(struct cifs_tcon *tcon); extern int cifs_swn_unregister(struct cifs_tcon *tcon); @@ -22,4 +24,29 @@ extern void cifs_swn_dump(struct seq_file *m); extern void cifs_swn_check(void); +static inline bool cifs_swn_set_server_dstaddr(struct TCP_Server_Info *server) +{ + if (server->use_swn_dstaddr) { + server->dstaddr = server->swn_dstaddr; + return true; + } + return false; +} + +static inline void cifs_swn_reset_server_dstaddr(struct TCP_Server_Info *server) +{ + server->use_swn_dstaddr = false; +} + +#else + +static inline int cifs_swn_register(struct cifs_tcon *tcon) { return 0; } +static inline int cifs_swn_unregister(struct cifs_tcon *tcon) { return 0; } +static inline int cifs_swn_notify(struct sk_buff *s, struct genl_info *i) { return 0; } +static inline void cifs_swn_dump(struct seq_file *m) {} +static inline void cifs_swn_check(void) {} +static inline bool cifs_swn_set_server_dstaddr(struct TCP_Server_Info *server) { return false; } +static inline void cifs_swn_reset_server_dstaddr(struct TCP_Server_Info *server) {} + +#endif /* CONFIG_CIFS_SWN_UPCALL */ #endif /* _CIFS_SWN_H */ diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 9d29eb9660c2..784407f9280f 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1094,11 +1094,9 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, struct cifs_ace *pnntace = NULL; char *nacl_base = NULL; u32 num_aces = 0; - __u64 nmode; bool new_aces_set = false; /* Assuming that pndacl and pnmode are never NULL */ - nmode = *pnmode; nacl_base = (char *)pndacl; nsize = sizeof(struct cifs_acl); @@ -1118,7 +1116,6 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, /* Retain old ACEs which we can retain */ for (i = 0; i < src_num_aces; ++i) { pntace = (struct cifs_ace *) (acl_base + size); - pnntace = (struct cifs_ace *) (nacl_base + nsize); if (!new_aces_set && (pntace->flags & INHERITED_ACE)) { /* Place the new ACEs in between existing explicit and inherited */ @@ -1131,14 +1128,17 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, } /* If it's any one of the ACE we're replacing, skip! */ - if ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || + if (((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || (compare_sids(&pntace->sid, pownersid) == 0) || (compare_sids(&pntace->sid, pgrpsid) == 0) || (compare_sids(&pntace->sid, &sid_everyone) == 0) || - (compare_sids(&pntace->sid, &sid_authusers) == 0)) { + (compare_sids(&pntace->sid, &sid_authusers) == 0))) { goto next_ace; } + /* update the pointer to the next ACE to populate*/ + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += cifs_copy_ace(pnntace, pntace, NULL); num_aces++; @@ -1649,7 +1649,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, * Add three ACEs for owner, group, everyone getting rid of other ACEs * as chmod disables ACEs and set the security descriptor. Allocate * memory for the smb header, set security descriptor request security - * descriptor parameters, and secuirty descriptor itself + * descriptor parameters, and security descriptor itself */ nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN); pnntsd = kmalloc(nsecdesclen, GFP_KERNEL); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 099ad9f3660b..5f2c139143a7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -217,8 +217,11 @@ cifs_read_super(struct super_block *sb) rc = super_setup_bdi(sb); if (rc) goto out_no_root; - /* tune readahead according to rsize */ - sb->s_bdi->ra_pages = cifs_sb->ctx->rsize / PAGE_SIZE; + /* tune readahead according to rsize if readahead size not set on mount */ + if (cifs_sb->ctx->rasize) + sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE; + else + sb->s_bdi->ra_pages = cifs_sb->ctx->rsize / PAGE_SIZE; sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ @@ -257,6 +260,29 @@ out_no_root: static void cifs_kill_sb(struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon; + struct cached_fid *cfid; + + /* + * We ned to release all dentries for the cached directories + * before we kill the sb. + */ + if (cifs_sb->root) { + dput(cifs_sb->root); + cifs_sb->root = NULL; + } + tcon = cifs_sb_master_tcon(cifs_sb); + if (tcon) { + cfid = &tcon->crfid; + mutex_lock(&cfid->fid_mutex); + if (cfid->dentry) { + + dput(cfid->dentry); + cfid->dentry = NULL; + } + mutex_unlock(&cfid->fid_mutex); + } + kill_anon_super(sb); cifs_umount(cifs_sb); } @@ -476,7 +502,8 @@ static int cifs_show_devname(struct seq_file *m, struct dentry *root) seq_puts(m, "none"); else { convert_delimiter(devname, '/'); - seq_puts(m, devname); + /* escape all spaces in share names */ + seq_escape(m, devname, " \t"); kfree(devname); } return 0; @@ -625,6 +652,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize); seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize); seq_printf(s, ",bsize=%u", cifs_sb->ctx->bsize); + if (cifs_sb->ctx->rasize) + seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize); if (tcon->ses->server->min_offload) seq_printf(s, ",esize=%u", tcon->ses->server->min_offload); seq_printf(s, ",echo_interval=%lu", @@ -655,10 +684,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",multichannel,max_channels=%zu", tcon->ses->chan_max); -#ifdef CONFIG_CIFS_SWN_UPCALL if (tcon->use_witness) seq_puts(s, ",witness"); -#endif return 0; } @@ -833,6 +860,12 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, goto out; } + /* cifs_setup_volume_info->smb3_parse_devname() redups UNC & prepath */ + kfree(cifs_sb->ctx->UNC); + cifs_sb->ctx->UNC = NULL; + kfree(cifs_sb->ctx->prepath); + cifs_sb->ctx->prepath = NULL; + rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC); if (rc) { root = ERR_PTR(rc); @@ -887,6 +920,9 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, if (IS_ERR(root)) goto out_super; + if (cifs_sb) + cifs_sb->root = dget(root); + cifs_dbg(FYI, "dentry root is: %p\n", root); return root; @@ -1527,10 +1563,6 @@ init_cifs(void) int rc = 0; cifs_proc_init(); INIT_LIST_HEAD(&cifs_tcp_ses_list); -#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ - INIT_LIST_HEAD(&GlobalDnotifyReqList); - INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); -#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ /* * Initialize Global counters */ diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 0d7ef150dbb2..6beddb108ba0 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -165,5 +165,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.31" +#define CIFS_VERSION "2.32" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 31fc8695abd6..b23a0ee8c6f8 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -495,7 +495,7 @@ struct smb_version_operations { struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - char *full_path, + const char *full_path, umode_t mode, dev_t device_number); /* version specific fiemap implementation */ @@ -919,8 +919,8 @@ struct cifs_ses { bool binding:1; /* are we binding the session? */ __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; - __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; - __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE]; __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; @@ -988,10 +988,12 @@ struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; bool has_lease:1; + unsigned long time; /* jiffies of when lease was taken */ struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; struct cifs_tcon *tcon; + struct dentry *dentry; struct work_struct lease_break; struct smb2_file_all_info file_all_info; }; @@ -1070,6 +1072,7 @@ struct cifs_tcon { bool use_resilient:1; /* use resilient instead of durable handles */ bool use_persistent:1; /* use persistent instead of durable handles */ bool no_lease:1; /* Do not request leases on files or directories */ + bool use_witness:1; /* use witness protocol */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; @@ -1094,9 +1097,6 @@ struct cifs_tcon { int remap:2; struct list_head ulist; /* cache update list */ #endif -#ifdef CONFIG_CIFS_SWN_UPCALL - bool use_witness:1; /* use witness protocol */ -#endif }; /* @@ -1283,8 +1283,6 @@ struct cifs_aio_ctx { bool direct_io; }; -struct cifs_readdata; - /* asynchronous read support */ struct cifs_readdata { struct kref refcount; @@ -1318,8 +1316,6 @@ struct cifs_readdata { struct page **pages; }; -struct cifs_writedata; - /* asynchronous write support */ struct cifs_writedata { struct kref refcount; @@ -1798,9 +1794,8 @@ require use of the stronger protocol */ * * Semaphores * ---------- - * sesSem operations on smb session - * tconSem operations on tree connection - * fh_sem file handle reconnection operations + * cifsInodeInfo->lock_sem protects: + * the list of locks held by the inode * ****************************************************************************/ @@ -1831,13 +1826,6 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; */ GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; -#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ -/* Outstanding dir notify requests */ -GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; -/* DirNotify response queue */ -GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q; -#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ - /* * Global transaction id (XID) information */ @@ -1881,19 +1869,9 @@ extern unsigned int cifs_min_small; /* min size of small buf pool */ extern unsigned int cifs_max_pending; /* MAX requests at once to server*/ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ -GLOBAL_EXTERN struct rb_root uidtree; -GLOBAL_EXTERN struct rb_root gidtree; -GLOBAL_EXTERN spinlock_t siduidlock; -GLOBAL_EXTERN spinlock_t sidgidlock; -GLOBAL_EXTERN struct rb_root siduidtree; -GLOBAL_EXTERN struct rb_root sidgidtree; -GLOBAL_EXTERN spinlock_t uidsidlock; -GLOBAL_EXTERN spinlock_t gidsidlock; - void cifs_oplock_break(struct work_struct *work); void cifs_queue_oplock_break(struct cifsFileInfo *cfile); -extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; extern struct workqueue_struct *decrypt_wq; extern struct workqueue_struct *fileinfo_put_wq; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 64fe5a47b5e8..b53a87db282f 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -147,6 +147,11 @@ */ #define SMB3_SIGN_KEY_SIZE (16) +/* + * Size of the smb3 encryption/decryption keys + */ +#define SMB3_ENC_DEC_KEY_SIZE (32) + #define CIFS_CLIENT_CHALLENGE_SIZE (8) #define CIFS_SERVER_CHALLENGE_SIZE (8) #define CIFS_HMAC_MD5_HASH_SIZE (16) @@ -1898,7 +1903,7 @@ typedef struct smb_com_transaction2_fnext_req { __le16 InformationLevel; __u32 ResumeKey; __le16 SearchFlags; - char ResumeFileName[1]; + char ResumeFileName[]; } __attribute__((packed)) TRANSACTION2_FNEXT_REQ; typedef struct smb_com_transaction2_fnext_rsp { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 75ce6f742b8d..a79d50001fbf 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -69,9 +69,20 @@ extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); extern int init_cifs_spnego(void); extern void exit_cifs_spnego(void); -extern char *build_path_from_dentry(struct dentry *); +extern const char *build_path_from_dentry(struct dentry *, void *); extern char *build_path_from_dentry_optional_prefix(struct dentry *direntry, - bool prefix); + void *page, bool prefix); +static inline void *alloc_dentry_path(void) +{ + return __getname(); +} + +static inline void free_dentry_path(void *page) +{ + if (page) + __putname(page); +} + extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, @@ -184,7 +195,7 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock); -extern int cifs_posix_open(char *full_path, struct inode **inode, +extern int cifs_posix_open(const char *full_path, struct inode **inode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *oplock, __u16 *netfid, unsigned int xid); @@ -194,7 +205,7 @@ extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb); extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *, struct cifs_sb_info *); -extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); +extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); extern struct inode *cifs_iget(struct super_block *sb, struct cifs_fattr *fattr); @@ -207,7 +218,7 @@ extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, unsigned int xid); extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs, - unsigned int xid, char *full_path, __u32 dosattr); + unsigned int xid, const char *full_path, __u32 dosattr); extern int cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, const unsigned int xid); @@ -358,11 +369,6 @@ extern int CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon, bool delete_file, __u16 fid, __u32 pid_of_opener); -#if 0 -extern int CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, - char *fileName, __u16 dos_attributes, - const struct nls_table *nls_codepage); -#endif /* possibly unneeded function */ extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, const char *file_name, __u64 size, struct cifs_sb_info *cifs_sb, bool set_allocation); @@ -504,12 +510,6 @@ extern int generate_smb311signingkey(struct cifs_ses *); extern int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key); #endif /* CIFS_WEAK_PW_HASH */ -#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ -extern int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon, - const int notify_subdirs, const __u16 netfid, - __u32 filter, struct file *file, int multishot, - const struct nls_table *nls_codepage); -#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ extern int CIFSSMBCopy(unsigned int xid, struct cifs_tcon *source_tcon, const char *fromName, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c279527aae92..41f74163cc1c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -114,7 +114,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) mutex_lock(&tcon->crfid.fid_mutex); tcon->crfid.is_valid = false; /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ - close_shroot_lease_locked(&tcon->crfid); + close_cached_dir_lease_locked(&tcon->crfid); memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); mutex_unlock(&tcon->crfid.fid_mutex); @@ -5917,56 +5917,6 @@ SetTimesRetry: return rc; } -/* Can not be used to set time stamps yet (due to old DOS time format) */ -/* Can be used to set attributes */ -#if 0 /* Possibly not needed - since it turns out that strangely NT4 has a bug - handling it anyway and NT4 was what we thought it would be needed for - Do not delete it until we prove whether needed for Win9x though */ -int -CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName, - __u16 dos_attrs, const struct nls_table *nls_codepage) -{ - SETATTR_REQ *pSMB = NULL; - SETATTR_RSP *pSMBr = NULL; - int rc = 0; - int bytes_returned; - int name_len; - - cifs_dbg(FYI, "In SetAttrLegacy\n"); - -SetAttrLgcyRetry: - rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, - (void **) &pSMBr); - if (rc) - return rc; - - if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = - ConvertToUTF16((__le16 *) pSMB->fileName, fileName, - PATH_MAX, nls_codepage); - name_len++; /* trailing null */ - name_len *= 2; - } else { - name_len = copy_path_name(pSMB->fileName, fileName); - } - pSMB->attr = cpu_to_le16(dos_attrs); - pSMB->BufferFormat = 0x04; - inc_rfc1001_len(pSMB, name_len + 1); - pSMB->ByteCount = cpu_to_le16(name_len + 1); - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, 0); - if (rc) - cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc); - - cifs_buf_release(pSMB); - - if (rc == -EAGAIN) - goto SetAttrLgcyRetry; - - return rc; -} -#endif /* temporarily unneeded SetAttr legacy function */ - static void cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset, const struct cifs_unix_set_info_args *args) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index eec8a2052da2..121d8b4535b0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -62,9 +62,7 @@ #include "dfs_cache.h" #endif #include "fs_context.h" -#ifdef CONFIG_CIFS_SWN_UPCALL #include "cifs_swn.h" -#endif extern mempool_t *cifs_req_poolp; extern bool disable_legacy_dialects; @@ -87,7 +85,6 @@ static void cifs_prune_tlinks(struct work_struct *work); * * This should be called with server->srv_mutex held. */ -#ifdef CONFIG_CIFS_DFS_UPCALL static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) { int rc; @@ -124,6 +121,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) return !rc ? -1 : 0; } +#ifdef CONFIG_CIFS_DFS_UPCALL /* These functions must be called with server->srv_mutex held */ static void reconn_set_next_dfs_target(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb, @@ -314,25 +312,34 @@ cifs_reconnect(struct TCP_Server_Info *server) mutex_lock(&server->srv_mutex); -#ifdef CONFIG_CIFS_SWN_UPCALL - if (server->use_swn_dstaddr) { - server->dstaddr = server->swn_dstaddr; - } else { -#endif + if (!cifs_swn_set_server_dstaddr(server)) { #ifdef CONFIG_CIFS_DFS_UPCALL + if (cifs_sb && cifs_sb->origin_fullpath) /* * Set up next DFS target server (if any) for reconnect. If DFS * feature is disabled, then we will retry last server we * connected to before. */ reconn_set_next_dfs_target(server, cifs_sb, &tgt_list, &tgt_it); + else { #endif + /* + * Resolve the hostname again to make sure that IP address is up-to-date. + */ + rc = reconn_set_ipaddr_from_hostname(server); + if (rc) { + cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", + __func__, rc); + } -#ifdef CONFIG_CIFS_SWN_UPCALL +#ifdef CONFIG_CIFS_DFS_UPCALL } #endif + + } + if (cifs_rdma_enabled(server)) rc = smbd_reconnect(server); else @@ -348,9 +355,7 @@ cifs_reconnect(struct TCP_Server_Info *server) if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); -#ifdef CONFIG_CIFS_SWN_UPCALL - server->use_swn_dstaddr = false; -#endif + cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); } } while (server->tcpStatus == CifsNeedReconnect); @@ -415,10 +420,8 @@ cifs_echo_request(struct work_struct *work) cifs_dbg(FYI, "Unable to send echo request to server: %s\n", server->hostname); -#ifdef CONFIG_CIFS_SWN_UPCALL /* Check witness registrations */ cifs_swn_check(); -#endif requeue_echo: queue_delayed_work(cifsiod_wq, &server->echo, server->echo_interval); @@ -1775,9 +1778,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) * for the request. */ if (is_domain && ses->domainName) { - ctx->domainname = kstrndup(ses->domainName, - strlen(ses->domainName), - GFP_KERNEL); + ctx->domainname = kstrdup(ses->domainName, GFP_KERNEL); if (!ctx->domainname) { cifs_dbg(FYI, "Unable to allocate %zd bytes for domain\n", len); @@ -1994,7 +1995,6 @@ cifs_put_tcon(struct cifs_tcon *tcon) return; } -#ifdef CONFIG_CIFS_SWN_UPCALL if (tcon->use_witness) { int rc; @@ -2004,7 +2004,6 @@ cifs_put_tcon(struct cifs_tcon *tcon) __func__, rc); } } -#endif list_del_init(&tcon->tcon_list); spin_unlock(&cifs_tcp_ses_lock); @@ -2166,9 +2165,9 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) } tcon->use_resilient = true; } -#ifdef CONFIG_CIFS_SWN_UPCALL + tcon->use_witness = false; - if (ctx->witness) { + if (IS_ENABLED(CONFIG_CIFS_SWN_UPCALL) && ctx->witness) { if (ses->server->vals->protocol_id >= SMB30_PROT_ID) { if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER) { /* @@ -2194,7 +2193,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) goto out_fail; } } -#endif /* If the user really knows what they are doing they can override */ if (tcon->share_flags & SMB2_SHAREFLAG_NO_CACHING) { @@ -3411,8 +3409,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; } /* Save mount options */ - mntdata = kstrndup(cifs_sb->ctx->mount_options, - strlen(cifs_sb->ctx->mount_options), GFP_KERNEL); + mntdata = kstrdup(cifs_sb->ctx->mount_options, GFP_KERNEL); if (!mntdata) { rc = -ENOMEM; goto error; @@ -3485,7 +3482,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) * links, the prefix path is included in both and may be changed during reconnect. See * cifs_tree_connect(). */ - cifs_sb->origin_fullpath = kstrndup(full_path, strlen(full_path), GFP_KERNEL); + cifs_sb->origin_fullpath = kstrdup(full_path, GFP_KERNEL); if (!cifs_sb->origin_fullpath) { rc = -ENOMEM; goto error; @@ -3862,9 +3859,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->sectype = master_tcon->ses->sectype; ctx->sign = master_tcon->ses->sign; ctx->seal = master_tcon->seal; -#ifdef CONFIG_CIFS_SWN_UPCALL ctx->witness = master_tcon->use_witness; -#endif rc = cifs_set_vol_auth(ctx, master_tcon->ses); if (rc) { diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 098b4bc8da59..b1fa30fefe1f 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -81,23 +81,24 @@ static void refresh_cache_worker(struct work_struct *work); static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker); -static int get_normalized_path(const char *path, char **npath) +static int get_normalized_path(const char *path, const char **npath) { if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/')) return -EINVAL; if (*path == '\\') { - *npath = (char *)path; + *npath = path; } else { - *npath = kstrndup(path, strlen(path), GFP_KERNEL); - if (!*npath) + char *s = kstrdup(path, GFP_KERNEL); + if (!s) return -ENOMEM; - convert_delimiter(*npath, '\\'); + convert_delimiter(s, '\\'); + *npath = s; } return 0; } -static inline void free_normalized_path(const char *path, char *npath) +static inline void free_normalized_path(const char *path, const char *npath) { if (path != npath) kfree(npath); @@ -358,7 +359,7 @@ static struct cache_dfs_tgt *alloc_target(const char *name, int path_consumed) t = kmalloc(sizeof(*t), GFP_ATOMIC); if (!t) return ERR_PTR(-ENOMEM); - t->name = kstrndup(name, strlen(name), GFP_ATOMIC); + t->name = kstrdup(name, GFP_ATOMIC); if (!t->name) { kfree(t); return ERR_PTR(-ENOMEM); @@ -419,7 +420,7 @@ static struct cache_entry *alloc_cache_entry(const char *path, if (!ce) return ERR_PTR(-ENOMEM); - ce->path = kstrndup(path, strlen(path), GFP_KERNEL); + ce->path = kstrdup(path, GFP_KERNEL); if (!ce->path) { kmem_cache_free(cache_slab, ce); return ERR_PTR(-ENOMEM); @@ -531,7 +532,7 @@ static struct cache_entry *lookup_cache_entry(const char *path, unsigned int *ha char *s, *e; char sep; - npath = kstrndup(path, strlen(path), GFP_KERNEL); + npath = kstrdup(path, GFP_KERNEL); if (!npath) return ERR_PTR(-ENOMEM); @@ -641,7 +642,7 @@ static int __update_cache_entry(const char *path, if (ce->tgthint) { s = ce->tgthint->name; - th = kstrndup(s, strlen(s), GFP_ATOMIC); + th = kstrdup(s, GFP_ATOMIC); if (!th) return -ENOMEM; } @@ -786,11 +787,11 @@ static int setup_referral(const char *path, struct cache_entry *ce, memset(ref, 0, sizeof(*ref)); - ref->path_name = kstrndup(path, strlen(path), GFP_ATOMIC); + ref->path_name = kstrdup(path, GFP_ATOMIC); if (!ref->path_name) return -ENOMEM; - ref->node_name = kstrndup(target, strlen(target), GFP_ATOMIC); + ref->node_name = kstrdup(target, GFP_ATOMIC); if (!ref->node_name) { rc = -ENOMEM; goto err_free_path; @@ -828,7 +829,7 @@ static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl) goto err_free_it; } - it->it_name = kstrndup(t->name, strlen(t->name), GFP_ATOMIC); + it->it_name = kstrdup(t->name, GFP_ATOMIC); if (!it->it_name) { kfree(it); rc = -ENOMEM; @@ -882,7 +883,7 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, struct dfs_cache_tgt_list *tgt_list) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; rc = get_normalized_path(path, &npath); @@ -936,7 +937,7 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tgt_list) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; rc = get_normalized_path(path, &npath); @@ -991,7 +992,7 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, const struct dfs_cache_tgt_iterator *it) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; struct cache_dfs_tgt *t; @@ -1053,7 +1054,7 @@ int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; struct cache_dfs_tgt *t; @@ -1111,7 +1112,7 @@ int dfs_cache_get_tgt_referral(const char *path, struct dfs_info3_param *ref) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; if (!it || !ref) @@ -1166,7 +1167,7 @@ int dfs_cache_add_vol(char *mntdata, struct smb3_fs_context *ctx, const char *fu if (!vi) return -ENOMEM; - vi->fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); + vi->fullpath = kstrdup(fullpath, GFP_KERNEL); if (!vi->fullpath) { rc = -ENOMEM; goto err_free_vi; @@ -1484,7 +1485,7 @@ static int refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon) { int rc = 0; unsigned int xid; - char *path, *npath; + const char *path, *npath; struct cache_entry *ce; struct cifs_ses *root_ses = NULL, *ses; struct dfs_info3_param *refs = NULL; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index a3fb81e0ba17..c85aff838305 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -78,31 +78,31 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s } /* Note: caller must free return buffer */ -char * -build_path_from_dentry(struct dentry *direntry) +const char * +build_path_from_dentry(struct dentry *direntry, void *page) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); bool prefix = tcon->Flags & SMB_SHARE_IS_IN_DFS; - return build_path_from_dentry_optional_prefix(direntry, + return build_path_from_dentry_optional_prefix(direntry, page, prefix); } char * -build_path_from_dentry_optional_prefix(struct dentry *direntry, bool prefix) +build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, + bool prefix) { - struct dentry *temp; - int namelen; int dfsplen; int pplen = 0; - char *full_path; - char dirsep; struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - unsigned seq; + char dirsep = CIFS_DIR_SEP(cifs_sb); + char *s; + + if (unlikely(!page)) + return ERR_PTR(-ENOMEM); - dirsep = CIFS_DIR_SEP(cifs_sb); if (prefix) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); else @@ -111,86 +111,39 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, bool prefix) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0; -cifs_bp_rename_retry: - namelen = dfsplen + pplen; - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - for (temp = direntry; !IS_ROOT(temp);) { - namelen += (1 + temp->d_name.len); - temp = temp->d_parent; - if (temp == NULL) { - cifs_dbg(VFS, "corrupt dentry\n"); - rcu_read_unlock(); - return NULL; - } - } - rcu_read_unlock(); - - full_path = kmalloc(namelen+1, GFP_ATOMIC); - if (full_path == NULL) - return full_path; - full_path[namelen] = 0; /* trailing null */ - rcu_read_lock(); - for (temp = direntry; !IS_ROOT(temp);) { - spin_lock(&temp->d_lock); - namelen -= 1 + temp->d_name.len; - if (namelen < 0) { - spin_unlock(&temp->d_lock); - break; - } else { - full_path[namelen] = dirsep; - strncpy(full_path + namelen + 1, temp->d_name.name, - temp->d_name.len); - cifs_dbg(FYI, "name: %s\n", full_path + namelen); - } - spin_unlock(&temp->d_lock); - temp = temp->d_parent; - if (temp == NULL) { - cifs_dbg(VFS, "corrupt dentry\n"); - rcu_read_unlock(); - kfree(full_path); - return NULL; - } - } - rcu_read_unlock(); - if (namelen != dfsplen + pplen || read_seqretry(&rename_lock, seq)) { - cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n", - namelen, dfsplen); - /* presumably this is only possible if racing with a rename - of one of the parent directories (we can not lock the dentries - above us to prevent this, but retrying should be harmless) */ - kfree(full_path); - goto cifs_bp_rename_retry; - } - /* DIR_SEP already set for byte 0 / vs \ but not for - subsequent slashes in prepath which currently must - be entered the right way - not sure if there is an alternative - since the '\' is a valid posix character so we can not switch - those safely to '/' if any are found in the middle of the prepath */ - /* BB test paths to Windows with '/' in the midst of prepath */ - + s = dentry_path_raw(direntry, page, PAGE_SIZE); + if (IS_ERR(s)) + return s; + if (!s[1]) // for root we want "", not "/" + s++; + if (s < (char *)page + pplen + dfsplen) + return ERR_PTR(-ENAMETOOLONG); if (pplen) { - int i; - cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath); - memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1); - full_path[dfsplen] = dirsep; - for (i = 0; i < pplen-1; i++) - if (full_path[dfsplen+1+i] == '/') - full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb); + s -= pplen; + memcpy(s + 1, cifs_sb->prepath, pplen - 1); + *s = '/'; } + if (dirsep != '/') { + /* BB test paths to Windows with '/' in the midst of prepath */ + char *p; + for (p = s; *p; p++) + if (*p == '/') + *p = dirsep; + } if (dfsplen) { - strncpy(full_path, tcon->treeName, dfsplen); + s -= dfsplen; + memcpy(s, tcon->treeName, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { int i; for (i = 0; i < dfsplen; i++) { - if (full_path[i] == '\\') - full_path[i] = '/'; + if (s[i] == '\\') + s[i] = '/'; } } } - return full_path; + return s; } /* @@ -233,7 +186,8 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, int desired_access; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = tlink_tcon(tlink); - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; int disposition; @@ -244,9 +198,11 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, if (tcon->ses->server->oplocks) *oplock = REQ_OPLOCK; - full_path = build_path_from_dentry(direntry); - if (!full_path) - return -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + free_dentry_path(page); + return PTR_ERR(full_path); + } if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && (CIFS_UNIX_POSIX_PATH_OPS_CAP & @@ -418,15 +374,16 @@ cifs_create_get_file_info: if (newinode) { if (server->ops->set_lease_key) server->ops->set_lease_key(newinode, fid); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) - newinode->i_mode = mode; - if ((*oplock & CIFS_CREATE_ACTION) && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { - newinode->i_uid = current_fsuid(); - if (inode->i_mode & S_ISGID) - newinode->i_gid = inode->i_gid; - else - newinode->i_gid = current_fsgid(); + if ((*oplock & CIFS_CREATE_ACTION) && S_ISREG(newinode->i_mode)) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + newinode->i_mode = mode; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + newinode->i_uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + newinode->i_gid = inode->i_gid; + else + newinode->i_gid = current_fsgid(); + } } } } @@ -448,7 +405,7 @@ cifs_create_set_dentry: out: kfree(buf); - kfree(full_path); + free_dentry_path(page); return rc; out_err: @@ -619,7 +576,8 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; - char *full_path = NULL; + const char *full_path; + void *page; if (!old_valid_dev(device_number)) return -EINVAL; @@ -629,13 +587,13 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, if (IS_ERR(tlink)) return PTR_ERR(tlink); + page = alloc_dentry_path(); tcon = tlink_tcon(tlink); - xid = get_xid(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto mknod_out; } @@ -644,7 +602,7 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, device_number); mknod_out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -660,7 +618,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct tcon_link *tlink; struct cifs_tcon *pTcon; struct inode *newInode = NULL; - char *full_path = NULL; + const char *full_path; + void *page; xid = get_xid(); @@ -687,11 +646,13 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) in which we already have the sb rename sem */ - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { + page = alloc_dentry_path(); + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { cifs_put_tlink(tlink); free_xid(xid); - return ERR_PTR(-ENOMEM); + free_dentry_path(page); + return ERR_CAST(full_path); } if (d_really_is_positive(direntry)) { @@ -727,7 +688,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } newInode = ERR_PTR(rc); } - kfree(full_path); + free_dentry_path(page); cifs_put_tlink(tlink); free_xid(xid); return d_splice_alias(newInode, direntry); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 26de4329d161..639c59596d4f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -112,7 +112,7 @@ static inline int cifs_get_disposition(unsigned int flags) return FILE_OPEN; } -int cifs_posix_open(char *full_path, struct inode **pinode, +int cifs_posix_open(const char *full_path, struct inode **pinode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *poplock, __u16 *pnetfid, unsigned int xid) { @@ -165,7 +165,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode, goto posix_open_ret; } } else { - cifs_fattr_to_inode(*pinode, &fattr); + cifs_revalidate_mapping(*pinode); + rc = cifs_fattr_to_inode(*pinode, &fattr); } posix_open_ret: @@ -174,7 +175,7 @@ posix_open_ret: } static int -cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, +cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, struct cifs_fid *fid, unsigned int xid) { @@ -529,7 +530,8 @@ int cifs_open(struct inode *inode, struct file *file) struct cifs_tcon *tcon; struct tcon_link *tlink; struct cifsFileInfo *cfile = NULL; - char *full_path = NULL; + void *page; + const char *full_path; bool posix_open_ok = false; struct cifs_fid fid; struct cifs_pending_open open; @@ -545,9 +547,10 @@ int cifs_open(struct inode *inode, struct file *file) tcon = tlink_tcon(tlink); server = tcon->ses->server; - full_path = build_path_from_dentry(file_dentry(file)); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(file_dentry(file), page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -639,7 +642,7 @@ int cifs_open(struct inode *inode, struct file *file) } out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -688,7 +691,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) struct TCP_Server_Info *server; struct cifsInodeInfo *cinode; struct inode *inode; - char *full_path = NULL; + void *page; + const char *full_path; int desired_access; int disposition = FILE_OPEN; int create_options = CREATE_NOT_DIR; @@ -698,9 +702,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) mutex_lock(&cfile->fh_mutex); if (!cfile->invalidHandle) { mutex_unlock(&cfile->fh_mutex); - rc = 0; free_xid(xid); - return rc; + return 0; } inode = d_inode(cfile->dentry); @@ -714,12 +717,13 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) * called and if the server was down that means we end up here, and we * can never tell if the caller already has the rename_sem. */ - full_path = build_path_from_dentry(cfile->dentry); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(cfile->dentry, page); + if (IS_ERR(full_path)) { mutex_unlock(&cfile->fh_mutex); + free_dentry_path(page); free_xid(xid); - return rc; + return PTR_ERR(full_path); } cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n", @@ -837,7 +841,7 @@ reopen_success: cifs_relock_file(cfile); reopen_error_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } @@ -2068,34 +2072,31 @@ cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, int flags, struct cifsFileInfo **ret_file) { - struct list_head *tmp; struct cifsFileInfo *cfile; - struct cifsInodeInfo *cinode; - char *full_path; + void *page = alloc_dentry_path(); *ret_file = NULL; spin_lock(&tcon->open_file_lock); - list_for_each(tmp, &tcon->openFileList) { - cfile = list_entry(tmp, struct cifsFileInfo, - tlist); - full_path = build_path_from_dentry(cfile->dentry); - if (full_path == NULL) { + list_for_each_entry(cfile, &tcon->openFileList, tlist) { + struct cifsInodeInfo *cinode; + const char *full_path = build_path_from_dentry(cfile->dentry, page); + if (IS_ERR(full_path)) { spin_unlock(&tcon->open_file_lock); - return -ENOMEM; + free_dentry_path(page); + return PTR_ERR(full_path); } - if (strcmp(full_path, name)) { - kfree(full_path); + if (strcmp(full_path, name)) continue; - } - kfree(full_path); cinode = CIFS_I(d_inode(cfile->dentry)); spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); return cifs_get_writable_file(cinode, flags, ret_file); } spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); return -ENOENT; } @@ -2103,35 +2104,32 @@ int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, struct cifsFileInfo **ret_file) { - struct list_head *tmp; struct cifsFileInfo *cfile; - struct cifsInodeInfo *cinode; - char *full_path; + void *page = alloc_dentry_path(); *ret_file = NULL; spin_lock(&tcon->open_file_lock); - list_for_each(tmp, &tcon->openFileList) { - cfile = list_entry(tmp, struct cifsFileInfo, - tlist); - full_path = build_path_from_dentry(cfile->dentry); - if (full_path == NULL) { + list_for_each_entry(cfile, &tcon->openFileList, tlist) { + struct cifsInodeInfo *cinode; + const char *full_path = build_path_from_dentry(cfile->dentry, page); + if (IS_ERR(full_path)) { spin_unlock(&tcon->open_file_lock); - return -ENOMEM; + free_dentry_path(page); + return PTR_ERR(full_path); } - if (strcmp(full_path, name)) { - kfree(full_path); + if (strcmp(full_path, name)) continue; - } - kfree(full_path); cinode = CIFS_I(d_inode(cfile->dentry)); spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); *ret_file = find_readable_file(cinode, 0); return *ret_file ? 0 : -ENOENT; } spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); return -ENOENT; } diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 892f51a21278..3e0d016849e3 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -137,6 +137,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("min_enc_offload", Opt_min_enc_offload), fsparam_u32("esize", Opt_min_enc_offload), fsparam_u32("bsize", Opt_blocksize), + fsparam_u32("rasize", Opt_rasize), fsparam_u32("rsize", Opt_rsize), fsparam_u32("wsize", Opt_wsize), fsparam_u32("actimeo", Opt_actimeo), @@ -188,8 +189,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { {} }; -int -cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx) +static int +cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; @@ -203,7 +204,7 @@ cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx) switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5p: - cifs_dbg(VFS, "sec=krb5p is not supported!\n"); + cifs_errorf(fc, "sec=krb5p is not supported!\n"); return 1; case Opt_sec_krb5i: ctx->sign = true; @@ -238,7 +239,7 @@ cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx) ctx->nullauth = 1; break; default: - cifs_dbg(VFS, "bad security option: %s\n", value); + cifs_errorf(fc, "bad security option: %s\n", value); return 1; } @@ -254,8 +255,8 @@ static const match_table_t cifs_cacheflavor_tokens = { { Opt_cache_err, NULL } }; -int -cifs_parse_cache_flavor(char *value, struct smb3_fs_context *ctx) +static int +cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; @@ -291,7 +292,7 @@ cifs_parse_cache_flavor(char *value, struct smb3_fs_context *ctx) ctx->cache_rw = true; break; default: - cifs_dbg(VFS, "bad cache= option: %s\n", value); + cifs_errorf(fc, "bad cache= option: %s\n", value); return 1; } return 0; @@ -339,7 +340,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx } static int -cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) +cifs_parse_smb_version(struct fs_context *fc, char *value, struct smb3_fs_context *ctx, bool is_smb3) { substring_t args[MAX_OPT_ARGS]; @@ -347,24 +348,24 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY case Smb_1: if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + cifs_errorf(fc, "mount with legacy dialect disabled\n"); return 1; } if (is_smb3) { - cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); + cifs_errorf(fc, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); return 1; } - cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); + cifs_errorf(fc, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); ctx->ops = &smb1_operations; ctx->vals = &smb1_values; break; case Smb_20: if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + cifs_errorf(fc, "mount with legacy dialect disabled\n"); return 1; } if (is_smb3) { - cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n"); + cifs_errorf(fc, "vers=2.0 not permitted when mounting with smb3\n"); return 1; } ctx->ops = &smb20_operations; @@ -372,10 +373,10 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) break; #else case Smb_1: - cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); + cifs_errorf(fc, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); return 1; case Smb_20: - cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n"); + cifs_errorf(fc, "vers=2.0 mount not permitted when legacy dialects disabled\n"); return 1; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ case Smb_21: @@ -403,7 +404,7 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) ctx->vals = &smbdefault_values; break; default: - cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); + cifs_errorf(fc, "Unknown vers= option specified: %s\n", value); return 1; } return 0; @@ -430,7 +431,7 @@ int smb3_parse_opt(const char *options, const char *key, char **val) if (nval == p) continue; *nval++ = 0; - *val = kstrndup(nval, strlen(nval), GFP_KERNEL); + *val = kstrdup(nval, GFP_KERNEL); rc = !*val ? -ENOMEM : 0; goto out; } @@ -588,14 +589,14 @@ static int smb3_fs_context_validate(struct fs_context *fc) struct smb3_fs_context *ctx = smb3_fc2context(fc); if (ctx->rdma && ctx->vals->protocol_id < SMB30_PROT_ID) { - cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n"); + cifs_errorf(fc, "SMB Direct requires Version >=3.0\n"); return -EOPNOTSUPP; } #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (ctx->multiuser) { - cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); + cifs_errorf(fc, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); return -1; } #endif @@ -605,13 +606,13 @@ static int smb3_fs_context_validate(struct fs_context *fc) if (!ctx->UNC) { - cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n"); + cifs_errorf(fc, "CIFS mount error: No usable UNC path provided in device string!\n"); return -1; } /* make sure UNC has a share name */ if (strlen(ctx->UNC) < 3 || !strchr(ctx->UNC + 3, '\\')) { - cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n"); + cifs_errorf(fc, "Malformed UNC. Unable to find share name.\n"); return -ENOENT; } @@ -684,49 +685,50 @@ static void smb3_fs_context_free(struct fs_context *fc) * Compare the old and new proposed context during reconfigure * and check if the changes are compatible. */ -static int smb3_verify_reconfigure_ctx(struct smb3_fs_context *new_ctx, +static int smb3_verify_reconfigure_ctx(struct fs_context *fc, + struct smb3_fs_context *new_ctx, struct smb3_fs_context *old_ctx) { if (new_ctx->posix_paths != old_ctx->posix_paths) { - cifs_dbg(VFS, "can not change posixpaths during remount\n"); + cifs_errorf(fc, "can not change posixpaths during remount\n"); return -EINVAL; } if (new_ctx->sectype != old_ctx->sectype) { - cifs_dbg(VFS, "can not change sec during remount\n"); + cifs_errorf(fc, "can not change sec during remount\n"); return -EINVAL; } if (new_ctx->multiuser != old_ctx->multiuser) { - cifs_dbg(VFS, "can not change multiuser during remount\n"); + cifs_errorf(fc, "can not change multiuser during remount\n"); return -EINVAL; } if (new_ctx->UNC && (!old_ctx->UNC || strcmp(new_ctx->UNC, old_ctx->UNC))) { - cifs_dbg(VFS, "can not change UNC during remount\n"); + cifs_errorf(fc, "can not change UNC during remount\n"); return -EINVAL; } if (new_ctx->username && (!old_ctx->username || strcmp(new_ctx->username, old_ctx->username))) { - cifs_dbg(VFS, "can not change username during remount\n"); + cifs_errorf(fc, "can not change username during remount\n"); return -EINVAL; } if (new_ctx->password && (!old_ctx->password || strcmp(new_ctx->password, old_ctx->password))) { - cifs_dbg(VFS, "can not change password during remount\n"); + cifs_errorf(fc, "can not change password during remount\n"); return -EINVAL; } if (new_ctx->domainname && (!old_ctx->domainname || strcmp(new_ctx->domainname, old_ctx->domainname))) { - cifs_dbg(VFS, "can not change domainname during remount\n"); + cifs_errorf(fc, "can not change domainname during remount\n"); return -EINVAL; } if (new_ctx->nodename && (!old_ctx->nodename || strcmp(new_ctx->nodename, old_ctx->nodename))) { - cifs_dbg(VFS, "can not change nodename during remount\n"); + cifs_errorf(fc, "can not change nodename during remount\n"); return -EINVAL; } if (new_ctx->iocharset && (!old_ctx->iocharset || strcmp(new_ctx->iocharset, old_ctx->iocharset))) { - cifs_dbg(VFS, "can not change iocharset during remount\n"); + cifs_errorf(fc, "can not change iocharset during remount\n"); return -EINVAL; } @@ -747,7 +749,7 @@ static int smb3_reconfigure(struct fs_context *fc) struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); int rc; - rc = smb3_verify_reconfigure_ctx(ctx, cifs_sb->ctx); + rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx); if (rc) return rc; @@ -933,13 +935,33 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, */ if ((result.uint_32 < CIFS_MAX_MSGSIZE) || (result.uint_32 > (4 * SMB3_DEFAULT_IOSIZE))) { - cifs_dbg(VFS, "%s: Invalid blocksize\n", + cifs_errorf(fc, "%s: Invalid blocksize\n", __func__); goto cifs_parse_mount_err; } ctx->bsize = result.uint_32; ctx->got_bsize = true; break; + case Opt_rasize: + /* + * readahead size realistically should never need to be + * less than 1M (CIFS_DEFAULT_IOSIZE) or greater than 32M + * (perhaps an exception should be considered in the + * for the case of a large number of channels + * when multichannel is negotiated) since that would lead + * to plenty of parallel I/O in flight to the server. + * Note that smaller read ahead sizes would + * hurt performance of common tools like cp and scp + * which often trigger sequential i/o with read ahead + */ + if ((result.uint_32 > (8 * SMB3_DEFAULT_IOSIZE)) || + (result.uint_32 < CIFS_DEFAULT_IOSIZE)) { + cifs_errorf(fc, "%s: Invalid rasize %d vs. %d\n", + __func__, result.uint_32, SMB3_DEFAULT_IOSIZE); + goto cifs_parse_mount_err; + } + ctx->rasize = result.uint_32; + break; case Opt_rsize: ctx->rsize = result.uint_32; ctx->got_rsize = true; @@ -951,25 +973,25 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_acregmax: ctx->acregmax = HZ * result.uint_32; if (ctx->acregmax > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "acregmax too large\n"); + cifs_errorf(fc, "acregmax too large\n"); goto cifs_parse_mount_err; } break; case Opt_acdirmax: ctx->acdirmax = HZ * result.uint_32; if (ctx->acdirmax > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "acdirmax too large\n"); + cifs_errorf(fc, "acdirmax too large\n"); goto cifs_parse_mount_err; } break; case Opt_actimeo: if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "timeout too large\n"); + cifs_errorf(fc, "timeout too large\n"); goto cifs_parse_mount_err; } if ((ctx->acdirmax != CIFS_DEF_ACTIMEO) || (ctx->acregmax != CIFS_DEF_ACTIMEO)) { - cifs_dbg(VFS, "actimeo ignored since acregmax or acdirmax specified\n"); + cifs_errorf(fc, "actimeo ignored since acregmax or acdirmax specified\n"); break; } ctx->acdirmax = ctx->acregmax = HZ * result.uint_32; @@ -982,7 +1004,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; case Opt_max_credits: if (result.uint_32 < 20 || result.uint_32 > 60000) { - cifs_dbg(VFS, "%s: Invalid max_credits value\n", + cifs_errorf(fc, "%s: Invalid max_credits value\n", __func__); goto cifs_parse_mount_err; } @@ -990,7 +1012,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; case Opt_max_channels: if (result.uint_32 < 1 || result.uint_32 > CIFS_MAX_CHANNELS) { - cifs_dbg(VFS, "%s: Invalid max_channels value, needs to be 1-%d\n", + cifs_errorf(fc, "%s: Invalid max_channels value, needs to be 1-%d\n", __func__, CIFS_MAX_CHANNELS); goto cifs_parse_mount_err; } @@ -999,7 +1021,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_handletimeout: ctx->handle_timeout = result.uint_32; if (ctx->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) { - cifs_dbg(VFS, "Invalid handle cache timeout, longer than 16 minutes\n"); + cifs_errorf(fc, "Invalid handle cache timeout, longer than 16 minutes\n"); goto cifs_parse_mount_err; } break; @@ -1010,23 +1032,23 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case 0: break; case -ENOMEM: - cifs_dbg(VFS, "Unable to allocate memory for devname\n"); + cifs_errorf(fc, "Unable to allocate memory for devname\n"); goto cifs_parse_mount_err; case -EINVAL: - cifs_dbg(VFS, "Malformed UNC in devname\n"); + cifs_errorf(fc, "Malformed UNC in devname\n"); goto cifs_parse_mount_err; default: - cifs_dbg(VFS, "Unknown error parsing devname\n"); + cifs_errorf(fc, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } ctx->source = kstrdup(param->string, GFP_KERNEL); if (ctx->source == NULL) { - cifs_dbg(VFS, "OOM when copying UNC string\n"); + cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } fc->source = kstrdup(param->string, GFP_KERNEL); if (fc->source == NULL) { - cifs_dbg(VFS, "OOM when copying UNC string\n"); + cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } break; @@ -1046,7 +1068,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } ctx->username = kstrdup(param->string, GFP_KERNEL); if (ctx->username == NULL) { - cifs_dbg(VFS, "OOM when copying username string\n"); + cifs_errorf(fc, "OOM when copying username string\n"); goto cifs_parse_mount_err; } break; @@ -1058,7 +1080,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->password = kstrdup(param->string, GFP_KERNEL); if (ctx->password == NULL) { - cifs_dbg(VFS, "OOM when copying password string\n"); + cifs_errorf(fc, "OOM when copying password string\n"); goto cifs_parse_mount_err; } break; @@ -1085,7 +1107,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, kfree(ctx->domainname); ctx->domainname = kstrdup(param->string, GFP_KERNEL); if (ctx->domainname == NULL) { - cifs_dbg(VFS, "OOM when copying domainname string\n"); + cifs_errorf(fc, "OOM when copying domainname string\n"); goto cifs_parse_mount_err; } cifs_dbg(FYI, "Domain name set\n"); @@ -1109,7 +1131,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, kfree(ctx->iocharset); ctx->iocharset = kstrdup(param->string, GFP_KERNEL); if (ctx->iocharset == NULL) { - cifs_dbg(VFS, "OOM when copying iocharset string\n"); + cifs_errorf(fc, "OOM when copying iocharset string\n"); goto cifs_parse_mount_err; } } @@ -1175,30 +1197,32 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; case Opt_vers: /* protocol version (dialect) */ - if (cifs_parse_smb_version(param->string, ctx, is_smb3) != 0) + if (cifs_parse_smb_version(fc, param->string, ctx, is_smb3) != 0) goto cifs_parse_mount_err; ctx->got_version = true; break; case Opt_sec: - if (cifs_parse_security_flavors(param->string, ctx) != 0) + if (cifs_parse_security_flavors(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_cache: - if (cifs_parse_cache_flavor(param->string, ctx) != 0) + if (cifs_parse_cache_flavor(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_witness: #ifndef CONFIG_CIFS_SWN_UPCALL - cifs_dbg(VFS, "Witness support needs CONFIG_CIFS_SWN_UPCALL config option\n"); + cifs_errorf(fc, "Witness support needs CONFIG_CIFS_SWN_UPCALL config option\n"); goto cifs_parse_mount_err; #endif ctx->witness = true; pr_warn_once("Witness protocol support is experimental\n"); break; case Opt_rootfs: -#ifdef CONFIG_CIFS_ROOT - ctx->rootfs = true; +#ifndef CONFIG_CIFS_ROOT + cifs_dbg(VFS, "rootfs support requires CONFIG_CIFS_ROOT config option\n"); + goto cifs_parse_mount_err; #endif + ctx->rootfs = true; break; case Opt_posixpaths: if (result.negated) @@ -1288,7 +1312,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; case Opt_fsc: #ifndef CONFIG_CIFS_FSCACHE - cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); + cifs_errorf(fc, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); goto cifs_parse_mount_err; #endif ctx->fsc = true; @@ -1309,15 +1333,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (result.negated) { ctx->nopersistent = true; if (ctx->persistent) { - cifs_dbg(VFS, - "persistenthandles mount options conflict\n"); + cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } else { ctx->persistent = true; if ((ctx->nopersistent) || (ctx->resilient)) { - cifs_dbg(VFS, - "persistenthandles mount options conflict\n"); + cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } @@ -1328,8 +1350,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } else { ctx->resilient = true; if (ctx->persistent) { - cifs_dbg(VFS, - "persistenthandles mount options conflict\n"); + cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } @@ -1377,7 +1398,9 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->cred_uid = current_uid(); ctx->linux_uid = current_uid(); ctx->linux_gid = current_gid(); - ctx->bsize = 1024 * 1024; /* can improve cp performance significantly */ + /* By default 4MB read ahead size, 1MB block size */ + ctx->bsize = CIFS_DEFAULT_IOSIZE; /* can improve cp performance significantly */ + ctx->rasize = 0; /* 0 = use default (ie negotiated rsize) for read ahead pages */ /* * default to SFM style remapping of seven reserved characters diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 87dd1f7168f2..2a71c8e411ac 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -13,7 +13,12 @@ #include <linux/parser.h> #include <linux/fs_parser.h> -#define cifs_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__) +/* Log errors in fs_context (new mount api) but also in dmesg (old style) */ +#define cifs_errorf(fc, fmt, ...) \ + do { \ + errorf(fc, fmt, ## __VA_ARGS__); \ + cifs_dbg(VFS, fmt, ## __VA_ARGS__); \ + } while (0) enum smb_version { Smb_1 = 1, @@ -115,6 +120,7 @@ enum cifs_param { Opt_dirmode, Opt_min_enc_offload, Opt_blocksize, + Opt_rasize, Opt_rsize, Opt_wsize, Opt_actimeo, @@ -230,6 +236,7 @@ struct smb3_fs_context { /* reuse existing guid for multichannel */ u8 client_guid[SMB2_CLIENT_GUID_SIZE]; unsigned int bsize; + unsigned int rasize; unsigned int rsize; unsigned int wsize; unsigned int min_offload; @@ -257,10 +264,6 @@ struct smb3_fs_context { extern const struct fs_parameter_spec smb3_fs_parameters[]; -extern int cifs_parse_cache_flavor(char *value, - struct smb3_fs_context *ctx); -extern int cifs_parse_security_flavors(char *value, - struct smb3_fs_context *ctx); extern int smb3_init_fs_context(struct fs_context *fc); extern void smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx); extern void smb3_cleanup_fs_context(struct smb3_fs_context *ctx); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 7c61bc9573c0..002d864b8f7b 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -157,12 +157,18 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) } /* populate an inode with info from a cifs_fattr struct */ -void +int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + if (!(inode->i_state & I_NEW) && + unlikely(inode_wrong_type(inode, fattr->cf_mode))) { + CIFS_I(inode)->time = 0; /* force reval */ + return -ESTALE; + } + cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); @@ -219,6 +225,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_flags |= S_AUTOMOUNT; if (inode->i_state & I_NEW) cifs_set_ops(inode); + return 0; } void @@ -363,7 +370,7 @@ cifs_get_file_info_unix(struct file *filp) rc = 0; } - cifs_fattr_to_inode(inode, &fattr); + rc = cifs_fattr_to_inode(inode, &fattr); free_xid(xid); return rc; } @@ -426,14 +433,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, } /* if filetype is different, return error */ - if (unlikely(((*pinode)->i_mode & S_IFMT) != - (fattr.cf_mode & S_IFMT))) { - CIFS_I(*pinode)->time = 0; /* force reval */ - rc = -ESTALE; - goto cgiiu_exit; - } - - cifs_fattr_to_inode(*pinode, &fattr); + rc = cifs_fattr_to_inode(*pinode, &fattr); } cgiiu_exit: @@ -783,7 +783,8 @@ cifs_get_file_info(struct file *filp) */ fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; - cifs_fattr_to_inode(inode, &fattr); + /* if filetype is different, return error */ + rc = cifs_fattr_to_inode(inode, &fattr); cgfi_exit: free_xid(xid); return rc; @@ -1100,16 +1101,8 @@ handle_mnt_opt: rc = -ESTALE; goto out; } - /* if filetype is different, return error */ - if (unlikely(((*inode)->i_mode & S_IFMT) != - (fattr.cf_mode & S_IFMT))) { - CIFS_I(*inode)->time = 0; /* force reval */ - rc = -ESTALE; - goto out; - } - - cifs_fattr_to_inode(*inode, &fattr); + rc = cifs_fattr_to_inode(*inode, &fattr); } out: cifs_buf_release(smb1_backup_rsp_buf); @@ -1215,14 +1208,7 @@ smb311_posix_get_inode_info(struct inode **inode, } /* if filetype is different, return error */ - if (unlikely(((*inode)->i_mode & S_IFMT) != - (fattr.cf_mode & S_IFMT))) { - CIFS_I(*inode)->time = 0; /* force reval */ - rc = -ESTALE; - goto out; - } - - cifs_fattr_to_inode(*inode, &fattr); + rc = cifs_fattr_to_inode(*inode, &fattr); } out: cifs_put_tlink(tlink); @@ -1249,7 +1235,7 @@ cifs_find_inode(struct inode *inode, void *opaque) return 0; /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) + if (inode_wrong_type(inode, fattr->cf_mode)) return 0; /* if it's not a directory or has no dentries, then flag it */ @@ -1317,6 +1303,7 @@ retry_iget5_locked: } } + /* can't fail - see cifs_find_inode() */ cifs_fattr_to_inode(inode, fattr); if (sb->s_flags & SB_NOATIME) inode->i_flags |= S_NOATIME | S_NOCMTIME; @@ -1408,7 +1395,7 @@ out: int cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, - char *full_path, __u32 dosattr) + const char *full_path, __u32 dosattr) { bool set_time = false; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -1609,7 +1596,8 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) { int rc = 0; unsigned int xid; - char *full_path = NULL; + const char *full_path; + void *page; struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifs_inode; struct super_block *sb = dir->i_sb; @@ -1629,6 +1617,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) server = tcon->ses->server; xid = get_xid(); + page = alloc_dentry_path(); if (tcon->nodelete) { rc = -EACCES; @@ -1637,9 +1626,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) /* Unlink can be called from rename so we can not take the * sb->s_vfs_rename_mutex here */ - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto unlink_out; } @@ -1713,7 +1702,7 @@ out_reval: cifs_inode = CIFS_I(dir); CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ unlink_out: - kfree(full_path); + free_dentry_path(page); kfree(attrs); free_xid(xid); cifs_put_tlink(tlink); @@ -1740,6 +1729,16 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, if (rc) return rc; + if (!S_ISDIR(inode->i_mode)) { + /* + * mkdir succeeded, but another client has managed to remove the + * sucker and replace it with non-directory. Return success, + * but don't leave the child in dcache. + */ + iput(inode); + d_drop(dentry); + return 0; + } /* * setting nlink not necessary except in cases where we failed to get it * from the server or was set bogus. Also, since this is a brand new @@ -1791,7 +1790,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, } } d_instantiate(dentry, inode); - return rc; + return 0; } static int @@ -1866,7 +1865,8 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - char *full_path; + const char *full_path; + void *page; cifs_dbg(FYI, "In cifs_mkdir, mode = %04ho inode = 0x%p\n", mode, inode); @@ -1879,9 +1879,10 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, xid = get_xid(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto mkdir_out; } @@ -1924,7 +1925,7 @@ mkdir_out: * attributes are invalid now. */ CIFS_I(inode)->time = 0; - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -1938,16 +1939,17 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); struct cifsInodeInfo *cifsInode; cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode); xid = get_xid(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto rmdir_exit; } @@ -1997,7 +1999,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) current_time(inode); rmdir_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } @@ -2072,8 +2074,8 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, struct dentry *source_dentry, struct inode *target_dir, struct dentry *target_dentry, unsigned int flags) { - char *from_name = NULL; - char *to_name = NULL; + const char *from_name, *to_name; + void *page1, *page2; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; @@ -2091,21 +2093,19 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + page1 = alloc_dentry_path(); + page2 = alloc_dentry_path(); xid = get_xid(); - /* - * we already have the rename sem so we do not need to - * grab it again here to protect the path integrity - */ - from_name = build_path_from_dentry(source_dentry); - if (from_name == NULL) { - rc = -ENOMEM; + from_name = build_path_from_dentry(source_dentry, page1); + if (IS_ERR(from_name)) { + rc = PTR_ERR(from_name); goto cifs_rename_exit; } - to_name = build_path_from_dentry(target_dentry); - if (to_name == NULL) { - rc = -ENOMEM; + to_name = build_path_from_dentry(target_dentry, page2); + if (IS_ERR(to_name)) { + rc = PTR_ERR(to_name); goto cifs_rename_exit; } @@ -2177,18 +2177,21 @@ unlink_target: cifs_rename_exit: kfree(info_buf_source); - kfree(from_name); - kfree(to_name); + free_dentry_path(page2); + free_dentry_path(page1); free_xid(xid); cifs_put_tlink(tlink); return rc; } static bool -cifs_inode_needs_reval(struct inode *inode) +cifs_dentry_needs_reval(struct dentry *dentry) { + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct cached_fid *cfid = NULL; if (cifs_i->time == 0) return true; @@ -2199,6 +2202,16 @@ cifs_inode_needs_reval(struct inode *inode) if (!lookupCacheEnabled) return true; + if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) { + mutex_lock(&cfid->fid_mutex); + if (cfid->time && cifs_i->time > cfid->time) { + mutex_unlock(&cfid->fid_mutex); + close_cached_dir(cfid); + return false; + } + mutex_unlock(&cfid->fid_mutex); + close_cached_dir(cfid); + } /* * depending on inode type, check if attribute caching disabled for * files or directories @@ -2297,10 +2310,10 @@ cifs_zap_mapping(struct inode *inode) int cifs_revalidate_file_attr(struct file *filp) { int rc = 0; - struct inode *inode = file_inode(filp); + struct dentry *dentry = file_dentry(filp); struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; - if (!cifs_inode_needs_reval(inode)) + if (!cifs_dentry_needs_reval(dentry)) return rc; if (tlink_tcon(cfile->tlink)->unix_ext) @@ -2317,22 +2330,22 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) int rc = 0; struct inode *inode = d_inode(dentry); struct super_block *sb = dentry->d_sb; - char *full_path = NULL; + const char *full_path; + void *page; int count = 0; if (inode == NULL) return -ENOENT; - if (!cifs_inode_needs_reval(inode)) + if (!cifs_dentry_needs_reval(dentry)) return rc; xid = get_xid(); - /* can not safely grab the rename sem here if rename calls revalidate - since that would deadlock */ - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -2351,7 +2364,7 @@ again: if (rc == -EAGAIN && count++ < 10) goto again; out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; @@ -2395,7 +2408,7 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, * We need to be sure that all dirty pages are written and the server * has actual ctime, mtime and file length. */ - if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE)) && + if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE | STATX_BLOCKS)) && !CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = filemap_fdatawait(inode->i_mapping); @@ -2522,7 +2535,7 @@ void cifs_setsize(struct inode *inode, loff_t offset) static int cifs_set_file_size(struct inode *inode, struct iattr *attrs, - unsigned int xid, char *full_path) + unsigned int xid, const char *full_path) { int rc; struct cifsFileInfo *open_file; @@ -2585,6 +2598,14 @@ set_size_out: if (rc == 0) { cifsInode->server_eof = attrs->ia_size; cifs_setsize(inode, attrs->ia_size); + /* + * i_blocks is not related to (i_size / i_blksize), but instead + * 512 byte (2**9) size is required for calculating num blocks. + * Until we can query the server for actual allocation size, + * this is best estimate we have for blocks allocated for a file + * Number of blocks must be rounded up so size 1 is not 0 blocks + */ + inode->i_blocks = (512 - 1 + attrs->ia_size) >> 9; /* * The man page of truncate says if the size changed, @@ -2605,7 +2626,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) { int rc; unsigned int xid; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); struct inode *inode = d_inode(direntry); struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -2626,9 +2648,9 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (rc < 0) goto out; - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -2740,7 +2762,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) cifsInode->time = 0; out: kfree(args); - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } @@ -2756,7 +2778,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifsFileInfo *wfile; struct cifs_tcon *tcon; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); int rc = -EACCES; __u32 dosattr = 0; __u64 mode = NO_CHANGE_64; @@ -2770,16 +2793,13 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) attrs->ia_valid |= ATTR_FORCE; rc = setattr_prepare(&init_user_ns, direntry, attrs); - if (rc < 0) { - free_xid(xid); - return rc; - } + if (rc < 0) + goto cifs_setattr_exit; - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; - free_xid(xid); - return rc; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); + goto cifs_setattr_exit; } /* @@ -2929,8 +2949,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) mark_inode_dirty(inode); cifs_setattr_exit: - kfree(full_path); free_xid(xid); + free_dentry_path(page); return rc; } @@ -2953,12 +2973,3 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, /* BB: add cifs_setattr_legacy for really old servers */ return rc; } - -#if 0 -void cifs_delete_inode(struct inode *inode) -{ - cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode); - /* may have to add back in if and when safe distributed caching of - directories added e.g. via FindNotify */ -} -#endif diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index dcde44ff6cf9..08d99fec593e 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -42,13 +42,16 @@ static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct dentry *dentry = filep->f_path.dentry; - unsigned char *path; + const unsigned char *path; + void *page = alloc_dentry_path(); __le16 *utf16_path = NULL, root_path; int rc = 0; - path = build_path_from_dentry(dentry); - if (path == NULL) - return -ENOMEM; + path = build_path_from_dentry(dentry, page); + if (IS_ERR(path)) { + free_dentry_path(page); + return PTR_ERR(path); + } cifs_dbg(FYI, "%s %s\n", __func__, path); @@ -73,7 +76,7 @@ static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, ici_exit: if (utf16_path != &root_path) kfree(utf16_path); - kfree(path); + free_dentry_path(page); return rc; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 7c5878a645d9..616e1bc0cc0a 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -510,8 +510,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, { int rc = -EACCES; unsigned int xid; - char *from_name = NULL; - char *to_name = NULL; + const char *from_name, *to_name; + void *page1, *page2; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; struct cifs_tcon *tcon; @@ -524,11 +524,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, tcon = tlink_tcon(tlink); xid = get_xid(); + page1 = alloc_dentry_path(); + page2 = alloc_dentry_path(); - from_name = build_path_from_dentry(old_file); - to_name = build_path_from_dentry(direntry); - if ((from_name == NULL) || (to_name == NULL)) { - rc = -ENOMEM; + from_name = build_path_from_dentry(old_file, page1); + if (IS_ERR(from_name)) { + rc = PTR_ERR(from_name); + goto cifs_hl_exit; + } + to_name = build_path_from_dentry(direntry, page2); + if (IS_ERR(to_name)) { + rc = PTR_ERR(to_name); goto cifs_hl_exit; } @@ -587,8 +593,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, } cifs_hl_exit: - kfree(from_name); - kfree(to_name); + free_dentry_path(page1); + free_dentry_path(page2); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -600,7 +606,8 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, { int rc = -ENOMEM; unsigned int xid; - char *full_path = NULL; + const char *full_path; + void *page; char *target_path = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; @@ -620,11 +627,13 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, tcon = tlink_tcon(tlink); server = tcon->ses->server; - full_path = build_path_from_dentry(direntry); - if (!full_path) { + page = alloc_dentry_path(); + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { free_xid(xid); cifs_put_tlink(tlink); - return ERR_PTR(-ENOMEM); + free_dentry_path(page); + return ERR_CAST(full_path); } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); @@ -649,7 +658,7 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, &target_path, reparse_point); } - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); if (rc != 0) { @@ -669,7 +678,8 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); struct inode *newinode = NULL; xid = get_xid(); @@ -681,9 +691,9 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, } pTcon = tlink_tcon(tlink); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto symlink_exit; } @@ -719,7 +729,7 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, } } symlink_exit: - kfree(full_path); + free_dentry_path(page); cifs_put_tlink(tlink); free_xid(xid); return rc; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 82e176720ca6..c15a90e422be 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1180,7 +1180,7 @@ int update_super_prepath(struct cifs_tcon *tcon, char *prefix) kfree(cifs_sb->prepath); if (prefix && *prefix) { - cifs_sb->prepath = kstrndup(prefix, strlen(prefix), GFP_ATOMIC); + cifs_sb->prepath = kstrdup(prefix, GFP_ATOMIC); if (!cifs_sb->prepath) { rc = -ENOMEM; goto out; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 80bf4c6f4c7b..63bfc533c9fb 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -119,9 +119,7 @@ retry: /* update inode in place * if both i_ino and i_mode didn't change */ if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid && - (inode->i_mode & S_IFMT) == - (fattr->cf_mode & S_IFMT)) { - cifs_fattr_to_inode(inode, fattr); + cifs_fattr_to_inode(inode, fattr) == 0) { dput(dentry); return; } @@ -384,7 +382,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, static int initiate_cifs_search(const unsigned int xid, struct file *file, - char *full_path) + const char *full_path) { __u16 search_flags; int rc = 0; @@ -704,7 +702,7 @@ static int cifs_save_resume_key(const char *current_entry, */ static int find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, - struct file *file, char *full_path, + struct file *file, const char *full_path, char **current_entry, int *num_to_ret) { __u16 search_flags; @@ -942,13 +940,14 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) char *tmp_buf = NULL; char *end_of_smb; unsigned int max_len; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); xid = get_xid(); - full_path = build_path_from_dentry(file_dentry(file)); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(file_dentry(file), page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto rddir2_exit; } @@ -1043,7 +1042,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) kfree(tmp_buf); rddir2_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index e31b939e628c..3b83839fc2c2 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -926,9 +926,7 @@ cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon, 0); if (!rc) { - *symlinkinfo = kstrndup(referral.node_name, - strlen(referral.node_name), - GFP_KERNEL); + *symlinkinfo = kstrdup(referral.node_name, GFP_KERNEL); free_dfs_info_param(&referral); if (!*symlinkinfo) rc = -ENOMEM; @@ -1027,7 +1025,7 @@ cifs_can_echo(struct TCP_Server_Info *server) static int cifs_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - char *full_path, umode_t mode, dev_t dev) + const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct inode *newinode = NULL; diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 99a1951a01ec..d9a990c99121 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -58,6 +58,7 @@ #define SMB2_HMACSHA256_SIZE (32) #define SMB2_CMACAES_SIZE (16) #define SMB3_SIGNKEY_SIZE (16) +#define SMB3_GCM128_CRYPTKEY_SIZE (16) #define SMB3_GCM256_CRYPTKEY_SIZE (32) /* Maximum buffer size value we can send with 1 credit */ diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index a718dc77e604..9a61209a283e 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -512,7 +512,6 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_file_all_info *smb2_data; __u32 create_options = 0; - bool no_cached_open = tcon->nohandlecache; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; @@ -525,11 +524,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, return -ENOMEM; /* If it is a root and its handle is cached then use it */ - if (!strlen(full_path) && !no_cached_open) { - rc = open_shroot(xid, tcon, cifs_sb, &cfid); - if (rc) - goto out; - + rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); + if (!rc) { if (tcon->crfid.file_all_info_is_valid) { move_smb2_info_to_cifs(data, &tcon->crfid.file_all_info); @@ -540,7 +536,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (!rc) move_smb2_info_to_cifs(data, smb2_data); } - close_shroot(cfid); + close_cached_dir(cfid); goto out; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index b50164e2c88d..06d555d4da9a 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -667,6 +667,7 @@ smb2_is_valid_lease_break(char *buffer) !memcmp(rsp->LeaseKey, tcon->crfid.fid->lease_key, SMB2_LEASE_KEY_SIZE)) { + tcon->crfid.time = 0; INIT_WORK(&tcon->crfid.lease_break, smb2_cached_lease_break); queue_work(cifsiod_wq, @@ -754,8 +755,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) } } spin_unlock(&cifs_tcp_ses_lock); - cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); - return false; + cifs_dbg(FYI, "No file id matched, oplock break ignored\n"); + return true; } void diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 9bae7e8deb09..dd0eb665b680 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -690,17 +690,21 @@ smb2_close_cached_fid(struct kref *ref) cfid->is_valid = false; cfid->file_all_info_is_valid = false; cfid->has_lease = false; + if (cfid->dentry) { + dput(cfid->dentry); + cfid->dentry = NULL; + } } } -void close_shroot(struct cached_fid *cfid) +void close_cached_dir(struct cached_fid *cfid) { mutex_lock(&cfid->fid_mutex); kref_put(&cfid->refcount, smb2_close_cached_fid); mutex_unlock(&cfid->fid_mutex); } -void close_shroot_lease_locked(struct cached_fid *cfid) +void close_cached_dir_lease_locked(struct cached_fid *cfid) { if (cfid->has_lease) { cfid->has_lease = false; @@ -708,10 +712,10 @@ void close_shroot_lease_locked(struct cached_fid *cfid) } } -void close_shroot_lease(struct cached_fid *cfid) +void close_cached_dir_lease(struct cached_fid *cfid) { mutex_lock(&cfid->fid_mutex); - close_shroot_lease_locked(cfid); + close_cached_dir_lease_locked(cfid); mutex_unlock(&cfid->fid_mutex); } @@ -721,13 +725,15 @@ smb2_cached_lease_break(struct work_struct *work) struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_shroot_lease(cfid); + close_cached_dir_lease(cfid); } /* - * Open the directory at the root of a share + * Open the and cache a directory handle. + * Only supported for the root handle. */ -int open_shroot(unsigned int xid, struct cifs_tcon *tcon, +int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, + const char *path, struct cifs_sb_info *cifs_sb, struct cached_fid **cfid) { @@ -745,6 +751,18 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, __le16 utf16_path = 0; /* Null - since an open of top of share */ u8 oplock = SMB2_OPLOCK_LEVEL_II; struct cifs_fid *pfid; + struct dentry *dentry; + + if (tcon->nohandlecache) + return -ENOTSUPP; + + if (cifs_sb->root == NULL) + return -ENOENT; + + if (strlen(path)) + return -ENOENT; + + dentry = cifs_sb->root; mutex_lock(&tcon->crfid.fid_mutex); if (tcon->crfid.is_valid) { @@ -830,11 +848,9 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, }; /* - * caller expects this func to set pfid to a valid - * cached root, so we copy the existing one and get a - * reference. + * caller expects this func to set the fid in crfid to valid + * cached root, so increment the refcount. */ - memcpy(pfid, tcon->crfid.fid, sizeof(*pfid)); kref_get(&tcon->crfid.refcount); mutex_unlock(&tcon->crfid.fid_mutex); @@ -867,13 +883,18 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); #endif /* CIFS_DEBUG2 */ - memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); tcon->crfid.tcon = tcon; tcon->crfid.is_valid = true; + tcon->crfid.dentry = dentry; + dget(dentry); kref_init(&tcon->crfid.refcount); /* BB TBD check to see if oplock level check can be removed below */ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { + /* + * See commit 2f94a3125b87. Increment the refcount when we + * get a lease for root, release it if lease break occurs + */ kref_get(&tcon->crfid.refcount); tcon->crfid.has_lease = true; smb2_parse_contexts(server, o_rsp, @@ -892,6 +913,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, &rsp_iov[1], sizeof(struct smb2_file_all_info), (char *)&tcon->crfid.file_all_info)) tcon->crfid.file_all_info_is_valid = true; + tcon->crfid.time = jiffies; + oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); @@ -905,6 +928,22 @@ oshr_free: return rc; } +int open_cached_dir_by_dentry(struct cifs_tcon *tcon, + struct dentry *dentry, + struct cached_fid **cfid) +{ + mutex_lock(&tcon->crfid.fid_mutex); + if (tcon->crfid.dentry == dentry) { + cifs_dbg(FYI, "found a cached root file handle by dentry\n"); + *cfid = &tcon->crfid; + kref_get(&tcon->crfid.refcount); + mutex_unlock(&tcon->crfid.fid_mutex); + return 0; + } + mutex_unlock(&tcon->crfid.fid_mutex); + return -ENOENT; +} + static void smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb) @@ -914,7 +953,6 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; - bool no_cached_open = tcon->nohandlecache; struct cached_fid *cfid = NULL; oparms.tcon = tcon; @@ -924,14 +962,12 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - if (no_cached_open) { + rc = open_cached_dir(xid, tcon, "", cifs_sb, &cfid); + if (rc == 0) + memcpy(&fid, cfid->fid, sizeof(struct cifs_fid)); + else rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); - } else { - rc = open_shroot(xid, tcon, cifs_sb, &cfid); - if (rc == 0) - memcpy(&fid, cfid->fid, sizeof(struct cifs_fid)); - } if (rc) return; @@ -945,10 +981,10 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, FS_VOLUME_INFORMATION); SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */ - if (no_cached_open) + if (cfid == NULL) SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); else - close_shroot(cfid); + close_cached_dir(cfid); } static void @@ -1531,7 +1567,10 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, NULL, 0 /* no input */, CIFSMaxBufSize, (char **)&res_key, &ret_data_len); - if (rc) { + if (rc == -EOPNOTSUPP) { + pr_warn_once("Server share %s does not support copy range\n", tcon->treeName); + goto req_res_key_exit; + } else if (rc) { cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); goto req_res_key_exit; } @@ -1763,18 +1802,14 @@ smb2_ioctl_query_info(const unsigned int xid, } iqinf_exit: - kfree(vars); - kfree(buffer); - SMB2_open_free(&rqst[0]); - if (qi.flags & PASSTHRU_FSCTL) - SMB2_ioctl_free(&rqst[1]); - else - SMB2_query_info_free(&rqst[1]); - - SMB2_close_free(&rqst[2]); + cifs_small_buf_release(rqst[0].rq_iov[0].iov_base); + cifs_small_buf_release(rqst[1].rq_iov[0].iov_base); + cifs_small_buf_release(rqst[2].rq_iov[0].iov_base); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + kfree(vars); + kfree(buffer); return rc; e_fault: @@ -2038,6 +2073,7 @@ smb2_duplicate_extents(const unsigned int xid, { int rc; unsigned int ret_data_len; + struct inode *inode; struct duplicate_extents_to_file dup_ext_buf; struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); @@ -2054,10 +2090,21 @@ smb2_duplicate_extents(const unsigned int xid, cifs_dbg(FYI, "Duplicate extents: src off %lld dst off %lld len %lld\n", src_off, dest_off, len); - rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); - if (rc) - goto duplicate_extents_out; + inode = d_inode(trgtfile->dentry); + if (inode->i_size < dest_off + len) { + rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); + if (rc) + goto duplicate_extents_out; + /* + * Although also could set plausible allocation size (i_blocks) + * here in addition to setting the file size, in reflink + * it is likely that the target file is sparse. Its allocation + * size will be queried on next revalidate, but it is important + * to make sure that file's cached size is updated immediately + */ + cifs_setsize(inode, dest_off + len); + } rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, @@ -2205,22 +2252,23 @@ smb3_notify(const unsigned int xid, struct file *pfile, struct smb3_notify notify; struct dentry *dentry = pfile->f_path.dentry; struct inode *inode = file_inode(pfile); - struct cifs_sb_info *cifs_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_open_parms oparms; struct cifs_fid fid; struct cifs_tcon *tcon; - unsigned char *path = NULL; + const unsigned char *path; + void *page = alloc_dentry_path(); __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; int rc = 0; - path = build_path_from_dentry(dentry); - if (path == NULL) - return -ENOMEM; - - cifs_sb = CIFS_SB(inode->i_sb); + path = build_path_from_dentry(dentry, page); + if (IS_ERR(path)) { + rc = PTR_ERR(path); + goto notify_exit; + } - utf16_path = cifs_convert_path_to_utf16(path + 1, cifs_sb); + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (utf16_path == NULL) { rc = -ENOMEM; goto notify_exit; @@ -2252,7 +2300,7 @@ smb3_notify(const unsigned int xid, struct file *pfile, cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc); notify_exit: - kfree(path); + free_dentry_path(page); kfree(utf16_path); return rc; } @@ -3640,6 +3688,77 @@ out: return rc; } +static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, + loff_t off, loff_t len) +{ + int rc; + unsigned int xid; + struct cifsFileInfo *cfile = file->private_data; + __le64 eof; + + xid = get_xid(); + + if (off >= i_size_read(file->f_inode) || + off + len >= i_size_read(file->f_inode)) { + rc = -EINVAL; + goto out; + } + + rc = smb2_copychunk_range(xid, cfile, cfile, off + len, + i_size_read(file->f_inode) - off - len, off); + if (rc < 0) + goto out; + + eof = cpu_to_le64(i_size_read(file->f_inode) - len); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc < 0) + goto out; + + rc = 0; + out: + free_xid(xid); + return rc; +} + +static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, + loff_t off, loff_t len) +{ + int rc; + unsigned int xid; + struct cifsFileInfo *cfile = file->private_data; + __le64 eof; + __u64 count; + + xid = get_xid(); + + if (off >= i_size_read(file->f_inode)) { + rc = -EINVAL; + goto out; + } + + count = i_size_read(file->f_inode) - off; + eof = cpu_to_le64(i_size_read(file->f_inode) + len); + + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc < 0) + goto out; + + rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); + if (rc < 0) + goto out; + + rc = smb3_zero_range(file, tcon, off, len, 1); + if (rc < 0) + goto out; + + rc = 0; + out: + free_xid(xid); + return rc; +} + static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offset, int whence) { struct cifsFileInfo *wrcfile, *cfile = file->private_data; @@ -3811,6 +3930,10 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, return smb3_zero_range(file, tcon, off, len, false); } else if (mode == FALLOC_FL_KEEP_SIZE) return smb3_simple_falloc(file, tcon, off, len, true); + else if (mode == FALLOC_FL_COLLAPSE_RANGE) + return smb3_collapse_range(file, tcon, off, len); + else if (mode == FALLOC_FL_INSERT_RANGE) + return smb3_insert_range(file, tcon, off, len); else if (mode == 0) return smb3_simple_falloc(file, tcon, off, len, false); @@ -4158,7 +4281,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) if (ses->Suid == ses_id) { ses_enc_key = enc ? ses->smb3encryptionkey : ses->smb3decryptionkey; - memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE); + memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); spin_unlock(&cifs_tcp_ses_lock); return 0; } @@ -4166,7 +4289,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) } spin_unlock(&cifs_tcp_ses_lock); - return 1; + return -EAGAIN; } /* * Encrypt or decrypt @rqst message. @rqst[0] has the following format: @@ -4185,7 +4308,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, int rc = 0; struct scatterlist *sg; u8 sign[SMB2_SIGNATURE_SIZE] = {}; - u8 key[SMB3_SIGN_KEY_SIZE]; + u8 key[SMB3_ENC_DEC_KEY_SIZE]; struct aead_request *req; char *iv; unsigned int iv_len; @@ -4209,10 +4332,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, tfm = enc ? server->secmech.ccmaesencrypt : server->secmech.ccmaesdecrypt; - if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE); else - rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); + rc = crypto_aead_setkey(tfm, key, SMB3_GCM128_CRYPTKEY_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc); @@ -4955,7 +5079,7 @@ smb2_next_header(char *buf) static int smb2_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - char *full_path, umode_t mode, dev_t dev) + const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); int rc = -EPERM; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2199a9bfae8f..e36c2a867783 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1857,7 +1857,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; - close_shroot_lease(&tcon->crfid); + close_cached_dir_lease(&tcon->crfid); rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server, (void **) &req, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index a5a9e33c0d73..6442dc1c292b 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -144,7 +144,7 @@ struct smb2_transform_hdr { } __packed; /* See MS-SMB2 2.2.42 */ -struct smb2_compression_transform_hdr { +struct smb2_compression_transform_hdr_unchained { __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ __le32 OriginalCompressedSegmentSize; __le16 CompressionAlgorithm; @@ -160,10 +160,17 @@ struct compression_payload_header { __le16 CompressionAlgorithm; __le16 Flags; __le32 Length; /* length of compressed playload including field below if present */ - /* __le32 OriginalPayloadSize; */ /* optional */ + /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */ } __packed; /* See MS-SMB2 2.2.42.2 */ +struct smb2_compression_transform_hdr_chained { + __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ + __le32 OriginalCompressedSegmentSize; + /* struct compression_payload_header[] */ +} __packed; + +/* See MS-SMB2 2.2.42.2.2 */ struct compression_pattern_payload_v1 { __le16 Pattern; __le16 Reserved1; @@ -181,7 +188,11 @@ struct smb2_rdma_transform { __le32 Reserved2; } __packed; -struct smb2_rdma_encryption_transform { +/* TransformType */ +#define SMB2_RDMA_TRANSFORM_TYPE_ENCRYPTION 0x0001 +#define SMB2_RDMA_TRANSFORM_TYPE_SIGNING 0x0002 + +struct smb2_rdma_crypto_transform { __le16 TransformType; __le16 SignatureLength; __le16 NonceLength; @@ -409,13 +420,29 @@ struct smb2_netname_neg_context { } __packed; /* - * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5 * and 2.2.4.1.5 */ +/* Flags */ +#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 + +struct smb2_transport_capabilities_context { + __le16 ContextType; /* 6 */ + __le16 DataLength; + __u32 Reserved; + __le32 Flags; +} __packed; + +/* + * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * and 2.2.4.1.6 + */ + /* RDMA Transform IDs */ #define SMB2_RDMA_TRANSFORM_NONE 0x0000 #define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 +#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002 struct smb2_rdma_transform_capabilities_context { __le16 ContextType; /* 7 */ @@ -427,6 +454,11 @@ struct smb2_rdma_transform_capabilities_context { __le16 RDMATransformIds[]; } __packed; +/* + * For signing capabilities context see MS-SMB2 2.2.3.1.7 + * and 2.2.4.1.7 + */ + /* Signing algorithms */ #define SIGNING_ALG_HMAC_SHA256 0 #define SIGNING_ALG_AES_CMAC 1 @@ -634,7 +666,8 @@ struct smb2_tree_connect_rsp { #define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 #define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 #define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ -#define SHI1005_FLAGS_ALL 0x0004FF33 +#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ +#define SHI1005_FLAGS_ALL 0x0014FF33 /* Possible share capabilities */ #define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ @@ -1390,7 +1423,11 @@ struct smb2_lock_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 48 */ __le16 LockCount; - __le32 Reserved; + /* + * The least significant four bits are the index, the other 28 bits are + * the lock sequence number (0 to 64). See MS-SMB2 2.2.26 + */ + __le32 LockSequenceNumber; __u64 PersistentFileId; /* opaque endianness */ __u64 VolatileFileId; /* opaque endianness */ /* Followed by at least one */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index a2eb34a8d9c9..a5f87b02cfaf 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -69,12 +69,16 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); -extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - struct cached_fid **cfid); -extern void close_shroot(struct cached_fid *cfid); -extern void close_shroot_lease(struct cached_fid *cfid); -extern void close_shroot_lease_locked(struct cached_fid *cfid); +extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, + const char *path, + struct cifs_sb_info *cifs_sb, + struct cached_fid **cfid); +extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon, + struct dentry *dentry, + struct cached_fid **cfid); +extern void close_cached_dir(struct cached_fid *cfid); +extern void close_cached_dir_lease(struct cached_fid *cfid); +extern void close_cached_dir_lease_locked(struct cached_fid *cfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index ebccd71cc60a..e6fa76ab70be 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -298,7 +298,8 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; - __u8 L[4] = {0, 0, 0, 128}; + __u8 L128[4] = {0, 0, 0, 128}; + __u8 L256[4] = {0, 0, 1, 0}; int rc = 0; unsigned char prfhash[SMB2_HMACSHA256_SIZE]; unsigned char *hashptr = prfhash; @@ -354,8 +355,14 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L, 4); + if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + L256, 4); + } else { + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + L128, 4); + } if (rc) { cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); goto smb3signkey_ret; @@ -390,6 +397,9 @@ generate_smb3signingkey(struct cifs_ses *ses, const struct derivation_triplet *ptriplet) { int rc; +#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS + struct TCP_Server_Info *server = ses->server; +#endif /* * All channels use the same encryption/decryption keys but @@ -422,11 +432,11 @@ generate_smb3signingkey(struct cifs_ses *ses, rc = generate_key(ses, ptriplet->encryption.label, ptriplet->encryption.context, ses->smb3encryptionkey, - SMB3_SIGN_KEY_SIZE); + SMB3_ENC_DEC_KEY_SIZE); rc = generate_key(ses, ptriplet->decryption.label, ptriplet->decryption.context, ses->smb3decryptionkey, - SMB3_SIGN_KEY_SIZE); + SMB3_ENC_DEC_KEY_SIZE); if (rc) return rc; } @@ -442,14 +452,23 @@ generate_smb3signingkey(struct cifs_ses *ses, */ cifs_dbg(VFS, "Session Id %*ph\n", (int)sizeof(ses->Suid), &ses->Suid); + cifs_dbg(VFS, "Cipher type %d\n", server->cipher_type); cifs_dbg(VFS, "Session Key %*ph\n", SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response); cifs_dbg(VFS, "Signing Key %*ph\n", SMB3_SIGN_KEY_SIZE, ses->smb3signingkey); - cifs_dbg(VFS, "ServerIn Key %*ph\n", - SMB3_SIGN_KEY_SIZE, ses->smb3encryptionkey); - cifs_dbg(VFS, "ServerOut Key %*ph\n", - SMB3_SIGN_KEY_SIZE, ses->smb3decryptionkey); + if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { + cifs_dbg(VFS, "ServerIn Key %*ph\n", + SMB3_GCM256_CRYPTKEY_SIZE, ses->smb3encryptionkey); + cifs_dbg(VFS, "ServerOut Key %*ph\n", + SMB3_GCM256_CRYPTKEY_SIZE, ses->smb3decryptionkey); + } else { + cifs_dbg(VFS, "ServerIn Key %*ph\n", + SMB3_GCM128_CRYPTKEY_SIZE, ses->smb3encryptionkey); + cifs_dbg(VFS, "ServerOut Key %*ph\n", + SMB3_GCM128_CRYPTKEY_SIZE, ses->smb3decryptionkey); + } #endif return rc; } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 007d99437c77..c1725b55f364 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -1196,9 +1196,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { + mutex_lock(&server->srv_mutex); smb311_update_preauth_hash(ses, rqst[0].rq_iov, rqst[0].rq_nvec); + mutex_unlock(&server->srv_mutex); + } for (i = 0; i < num_rqst; i++) { rc = wait_for_response(server, midQ[i]); @@ -1266,7 +1269,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, .iov_base = resp_iov[0].iov_base, .iov_len = resp_iov[0].iov_len }; + mutex_lock(&server->srv_mutex); smb311_update_preauth_hash(ses, &iov, 1); + mutex_unlock(&server->srv_mutex); } out: diff --git a/fs/cifs/unc.c b/fs/cifs/unc.c index 394aa00cea40..f6fc5e343ea4 100644 --- a/fs/cifs/unc.c +++ b/fs/cifs/unc.c @@ -50,7 +50,6 @@ char *extract_sharename(const char *unc) { const char *src; char *delim, *dst; - int len; /* skip double chars at the beginning */ src = unc + 2; @@ -60,10 +59,9 @@ char *extract_sharename(const char *unc) if (!delim) return ERR_PTR(-EINVAL); delim++; - len = strlen(delim); /* caller has to free the memory */ - dst = kstrndup(delim, len, GFP_KERNEL); + dst = kstrdup(delim, GFP_KERNEL); if (!dst) return ERR_PTR(-ENOMEM); diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 41a611e76bb7..e351b945135b 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -53,7 +53,7 @@ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT, XATTR_CIFS_NTSD, XATTR_CIFS_NTSD_FULL }; static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon, - struct inode *inode, char *full_path, + struct inode *inode, const char *full_path, const void *value, size_t size) { ssize_t rc = -EOPNOTSUPP; @@ -77,7 +77,7 @@ static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon, } static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon, - struct inode *inode, char *full_path, + struct inode *inode, const char *full_path, const void *value, size_t size) { ssize_t rc = -EOPNOTSUPP; @@ -112,7 +112,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path; + const char *full_path; + void *page; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -120,10 +121,11 @@ static int cifs_xattr_set(const struct xattr_handler *handler, pTcon = tlink_tcon(tlink); xid = get_xid(); + page = alloc_dentry_path(); - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } /* return dos attributes as pseudo xattr */ @@ -235,7 +237,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, } out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -297,7 +299,8 @@ static int cifs_xattr_get(const struct xattr_handler *handler, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path; + const char *full_path; + void *page; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -305,10 +308,11 @@ static int cifs_xattr_get(const struct xattr_handler *handler, pTcon = tlink_tcon(tlink); xid = get_xid(); + page = alloc_dentry_path(); - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -401,7 +405,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, rc = -EOPNOTSUPP; out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -414,7 +418,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path; + const char *full_path; + void *page; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) return -EOPNOTSUPP; @@ -425,10 +430,11 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) pTcon = tlink_tcon(tlink); xid = get_xid(); + page = alloc_dentry_path(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto list_ea_exit; } /* return dos attributes as pseudo xattr */ @@ -442,7 +448,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, full_path, NULL, data, buf_size, cifs_sb); list_ea_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; diff --git a/fs/coda/file.c b/fs/coda/file.c index 128d63df5bfb..ef5ca22bfb3e 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -175,10 +175,10 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) ret = call_mmap(vma->vm_file, vma); if (ret) { - /* if call_mmap fails, our caller will put coda_file so we - * should drop the reference to the host_file that we got. + /* if call_mmap fails, our caller will put host_file so we + * should drop the reference to the coda_file that we got. */ - fput(host_file); + fput(coda_file); kfree(cvm_ops); } else { /* here we add redirects for the open/close vm_operations */ diff --git a/fs/coredump.c b/fs/coredump.c index 1c0fdc1aa70b..2868e3e171ae 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -809,6 +809,16 @@ void do_coredump(const kernel_siginfo_t *siginfo) } file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); + /* + * Ensures that file size is big enough to contain the current + * file postion. This prevents gdb from complaining about + * a truncated file if the last "write" to the file was + * dump_skip. + */ + if (cprm.to_skip) { + cprm.to_skip--; + dump_emit(&cprm, "", 1); + } file_end_write(cprm.file); } if (ispipe && core_pipe_limit) @@ -835,7 +845,7 @@ fail: * do on a core-file: use only these functions to write out all the * necessary info. */ -int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr) { struct file *file = cprm->file; loff_t pos = file->f_pos; @@ -855,9 +865,8 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) return 1; } -EXPORT_SYMBOL(dump_emit); -int dump_skip(struct coredump_params *cprm, size_t nr) +static int __dump_skip(struct coredump_params *cprm, size_t nr) { static char zeroes[PAGE_SIZE]; struct file *file = cprm->file; @@ -869,13 +878,35 @@ int dump_skip(struct coredump_params *cprm, size_t nr) return 1; } else { while (nr > PAGE_SIZE) { - if (!dump_emit(cprm, zeroes, PAGE_SIZE)) + if (!__dump_emit(cprm, zeroes, PAGE_SIZE)) return 0; nr -= PAGE_SIZE; } - return dump_emit(cprm, zeroes, nr); + return __dump_emit(cprm, zeroes, nr); } } + +int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +{ + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + return __dump_emit(cprm, addr, nr); +} +EXPORT_SYMBOL(dump_emit); + +void dump_skip_to(struct coredump_params *cprm, unsigned long pos) +{ + cprm->to_skip = pos - cprm->pos; +} +EXPORT_SYMBOL(dump_skip_to); + +void dump_skip(struct coredump_params *cprm, size_t nr) +{ + cprm->to_skip += nr; +} EXPORT_SYMBOL(dump_skip); #ifdef CONFIG_ELF_CORE @@ -902,11 +933,11 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, stop = !dump_emit(cprm, kaddr, PAGE_SIZE); kunmap_local(kaddr); put_page(page); + if (stop) + return 0; } else { - stop = !dump_skip(cprm, PAGE_SIZE); + dump_skip(cprm, PAGE_SIZE); } - if (stop) - return 0; } return 1; } @@ -914,33 +945,16 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, int dump_align(struct coredump_params *cprm, int align) { - unsigned mod = cprm->pos & (align - 1); + unsigned mod = (cprm->pos + cprm->to_skip) & (align - 1); if (align & (align - 1)) return 0; - return mod ? dump_skip(cprm, align - mod) : 1; + if (mod) + cprm->to_skip += align - mod; + return 1; } EXPORT_SYMBOL(dump_align); /* - * Ensures that file size is big enough to contain the current file - * postion. This prevents gdb from complaining about a truncated file - * if the last "write" to the file was dump_skip. - */ -void dump_truncate(struct coredump_params *cprm) -{ - struct file *file = cprm->file; - loff_t offset; - - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - offset = file->f_op->llseek(file, 0, SEEK_CUR); - if (i_size_read(file->f_mapping->host) < offset) - do_truncate(file_mnt_user_ns(file), file->f_path.dentry, - offset, 0, file); - } -} -EXPORT_SYMBOL(dump_truncate); - -/* * The purpose of always_dump_vma() is to make sure that special kernel mappings * that are useful for post-mortem analysis are included in every core dump. * In that way we ensure that the core dump is fully interpretable later diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index a5f5c30368a2..2d0c8922f635 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -14,16 +14,30 @@ config FS_ENCRYPTION F2FS and UBIFS make use of this feature. # Filesystems supporting encryption must select this if FS_ENCRYPTION. This -# allows the algorithms to be built as modules when all the filesystems are. +# allows the algorithms to be built as modules when all the filesystems are, +# whereas selecting them from FS_ENCRYPTION would force them to be built-in. +# +# Note: this option only pulls in the algorithms that filesystem encryption +# needs "by default". If userspace will use "non-default" encryption modes such +# as Adiantum encryption, then those other modes need to be explicitly enabled +# in the crypto API; see Documentation/filesystems/fscrypt.rst for details. +# +# Also note that this option only pulls in the generic implementations of the +# algorithms, not any per-architecture optimized implementations. It is +# strongly recommended to enable optimized implementations too. It is safe to +# disable these generic implementations if corresponding optimized +# implementations will always be available too; for this reason, these are soft +# dependencies ('imply' rather than 'select'). Only disable these generic +# implementations if you're sure they will never be needed, though. config FS_ENCRYPTION_ALGS tristate - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_CTS - select CRYPTO_ECB - select CRYPTO_HMAC - select CRYPTO_SHA512 - select CRYPTO_XTS + imply CRYPTO_AES + imply CRYPTO_CBC + imply CRYPTO_CTS + imply CRYPTO_ECB + imply CRYPTO_HMAC + imply CRYPTO_SHA512 + imply CRYPTO_XTS config FS_ENCRYPTION_INLINE_CRYPT bool "Enable fscrypt to use inline crypto" diff --git a/fs/d_path.c b/fs/d_path.c index a69e2cd36e6e..270d62133996 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -326,9 +326,9 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) /* * Write full pathname from the root of the filesystem into the buffer. */ -static char *__dentry_path(struct dentry *d, char *buf, int buflen) +static char *__dentry_path(const struct dentry *d, char *buf, int buflen) { - struct dentry *dentry; + const struct dentry *dentry; char *end, *retval; int len, seq = 0; int error = 0; @@ -347,7 +347,7 @@ restart: *retval = '/'; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { - struct dentry *parent = dentry->d_parent; + const struct dentry *parent = dentry->d_parent; prefetch(parent); error = prepend_name(&end, &len, &dentry->d_name); @@ -371,13 +371,13 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } -char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) +char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { return __dentry_path(dentry, buf, buflen); } EXPORT_SYMBOL(dentry_path_raw); -char *dentry_path(struct dentry *dentry, char *buf, int buflen) +char *dentry_path(const struct dentry *dentry, char *buf, int buflen) { char *p = NULL; char *retval; diff --git a/fs/dcache.c b/fs/dcache.c index 7d24ff7eb206..cf871a81f4fd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -84,6 +84,8 @@ const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); +const struct qstr dotdot_name = QSTR_INIT("..", 2); +EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 686e0ad28788..e813acfaa6e8 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -773,7 +773,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { - char buf[3]; + char buf[2]; bool val; int r; struct dentry *dentry = F_DENTRY(file); @@ -789,7 +789,6 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, else buf[0] = 'N'; buf[1] = '\n'; - buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } EXPORT_SYMBOL_GPL(debugfs_read_file_bool); @@ -865,6 +864,97 @@ struct dentry *debugfs_create_bool(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_bool); +ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dentry *dentry = F_DENTRY(file); + char *str, *copy = NULL; + int copy_len, len; + ssize_t ret; + + ret = debugfs_file_get(dentry); + if (unlikely(ret)) + return ret; + + str = *(char **)file->private_data; + len = strlen(str) + 1; + copy = kmalloc(len, GFP_KERNEL); + if (!copy) { + debugfs_file_put(dentry); + return -ENOMEM; + } + + copy_len = strscpy(copy, str, len); + debugfs_file_put(dentry); + if (copy_len < 0) { + kfree(copy); + return copy_len; + } + + copy[copy_len] = '\n'; + + ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len); + kfree(copy); + + return ret; +} + +static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* This is really only for read-only strings */ + return -EINVAL; +} + +static const struct file_operations fops_str = { + .read = debugfs_read_file_str, + .write = debugfs_write_file_str, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct file_operations fops_str_ro = { + .read = debugfs_read_file_str, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct file_operations fops_str_wo = { + .write = debugfs_write_file_str, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * debugfs_create_str - create a debugfs file that is used to read and write a string value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be + * returned. + * + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will + * be returned. + */ +void debugfs_create_str(const char *name, umode_t mode, + struct dentry *parent, char **value) +{ + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str, + &fops_str_ro, &fops_str_wo); +} + static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 22e86ae4dd5a..1d252164d97b 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -35,7 +35,7 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static unsigned int debugfs_allow = DEFAULT_DEBUGFS_ALLOW_BITS; +static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; /* * Don't allow access attributes to be changed whilst the kernel is locked down diff --git a/fs/direct-io.c b/fs/direct-io.c index b61491bf3166..b2e86e739d7a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -812,6 +812,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, struct buffer_head *map_bh) { int ret = 0; + int boundary = sdio->boundary; /* dio_send_cur_page may clear it */ if (dio->op == REQ_OP_WRITE) { /* @@ -850,10 +851,10 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: /* - * If sdio->boundary then we want to schedule the IO now to + * If boundary then we want to schedule the IO now to * avoid metadata seeks. */ - if (sdio->boundary) { + if (boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); if (sdio->bio) dio_bio_submit(dio, sdio); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 49c5f9407098..88d95d96e36c 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item, CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, - int *info_field, bool (*check_cb)(unsigned int x), + int *info_field, int (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; @@ -137,8 +137,11 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, if (rc) return rc; - if (check_cb && check_cb(x)) - return -EINVAL; + if (check_cb) { + rc = check_cb(x); + if (rc) + return rc; + } *cl_field = x; *info_field = x; @@ -161,17 +164,53 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ } \ CONFIGFS_ATTR(cluster_, name); -static bool dlm_check_zero(unsigned int x) +static int dlm_check_protocol_and_dlm_running(unsigned int x) +{ + switch (x) { + case 0: + /* TCP */ + break; + case 1: + /* SCTP */ + break; + default: + return -EINVAL; + } + + if (dlm_allow_conn) + return -EBUSY; + + return 0; +} + +static int dlm_check_zero_and_dlm_running(unsigned int x) +{ + if (!x) + return -EINVAL; + + if (dlm_allow_conn) + return -EBUSY; + + return 0; +} + +static int dlm_check_zero(unsigned int x) { - return !x; + if (!x) + return -EINVAL; + + return 0; } -static bool dlm_check_buffer_size(unsigned int x) +static int dlm_check_buffer_size(unsigned int x) { - return (x < DEFAULT_BUFFER_SIZE); + if (x < DEFAULT_BUFFER_SIZE) + return -EINVAL; + + return 0; } -CLUSTER_ATTR(tcp_port, dlm_check_zero); +CLUSTER_ATTR(tcp_port, dlm_check_zero_and_dlm_running); CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); CLUSTER_ATTR(recover_timer, dlm_check_zero); @@ -179,7 +218,7 @@ CLUSTER_ATTR(toss_secs, dlm_check_zero); CLUSTER_ATTR(scan_secs, dlm_check_zero); CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); -CLUSTER_ATTR(protocol, NULL); +CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); CLUSTER_ATTR(timewarn_cs, dlm_check_zero); CLUSTER_ATTR(waitwarn_us, NULL); @@ -688,6 +727,7 @@ static ssize_t comm_mark_show(struct config_item *item, char *buf) static ssize_t comm_mark_store(struct config_item *item, const char *buf, size_t len) { + struct dlm_comm *comm; unsigned int mark; int rc; @@ -695,7 +735,15 @@ static ssize_t comm_mark_store(struct config_item *item, const char *buf, if (rc) return rc; - config_item_to_comm(item)->mark = mark; + if (mark == 0) + mark = dlm_config.ci_mark; + + comm = config_item_to_comm(item); + rc = dlm_lowcomms_nodes_set_mark(comm->nodeid, mark); + if (rc) + return rc; + + comm->mark = mark; return len; } @@ -870,24 +918,6 @@ int dlm_comm_seq(int nodeid, uint32_t *seq) return 0; } -void dlm_comm_mark(int nodeid, unsigned int *mark) -{ - struct dlm_comm *cm; - - cm = get_comm(nodeid); - if (!cm) { - *mark = dlm_config.ci_mark; - return; - } - - if (cm->mark) - *mark = cm->mark; - else - *mark = dlm_config.ci_mark; - - put_comm(cm); -} - int dlm_our_nodeid(void) { return local_comm ? local_comm->nodeid : 0; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index c210250a2581..d2cd4bd20313 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -48,7 +48,6 @@ void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); int dlm_comm_seq(int nodeid, uint32_t *seq); -void dlm_comm_mark(int nodeid, unsigned int *mark); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index d6bbccb0ed15..d5bd990bcab8 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -542,6 +542,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) if (bucket >= ls->ls_rsbtbl_size) { kfree(ri); + ++*pos; return NULL; } tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 002123efc6b0..b93df39d0915 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -3541,8 +3541,6 @@ static int _create_message(struct dlm_ls *ls, int mb_len, if (!mh) return -ENOBUFS; - memset(mb, 0, mb_len); - ms = (struct dlm_message *) mb; ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 561dcad08ad6..c14cf2b7faab 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -404,12 +404,6 @@ static int threads_start(void) return error; } -static void threads_stop(void) -{ - dlm_scand_stop(); - dlm_lowcomms_stop(); -} - static int new_lockspace(const char *name, const char *cluster, uint32_t flags, int lvblen, const struct dlm_lockspace_ops *ops, void *ops_arg, @@ -702,8 +696,11 @@ int dlm_new_lockspace(const char *name, const char *cluster, ls_count++; if (error > 0) error = 0; - if (!ls_count) - threads_stop(); + if (!ls_count) { + dlm_scand_stop(); + dlm_lowcomms_shutdown(); + dlm_lowcomms_stop(); + } out: mutex_unlock(&ls_lock); return error; @@ -788,6 +785,11 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_recoverd_stop(ls); + if (ls_count == 1) { + dlm_scand_stop(); + dlm_lowcomms_shutdown(); + } + dlm_callback_stop(ls); remove_lockspace(ls); @@ -880,7 +882,7 @@ int dlm_release_lockspace(void *lockspace, int force) if (!error) ls_count--; if (!ls_count) - threads_stop(); + dlm_lowcomms_stop(); mutex_unlock(&ls_lock); return error; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 372c34ff8594..166e36fcf3e4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -102,6 +102,9 @@ struct listen_connection { struct work_struct rwork; }; +#define DLM_WQ_REMAIN_BYTES(e) (PAGE_SIZE - e->end) +#define DLM_WQ_LENGTH_BYTES(e) (e->end - e->offset) + /* An entry waiting to be sent */ struct writequeue_entry { struct list_head list; @@ -116,6 +119,7 @@ struct writequeue_entry { struct dlm_node_addr { struct list_head list; int nodeid; + int mark; int addr_count; int curr_addr_index; struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; @@ -134,7 +138,7 @@ static DEFINE_SPINLOCK(dlm_node_addrs_spin); static struct listen_connection listen_con; static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; -static int dlm_allow_conn; +int dlm_allow_conn; /* Work queues */ static struct workqueue_struct *recv_workqueue; @@ -303,7 +307,8 @@ static int addr_compare(const struct sockaddr_storage *x, } static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, - struct sockaddr *sa_out, bool try_new_addr) + struct sockaddr *sa_out, bool try_new_addr, + unsigned int *mark) { struct sockaddr_storage sas; struct dlm_node_addr *na; @@ -331,6 +336,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, if (!na->addr_count) return -ENOENT; + *mark = na->mark; + if (sas_out) memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); @@ -350,7 +357,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, return 0; } -static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) +static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid, + unsigned int *mark) { struct dlm_node_addr *na; int rv = -EEXIST; @@ -364,6 +372,7 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) for (addr_i = 0; addr_i < na->addr_count; addr_i++) { if (addr_compare(na->addr[addr_i], addr)) { *nodeid = na->nodeid; + *mark = na->mark; rv = 0; goto unlock; } @@ -412,6 +421,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) new_node->nodeid = nodeid; new_node->addr[0] = new_addr; new_node->addr_count = 1; + new_node->mark = dlm_config.ci_mark; list_add(&new_node->list, &dlm_node_addrs); spin_unlock(&dlm_node_addrs_spin); return 0; @@ -519,6 +529,23 @@ int dlm_lowcomms_connect_node(int nodeid) return 0; } +int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) +{ + struct dlm_node_addr *na; + + spin_lock(&dlm_node_addrs_spin); + na = find_node_addr(nodeid); + if (!na) { + spin_unlock(&dlm_node_addrs_spin); + return -ENOENT; + } + + na->mark = mark; + spin_unlock(&dlm_node_addrs_spin); + + return 0; +} + static void lowcomms_error_report(struct sock *sk) { struct connection *con; @@ -685,10 +712,7 @@ static void shutdown_connection(struct connection *con) { int ret; - if (cancel_work_sync(&con->swork)) { - log_print("canceled swork for node %d", con->nodeid); - clear_bit(CF_WRITE_PENDING, &con->flags); - } + flush_work(&con->swork); mutex_lock(&con->sock_mutex); /* nothing to shutdown */ @@ -867,7 +891,7 @@ static int accept_from_sock(struct listen_connection *con) /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); - if (addr_to_nodeid(&peeraddr, &nodeid)) { + if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) { unsigned char *b=(unsigned char *)&peeraddr; log_print("connect from non cluster node"); print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, @@ -876,9 +900,6 @@ static int accept_from_sock(struct listen_connection *con) return -1; } - dlm_comm_mark(nodeid, &mark); - sock_set_mark(newsock->sk, mark); - log_print("got connection from %d", nodeid); /* Check to see if we already have a connection to this node. This @@ -892,6 +913,8 @@ static int accept_from_sock(struct listen_connection *con) goto accept_err; } + sock_set_mark(newsock->sk, mark); + mutex_lock(&newcon->sock_mutex); if (newcon->sock) { struct connection *othercon = newcon->othercon; @@ -908,16 +931,18 @@ static int accept_from_sock(struct listen_connection *con) result = dlm_con_init(othercon, nodeid); if (result < 0) { kfree(othercon); + mutex_unlock(&newcon->sock_mutex); goto accept_err; } + lockdep_set_subclass(&othercon->sock_mutex, 1); newcon->othercon = othercon; } else { /* close other sock con if we have something new */ close_connection(othercon, false, true, false); } - mutex_lock_nested(&othercon->sock_mutex, 1); + mutex_lock(&othercon->sock_mutex); add_sock(newsock, othercon); addcon = othercon; mutex_unlock(&othercon->sock_mutex); @@ -930,6 +955,7 @@ static int accept_from_sock(struct listen_connection *con) addcon = newcon; } + set_bit(CF_CONNECTED, &addcon->flags); mutex_unlock(&newcon->sock_mutex); /* @@ -1015,8 +1041,6 @@ static void sctp_connect_to_sock(struct connection *con) struct socket *sock; unsigned int mark; - dlm_comm_mark(con->nodeid, &mark); - mutex_lock(&con->sock_mutex); /* Some odd races can cause double-connects, ignore them */ @@ -1029,7 +1053,7 @@ static void sctp_connect_to_sock(struct connection *con) } memset(&daddr, 0, sizeof(daddr)); - result = nodeid_to_addr(con->nodeid, &daddr, NULL, true); + result = nodeid_to_addr(con->nodeid, &daddr, NULL, true, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); goto out; @@ -1104,13 +1128,11 @@ out: static void tcp_connect_to_sock(struct connection *con) { struct sockaddr_storage saddr, src_addr; + unsigned int mark; int addr_len; struct socket *sock = NULL; - unsigned int mark; int result; - dlm_comm_mark(con->nodeid, &mark); - mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) goto out; @@ -1125,15 +1147,15 @@ static void tcp_connect_to_sock(struct connection *con) if (result < 0) goto out_err; - sock_set_mark(sock->sk, mark); - memset(&saddr, 0, sizeof(saddr)); - result = nodeid_to_addr(con->nodeid, &saddr, NULL, false); + result = nodeid_to_addr(con->nodeid, &saddr, NULL, false, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); goto out_err; } + sock_set_mark(sock->sk, mark); + add_sock(sock, con); /* Bind to our cluster-known address connecting to avoid @@ -1330,70 +1352,72 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con, { struct writequeue_entry *entry; - entry = kmalloc(sizeof(struct writequeue_entry), allocation); + entry = kzalloc(sizeof(*entry), allocation); if (!entry) return NULL; - entry->page = alloc_page(allocation); + entry->page = alloc_page(allocation | __GFP_ZERO); if (!entry->page) { kfree(entry); return NULL; } - entry->offset = 0; - entry->len = 0; - entry->end = 0; - entry->users = 0; entry->con = con; + entry->users = 1; return entry; } -void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +static struct writequeue_entry *new_wq_entry(struct connection *con, int len, + gfp_t allocation, char **ppc) { - struct connection *con; struct writequeue_entry *e; - int offset = 0; - if (len > LOWCOMMS_MAX_TX_BUFFER_LEN) { - BUILD_BUG_ON(PAGE_SIZE < LOWCOMMS_MAX_TX_BUFFER_LEN); - log_print("failed to allocate a buffer of size %d", len); - return NULL; + spin_lock(&con->writequeue_lock); + if (!list_empty(&con->writequeue)) { + e = list_last_entry(&con->writequeue, struct writequeue_entry, list); + if (DLM_WQ_REMAIN_BYTES(e) >= len) { + *ppc = page_address(e->page) + e->end; + e->end += len; + e->users++; + spin_unlock(&con->writequeue_lock); + + return e; + } } + spin_unlock(&con->writequeue_lock); - con = nodeid2con(nodeid, allocation); - if (!con) + e = new_writequeue_entry(con, allocation); + if (!e) return NULL; + *ppc = page_address(e->page); + e->end += len; + spin_lock(&con->writequeue_lock); - e = list_entry(con->writequeue.prev, struct writequeue_entry, list); - if ((&e->list == &con->writequeue) || - (PAGE_SIZE - e->end < len)) { - e = NULL; - } else { - offset = e->end; - e->end += len; - e->users++; - } + list_add_tail(&e->list, &con->writequeue); spin_unlock(&con->writequeue_lock); - if (e) { - got_one: - *ppc = page_address(e->page) + offset; - return e; - } + return e; +}; - e = new_writequeue_entry(con, allocation); - if (e) { - spin_lock(&con->writequeue_lock); - offset = e->end; - e->end += len; - e->users++; - list_add_tail(&e->list, &con->writequeue); - spin_unlock(&con->writequeue_lock); - goto got_one; +void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +{ + struct connection *con; + + if (len > DEFAULT_BUFFER_SIZE || + len < sizeof(struct dlm_header)) { + BUILD_BUG_ON(PAGE_SIZE < DEFAULT_BUFFER_SIZE); + log_print("failed to allocate a buffer of size %d", len); + WARN_ON(1); + return NULL; } - return NULL; + + con = nodeid2con(nodeid, allocation); + if (!con) + return NULL; + + return new_wq_entry(con, len, allocation, ppc); } void dlm_lowcomms_commit_buffer(void *mh) @@ -1406,7 +1430,8 @@ void dlm_lowcomms_commit_buffer(void *mh) users = --e->users; if (users) goto out; - e->len = e->end - e->offset; + + e->len = DLM_WQ_LENGTH_BYTES(e); spin_unlock(&con->writequeue_lock); queue_work(send_workqueue, &con->swork); @@ -1432,11 +1457,10 @@ static void send_to_sock(struct connection *con) spin_lock(&con->writequeue_lock); for (;;) { - e = list_entry(con->writequeue.next, struct writequeue_entry, - list); - if ((struct list_head *) e == &con->writequeue) + if (list_empty(&con->writequeue)) break; + e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; BUG_ON(len == 0 && e->users == 0); @@ -1589,6 +1613,29 @@ static int work_start(void) return 0; } +static void shutdown_conn(struct connection *con) +{ + if (con->shutdown_action) + con->shutdown_action(con); +} + +void dlm_lowcomms_shutdown(void) +{ + /* Set all the flags to prevent any + * socket activity. + */ + dlm_allow_conn = 0; + + if (recv_workqueue) + flush_workqueue(recv_workqueue); + if (send_workqueue) + flush_workqueue(send_workqueue); + + dlm_close_sock(&listen_con.sock); + + foreach_conn(shutdown_conn); +} + static void _stop_conn(struct connection *con, bool and_other) { mutex_lock(&con->sock_mutex); @@ -1610,12 +1657,6 @@ static void stop_conn(struct connection *con) _stop_conn(con, true); } -static void shutdown_conn(struct connection *con) -{ - if (con->shutdown_action) - con->shutdown_action(con); -} - static void connection_release(struct rcu_head *rcu) { struct connection *con = container_of(rcu, struct connection, rcu); @@ -1672,19 +1713,6 @@ static void work_flush(void) void dlm_lowcomms_stop(void) { - /* Set all the flags to prevent any - socket activity. - */ - dlm_allow_conn = 0; - - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); - - dlm_close_sock(&listen_con.sock); - - foreach_conn(shutdown_conn); work_flush(); foreach_conn(free_conn); work_stop(); diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 0918f9376489..48bbc4e18761 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -14,13 +14,18 @@ #define LOWCOMMS_MAX_TX_BUFFER_LEN 4096 +/* switch to check if dlm is running */ +extern int dlm_allow_conn; + int dlm_lowcomms_start(void); +void dlm_lowcomms_shutdown(void); void dlm_lowcomms_stop(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_lowcomms_commit_buffer(void *mh); int dlm_lowcomms_connect_node(int nodeid); +int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index fde3a6afe4be..1c6654a21ec4 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -22,8 +22,6 @@ * into packets and sends them to the comms layer. */ -#include <asm/unaligned.h> - #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" @@ -45,13 +43,22 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) while (len >= sizeof(struct dlm_header)) { hd = (struct dlm_header *)ptr; - /* no message should be more than this otherwise we - * cannot deliver this message to upper layers + /* no message should be more than DEFAULT_BUFFER_SIZE or + * less than dlm_header size. + * + * Some messages does not have a 8 byte length boundary yet + * which can occur in a unaligned memory access of some dlm + * messages. However this problem need to be fixed at the + * sending side, for now it seems nobody run into architecture + * related issues yet but it slows down some processing. + * Fixing this issue should be scheduled in future by doing + * the next major version bump. */ - msglen = get_unaligned_le16(&hd->h_length); - if (msglen > DEFAULT_BUFFER_SIZE) { - log_print("received invalid length header: %u, will abort message parsing", - msglen); + msglen = le16_to_cpu(hd->h_length); + if (msglen > DEFAULT_BUFFER_SIZE || + msglen < sizeof(struct dlm_header)) { + log_print("received invalid length header: %u from node %d, will abort message parsing", + msglen, nodeid); return -EBADMSG; } @@ -84,15 +91,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) goto skip; } - /* for aligned memory access, we just copy current message - * to begin of the buffer which contains already parsed buffer - * data and should provide align access for upper layers - * because the start address of the buffer has a aligned - * address. This memmove can be removed when the upperlayer - * is capable of unaligned memory access. - */ - memmove(buf, ptr, msglen); - dlm_receive_buffer((union dlm_packet *)buf, nodeid); + dlm_receive_buffer((union dlm_packet *)ptr, nodeid); skip: ret += msglen; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 73ddee5159d7..f5b1bd65728d 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -41,7 +41,6 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, to_nodeid, type, len); return -ENOBUFS; } - memset(mb, 0, mb_len); rc = (struct dlm_rcom *) mb; @@ -462,7 +461,6 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; - memset(mb, 0, mb_len); rc = (struct dlm_rcom *) mb; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index e6ac78c62ca4..495fb4514d09 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -262,10 +262,7 @@ struct ecryptfs_inode_info { * vfsmount too. */ struct ecryptfs_dentry_info { struct path lower_path; - union { - struct ecryptfs_crypt_stat *crypt_stat; - struct rcu_head rcu; - }; + struct rcu_head rcu; }; /** @@ -496,12 +493,6 @@ ecryptfs_set_superblock_lower(struct super_block *sb, ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb; } -static inline struct ecryptfs_dentry_info * -ecryptfs_dentry_to_private(struct dentry *dentry) -{ - return (struct ecryptfs_dentry_info *)dentry->d_fsdata; -} - static inline void ecryptfs_set_dentry_private(struct dentry *dentry, struct ecryptfs_dentry_info *dentry_info) @@ -515,12 +506,6 @@ ecryptfs_dentry_to_lower(struct dentry *dentry) return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; } -static inline struct vfsmount * -ecryptfs_dentry_to_lower_mnt(struct dentry *dentry) -{ - return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt; -} - static inline struct path * ecryptfs_dentry_to_lower_path(struct dentry *dentry) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 18e9285fbb4c..0a1ab1db1450 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -18,22 +18,22 @@ #include <linux/fs_stack.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/fileattr.h> #include <asm/unaligned.h> #include "ecryptfs_kernel.h" -static struct dentry *lock_parent(struct dentry *dentry) +static int lock_parent(struct dentry *dentry, + struct dentry **lower_dentry, + struct inode **lower_dir) { - struct dentry *dir; + struct dentry *lower_dir_dentry; - dir = dget_parent(dentry); - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - return dir; -} + lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); + *lower_dir = d_inode(lower_dir_dentry); + *lower_dentry = ecryptfs_dentry_to_lower(dentry); -static void unlock_dir(struct dentry *dir) -{ - inode_unlock(d_inode(dir)); - dput(dir); + inode_lock_nested(*lower_dir, I_MUTEX_PARENT); + return (*lower_dentry)->d_parent == lower_dir_dentry ? 0 : -EINVAL; } static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) @@ -127,32 +127,29 @@ static int ecryptfs_interpose(struct dentry *lower_dentry, static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, struct inode *inode) { - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct dentry *lower_dir_dentry; - struct inode *lower_dir_inode; + struct dentry *lower_dentry; + struct inode *lower_dir; int rc; - lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); - lower_dir_inode = d_inode(lower_dir_dentry); - inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT); + rc = lock_parent(dentry, &lower_dentry, &lower_dir); dget(lower_dentry); // don't even try to make the lower negative - if (lower_dentry->d_parent != lower_dir_dentry) - rc = -EINVAL; - else if (d_unhashed(lower_dentry)) - rc = -EINVAL; - else - rc = vfs_unlink(&init_user_ns, lower_dir_inode, lower_dentry, - NULL); + if (!rc) { + if (d_unhashed(lower_dentry)) + rc = -EINVAL; + else + rc = vfs_unlink(&init_user_ns, lower_dir, lower_dentry, + NULL); + } if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; } - fsstack_copy_attr_times(dir, lower_dir_inode); + fsstack_copy_attr_times(dir, lower_dir); set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); inode->i_ctime = dir->i_ctime; out_unlock: dput(lower_dentry); - inode_unlock(lower_dir_inode); + inode_unlock(lower_dir); if (!rc) d_drop(dentry); return rc; @@ -176,13 +173,13 @@ ecryptfs_do_create(struct inode *directory_inode, { int rc; struct dentry *lower_dentry; - struct dentry *lower_dir_dentry; + struct inode *lower_dir; struct inode *inode; - lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); - lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_create(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, - mode, true); + rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir); + if (!rc) + rc = vfs_create(&init_user_ns, lower_dir, + lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); @@ -192,14 +189,13 @@ ecryptfs_do_create(struct inode *directory_inode, inode = __ecryptfs_get_inode(d_inode(lower_dentry), directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(&init_user_ns, d_inode(lower_dir_dentry), - lower_dentry, NULL); + vfs_unlink(&init_user_ns, lower_dir, lower_dentry, NULL); goto out_lock; } - fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry)); - fsstack_copy_inode_size(directory_inode, d_inode(lower_dir_dentry)); + fsstack_copy_attr_times(directory_inode, lower_dir); + fsstack_copy_inode_size(directory_inode, lower_dir); out_lock: - unlock_dir(lower_dir_dentry); + inode_unlock(lower_dir); return inode; } @@ -430,32 +426,28 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, { struct dentry *lower_old_dentry; struct dentry *lower_new_dentry; - struct dentry *lower_dir_dentry; + struct inode *lower_dir; u64 file_size_save; int rc; file_size_save = i_size_read(d_inode(old_dentry)); lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); - lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); - dget(lower_old_dentry); - dget(lower_new_dentry); - lower_dir_dentry = lock_parent(lower_new_dentry); - rc = vfs_link(lower_old_dentry, &init_user_ns, - d_inode(lower_dir_dentry), lower_new_dentry, NULL); + rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir); + if (!rc) + rc = vfs_link(lower_old_dentry, &init_user_ns, lower_dir, + lower_new_dentry, NULL); if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + fsstack_copy_attr_times(dir, lower_dir); + fsstack_copy_inode_size(dir, lower_dir); set_nlink(d_inode(old_dentry), ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink); i_size_write(d_inode(new_dentry), file_size_save); out_lock: - unlock_dir(lower_dir_dentry); - dput(lower_new_dentry); - dput(lower_old_dentry); + inode_unlock(lower_dir); return rc; } @@ -470,14 +462,14 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns, { int rc; struct dentry *lower_dentry; - struct dentry *lower_dir_dentry; + struct inode *lower_dir; char *encoded_symname; size_t encoded_symlen; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; - lower_dentry = ecryptfs_dentry_to_lower(dentry); - dget(lower_dentry); - lower_dir_dentry = lock_parent(lower_dentry); + rc = lock_parent(dentry, &lower_dentry, &lower_dir); + if (rc) + goto out_lock; mount_crypt_stat = &ecryptfs_superblock_to_private( dir->i_sb)->mount_crypt_stat; rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, @@ -486,7 +478,7 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns, strlen(symname)); if (rc) goto out_lock; - rc = vfs_symlink(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, + rc = vfs_symlink(&init_user_ns, lower_dir, lower_dentry, encoded_symname); kfree(encoded_symname); if (rc || d_really_is_negative(lower_dentry)) @@ -494,11 +486,10 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns, rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + fsstack_copy_attr_times(dir, lower_dir); + fsstack_copy_inode_size(dir, lower_dir); out_lock: - unlock_dir(lower_dir_dentry); - dput(lower_dentry); + inode_unlock(lower_dir); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -509,22 +500,22 @@ static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, { int rc; struct dentry *lower_dentry; - struct dentry *lower_dir_dentry; + struct inode *lower_dir; - lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mkdir(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, - mode); + rc = lock_parent(dentry, &lower_dentry, &lower_dir); + if (!rc) + rc = vfs_mkdir(&init_user_ns, lower_dir, + lower_dentry, mode); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); - set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); + fsstack_copy_attr_times(dir, lower_dir); + fsstack_copy_inode_size(dir, lower_dir); + set_nlink(dir, lower_dir->i_nlink); out: - unlock_dir(lower_dir_dentry); + inode_unlock(lower_dir); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -533,29 +524,24 @@ out: static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) { struct dentry *lower_dentry; - struct dentry *lower_dir_dentry; - struct inode *lower_dir_inode; + struct inode *lower_dir; int rc; - lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); - lower_dir_inode = d_inode(lower_dir_dentry); - - inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT); + rc = lock_parent(dentry, &lower_dentry, &lower_dir); dget(lower_dentry); // don't even try to make the lower negative - if (lower_dentry->d_parent != lower_dir_dentry) - rc = -EINVAL; - else if (d_unhashed(lower_dentry)) - rc = -EINVAL; - else - rc = vfs_rmdir(&init_user_ns, lower_dir_inode, lower_dentry); + if (!rc) { + if (d_unhashed(lower_dentry)) + rc = -EINVAL; + else + rc = vfs_rmdir(&init_user_ns, lower_dir, lower_dentry); + } if (!rc) { clear_nlink(d_inode(dentry)); - fsstack_copy_attr_times(dir, lower_dir_inode); - set_nlink(dir, lower_dir_inode->i_nlink); + fsstack_copy_attr_times(dir, lower_dir); + set_nlink(dir, lower_dir->i_nlink); } dput(lower_dentry); - inode_unlock(lower_dir_inode); + inode_unlock(lower_dir); if (!rc) d_drop(dentry); return rc; @@ -567,21 +553,21 @@ ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, { int rc; struct dentry *lower_dentry; - struct dentry *lower_dir_dentry; + struct inode *lower_dir; - lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mknod(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, - mode, dev); + rc = lock_parent(dentry, &lower_dentry, &lower_dir); + if (!rc) + rc = vfs_mknod(&init_user_ns, lower_dir, + lower_dentry, mode, dev); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + fsstack_copy_attr_times(dir, lower_dir); + fsstack_copy_inode_size(dir, lower_dir); out: - unlock_dir(lower_dir_dentry); + inode_unlock(lower_dir); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -1118,6 +1104,23 @@ out: return rc; } +static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa); +} + +static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + int rc; + + rc = vfs_fileattr_set(&init_user_ns, lower_dentry, fa); + fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); + + return rc; +} + const struct inode_operations ecryptfs_symlink_iops = { .get_link = ecryptfs_get_link, .permission = ecryptfs_permission, @@ -1139,6 +1142,8 @@ const struct inode_operations ecryptfs_dir_iops = { .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, .listxattr = ecryptfs_listxattr, + .fileattr_get = ecryptfs_fileattr_get, + .fileattr_set = ecryptfs_fileattr_set, }; const struct inode_operations ecryptfs_main_iops = { @@ -1146,6 +1151,8 @@ const struct inode_operations ecryptfs_main_iops = { .setattr = ecryptfs_setattr, .getattr = ecryptfs_getattr, .listxattr = ecryptfs_listxattr, + .fileattr_get = ecryptfs_fileattr_get, + .fileattr_set = ecryptfs_fileattr_set, }; static int ecryptfs_xattr_get(const struct xattr_handler *handler, diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index e6bc0302643b..d57ee15874f9 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -106,86 +106,9 @@ out_free: return size; } -static inline unsigned int efivarfs_getflags(struct inode *inode) -{ - unsigned int i_flags; - unsigned int flags = 0; - - i_flags = inode->i_flags; - if (i_flags & S_IMMUTABLE) - flags |= FS_IMMUTABLE_FL; - return flags; -} - -static int -efivarfs_ioc_getxflags(struct file *file, void __user *arg) -{ - struct inode *inode = file->f_mapping->host; - unsigned int flags = efivarfs_getflags(inode); - - if (copy_to_user(arg, &flags, sizeof(flags))) - return -EFAULT; - return 0; -} - -static int -efivarfs_ioc_setxflags(struct file *file, void __user *arg) -{ - struct inode *inode = file->f_mapping->host; - unsigned int flags; - unsigned int i_flags = 0; - unsigned int oldflags = efivarfs_getflags(inode); - int error; - - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EACCES; - - if (copy_from_user(&flags, arg, sizeof(flags))) - return -EFAULT; - - if (flags & ~FS_IMMUTABLE_FL) - return -EOPNOTSUPP; - - if (flags & FS_IMMUTABLE_FL) - i_flags |= S_IMMUTABLE; - - - error = mnt_want_write_file(file); - if (error) - return error; - - inode_lock(inode); - - error = vfs_ioc_setflags_prepare(inode, oldflags, flags); - if (error) - goto out; - - inode_set_flags(inode, i_flags, S_IMMUTABLE); -out: - inode_unlock(inode); - mnt_drop_write_file(file); - return error; -} - -static long -efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p) -{ - void __user *arg = (void __user *)p; - - switch (cmd) { - case FS_IOC_GETFLAGS: - return efivarfs_ioc_getxflags(file, arg); - case FS_IOC_SETFLAGS: - return efivarfs_ioc_setxflags(file, arg); - } - - return -ENOTTY; -} - const struct file_operations efivarfs_file_operations = { .open = simple_open, .read = efivarfs_file_read, .write = efivarfs_file_write, .llseek = no_llseek, - .unlocked_ioctl = efivarfs_file_ioctl, }; diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 14e2947975fd..939e5e242b98 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -10,9 +10,12 @@ #include <linux/kmemleak.h> #include <linux/slab.h> #include <linux/uuid.h> +#include <linux/fileattr.h> #include "internal.h" +static const struct inode_operations efivarfs_file_inode_operations; + struct inode *efivarfs_get_inode(struct super_block *sb, const struct inode *dir, int mode, dev_t dev, bool is_removable) @@ -26,6 +29,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, inode->i_flags = is_removable ? 0 : S_IMMUTABLE; switch (mode & S_IFMT) { case S_IFREG: + inode->i_op = &efivarfs_file_inode_operations; inode->i_fop = &efivarfs_file_operations; break; case S_IFDIR: @@ -138,3 +142,43 @@ const struct inode_operations efivarfs_dir_inode_operations = { .unlink = efivarfs_unlink, .create = efivarfs_create, }; + +static int +efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + unsigned int i_flags; + unsigned int flags = 0; + + i_flags = d_inode(dentry)->i_flags; + if (i_flags & S_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + + fileattr_fill_flags(fa, flags); + + return 0; +} + +static int +efivarfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + unsigned int i_flags = 0; + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + + if (fa->flags & ~FS_IMMUTABLE_FL) + return -EOPNOTSUPP; + + if (fa->flags & FS_IMMUTABLE_FL) + i_flags |= S_IMMUTABLE; + + inode_set_flags(d_inode(dentry), i_flags, S_IMMUTABLE); + + return 0; +} + +static const struct inode_operations efivarfs_file_inode_operations = { + .fileattr_get = efivarfs_fileattr_get, + .fileattr_set = efivarfs_fileattr_set, +}; diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 74b0aaa7114c..858b3339f381 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -76,17 +76,3 @@ config EROFS_FS_ZIP If you don't want to enable compression feature, say N. -config EROFS_FS_CLUSTER_PAGE_LIMIT - int "EROFS Cluster Pages Hard Limit" - depends on EROFS_FS_ZIP - range 1 256 - default "1" - help - Indicates maximum # of pages of a compressed - physical cluster. - - For example, if files in a image were compressed - into 8k-unit, hard limit should not be configured - less than 2. Otherwise, the image will be refused - to mount on this kernel. - diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index af159539fc1b..1f9aced49070 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_EROFS_FS) += erofs.o -erofs-objs := super.o inode.o data.o namei.o dir.o utils.o +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1249e74b3bf0..ebac756cb2a3 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -109,21 +109,6 @@ err_out: return err; } -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags) -{ - if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { - int err = z_erofs_map_blocks_iter(inode, map, flags); - - if (map->mpage) { - put_page(map->mpage); - map->mpage = NULL; - } - return err; - } - return erofs_map_blocks_flatmode(inode, map, flags); -} - static inline struct bio *erofs_read_raw_page(struct bio *bio, struct address_space *mapping, struct page *page, @@ -159,7 +144,7 @@ submit_bio_retry: erofs_blk_t blknr; unsigned int blkoff; - err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW); if (err) goto err_out; @@ -318,7 +303,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) return 0; } - if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW)) + if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW)) return erofs_blknr(map.m_pa); return 0; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 1cb1ffd10569..88e33addf229 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -28,6 +28,42 @@ struct z_erofs_decompressor { char *name; }; +int z_erofs_load_lz4_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lz4_cfgs *lz4, int size) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + u16 distance; + + if (lz4) { + if (size < sizeof(struct z_erofs_lz4_cfgs)) { + erofs_err(sb, "invalid lz4 cfgs, size=%u", size); + return -EINVAL; + } + distance = le16_to_cpu(lz4->max_distance); + + sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks); + if (!sbi->lz4.max_pclusterblks) { + sbi->lz4.max_pclusterblks = 1; /* reserved case */ + } else if (sbi->lz4.max_pclusterblks > + Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) { + erofs_err(sb, "too large lz4 pclusterblks %u", + sbi->lz4.max_pclusterblks); + return -EINVAL; + } else if (sbi->lz4.max_pclusterblks >= 2) { + erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!"); + } + } else { + distance = le16_to_cpu(dsb->u1.lz4_max_distance); + sbi->lz4.max_pclusterblks = 1; + } + + sbi->lz4.max_distance_pages = distance ? + DIV_ROUND_UP(distance, PAGE_SIZE) + 1 : + LZ4_MAX_DISTANCE_PAGES; + return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); +} + static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, struct list_head *pagepool) { @@ -36,6 +72,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, BITS_PER_LONG)] = { 0 }; + unsigned int lz4_max_distance_pages = + EROFS_SB(rq->sb)->lz4.max_distance_pages; void *kaddr = NULL; unsigned int i, j, top; @@ -44,14 +82,14 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, struct page *const page = rq->out[i]; struct page *victim; - if (j >= LZ4_MAX_DISTANCE_PAGES) + if (j >= lz4_max_distance_pages) j = 0; /* 'valid' bounced can only be tested after a complete round */ if (test_bit(j, bounced)) { - DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES); - DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES); - availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES]; + DBG_BUGON(i < lz4_max_distance_pages); + DBG_BUGON(top >= lz4_max_distance_pages); + availables[top++] = rq->out[i - lz4_max_distance_pages]; } if (page) { @@ -73,9 +111,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, victim = availables[--top]; get_page(victim); } else { - victim = erofs_allocpage(pagepool, GFP_KERNEL); - if (!victim) - return -ENOMEM; + victim = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; @@ -83,96 +120,123 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, return kaddr ? 1 : 0; } -static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq, - u8 *src, unsigned int pageofs_in) +static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq, + void *inpage, unsigned int *inputmargin, int *maptype, + bool support_0padding) { - /* - * if in-place decompression is ongoing, those decompressed - * pages should be copied in order to avoid being overlapped. - */ - struct page **in = rq->in; - u8 *const tmp = erofs_get_pcpubuf(0); - u8 *tmpp = tmp; - unsigned int inlen = rq->inputsize - pageofs_in; - unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in); - - while (tmpp < tmp + inlen) { - if (!src) - src = kmap_atomic(*in); - memcpy(tmpp, src + pageofs_in, count); - kunmap_atomic(src); - src = NULL; - tmpp += count; - pageofs_in = 0; - count = PAGE_SIZE; + unsigned int nrpages_in, nrpages_out; + unsigned int ofull, oend, inputsize, total, i, j; + struct page **in; + void *src, *tmp; + + inputsize = rq->inputsize; + nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT; + oend = rq->pageofs_out + rq->outputsize; + ofull = PAGE_ALIGN(oend); + nrpages_out = ofull >> PAGE_SHIFT; + + if (rq->inplace_io) { + if (rq->partial_decoding || !support_0padding || + ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize)) + goto docopy; + + for (i = 0; i < nrpages_in; ++i) { + DBG_BUGON(rq->in[i] == NULL); + for (j = 0; j < nrpages_out - nrpages_in + i; ++j) + if (rq->out[j] == rq->in[i]) + goto docopy; + } + } + + if (nrpages_in <= 1) { + *maptype = 0; + return inpage; + } + kunmap_atomic(inpage); + might_sleep(); + src = erofs_vm_map_ram(rq->in, nrpages_in); + if (!src) + return ERR_PTR(-ENOMEM); + *maptype = 1; + return src; + +docopy: + /* Or copy compressed data which can be overlapped to per-CPU buffer */ + in = rq->in; + src = erofs_get_pcpubuf(nrpages_in); + if (!src) { + DBG_BUGON(1); + kunmap_atomic(inpage); + return ERR_PTR(-EFAULT); + } + + tmp = src; + total = rq->inputsize; + while (total) { + unsigned int page_copycnt = + min_t(unsigned int, total, PAGE_SIZE - *inputmargin); + + if (!inpage) + inpage = kmap_atomic(*in); + memcpy(tmp, inpage + *inputmargin, page_copycnt); + kunmap_atomic(inpage); + inpage = NULL; + tmp += page_copycnt; + total -= page_copycnt; ++in; + *inputmargin = 0; } - return tmp; + *maptype = 2; + return src; } static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) { - unsigned int inputmargin, inlen; - u8 *src; - bool copied, support_0padding; - int ret; - - if (rq->inputsize > PAGE_SIZE) - return -EOPNOTSUPP; + unsigned int inputmargin; + u8 *headpage, *src; + bool support_0padding; + int ret, maptype; - src = kmap_atomic(*rq->in); + DBG_BUGON(*rq->in == NULL); + headpage = kmap_atomic(*rq->in); inputmargin = 0; support_0padding = false; /* decompression inplace is only safe when 0padding is enabled */ - if (EROFS_SB(rq->sb)->feature_incompat & - EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) { + if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) { support_0padding = true; - while (!src[inputmargin & ~PAGE_MASK]) + while (!headpage[inputmargin & ~PAGE_MASK]) if (!(++inputmargin & ~PAGE_MASK)) break; if (inputmargin >= rq->inputsize) { - kunmap_atomic(src); + kunmap_atomic(headpage); return -EIO; } } - copied = false; - inlen = rq->inputsize - inputmargin; - if (rq->inplace_io) { - const uint oend = (rq->pageofs_out + - rq->outputsize) & ~PAGE_MASK; - const uint nr = PAGE_ALIGN(rq->pageofs_out + - rq->outputsize) >> PAGE_SHIFT; - - if (rq->partial_decoding || !support_0padding || - rq->out[nr - 1] != rq->in[0] || - rq->inputsize - oend < - LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) { - src = generic_copy_inplace_data(rq, src, inputmargin); - inputmargin = 0; - copied = true; - } - } + rq->inputsize -= inputmargin; + src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype, + support_0padding); + if (IS_ERR(src)) + return PTR_ERR(src); /* legacy format could compress extra data in a pcluster. */ if (rq->partial_decoding || !support_0padding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, - inlen, rq->outputsize, - rq->outputsize); + rq->inputsize, rq->outputsize, rq->outputsize); else ret = LZ4_decompress_safe(src + inputmargin, out, - inlen, rq->outputsize); + rq->inputsize, rq->outputsize); if (ret != rq->outputsize) { erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", - ret, inlen, inputmargin, rq->outputsize); + ret, rq->inputsize, inputmargin, rq->outputsize); WARN_ON(1); print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, - 16, 1, src + inputmargin, inlen, true); + 16, 1, src + inputmargin, rq->inputsize, true); print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, 16, 1, out, rq->outputsize, true); @@ -181,10 +245,16 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) ret = -EIO; } - if (copied) - erofs_put_pcpubuf(src); - else + if (maptype == 0) { kunmap_atomic(src); + } else if (maptype == 1) { + vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT); + } else if (maptype == 2) { + erofs_put_pcpubuf(src); + } else { + DBG_BUGON(1); + return -EFAULT; + } return ret; } @@ -234,57 +304,51 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, const struct z_erofs_decompressor *alg = decompressors + rq->alg; unsigned int dst_maptype; void *dst; - int ret, i; + int ret; - if (nrpages_out == 1 && !rq->inplace_io) { - DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); - dst_maptype = 0; - goto dstmap_out; - } + /* two optimized fast paths only for non bigpcluster cases yet */ + if (rq->inputsize <= PAGE_SIZE) { + if (nrpages_out == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_atomic(*rq->out); + dst_maptype = 0; + goto dstmap_out; + } - /* - * For the case of small output size (especially much less - * than PAGE_SIZE), memcpy the decompressed data rather than - * compressed data is preferred. - */ - if (rq->outputsize <= PAGE_SIZE * 7 / 8) { - dst = erofs_get_pcpubuf(0); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - rq->inplace_io = false; - ret = alg->decompress(rq, dst); - if (!ret) - copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, - rq->outputsize); - - erofs_put_pcpubuf(dst); - return ret; + /* + * For the case of small output size (especially much less + * than PAGE_SIZE), memcpy the decompressed data rather than + * compressed data is preferred. + */ + if (rq->outputsize <= PAGE_SIZE * 7 / 8) { + dst = erofs_get_pcpubuf(1); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + rq->inplace_io = false; + ret = alg->decompress(rq, dst); + if (!ret) + copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, + rq->outputsize); + + erofs_put_pcpubuf(dst); + return ret; + } } + /* general decoding path which can be used for all cases */ ret = alg->prepare_destpages(rq, pagepool); - if (ret < 0) { + if (ret < 0) return ret; - } else if (ret) { + if (ret) { dst = page_address(*rq->out); dst_maptype = 1; goto dstmap_out; } - i = 0; - while (1) { - dst = vm_map_ram(rq->out, nrpages_out, -1); - - /* retry two more times (totally 3 times) */ - if (dst || ++i >= 3) - break; - vm_unmap_aliases(); - } - + dst = erofs_vm_map_ram(rq->out, nrpages_out); if (!dst) return -ENOMEM; - dst_maptype = 2; dstmap_out: diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 9ad1615f4474..8739d3adf51f 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -18,15 +18,22 @@ * be incompatible with this kernel version. */ #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 -#define EROFS_ALL_FEATURE_INCOMPAT EROFS_FEATURE_INCOMPAT_LZ4_0PADDING +#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 +#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 +#define EROFS_ALL_FEATURE_INCOMPAT \ + (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ + EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ + EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER) -/* 128-byte erofs on-disk super block */ +#define EROFS_SB_EXTSLOT_SIZE 16 + +/* erofs on-disk super block (currently 128 bytes) */ struct erofs_super_block { __le32 magic; /* file system magic number */ __le32 checksum; /* crc32c(super_block) */ __le32 feature_compat; __u8 blkszbits; /* support block_size == PAGE_SIZE only */ - __u8 reserved; + __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ __le16 root_nid; /* nid of root directory */ __le64 inos; /* total valid ino # (== f_files - f_favail) */ @@ -39,7 +46,13 @@ struct erofs_super_block { __u8 uuid[16]; /* 128-bit uuid for volume */ __u8 volume_name[16]; /* volume name */ __le32 feature_incompat; - __u8 reserved2[44]; + union { + /* bitmap for available compression algorithms */ + __le16 available_compr_algs; + /* customized sliding window size instead of 64k by default */ + __le16 lz4_max_distance; + } __packed u1; + __u8 reserved2[42]; }; /* @@ -75,6 +88,9 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) #define EROFS_I_VERSION_BIT 0 #define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_ALL \ + ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1) + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ @@ -189,20 +205,33 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) e->e_name_len + le16_to_cpu(e->e_value_size)); } +/* maximum supported size of a physical compression cluster */ +#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) + /* available compression algorithm types (for h_algorithmtype) */ enum { Z_EROFS_COMPRESSION_LZ4 = 0, Z_EROFS_COMPRESSION_MAX }; +#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1)) + +/* 14 bytes (+ length field = 16 bytes) */ +struct z_erofs_lz4_cfgs { + __le16 max_distance; + __le16 max_pclusterblks; + u8 reserved[10]; +} __packed; /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; * (4B) + 2B + (4B) if compacted 2B is on. + * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) + * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) */ -#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0 - -#define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT) +#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 +#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 +#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 struct z_erofs_map_header { __le32 h_reserved1; @@ -214,9 +243,7 @@ struct z_erofs_map_header { __u8 h_algorithmtype; /* * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-4 : (physical - logical) cluster bits of head 1: - * For example, if logical clustersize = 4096, 1 for 8192. - * bit 5-7 : (physical - logical) cluster bits of head 2. + * bit 3-7 : reserved. */ __u8 h_clusterbits; }; @@ -259,6 +286,13 @@ enum { #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 +/* + * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the + * compressed block count of a compressed extent (in logical clusters, aka. + * block count of a pcluster). + */ +#define Z_EROFS_VLE_DI_D0_CBLKCNT (1 << 11) + struct z_erofs_vle_decompressed_index { __le16 di_advise; /* where to decompress in the head cluster */ diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 119fdce1b520..7ed2d7391692 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -44,6 +44,13 @@ static struct page *erofs_read_inode(struct inode *inode, dic = page_address(page) + *ofs; ifmt = le16_to_cpu(dic->i_format); + if (ifmt & ~EROFS_I_ALL) { + erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu", + ifmt, vi->nid); + err = -EOPNOTSUPP; + goto err_out; + } + vi->datalayout = erofs_inode_datalayout(ifmt); if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) { erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu", diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 351dae524a0c..f92e3e32b9f4 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -50,6 +50,8 @@ struct erofs_fs_context { #ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; + /* strategy of sync decompression (false - auto, true - force on) */ + bool readahead_sync_decompress; /* threshold for decompression synchronously */ unsigned int max_sync_decompress_pages; @@ -57,6 +59,14 @@ struct erofs_fs_context { unsigned int mount_opt; }; +/* all filesystem-wide lz4 configurations */ +struct erofs_sb_lz4_info { + /* # of pages needed for EROFS lz4 rolling decompression */ + u16 max_distance_pages; + /* maximum possible blocks for pclusters in the filesystem */ + u16 max_pclusterblks; +}; + struct erofs_sb_info { #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ @@ -67,9 +77,12 @@ struct erofs_sb_info { struct xarray managed_pslots; unsigned int shrinker_run_no; + u16 available_compr_algs; /* pseudo inode to manage cached pages */ struct inode *managed_cache; + + struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ u32 blocks; u32 meta_blkaddr; @@ -80,6 +93,7 @@ struct erofs_sb_info { /* inode slot unit size in bit shift */ unsigned char islotbits; + u32 sb_size; /* total superblock size */ u32 build_time_nsec; u64 build_time; @@ -182,12 +196,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) return v; } #endif /* !CONFIG_SMP */ - -/* hard limit of pages per compressed cluster */ -#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT) -#define EROFS_PCPUBUF_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES -#else -#define EROFS_PCPUBUF_NR_PAGES 0 #endif /* !CONFIG_EROFS_FS_ZIP */ /* we strictly follow PAGE_SIZE and no buffer head yet */ @@ -216,6 +224,17 @@ static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); } +#define EROFS_FEATURE_FUNCS(name, compat, feature) \ +static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ +{ \ + return sbi->feature_##compat & EROFS_FEATURE_##feature; \ +} + +EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING) +EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) +EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) +EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) + /* atomic flag definitions */ #define EROFS_I_EA_INITED_BIT 0 #define EROFS_I_Z_INITED_BIT 1 @@ -244,7 +263,6 @@ struct erofs_inode { unsigned short z_advise; unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; - unsigned char z_physical_clusterbits[2]; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -287,7 +305,7 @@ extern const struct address_space_operations erofs_raw_access_aops; extern const struct address_space_operations z_erofs_aops; /* - * Logical to physical block mapping, used by erofs_map_blocks() + * Logical to physical block mapping * * Different with other file systems, it is used for 2 access modes: * @@ -334,7 +352,7 @@ struct erofs_map_blocks { struct page *mpage; }; -/* Flags used by erofs_map_blocks() */ +/* Flags used by erofs_map_blocks_flatmode() */ #define EROFS_GET_BLOCKS_RAW 0x0001 /* zmap.c */ @@ -356,8 +374,6 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, /* data.c */ struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); -int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int); - /* inode.c */ static inline unsigned long erofs_inode_hash(erofs_nid_t nid) { @@ -386,23 +402,30 @@ int erofs_namei(struct inode *dir, struct qstr *name, /* dir.c */ extern const struct file_operations erofs_dir_fops; -/* utils.c / zdata.c */ -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); - -#if (EROFS_PCPUBUF_NR_PAGES > 0) -void *erofs_get_pcpubuf(unsigned int pagenr); -#define erofs_put_pcpubuf(buf) do { \ - (void)&(buf); \ - preempt_enable(); \ -} while (0) -#else -static inline void *erofs_get_pcpubuf(unsigned int pagenr) +static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) { - return ERR_PTR(-EOPNOTSUPP); + int retried = 0; + + while (1) { + void *p = vm_map_ram(pages, count, -1); + + /* retry two more times (totally 3 times) */ + if (p || ++retried >= 3) + return p; + vm_unmap_aliases(); + } + return NULL; } -#define erofs_put_pcpubuf(buf) do {} while (0) -#endif +/* pcpubuf.c */ +void *erofs_get_pcpubuf(unsigned int requiredpages); +void erofs_put_pcpubuf(void *ptr); +int erofs_pcpubuf_growsize(unsigned int nrpages); +void erofs_pcpubuf_init(void); +void erofs_pcpubuf_exit(void); + +/* utils.c / zdata.c */ +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); #ifdef CONFIG_EROFS_FS_ZIP int erofs_workgroup_put(struct erofs_workgroup *grp); @@ -421,6 +444,9 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); int erofs_try_to_free_cached_page(struct address_space *mapping, struct page *page); +int z_erofs_load_lz4_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lz4_cfgs *lz4, int len); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -428,6 +454,16 @@ static inline int erofs_init_shrinker(void) { return 0; } static inline void erofs_exit_shrinker(void) {} static inline int z_erofs_init_zip_subsystem(void) { return 0; } static inline void z_erofs_exit_zip_subsystem(void) {} +static inline int z_erofs_load_lz4_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lz4_cfgs *lz4, int len) +{ + if (lz4 || dsb->u1.lz4_max_distance) { + erofs_err(sb, "lz4 algorithm isn't enabled"); + return -EINVAL; + } + return 0; +} #endif /* !CONFIG_EROFS_FS_ZIP */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c new file mode 100644 index 000000000000..6c885575128a --- /dev/null +++ b/fs/erofs/pcpubuf.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Gao Xiang <xiang@kernel.org> + * + * For low-latency decompression algorithms (e.g. lz4), reserve consecutive + * per-CPU virtual memory (in pages) in advance to store such inplace I/O + * data if inplace decompression is failed (due to unmet inplace margin for + * example). + */ +#include "internal.h" + +struct erofs_pcpubuf { + raw_spinlock_t lock; + void *ptr; + struct page **pages; + unsigned int nrpages; +}; + +static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb); + +void *erofs_get_pcpubuf(unsigned int requiredpages) + __acquires(pcb->lock) +{ + struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb); + + raw_spin_lock(&pcb->lock); + /* check if the per-CPU buffer is too small */ + if (requiredpages > pcb->nrpages) { + raw_spin_unlock(&pcb->lock); + put_cpu_var(erofs_pcb); + /* (for sparse checker) pretend pcb->lock is still taken */ + __acquire(pcb->lock); + return NULL; + } + return pcb->ptr; +} + +void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock) +{ + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id()); + + DBG_BUGON(pcb->ptr != ptr); + raw_spin_unlock(&pcb->lock); + put_cpu_var(erofs_pcb); +} + +/* the next step: support per-CPU page buffers hotplug */ +int erofs_pcpubuf_growsize(unsigned int nrpages) +{ + static DEFINE_MUTEX(pcb_resize_mutex); + static unsigned int pcb_nrpages; + LIST_HEAD(pagepool); + int delta, cpu, ret, i; + + mutex_lock(&pcb_resize_mutex); + delta = nrpages - pcb_nrpages; + ret = 0; + /* avoid shrinking pcpubuf, since no idea how many fses rely on */ + if (delta <= 0) + goto out; + + for_each_possible_cpu(cpu) { + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); + struct page **pages, **oldpages; + void *ptr, *old_ptr; + + pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + break; + } + + for (i = 0; i < nrpages; ++i) { + pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL); + if (!pages[i]) { + ret = -ENOMEM; + oldpages = pages; + goto free_pagearray; + } + } + ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL); + if (!ptr) { + ret = -ENOMEM; + oldpages = pages; + goto free_pagearray; + } + raw_spin_lock(&pcb->lock); + old_ptr = pcb->ptr; + pcb->ptr = ptr; + oldpages = pcb->pages; + pcb->pages = pages; + i = pcb->nrpages; + pcb->nrpages = nrpages; + raw_spin_unlock(&pcb->lock); + + if (!oldpages) { + DBG_BUGON(old_ptr); + continue; + } + + if (old_ptr) + vunmap(old_ptr); +free_pagearray: + while (i) + list_add(&oldpages[--i]->lru, &pagepool); + kfree(oldpages); + if (ret) + break; + } + pcb_nrpages = nrpages; + put_pages_list(&pagepool); +out: + mutex_unlock(&pcb_resize_mutex); + return ret; +} + +void erofs_pcpubuf_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); + + raw_spin_lock_init(&pcb->lock); + } +} + +void erofs_pcpubuf_exit(void) +{ + int cpu, i; + + for_each_possible_cpu(cpu) { + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); + + if (pcb->ptr) { + vunmap(pcb->ptr); + pcb->ptr = NULL; + } + if (!pcb->pages) + continue; + + for (i = 0; i < pcb->nrpages; ++i) + if (pcb->pages[i]) + put_page(pcb->pages[i]); + kfree(pcb->pages); + pcb->pages = NULL; + } +} diff --git a/fs/erofs/super.c b/fs/erofs/super.c index d5a6b9b888a5..bbf3bbd908e0 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -122,6 +122,136 @@ static bool check_layout_compatibility(struct super_block *sb, return true; } +#ifdef CONFIG_EROFS_FS_ZIP +/* read variable-sized metadata, offset will be aligned by 4-byte */ +static void *erofs_read_metadata(struct super_block *sb, struct page **pagep, + erofs_off_t *offset, int *lengthp) +{ + struct page *page = *pagep; + u8 *buffer, *ptr; + int len, i, cnt; + erofs_blk_t blk; + + *offset = round_up(*offset, 4); + blk = erofs_blknr(*offset); + + if (!page || page->index != blk) { + if (page) { + unlock_page(page); + put_page(page); + } + page = erofs_get_meta_page(sb, blk); + if (IS_ERR(page)) + goto err_nullpage; + } + + ptr = kmap(page); + len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]); + if (!len) + len = U16_MAX + 1; + buffer = kmalloc(len, GFP_KERNEL); + if (!buffer) { + buffer = ERR_PTR(-ENOMEM); + goto out; + } + *offset += sizeof(__le16); + *lengthp = len; + + for (i = 0; i < len; i += cnt) { + cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i); + blk = erofs_blknr(*offset); + + if (!page || page->index != blk) { + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + page = erofs_get_meta_page(sb, blk); + if (IS_ERR(page)) { + kfree(buffer); + goto err_nullpage; + } + ptr = kmap(page); + } + memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt); + *offset += cnt; + } +out: + kunmap(page); + *pagep = page; + return buffer; +err_nullpage: + *pagep = NULL; + return page; +} + +static int erofs_load_compr_cfgs(struct super_block *sb, + struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi; + struct page *page; + unsigned int algs, alg; + erofs_off_t offset; + int size, ret; + + sbi = EROFS_SB(sb); + sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); + + if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { + erofs_err(sb, "try to load compressed fs with unsupported algorithms %x", + sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); + return -EINVAL; + } + + offset = EROFS_SUPER_OFFSET + sbi->sb_size; + page = NULL; + alg = 0; + ret = 0; + + for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { + void *data; + + if (!(algs & 1)) + continue; + + data = erofs_read_metadata(sb, &page, &offset, &size); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + goto err; + } + + switch (alg) { + case Z_EROFS_COMPRESSION_LZ4: + ret = z_erofs_load_lz4_config(sb, dsb, data, size); + break; + default: + DBG_BUGON(1); + ret = -EFAULT; + } + kfree(data); + if (ret) + goto err; + } +err: + if (page) { + unlock_page(page); + put_page(page); + } + return ret; +} +#else +static int erofs_load_compr_cfgs(struct super_block *sb, + struct erofs_super_block *dsb) +{ + if (dsb->u1.available_compr_algs) { + erofs_err(sb, "try to load compressed fs when compression is disabled"); + return -EINVAL; + } + return 0; +} +#endif + static int erofs_read_superblock(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -149,7 +279,7 @@ static int erofs_read_superblock(struct super_block *sb) } sbi->feature_compat = le32_to_cpu(dsb->feature_compat); - if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) { + if (erofs_sb_has_sb_chksum(sbi)) { ret = erofs_superblock_csum_verify(sb, data); if (ret) goto out; @@ -166,6 +296,12 @@ static int erofs_read_superblock(struct super_block *sb) if (!check_layout_compatibility(sb, dsb)) goto out; + sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; + if (sbi->sb_size > EROFS_BLKSIZ) { + erofs_err(sb, "invalid sb_extslots %u (more than a fs block)", + sbi->sb_size); + goto out; + } sbi->blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR @@ -187,7 +323,12 @@ static int erofs_read_superblock(struct super_block *sb) ret = -EFSCORRUPTED; goto out; } - ret = 0; + + /* parse on-disk compression configurations */ + if (erofs_sb_has_compr_cfgs(sbi)) + ret = erofs_load_compr_cfgs(sb, dsb); + else + ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0); out: kunmap(page); put_page(page); @@ -200,6 +341,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx) #ifdef CONFIG_EROFS_FS_ZIP ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND; ctx->max_sync_decompress_pages = 3; + ctx->readahead_sync_decompress = false; #endif #ifdef CONFIG_EROFS_FS_XATTR set_opt(ctx, XATTR_USER); @@ -513,6 +655,7 @@ static int __init erofs_module_init(void) if (err) goto shrinker_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) goto zip_err; @@ -542,6 +685,7 @@ static void __exit erofs_module_exit(void) /* Ensure all RCU free inodes are safe before cache is destroyed. */ rcu_barrier(); kmem_cache_destroy(erofs_inode_cachep); + erofs_pcpubuf_exit(); } /* get filesystem statistics */ diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index de9986d2f82f..6758c5b19f7c 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -21,18 +21,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) return page; } -#if (EROFS_PCPUBUF_NR_PAGES > 0) -static struct { - u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES]; -} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS]; - -void *erofs_get_pcpubuf(unsigned int pagenr) -{ - preempt_disable(); - return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE]; -} -#endif - #ifdef CONFIG_EROFS_FS_ZIP /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 3851e1a64f73..78e4b598ecca 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -11,6 +11,93 @@ #include <trace/events/erofs.h> /* + * since pclustersize is variable for big pcluster feature, introduce slab + * pools implementation for different pcluster sizes. + */ +struct z_erofs_pcluster_slab { + struct kmem_cache *slab; + unsigned int maxpages; + char name[48]; +}; + +#define _PCLP(n) { .maxpages = n } + +static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { + _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), + _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) +}; + +static void z_erofs_destroy_pcluster_pool(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + if (!pcluster_pool[i].slab) + continue; + kmem_cache_destroy(pcluster_pool[i].slab); + pcluster_pool[i].slab = NULL; + } +} + +static int z_erofs_create_pcluster_pool(void) +{ + struct z_erofs_pcluster_slab *pcs; + struct z_erofs_pcluster *a; + unsigned int size; + + for (pcs = pcluster_pool; + pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { + size = struct_size(a, compressed_pages, pcs->maxpages); + + sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); + pcs->slab = kmem_cache_create(pcs->name, size, 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (pcs->slab) + continue; + + z_erofs_destroy_pcluster_pool(); + return -ENOMEM; + } + return 0; +} + +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + struct z_erofs_pcluster *pcl; + + if (nrpages > pcs->maxpages) + continue; + + pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + if (!pcl) + return ERR_PTR(-ENOMEM); + pcl->pclusterpages = nrpages; + return pcl; + } + return ERR_PTR(-EINVAL); +} + +static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + + if (pcl->pclusterpages > pcs->maxpages) + continue; + + kmem_cache_free(pcs->slab, pcl); + return; + } + DBG_BUGON(1); +} + +/* * a compressed_pages[] placeholder in order to avoid * being filled with file pages for in-place decompression. */ @@ -37,12 +124,11 @@ typedef tagptr1_t compressed_page_t; tagptr_fold(compressed_page_t, page, 1) static struct workqueue_struct *z_erofs_workqueue __read_mostly; -static struct kmem_cache *pcluster_cachep __read_mostly; void z_erofs_exit_zip_subsystem(void) { destroy_workqueue(z_erofs_workqueue); - kmem_cache_destroy(pcluster_cachep); + z_erofs_destroy_pcluster_pool(); } static inline int z_erofs_init_workqueue(void) @@ -59,32 +145,16 @@ static inline int z_erofs_init_workqueue(void) return z_erofs_workqueue ? 0 : -ENOMEM; } -static void z_erofs_pcluster_init_once(void *ptr) -{ - struct z_erofs_pcluster *pcl = ptr; - struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); - unsigned int i; - - mutex_init(&cl->lock); - cl->nr_pages = 0; - cl->vcnt = 0; - for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i) - pcl->compressed_pages[i] = NULL; -} - int __init z_erofs_init_zip_subsystem(void) { - pcluster_cachep = kmem_cache_create("erofs_compress", - Z_EROFS_WORKGROUP_SIZE, 0, - SLAB_RECLAIM_ACCOUNT, - z_erofs_pcluster_init_once); - if (pcluster_cachep) { - if (!z_erofs_init_workqueue()) - return 0; - - kmem_cache_destroy(pcluster_cachep); - } - return -ENOMEM; + int err = z_erofs_create_pcluster_pool(); + + if (err) + return err; + err = z_erofs_init_workqueue(); + if (err) + z_erofs_destroy_pcluster_pool(); + return err; } enum z_erofs_collectmode { @@ -104,6 +174,12 @@ enum z_erofs_collectmode { * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| */ COLLECT_PRIMARY_HOOKED, + /* + * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it + * could be dispatched into bypass queue later due to uptodated managed + * pages. All related online pages cannot be reused for inplace I/O (or + * pagevec) since it can be directly decoded without I/O submission. + */ COLLECT_PRIMARY_FOLLOWED_NOINPLACE, /* * The current collection has been linked with the owned chain, and @@ -128,7 +204,8 @@ struct z_erofs_collector { struct z_erofs_pcluster *pcl, *tailpcl; struct z_erofs_collection *cl; - struct page **compressedpages; + /* a pointer used to pick up inplace I/O pages */ + struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; enum z_erofs_collectmode mode; @@ -162,18 +239,19 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, enum z_erofs_cache_alloctype type, struct list_head *pagepool) { - const struct z_erofs_pcluster *pcl = clt->pcl; - const unsigned int clusterpages = BIT(pcl->clusterbits); - struct page **pages = clt->compressedpages; - pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages); + struct z_erofs_pcluster *pcl = clt->pcl; bool standalone = true; gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + struct page **pages; + pgoff_t index; if (clt->mode < COLLECT_PRIMARY_FOLLOWED) return; - for (; pages < pcl->compressed_pages + clusterpages; ++pages) { + pages = pcl->compressed_pages; + index = pcl->obj.index; + for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { struct page *page; compressed_page_t t; struct page *newpage = NULL; @@ -186,21 +264,25 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, if (page) { t = tag_compressed_page_justfound(page); - } else if (type == DELAYEDALLOC) { - t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED); - } else if (type == TRYALLOC) { - newpage = erofs_allocpage(pagepool, gfp); - if (!newpage) - goto dontalloc; - - set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); - } else { /* DONTALLOC */ -dontalloc: - if (standalone) - clt->compressedpages = pages; + } else { + /* I/O is needed, no possible to decompress directly */ standalone = false; - continue; + switch (type) { + case DELAYEDALLOC: + t = tagptr_init(compressed_page_t, + PAGE_UNALLOCATED); + break; + case TRYALLOC: + newpage = erofs_allocpage(pagepool, gfp); + if (!newpage) + continue; + set_page_private(newpage, + Z_EROFS_PREALLOCATED_PAGE); + t = tag_compressed_page_justfound(newpage); + break; + default: /* DONTALLOC */ + continue; + } } if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) @@ -214,7 +296,11 @@ dontalloc: } } - if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */ + /* + * don't do inplace I/O if all compressed pages are available in + * managed cache since it can be moved to the bypass queue instead. + */ + if (standalone) clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; } @@ -225,14 +311,13 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); struct address_space *const mapping = MNGD_MAPPING(sbi); - const unsigned int clusterpages = BIT(pcl->clusterbits); int i; /* * refcount of workgroup is now freezed as 1, * therefore no need to worry about available decompression users. */ - for (i = 0; i < clusterpages; ++i) { + for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page = pcl->compressed_pages[i]; if (!page) @@ -257,13 +342,12 @@ int erofs_try_to_free_cached_page(struct address_space *mapping, struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); - const unsigned int clusterpages = BIT(pcl->clusterbits); int ret = 0; /* 0 - busy */ if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { unsigned int i; - for (i = 0; i < clusterpages; ++i) { + for (i = 0; i < pcl->pclusterpages; ++i) { if (pcl->compressed_pages[i] == page) { WRITE_ONCE(pcl->compressed_pages[i], NULL); ret = 1; @@ -279,16 +363,14 @@ int erofs_try_to_free_cached_page(struct address_space *mapping, } /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ -static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt, - struct page *page) +static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt, + struct page *page) { struct z_erofs_pcluster *const pcl = clt->pcl; - const unsigned int clusterpages = BIT(pcl->clusterbits); - while (clt->compressedpages < pcl->compressed_pages + clusterpages) { - if (!cmpxchg(clt->compressedpages++, NULL, page)) + while (clt->icpage_ptr > pcl->compressed_pages) + if (!cmpxchg(--clt->icpage_ptr, NULL, page)) return true; - } return false; } @@ -399,10 +481,10 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, struct erofs_workgroup *grp; int err; - /* no available workgroup, let's allocate one */ - pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS); - if (!pcl) - return -ENOMEM; + /* no available pcluster, let's allocate one */ + pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT); + if (IS_ERR(pcl)) + return PTR_ERR(pcl); atomic_set(&pcl->obj.refcount, 1); pcl->obj.index = map->m_pa >> PAGE_SHIFT; @@ -416,25 +498,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, else pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0]; - pcl->clusterbits -= PAGE_SHIFT; - /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = clt->owned_head; clt->mode = COLLECT_PRIMARY_FOLLOWED; cl = z_erofs_primarycollection(pcl); - - /* must be cleaned before freeing to slab */ - DBG_BUGON(cl->nr_pages); - DBG_BUGON(cl->vcnt); - cl->pageofs = map->m_la & ~PAGE_MASK; /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ + mutex_init(&cl->lock); DBG_BUGON(!mutex_trylock(&cl->lock)); grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); @@ -458,7 +533,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, err_out: mutex_unlock(&cl->lock); - kmem_cache_free(pcluster_cachep, pcl); + z_erofs_free_pcluster(pcl); return err; } @@ -502,9 +577,8 @@ out: z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, clt->cl->pagevec, clt->cl->vcnt); - clt->compressedpages = clt->pcl->compressed_pages; - if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */ - clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES; + /* since file-backed online pages are traversed in reverse order */ + clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages; return 0; } @@ -517,9 +591,8 @@ static void z_erofs_rcu_callback(struct rcu_head *head) struct z_erofs_collection *const cl = container_of(head, struct z_erofs_collection, rcu); - kmem_cache_free(pcluster_cachep, - container_of(cl, struct z_erofs_pcluster, - primary_collection)); + z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster, + primary_collection)); } void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) @@ -706,9 +779,12 @@ err_out: goto out; } +static void z_erofs_decompressqueue_work(struct work_struct *work); static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, bool sync, int bios) { + struct erofs_sb_info *const sbi = EROFS_SB(io->sb); + /* wake up the caller thread for sync decompression */ if (sync) { unsigned long flags; @@ -720,8 +796,15 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, return; } - if (!atomic_add_return(bios, &io->pending_bios)) + if (atomic_add_return(bios, &io->pending_bios)) + return; + /* Use workqueue and sync decompression for atomic contexts only */ + if (in_atomic() || irqs_disabled()) { queue_work(z_erofs_workqueue, &io->u.work); + sbi->ctx.readahead_sync_decompress = true; + return; + } + z_erofs_decompressqueue_work(&io->u.work); } static bool z_erofs_page_is_invalidated(struct page *page) @@ -761,9 +844,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, struct list_head *pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); - const unsigned int clusterpages = BIT(pcl->clusterbits); struct z_erofs_pagevec_ctor ctor; - unsigned int i, outputsize, llen, nr_pages; + unsigned int i, inputsize, outputsize, llen, nr_pages; struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; struct page **pages, **compressed_pages, *page; @@ -843,7 +925,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, overlapped = false; compressed_pages = pcl->compressed_pages; - for (i = 0; i < clusterpages; ++i) { + for (i = 0; i < pcl->pclusterpages; ++i) { unsigned int pagenr; page = compressed_pages[i]; @@ -896,12 +978,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, partial = true; } + inputsize = pcl->pclusterpages * PAGE_SIZE; err = z_erofs_decompress(&(struct z_erofs_decompress_req) { .sb = sb, .in = compressed_pages, .out = pages, .pageofs_out = cl->pageofs, - .inputsize = PAGE_SIZE, + .inputsize = inputsize, .outputsize = outputsize, .alg = pcl->algorithmformat, .inplace_io = overlapped, @@ -909,8 +992,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, }, pagepool); out: - /* must handle all compressed pages before endding pages */ - for (i = 0; i < clusterpages; ++i) { + /* must handle all compressed pages before ending pages */ + for (i = 0; i < pcl->pclusterpages; ++i) { page = compressed_pages[i]; if (erofs_page_is_managed(sbi, page)) @@ -1213,7 +1296,7 @@ static void z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); cur = pcl->obj.index; - end = cur + BIT(pcl->clusterbits); + end = cur + pcl->pclusterpages; /* close the main owned chain at first */ owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, @@ -1333,7 +1416,8 @@ static void z_erofs_readahead(struct readahead_control *rac) struct erofs_sb_info *const sbi = EROFS_I_SB(inode); unsigned int nr_pages = readahead_count(rac); - bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages); + bool sync = (sbi->ctx.readahead_sync_decompress && + nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *page, *head = NULL; LIST_HEAD(pagepool); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index b503b353d4ab..942ee69dff6a 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -10,6 +10,7 @@ #include "internal.h" #include "zpvec.h" +#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_NR_INLINE_PAGEVECS 3 /* @@ -59,16 +60,17 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; - /* A: compressed pages (including multi-usage pages) */ - struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES]; - /* A: lower limit of decompressed length and if full length or not */ unsigned int length; + /* I: physical cluster size in pages */ + unsigned short pclusterpages; + /* I: compression algorithm format */ unsigned char algorithmformat; - /* I: bit shift of physical cluster size */ - unsigned char clusterbits; + + /* A: compressed pages (can be cached or inplaced pages) */ + struct page *compressed_pages[]; }; #define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection) @@ -82,8 +84,6 @@ struct z_erofs_pcluster { #define Z_EROFS_PCLUSTER_NIL (NULL) -#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster) - struct z_erofs_decompressqueue { struct super_block *sb; atomic_t pending_bios; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 14d2de35110c..e62d813756f2 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -11,17 +11,16 @@ int z_erofs_fill_inode(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { + if (!erofs_sb_has_big_pcluster(sbi) && + vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { vi->z_advise = 0; vi->z_algorithmtype[0] = 0; vi->z_algorithmtype[1] = 0; vi->z_logical_clusterbits = LOG_BLOCK_SIZE; - vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits; - vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits; set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); } - inode->i_mapping->a_ops = &z_erofs_aops; return 0; } @@ -52,7 +51,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; - DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); + DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, 8); @@ -77,18 +77,22 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) } vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); - vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits + - ((h->h_clusterbits >> 3) & 3); - - if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) { - erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel", - vi->z_physical_clusterbits[0], vi->nid); - err = -EOPNOTSUPP; + if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto unmap_done; + } + if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", + vi->nid); + err = -EFSCORRUPTED; goto unmap_done; } - - vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits + - ((h->h_clusterbits >> 5) & 7); /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); @@ -111,7 +115,7 @@ struct z_erofs_maprecorder { u8 type; u16 clusterofs; u16 delta[2]; - erofs_blk_t pblk; + erofs_blk_t pblk, compressedlcs; }; static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, @@ -174,6 +178,15 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); + if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + m->compressedlcs = m->delta[0] & + ~Z_EROFS_VLE_DI_D0_CBLKCNT; + m->delta[0] = 1; + } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); break; case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: @@ -210,6 +223,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, unsigned int vcnt, base, lo, encodebits, nblk; int i; u8 *in, type; + bool big_pcluster; if (1 << amortizedshift == 4) vcnt = 2; @@ -218,6 +232,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; + big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; base = round_down(eofs, vcnt << amortizedshift); in = m->kaddr + base; @@ -229,7 +244,15 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, m->type = type; if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; - if (i + 1 != vcnt) { + if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (!big_pcluster) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + m->compressedlcs = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + m->delta[0] = 1; + return 0; + } else if (i + 1 != (int)vcnt) { m->delta[0] = lo; return 0; } @@ -242,22 +265,48 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, in, encodebits * (i - 1), &type); if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) lo = 0; + else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) + lo = 1; m->delta[0] = lo + 1; return 0; } m->clusterofs = lo; m->delta[0] = 0; /* figout out blkaddr (pblk) for HEAD lclusters */ - nblk = 1; - while (i > 0) { - --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) - i -= lo; - - if (i >= 0) + if (!big_pcluster) { + nblk = 1; + while (i > 0) { + --i; + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + i -= lo; + + if (i >= 0) + ++nblk; + } + } else { + nblk = 0; + while (i > 0) { + --i; + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { + --i; + nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + continue; + } + /* bigpcluster shouldn't have plain d0 == 1 */ + if (lo <= 1) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + i -= lo - 2; + continue; + } ++nblk; + } } in += (vcnt << amortizedshift) - sizeof(__le32); m->pblk = le32_to_cpu(*(__le32 *)in) + nblk; @@ -381,6 +430,58 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, return 0; } +static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, + unsigned int initial_lcn) +{ + struct erofs_inode *const vi = EROFS_I(m->inode); + struct erofs_map_blocks *const map = m->map; + const unsigned int lclusterbits = vi->z_logical_clusterbits; + unsigned long lcn; + int err; + + DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD); + if (!(map->m_flags & EROFS_MAP_ZIPPED) || + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + map->m_plen = 1 << lclusterbits; + return 0; + } + + lcn = m->lcn + 1; + if (m->compressedlcs) + goto out; + if (lcn == initial_lcn) + goto err_bonus_cblkcnt; + + err = z_erofs_load_cluster_from_disk(m, lcn); + if (err) + return err; + + switch (m->type) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + if (m->delta[0] != 1) + goto err_bonus_cblkcnt; + if (m->compressedlcs) + break; + fallthrough; + default: + erofs_err(m->inode->i_sb, + "cannot found CBLKCNT @ lcn %lu of nid %llu", + lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } +out: + map->m_plen = m->compressedlcs << lclusterbits; + return 0; +err_bonus_cblkcnt: + erofs_err(m->inode->i_sb, + "bogus CBLKCNT @ lcn %lu of nid %llu", + lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) @@ -392,6 +493,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, }; int err = 0; unsigned int lclusterbits, endoff; + unsigned long initial_lcn; unsigned long long ofs, end; trace_z_erofs_map_blocks_iter_enter(inode, map, flags); @@ -410,10 +512,10 @@ int z_erofs_map_blocks_iter(struct inode *inode, lclusterbits = vi->z_logical_clusterbits; ofs = map->m_la; - m.lcn = ofs >> lclusterbits; + initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = z_erofs_load_cluster_from_disk(&m, m.lcn); + err = z_erofs_load_cluster_from_disk(&m, initial_lcn); if (err) goto unmap_out; @@ -443,7 +545,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, m.delta[0] = 1; fallthrough; case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: - /* get the correspoinding first chunk */ + /* get the corresponding first chunk */ err = z_erofs_extent_lookback(&m, m.delta[0]); if (err) goto unmap_out; @@ -457,10 +559,12 @@ int z_erofs_map_blocks_iter(struct inode *inode, } map->m_llen = end - map->m_la; - map->m_plen = 1 << lclusterbits; map->m_pa = blknr_to_addr(m.pblk); map->m_flags |= EROFS_MAP_MAPPED; + err = z_erofs_get_extent_compressedlen(&m, initial_lcn); + if (err) + goto out; unmap_out: if (m.kaddr) kunmap_atomic(m.kaddr); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3196474cbe24..73138ea68342 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -366,8 +366,8 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) * * @ep: Pointer to the eventpoll context. * - * Returns: Returns a value different than zero if ready events are available, - * or zero otherwise. + * Return: a value different than %zero if ready events are available, + * or %zero otherwise. */ static inline int ep_events_available(struct eventpoll *ep) { @@ -1023,7 +1023,7 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, } #endif /* CONFIG_KCMP */ -/** +/* * Adds a new entry to the tail of the list in a lockless way, i.e. * multiple CPUs are allowed to call this function concurrently. * @@ -1035,10 +1035,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, * completed. * * Also an element can be locklessly added to the list only in one - * direction i.e. either to the tail either to the head, otherwise + * direction i.e. either to the tail or to the head, otherwise * concurrent access will corrupt the list. * - * Returns %false if element has been already added to the list, %true + * Return: %false if element has been already added to the list, %true * otherwise. */ static inline bool list_add_tail_lockless(struct list_head *new, @@ -1076,11 +1076,11 @@ static inline bool list_add_tail_lockless(struct list_head *new, return true; } -/** +/* * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, * i.e. multiple CPUs are allowed to call this function concurrently. * - * Returns %false if epi element has been already chained, %true otherwise. + * Return: %false if epi element has been already chained, %true otherwise. */ static inline bool chain_epi_lockless(struct epitem *epi) { @@ -1105,8 +1105,8 @@ static inline bool chain_epi_lockless(struct epitem *epi) * mechanism. It is called by the stored file descriptors when they * have events to report. * - * This callback takes a read lock in order not to content with concurrent - * events from another file descriptors, thus all modifications to ->rdllist + * This callback takes a read lock in order not to contend with concurrent + * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from * ep_scan_ready_list(), which stops all list modifications and guarantees * that lists state is seen correctly. @@ -1335,8 +1335,8 @@ static int reverse_path_check_proc(struct hlist_head *refs, int depth) * paths such that we will spend all our time waking up * eventpoll objects. * - * Returns: Returns zero if the proposed links don't create too many paths, - * -1 otherwise. + * Return: %zero if the proposed links don't create too many paths, + * %-1 otherwise. */ static int reverse_path_check(void) { @@ -1734,7 +1734,7 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) } /** - * ep_poll - Retrieves ready events, and delivers them to the caller supplied + * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. * * @ep: Pointer to the eventpoll context. @@ -1747,7 +1747,7 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) * until at least one event has been retrieved (or an error * occurred). * - * Returns: Returns the number of ready events which have been fetched, or an + * Return: the number of ready events which have been fetched, or an * error code, in case of error. */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, @@ -1774,9 +1774,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, /* * This call is racy: We may or may not see events that are being added - * to the ready list under the lock (e.g., in IRQ callbacks). For, cases + * to the ready list under the lock (e.g., in IRQ callbacks). For cases * with a non-zero timeout, this thread will check the ready list under - * lock and will added to the wait queue. For, cases with a zero + * lock and will add to the wait queue. For cases with a zero * timeout, the user by definition should not care and will have to * recheck again. */ @@ -1869,15 +1869,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, /** * ep_loop_check_proc - verify that adding an epoll file inside another - * epoll structure, does not violate the constraints, in + * epoll structure does not violate the constraints, in * terms of closed loops, or too deep chains (which can * result in excessive stack usage). * - * @priv: Pointer to the epoll file to be currently checked. + * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * - * Returns: Returns zero if adding the epoll @file inside current epoll - * structure @ep does not violate the constraints, or -1 otherwise. + * Return: %zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { @@ -1919,14 +1919,14 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) /** * ep_loop_check - Performs a check to verify that adding an epoll file (@to) - * into another epoll file (represented by @from) does not create + * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * - * @from: Pointer to the epoll we are inserting into. + * @ep: Pointer to the epoll we are inserting into. * @to: Pointer to the epoll to be inserted. * - * Returns: Returns zero if adding the epoll @to inside the epoll @from - * does not violate the constraints, or -1 otherwise. + * Return: %zero if adding the epoll @to inside the epoll @from + * does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { @@ -2074,8 +2074,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, ep = f.file->private_data; /* - * When we insert an epoll file descriptor, inside another epoll file - * descriptor, there is the change of creating closed loops, which are + * When we insert an epoll file descriptor inside another epoll file + * descriptor, there is the chance of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we @@ -2113,7 +2113,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, } /* - * Try to lookup the file inside our RB tree, Since we grabbed "mtx" + * Try to lookup the file inside our RB tree. Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index 761c79c3a4ba..cc5cffc4a769 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -141,11 +141,7 @@ void exfat_free_bitmap(struct exfat_sb_info *sbi) kfree(sbi->vol_amap); } -/* - * If the value of "clu" is 0, it means cluster 2 which is the first cluster of - * the cluster heap. - */ -int exfat_set_bitmap(struct inode *inode, unsigned int clu) +int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync) { int i, b; unsigned int ent_idx; @@ -158,14 +154,10 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu) b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); set_bit_le(b, sbi->vol_amap[i]->b_data); - exfat_update_bh(sbi->vol_amap[i], IS_DIRSYNC(inode)); + exfat_update_bh(sbi->vol_amap[i], sync); return 0; } -/* - * If the value of "clu" is 0, it means cluster 2 which is the first cluster of - * the cluster heap. - */ void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) { int i, b; @@ -186,8 +178,7 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) int ret_discard; ret_discard = sb_issue_discard(sb, - exfat_cluster_to_sector(sbi, clu + - EXFAT_RESERVED_CLUSTERS), + exfat_cluster_to_sector(sbi, clu), (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0); if (ret_discard == -EOPNOTSUPP) { @@ -273,3 +264,83 @@ int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count) *ret_count = count; return 0; } + +int exfat_trim_fs(struct inode *inode, struct fstrim_range *range) +{ + unsigned int trim_begin, trim_end, count, next_free_clu; + u64 clu_start, clu_end, trim_minlen, trimmed_total = 0; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + int err = 0; + + clu_start = max_t(u64, range->start >> sbi->cluster_size_bits, + EXFAT_FIRST_CLUSTER); + clu_end = clu_start + (range->len >> sbi->cluster_size_bits) - 1; + trim_minlen = range->minlen >> sbi->cluster_size_bits; + + if (clu_start >= sbi->num_clusters || range->len < sbi->cluster_size) + return -EINVAL; + + if (clu_end >= sbi->num_clusters) + clu_end = sbi->num_clusters - 1; + + mutex_lock(&sbi->bitmap_lock); + + trim_begin = trim_end = exfat_find_free_bitmap(sb, clu_start); + if (trim_begin == EXFAT_EOF_CLUSTER) + goto unlock; + + next_free_clu = exfat_find_free_bitmap(sb, trim_end + 1); + if (next_free_clu == EXFAT_EOF_CLUSTER) + goto unlock; + + do { + if (next_free_clu == trim_end + 1) { + /* extend trim range for continuous free cluster */ + trim_end++; + } else { + /* trim current range if it's larger than trim_minlen */ + count = trim_end - trim_begin + 1; + if (count >= trim_minlen) { + err = sb_issue_discard(sb, + exfat_cluster_to_sector(sbi, trim_begin), + count * sbi->sect_per_clus, GFP_NOFS, 0); + if (err) + goto unlock; + + trimmed_total += count; + } + + /* set next start point of the free hole */ + trim_begin = trim_end = next_free_clu; + } + + if (next_free_clu >= clu_end) + break; + + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto unlock; + } + + next_free_clu = exfat_find_free_bitmap(sb, next_free_clu + 1); + } while (next_free_clu != EXFAT_EOF_CLUSTER && + next_free_clu > trim_end); + + /* try to trim remainder */ + count = trim_end - trim_begin + 1; + if (count >= trim_minlen) { + err = sb_issue_discard(sb, exfat_cluster_to_sector(sbi, trim_begin), + count * sbi->sect_per_clus, GFP_NOFS, 0); + if (err) + goto unlock; + + trimmed_total += count; + } + +unlock: + mutex_unlock(&sbi->bitmap_lock); + range->len = trimmed_total << sbi->cluster_size_bits; + + return err; +} diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 916797077aad..c4523648472a 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -4,6 +4,7 @@ */ #include <linux/slab.h> +#include <linux/compat.h> #include <linux/bio.h> #include <linux/buffer_head.h> @@ -146,7 +147,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent 0); *uni_name.name = 0x0; - exfat_get_uniname_from_ext_entry(sb, &dir, dentry, + exfat_get_uniname_from_ext_entry(sb, &clu, i, uni_name.name); exfat_utf16_to_nls(sb, &uni_name, dir_entry->namebuf.lfn, @@ -306,6 +307,10 @@ const struct file_operations exfat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate = exfat_iterate, + .unlocked_ioctl = exfat_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = exfat_compat_ioctl, +#endif .fsync = exfat_file_fsync, }; @@ -315,7 +320,7 @@ int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu) exfat_chain_set(clu, EXFAT_EOF_CLUSTER, 0, ALLOC_NO_FAT_CHAIN); - ret = exfat_alloc_cluster(inode, 1, clu); + ret = exfat_alloc_cluster(inode, 1, clu, IS_DIRSYNC(inode)); if (ret) return ret; @@ -906,14 +911,19 @@ enum { }; /* - * return values: - * >= 0 : return dir entiry position with the name in dir - * -ENOENT : entry with the name does not exist - * -EIO : I/O error + * @ei: inode info of parent directory + * @p_dir: directory structure of parent directory + * @num_entries:entry size of p_uniname + * @hint_opt: If p_uniname is found, filled with optimized dir/entry + * for traversing cluster chain. + * @return: + * >= 0: file directory entry position where the name exists + * -ENOENT: entry with the name does not exist + * -EIO: I/O error */ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, - int num_entries, unsigned int type) + int num_entries, unsigned int type, struct exfat_hint *hint_opt) { int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len; int order, step, name_len = 0; @@ -990,6 +1000,8 @@ rewind: if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) { step = DIRENT_STEP_FILE; + hint_opt->clu = clu.dir; + hint_opt->eidx = i; if (type == TYPE_ALL || type == entry_type) { num_ext = ep->dentry.file.num_ext; step = DIRENT_STEP_STRM; diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index fa21421a14d9..1d6da61157c9 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -238,6 +238,7 @@ struct exfat_sb_info { unsigned int used_clusters; /* number of used clusters */ struct mutex s_lock; /* superblock lock */ + struct mutex bitmap_lock; /* bitmap lock */ struct exfat_mount_options options; struct nls_table *nls_io; /* Charset used for input and display */ struct ratelimit_state ratelimit; @@ -388,7 +389,7 @@ int exfat_clear_volume_dirty(struct super_block *sb); #define exfat_get_next_cluster(sb, pclu) exfat_ent_get(sb, *(pclu), pclu) int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, - struct exfat_chain *p_chain); + struct exfat_chain *p_chain, bool sync_bmap); int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain); int exfat_ent_get(struct super_block *sb, unsigned int loc, unsigned int *content); @@ -407,10 +408,11 @@ int exfat_count_num_clusters(struct super_block *sb, /* balloc.c */ int exfat_load_bitmap(struct super_block *sb); void exfat_free_bitmap(struct exfat_sb_info *sbi); -int exfat_set_bitmap(struct inode *inode, unsigned int clu); +int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync); void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync); unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu); int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count); +int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); /* file.c */ extern const struct file_operations exfat_file_operations; @@ -422,6 +424,9 @@ int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, unsigned int request_mask, unsigned int query_flags); int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); +long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long exfat_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); /* namei.c */ extern const struct dentry_operations exfat_dentry_ops; @@ -452,7 +457,7 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es); int exfat_calc_num_entries(struct exfat_uni_name *p_uniname); int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, - int num_entries, unsigned int type); + int num_entries, unsigned int type, struct exfat_hint *hint_opt); int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu); int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, int entry, sector_t *sector, int *offset); diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 7b2e8af17193..e949e563443c 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -151,13 +151,14 @@ int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain, return 0; } -int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) +/* This function must be called with bitmap_lock held */ +static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) { - unsigned int num_clusters = 0; - unsigned int clu; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); int cur_cmap_i, next_cmap_i; + unsigned int num_clusters = 0; + unsigned int clu; /* invalid cluster number */ if (p_chain->dir == EXFAT_FREE_CLUSTER || @@ -230,6 +231,17 @@ dec_used_clus: return 0; } +int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) +{ + int ret = 0; + + mutex_lock(&EXFAT_SB(inode->i_sb)->bitmap_lock); + ret = __exfat_free_cluster(inode, p_chain); + mutex_unlock(&EXFAT_SB(inode->i_sb)->bitmap_lock); + + return ret; +} + int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain, unsigned int *ret_clu) { @@ -308,7 +320,7 @@ release_bhs: } int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, - struct exfat_chain *p_chain) + struct exfat_chain *p_chain, bool sync_bmap) { int ret = -ENOSPC; unsigned int num_clusters = 0, total_cnt; @@ -328,6 +340,8 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, if (num_alloc > total_cnt - sbi->used_clusters) return -ENOSPC; + mutex_lock(&sbi->bitmap_lock); + hint_clu = p_chain->dir; /* find new cluster */ if (hint_clu == EXFAT_EOF_CLUSTER) { @@ -338,8 +352,10 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, } hint_clu = exfat_find_free_bitmap(sb, sbi->clu_srch_ptr); - if (hint_clu == EXFAT_EOF_CLUSTER) - return -ENOSPC; + if (hint_clu == EXFAT_EOF_CLUSTER) { + ret = -ENOSPC; + goto unlock; + } } /* check cluster validation */ @@ -349,8 +365,10 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, hint_clu = EXFAT_FIRST_CLUSTER; if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { if (exfat_chain_cont_cluster(sb, p_chain->dir, - num_clusters)) - return -EIO; + num_clusters)) { + ret = -EIO; + goto unlock; + } p_chain->flags = ALLOC_FAT_CHAIN; } } @@ -370,7 +388,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, } /* update allocation bitmap */ - if (exfat_set_bitmap(inode, new_clu)) { + if (exfat_set_bitmap(inode, new_clu, sync_bmap)) { ret = -EIO; goto free_cluster; } @@ -400,6 +418,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, sbi->used_clusters += num_clusters; p_chain->size += num_clusters; + mutex_unlock(&sbi->bitmap_lock); return 0; } @@ -419,7 +438,9 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, } free_cluster: if (num_clusters) - exfat_free_cluster(inode, p_chain); + __exfat_free_cluster(inode, p_chain); +unlock: + mutex_unlock(&sbi->bitmap_lock); return ret; } diff --git a/fs/exfat/file.c b/fs/exfat/file.c index f783cf38dd8e..6af0191b648f 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -4,6 +4,7 @@ */ #include <linux/slab.h> +#include <linux/compat.h> #include <linux/cred.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> @@ -350,6 +351,54 @@ out: return error; } +static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg) +{ + struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev); + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) + return -EFAULT; + + range.minlen = max_t(unsigned int, range.minlen, + q->limits.discard_granularity); + + ret = exfat_trim_fs(inode, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + +long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + switch (cmd) { + case FITRIM: + return exfat_ioctl_fitrim(inode, arg); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long exfat_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + return exfat_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; @@ -370,6 +419,10 @@ const struct file_operations exfat_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, + .unlocked_ioctl = exfat_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = exfat_compat_ioctl, +#endif .mmap = generic_file_mmap, .fsync = exfat_file_fsync, .splice_read = generic_file_splice_read, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 730373e0965a..1803ef3220fd 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -179,7 +179,8 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, return -EIO; } - ret = exfat_alloc_cluster(inode, num_to_be_allocated, &new_clu); + ret = exfat_alloc_cluster(inode, num_to_be_allocated, &new_clu, + inode_needs_sync(inode)); if (ret) return ret; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index d9e8ec689c55..24b41103d1cc 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -340,7 +340,7 @@ static int exfat_find_empty_entry(struct inode *inode, exfat_chain_set(&clu, last_clu + 1, 0, p_dir->flags); /* allocate a cluster */ - ret = exfat_alloc_cluster(inode, 1, &clu); + ret = exfat_alloc_cluster(inode, 1, &clu, IS_DIRSYNC(inode)); if (ret) return ret; @@ -596,6 +596,8 @@ static int exfat_find(struct inode *dir, struct qstr *qname, struct exfat_inode_info *ei = EXFAT_I(dir); struct exfat_dentry *ep, *ep2; struct exfat_entry_set_cache *es; + /* for optimized dir & entry to prevent long traverse of cluster chain */ + struct exfat_hint hint_opt; if (qname->len == 0) return -ENOENT; @@ -619,7 +621,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, /* search the file name for directories */ dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, - num_entries, TYPE_ALL); + num_entries, TYPE_ALL, &hint_opt); if (dentry < 0) return dentry; /* -error value */ @@ -628,6 +630,11 @@ static int exfat_find(struct inode *dir, struct qstr *qname, info->entry = dentry; info->num_subdirs = 0; + /* adjust cdir to the optimized value */ + cdir.dir = hint_opt.clu; + if (cdir.flags & ALLOC_NO_FAT_CHAIN) + cdir.size -= dentry / sbi->dentries_per_clu; + dentry = hint_opt.eidx; es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES); if (!es) return -EIO; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index c6d8d2e53486..d38d17a77e76 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -752,6 +752,7 @@ static int exfat_init_fs_context(struct fs_context *fc) return -ENOMEM; mutex_init(&sbi->s_lock); + mutex_init(&sbi->bitmap_lock); ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 14aa45316ad2..14292dba3a12 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -190,13 +190,20 @@ fail: return false; } +/* + * Calls to ext2_get_page()/ext2_put_page() must be nested according to the + * rules documented in kmap_local_page()/kunmap_local(). + * + * NOTE: ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() + * and should be treated as a call to ext2_get_page() for nesting purposes. + */ static struct page * ext2_get_page(struct inode *dir, unsigned long n, - int quiet) + int quiet, void **page_addr) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); if (!IS_ERR(page)) { - kmap(page); + *page_addr = kmap_local_page(page); if (unlikely(!PageChecked(page))) { if (PageError(page) || !ext2_check_page(page, quiet)) goto fail; @@ -205,7 +212,7 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n, return page; fail: - ext2_put_page(page); + ext2_put_page(page, *page_addr); return ERR_PTR(-EIO); } @@ -276,7 +283,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; ext2_dirent *de; - struct page *page = ext2_get_page(inode, n, 0); + struct page *page = ext2_get_page(inode, n, 0, (void **)&kaddr); if (IS_ERR(page)) { ext2_error(sb, __func__, @@ -285,7 +292,6 @@ ext2_readdir(struct file *file, struct dir_context *ctx) ctx->pos += PAGE_SIZE - offset; return PTR_ERR(page); } - kaddr = page_address(page); if (unlikely(need_revalidate)) { if (offset) { offset = ext2_validate_entry(kaddr, offset, chunk_mask); @@ -300,7 +306,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) if (de->rec_len == 0) { ext2_error(sb, __func__, "zero-length directory entry"); - ext2_put_page(page); + ext2_put_page(page, kaddr); return -EIO; } if (de->inode) { @@ -312,13 +318,13 @@ ext2_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), d_type)) { - ext2_put_page(page); + ext2_put_page(page, kaddr); return 0; } } ctx->pos += ext2_rec_len_from_disk(de->rec_len); } - ext2_put_page(page); + ext2_put_page(page, kaddr); } return 0; } @@ -332,9 +338,16 @@ ext2_readdir(struct file *file, struct dir_context *ctx) * Entry is guaranteed to be valid. * * On Success ext2_put_page() should be called on *res_page. + * + * NOTE: Calls to ext2_get_page()/ext2_put_page() must be nested according to + * the rules documented in kmap_local_page()/kunmap_local(). + * + * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() and + * should be treated as a call to ext2_get_page() for nesting purposes. */ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, - const struct qstr *child, struct page **res_page) + const struct qstr *child, struct page **res_page, + void **res_page_addr) { const char *name = child->name; int namelen = child->len; @@ -344,12 +357,14 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, struct page *page = NULL; struct ext2_inode_info *ei = EXT2_I(dir); ext2_dirent * de; + void *page_addr; if (npages == 0) goto out; /* OFFSET_CACHE */ *res_page = NULL; + *res_page_addr = NULL; start = ei->i_dir_start_lookup; if (start >= npages) @@ -357,25 +372,25 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, n = start; do { char *kaddr; - page = ext2_get_page(dir, n, 0); + page = ext2_get_page(dir, n, 0, &page_addr); if (IS_ERR(page)) return ERR_CAST(page); - kaddr = page_address(page); + kaddr = page_addr; de = (ext2_dirent *) kaddr; kaddr += ext2_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (de->rec_len == 0) { ext2_error(dir->i_sb, __func__, "zero-length directory entry"); - ext2_put_page(page); + ext2_put_page(page, page_addr); goto out; } if (ext2_match(namelen, name, de)) goto found; de = ext2_next_entry(de); } - ext2_put_page(page); + ext2_put_page(page, page_addr); if (++n >= npages) n = 0; @@ -393,6 +408,7 @@ out: found: *res_page = page; + *res_page_addr = page_addr; ei->i_dir_start_lookup = n; return de; } @@ -402,15 +418,24 @@ found: * (as a parameter - p). * * On Success ext2_put_page() should be called on *p. + * + * NOTE: Calls to ext2_get_page()/ext2_put_page() must be nested according to + * the rules documented in kmap_local_page()/kunmap_local(). + * + * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() and + * should be treated as a call to ext2_get_page() for nesting purposes. */ -struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) +struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p, + void **pa) { - struct page *page = ext2_get_page(dir, 0, 0); + void *page_addr; + struct page *page = ext2_get_page(dir, 0, 0, &page_addr); ext2_dirent *de = NULL; if (!IS_ERR(page)) { - de = ext2_next_entry((ext2_dirent *) page_address(page)); + de = ext2_next_entry((ext2_dirent *) page_addr); *p = page; + *pa = page_addr; } return de; } @@ -419,13 +444,14 @@ int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino) { struct ext2_dir_entry_2 *de; struct page *page; + void *page_addr; - de = ext2_find_entry(dir, child, &page); + de = ext2_find_entry(dir, child, &page, &page_addr); if (IS_ERR(de)) return PTR_ERR(de); *ino = le32_to_cpu(de->inode); - ext2_put_page(page); + ext2_put_page(page, page_addr); return 0; } @@ -434,12 +460,12 @@ static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, ext2_get_block); } -/* Releases the page */ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, - struct page *page, struct inode *inode, int update_times) + struct page *page, void *page_addr, struct inode *inode, + int update_times) { loff_t pos = page_offset(page) + - (char *) de - (char *) page_address(page); + (char *) de - (char *) page_addr; unsigned len = ext2_rec_len_from_disk(de->rec_len); int err; @@ -449,7 +475,6 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type(de, inode); err = ext2_commit_chunk(page, pos, len); - ext2_put_page(page); if (update_times) dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; @@ -468,10 +493,10 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) unsigned reclen = EXT2_DIR_REC_LEN(namelen); unsigned short rec_len, name_len; struct page *page = NULL; + void *page_addr = NULL; ext2_dirent * de; unsigned long npages = dir_pages(dir); unsigned long n; - char *kaddr; loff_t pos; int err; @@ -481,14 +506,15 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) * to protect that region. */ for (n = 0; n <= npages; n++) { + char *kaddr; char *dir_end; - page = ext2_get_page(dir, n, 0); + page = ext2_get_page(dir, n, 0, &page_addr); err = PTR_ERR(page); if (IS_ERR(page)) goto out; lock_page(page); - kaddr = page_address(page); + kaddr = page_addr; dir_end = kaddr + ext2_last_byte(dir, n); de = (ext2_dirent *)kaddr; kaddr += PAGE_SIZE - reclen; @@ -519,14 +545,14 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) de = (ext2_dirent *) ((char *) de + rec_len); } unlock_page(page); - ext2_put_page(page); + ext2_put_page(page, page_addr); } BUG(); return -EINVAL; got_it: pos = page_offset(page) + - (char*)de - (char*)page_address(page); + (char *)de - (char *)page_addr; err = ext2_prepare_chunk(page, pos, rec_len); if (err) goto out_unlock; @@ -546,7 +572,7 @@ got_it: mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: - ext2_put_page(page); + ext2_put_page(page, page_addr); out: return err; out_unlock: @@ -556,7 +582,7 @@ out_unlock: /* * ext2_delete_entry deletes a directory entry by merging it with the - * previous entry. Page is up-to-date. Releases the page. + * previous entry. Page is up-to-date. */ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) { @@ -594,7 +620,6 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(inode); out: - ext2_put_page(page); return err; } @@ -644,6 +669,7 @@ fail: */ int ext2_empty_dir (struct inode * inode) { + void *page_addr = NULL; struct page *page = NULL; unsigned long i, npages = dir_pages(inode); int dir_has_error = 0; @@ -651,14 +677,14 @@ int ext2_empty_dir (struct inode * inode) for (i = 0; i < npages; i++) { char *kaddr; ext2_dirent * de; - page = ext2_get_page(inode, i, dir_has_error); + page = ext2_get_page(inode, i, dir_has_error, &page_addr); if (IS_ERR(page)) { dir_has_error = 1; continue; } - kaddr = page_address(page); + kaddr = page_addr; de = (ext2_dirent *)kaddr; kaddr += ext2_last_byte(inode, i) - EXT2_DIR_REC_LEN(1); @@ -684,12 +710,12 @@ int ext2_empty_dir (struct inode * inode) } de = ext2_next_entry(de); } - ext2_put_page(page); + ext2_put_page(page, page_addr); } return 1; not_empty: - ext2_put_page(page); + ext2_put_page(page, page_addr); return 0; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 3309fb2d327a..b0a694820cb7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -283,8 +283,6 @@ static inline __u32 ext2_mask_flags(umode_t mode, __u32 flags) /* * ioctl commands */ -#define EXT2_IOC_GETFLAGS FS_IOC_GETFLAGS -#define EXT2_IOC_SETFLAGS FS_IOC_SETFLAGS #define EXT2_IOC_GETVERSION FS_IOC_GETVERSION #define EXT2_IOC_SETVERSION FS_IOC_SETVERSION #define EXT2_IOC_GETRSVSZ _IOR('f', 5, long) @@ -293,8 +291,6 @@ static inline __u32 ext2_mask_flags(umode_t mode, __u32 flags) /* * ioctl commands in 32 bit emulation */ -#define EXT2_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define EXT2_IOC32_SETFLAGS FS_IOC32_SETFLAGS #define EXT2_IOC32_GETVERSION FS_IOC32_GETVERSION #define EXT2_IOC32_SETVERSION FS_IOC32_SETVERSION @@ -742,14 +738,16 @@ extern int ext2_add_link (struct dentry *, struct inode *); extern int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino); extern int ext2_make_empty(struct inode *, struct inode *); -extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,const struct qstr *, struct page **); +extern struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *, + struct page **, void **res_page_addr); extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *); extern int ext2_empty_dir (struct inode *); -extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); -extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); -static inline void ext2_put_page(struct page *page) +extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p, void **pa); +extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, void *, + struct inode *, int); +static inline void ext2_put_page(struct page *page, void *page_addr) { - kunmap(page); + kunmap_local(page_addr); put_page(page); } @@ -772,6 +770,9 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); /* ioctl.c */ +extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +extern int ext2_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); extern long ext2_ioctl(struct file *, unsigned int, unsigned long); extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 96044f5dbc0e..f98466acc672 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -204,4 +204,6 @@ const struct inode_operations ext2_file_inode_operations = { .get_acl = ext2_get_acl, .set_acl = ext2_set_acl, .fiemap = ext2_fiemap, + .fileattr_get = ext2_fileattr_get, + .fileattr_set = ext2_fileattr_set, }; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index b399cbb7022d..e8340bf09b10 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -16,69 +16,51 @@ #include <linux/mount.h> #include <asm/current.h> #include <linux/uaccess.h> +#include <linux/fileattr.h> - -long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = file_inode(filp); - struct ext2_inode_info *ei = EXT2_I(inode); - unsigned int flags; - unsigned short rsv_window_size; - int ret; + struct ext2_inode_info *ei = EXT2_I(d_inode(dentry)); - ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg); + fileattr_fill_flags(fa, ei->i_flags & EXT2_FL_USER_VISIBLE); - switch (cmd) { - case EXT2_IOC_GETFLAGS: - flags = ei->i_flags & EXT2_FL_USER_VISIBLE; - return put_user(flags, (int __user *) arg); - case EXT2_IOC_SETFLAGS: { - unsigned int oldflags; + return 0; +} - ret = mnt_want_write_file(filp); - if (ret) - return ret; +int ext2_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct ext2_inode_info *ei = EXT2_I(inode); - if (!inode_owner_or_capable(&init_user_ns, inode)) { - ret = -EACCES; - goto setflags_out; - } + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; - if (get_user(flags, (int __user *) arg)) { - ret = -EFAULT; - goto setflags_out; - } + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return -EPERM; - flags = ext2_mask_flags(inode->i_mode, flags); + ei->i_flags = (ei->i_flags & ~EXT2_FL_USER_MODIFIABLE) | + (fa->flags & EXT2_FL_USER_MODIFIABLE); - inode_lock(inode); - /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) { - inode_unlock(inode); - ret = -EPERM; - goto setflags_out; - } - oldflags = ei->i_flags; + ext2_set_inode_flags(inode); + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); - ret = vfs_ioc_setflags_prepare(inode, oldflags, flags); - if (ret) { - inode_unlock(inode); - goto setflags_out; - } + return 0; +} - flags = flags & EXT2_FL_USER_MODIFIABLE; - flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; - ei->i_flags = flags; - ext2_set_inode_flags(inode); - inode->i_ctime = current_time(inode); - inode_unlock(inode); +long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct ext2_inode_info *ei = EXT2_I(inode); + unsigned short rsv_window_size; + int ret; - mark_inode_dirty(inode); -setflags_out: - mnt_drop_write_file(filp); - return ret; - } + ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { case EXT2_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *) arg); case EXT2_IOC_SETVERSION: { @@ -163,12 +145,6 @@ long ext2_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { - case EXT2_IOC32_GETFLAGS: - cmd = EXT2_IOC_GETFLAGS; - break; - case EXT2_IOC32_SETFLAGS: - cmd = EXT2_IOC_SETFLAGS; - break; case EXT2_IOC32_GETVERSION: cmd = EXT2_IOC_GETVERSION; break; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 3367384d344d..1f69b81655b6 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -81,11 +81,10 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, uns struct dentry *ext2_get_parent(struct dentry *child) { - struct qstr dotdot = QSTR_INIT("..", 2); ino_t ino; int res; - res = ext2_inode_by_name(d_inode(child), &dotdot, &ino); + res = ext2_inode_by_name(d_inode(child), &dotdot_name, &ino); if (res) return ERR_PTR(res); @@ -281,19 +280,21 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) struct inode * inode = d_inode(dentry); struct ext2_dir_entry_2 * de; struct page * page; + void *page_addr; int err; err = dquot_initialize(dir); if (err) goto out; - de = ext2_find_entry(dir, &dentry->d_name, &page); + de = ext2_find_entry(dir, &dentry->d_name, &page, &page_addr); if (IS_ERR(de)) { err = PTR_ERR(de); goto out; } err = ext2_delete_entry (de, page); + ext2_put_page(page, page_addr); if (err) goto out; @@ -328,8 +329,10 @@ static int ext2_rename (struct user_namespace * mnt_userns, struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; + void *dir_page_addr; struct ext2_dir_entry_2 * dir_de = NULL; struct page * old_page; + void *old_page_addr; struct ext2_dir_entry_2 * old_de; int err; @@ -344,7 +347,8 @@ static int ext2_rename (struct user_namespace * mnt_userns, if (err) goto out; - old_de = ext2_find_entry(old_dir, &old_dentry->d_name, &old_page); + old_de = ext2_find_entry(old_dir, &old_dentry->d_name, &old_page, + &old_page_addr); if (IS_ERR(old_de)) { err = PTR_ERR(old_de); goto out; @@ -352,12 +356,13 @@ static int ext2_rename (struct user_namespace * mnt_userns, if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = ext2_dotdot(old_inode, &dir_page); + dir_de = ext2_dotdot(old_inode, &dir_page, &dir_page_addr); if (!dir_de) goto out_old; } if (new_inode) { + void *page_addr; struct page *new_page; struct ext2_dir_entry_2 *new_de; @@ -365,12 +370,14 @@ static int ext2_rename (struct user_namespace * mnt_userns, if (dir_de && !ext2_empty_dir (new_inode)) goto out_dir; - new_de = ext2_find_entry(new_dir, &new_dentry->d_name, &new_page); + new_de = ext2_find_entry(new_dir, &new_dentry->d_name, + &new_page, &page_addr); if (IS_ERR(new_de)) { err = PTR_ERR(new_de); goto out_dir; } - ext2_set_link(new_dir, new_de, new_page, old_inode, 1); + ext2_set_link(new_dir, new_de, new_page, page_addr, old_inode, 1); + ext2_put_page(new_page, page_addr); new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); @@ -390,23 +397,25 @@ static int ext2_rename (struct user_namespace * mnt_userns, old_inode->i_ctime = current_time(old_inode); mark_inode_dirty(old_inode); - ext2_delete_entry (old_de, old_page); + ext2_delete_entry(old_de, old_page); if (dir_de) { if (old_dir != new_dir) - ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); - else - ext2_put_page(dir_page); + ext2_set_link(old_inode, dir_de, dir_page, + dir_page_addr, new_dir, 0); + + ext2_put_page(dir_page, dir_page_addr); inode_dec_link_count(old_dir); } - return 0; + ext2_put_page(old_page, old_page_addr); + return 0; out_dir: if (dir_de) - ext2_put_page(dir_page); + ext2_put_page(dir_page, dir_page_addr); out_old: - ext2_put_page(old_page); + ext2_put_page(old_page, old_page_addr); out: return err; } @@ -427,6 +436,8 @@ const struct inode_operations ext2_dir_inode_operations = { .get_acl = ext2_get_acl, .set_acl = ext2_set_acl, .tmpfile = ext2_tmpfile, + .fileattr_get = ext2_fileattr_get, + .fileattr_set = ext2_fileattr_set, }; const struct inode_operations ext2_special_inode_operations = { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 6c4753277916..21e09fbaa46f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1104,7 +1104,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - /* per fileystem reservation list head & lock */ + /* per filesystem reservation list head & lock */ spin_lock_init(&sbi->s_rsv_window_lock); sbi->s_rsv_window_root = RB_ROOT; /* @@ -1399,7 +1399,6 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) struct super_block *sb = dentry->d_sb; struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = sbi->s_es; - u64 fsid; spin_lock(&sbi->s_lock); @@ -1453,9 +1452,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_ffree = ext2_count_free_inodes(sb); es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT2_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(es->s_uuid); spin_unlock(&sbi->s_lock); return 0; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f45f9feebe59..9dc6e74b265c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -239,7 +239,7 @@ unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { - return num_clusters_in_group(sb, block_group) - + return num_clusters_in_group(sb, block_group) - ext4_num_overhead_clusters(sb, block_group, gdp); } @@ -626,27 +626,41 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi, /** * ext4_should_retry_alloc() - check if a block allocation should be retried - * @sb: super block - * @retries: number of attemps has been made + * @sb: superblock + * @retries: number of retry attempts made so far * - * ext4_should_retry_alloc() is called when ENOSPC is returned, and if - * it is profitable to retry the operation, this function will wait - * for the current or committing transaction to complete, and then - * return TRUE. We will only retry once. + * ext4_should_retry_alloc() is called when ENOSPC is returned while + * attempting to allocate blocks. If there's an indication that a pending + * journal transaction might free some space and allow another attempt to + * succeed, this function will wait for the current or committing transaction + * to complete and then return TRUE. */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || - (*retries)++ > 1 || - !EXT4_SB(sb)->s_journal) + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!sbi->s_journal) return 0; - smp_mb(); - if (EXT4_SB(sb)->s_mb_free_pending == 0) + if (++(*retries) > 3) { + percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit); return 0; + } + /* + * if there's no indication that blocks are about to be freed it's + * possible we just missed a transaction commit that did so + */ + smp_mb(); + if (sbi->s_mb_free_pending == 0) + return ext4_has_free_clusters(sbi, 1, 0); + + /* + * it's possible we've just missed a transaction commit here, + * so ignore the returned status + */ jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + (void) jbd2_journal_force_commit_nested(sbi->s_journal); return 1; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5ed870614c8d..ffb295aa891c 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -55,6 +55,18 @@ static int is_dx_dir(struct inode *inode) return 0; } +static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) +{ + /* Check if . or .. , or skip if namelen is 0 */ + if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && + (de->name[1] == '.' || de->name[1] == '\0')) + return true; + /* Check if this is a csum entry */ + if (de->file_type == EXT4_FT_DIR_CSUM) + return true; + return false; +} + /* * Return 0 if the directory entry is OK, and 1 if there is a problem * @@ -73,16 +85,20 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; + bool fake = is_fake_dir_entry(de); + bool has_csum = ext4_has_metadata_csum(dir->i_sb); - if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) + if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; - else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) + else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, + fake ? NULL : dir))) error_msg = "rec_len is too small for name_len"; else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; - else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) && + else if (unlikely(next_offset > size - ext4_dir_rec_len(1, + has_csum ? NULL : dir) && next_offset != size)) error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > @@ -94,15 +110,15 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " - "inode=%u, rec_len=%d, name_len=%d, size=%d", + "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), - rlen, de->name_len, size); + rlen, size, fake); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " - "inode=%u, rec_len=%d, name_len=%d, size=%d", + "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), - rlen, de->name_len, size); + rlen, size, fake); return 1; } @@ -124,9 +140,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); - if (err != ERR_BAD_DX_DIR) { + if (err != ERR_BAD_DX_DIR) return err; - } + /* Can we just clear INDEX flag to ignore htree information? */ if (!ext4_has_metadata_csum(sb)) { /* @@ -224,7 +240,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) + sb->s_blocksize) < ext4_dir_rec_len(1, + inode)) break; i += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); @@ -265,7 +282,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, - 0, 0, &de_name, &fstr); + EXT4_DIRENT_HASH(de), + EXT4_DIRENT_MINOR_HASH(de), + &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 644fd69185d3..37002663d521 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -162,7 +162,12 @@ enum SHIFT_DIRECTION { #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 - +/* Large fragment size list lookup succeeded at least once for cr = 0 */ +#define EXT4_MB_CR0_OPTIMIZED 0x8000 +/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ +#define EXT4_MB_CR1_OPTIMIZED 0x00010000 +/* Perform linear traversal for one group */ +#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; @@ -472,15 +477,6 @@ struct flex_groups { EXT4_VERITY_FL | \ EXT4_INLINE_DATA_FL) -/* Flags we can manipulate with through FS_IOC_FSSETXATTR */ -#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ - EXT4_IMMUTABLE_FL | \ - EXT4_APPEND_FL | \ - EXT4_NODUMP_FL | \ - EXT4_NOATIME_FL | \ - EXT4_PROJINHERIT_FL | \ - EXT4_DAX_FL) - /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ @@ -1222,7 +1218,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ -#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000 +#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ @@ -1247,7 +1243,9 @@ struct ext4_inode_info { #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ - +#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group + * scanning in mballoc + */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1484,6 +1482,7 @@ struct ext4_sb_info { struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; struct percpu_counter s_dirtyclusters_counter; + struct percpu_counter s_sra_exceeded_retry_limit; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; struct kobject s_kobj; @@ -1527,9 +1526,14 @@ struct ext4_sb_info { unsigned int s_mb_free_pending; struct list_head s_freed_data_list; /* List of blocks to be freed after commit completed */ + struct rb_root s_mb_avg_fragment_size_root; + rwlock_t s_mb_rb_lock; + struct list_head *s_mb_largest_free_orders; + rwlock_t *s_mb_largest_free_orders_locks; /* tunables */ unsigned long s_stripe; + unsigned int s_mb_max_linear_groups; unsigned int s_mb_stream_request; unsigned int s_mb_max_to_scan; unsigned int s_mb_min_to_scan; @@ -1549,12 +1553,17 @@ struct ext4_sb_info { atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ - spinlock_t s_bal_lock; - unsigned long s_mb_buddies_generated; - unsigned long long s_mb_generation_time; + atomic_t s_bal_cr0_bad_suggestions; + atomic_t s_bal_cr1_bad_suggestions; + atomic64_t s_bal_cX_groups_considered[4]; + atomic64_t s_bal_cX_hits[4]; + atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ + atomic_t s_mb_buddies_generated; /* number of buddies generated */ + atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; atomic_t s_mb_preallocated; atomic_t s_mb_discarded; @@ -2195,6 +2204,17 @@ struct ext4_dir_entry { char name[EXT4_NAME_LEN]; /* File name */ }; + +/* + * Encrypted Casefolded entries require saving the hash on disk. This structure + * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned + * boundary. + */ +struct ext4_dir_entry_hash { + __le32 hash; + __le32 minor_hash; +}; + /* * The new version of the directory entry. Since EXT4 structures are * stored in intel byte order, and the name_len field could never be @@ -2210,6 +2230,22 @@ struct ext4_dir_entry_2 { }; /* + * Access the hashes at the end of ext4_dir_entry_2 + */ +#define EXT4_DIRENT_HASHES(entry) \ + ((struct ext4_dir_entry_hash *) \ + (((void *)(entry)) + \ + ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) +#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) +#define EXT4_DIRENT_MINOR_HASH(entry) \ + le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) + +static inline bool ext4_hash_in_dirent(const struct inode *inode) +{ + return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); +} + +/* * This is a bogus directory entry at the end of each leaf block that * records checksums. */ @@ -2250,11 +2286,25 @@ struct ext4_dir_entry_tail { */ #define EXT4_DIR_PAD 4 #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) -#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ - ~EXT4_DIR_ROUND) #define EXT4_MAX_REC_LEN ((1<<16)-1) /* + * The rec_len is dependent on the type of directory. Directories that are + * casefolded and encrypted need to store the hash as well, so we add room for + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should + * pass NULL for dir, as those entries do not use the extra fields. + */ +static inline unsigned int ext4_dir_rec_len(__u8 name_len, + const struct inode *dir) +{ + int rec_len = (name_len + 8 + EXT4_DIR_ROUND); + + if (dir && ext4_hash_in_dirent(dir)) + rec_len += sizeof(struct ext4_dir_entry_hash); + return (rec_len & ~EXT4_DIR_ROUND); +} + +/* * If we ever get support for fs block sizes > page_size, we'll need * to remove the #if statements in the next two functions... */ @@ -2310,6 +2360,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_LEGACY_UNSIGNED 3 #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 +#define DX_HASH_SIPHASH 6 static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, const void *address, unsigned int length) @@ -2364,6 +2415,7 @@ struct ext4_filename { }; #define fname_name(p) ((p)->disk_name.name) +#define fname_usr_name(p) ((p)->usr_fname->name) #define fname_len(p) ((p)->disk_name.len) /* @@ -2594,9 +2646,9 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); #ifdef CONFIG_UNICODE -extern void ext4_fname_setup_ci_filename(struct inode *dir, +extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, - struct fscrypt_str *fname); + struct ext4_filename *fname); #endif #ifdef CONFIG_FS_ENCRYPTION @@ -2627,9 +2679,9 @@ static inline int ext4_fname_setup_filename(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); #ifdef CONFIG_UNICODE - ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); + err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif - return 0; + return err; } static inline int ext4_fname_prepare_lookup(struct inode *dir, @@ -2646,9 +2698,9 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); #ifdef CONFIG_UNICODE - ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name); + err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); #endif - return 0; + return err; } static inline void ext4_fname_free_filename(struct ext4_filename *fname) @@ -2673,15 +2725,16 @@ static inline int ext4_fname_setup_filename(struct inode *dir, int lookup, struct ext4_filename *fname) { + int err = 0; fname->usr_fname = iname; fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; #ifdef CONFIG_UNICODE - ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); + err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif - return 0; + return err; } static inline int ext4_fname_prepare_lookup(struct inode *dir, @@ -2706,9 +2759,9 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct ext4_dir_entry_2 *, struct buffer_head *, char *, int, unsigned int); -#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ - (de), (bh), (buf), (size), (offset))) + (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, @@ -2719,7 +2772,7 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); -void ext4_insert_dentry(struct inode *inode, +void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname); @@ -2793,6 +2846,8 @@ void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void __ext4_fc_track_create(handle_t *handle, struct inode *inode, + struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason); @@ -2808,8 +2863,10 @@ int __init ext4_fc_init_dentry_cache(void); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; +extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; +extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, @@ -2925,6 +2982,9 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); +int ext4_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); +int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); /* migrate.c */ @@ -3309,11 +3369,14 @@ struct ext4_group_info { ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; #endif struct rw_semaphore alloc_sem; + struct rb_node bb_avg_fragment_size_rb; + struct list_head bb_largest_free_order_node; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. * bb_counters[3] = 5 means @@ -3516,9 +3579,6 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh); -extern int ext4_ci_compare(const struct inode *parent, - const struct qstr *fname, - const struct qstr *entry, bool quick); extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, struct inode *inode); extern int __ext4_link(struct inode *dir, struct inode *inode, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 77c7c8a54da7..77c84d6f1af6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4382,7 +4382,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, { struct inode *inode = file_inode(file); handle_t *handle; - int ret, ret2 = 0, ret3 = 0; + int ret = 0, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6c4f19b0a556..f98ca4f37ef6 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -66,7 +66,7 @@ * Fast Commit Ineligibility * ------------------------- * Not all operations are supported by fast commits today (e.g extended - * attributes). Fast commit ineligiblity is marked by calling one of the + * attributes). Fast commit ineligibility is marked by calling one of the * two following functions: * * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall @@ -513,10 +513,10 @@ void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) __ext4_fc_track_link(handle, d_inode(dentry), dentry); } -void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) +void __ext4_fc_track_create(handle_t *handle, struct inode *inode, + struct dentry *dentry) { struct __track_dentry_update_args args; - struct inode *inode = d_inode(dentry); int ret; args.dentry = dentry; @@ -527,6 +527,11 @@ void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) trace_ext4_fc_track_create(inode, dentry, ret); } +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) +{ + __ext4_fc_track_create(handle, d_inode(dentry), dentry); +} + /* __track_fn for inode tracking */ static int __track_inode(struct inode *inode, void *arg, bool update) { @@ -1083,8 +1088,10 @@ static int ext4_fc_perform_commit(journal_t *journal) head.fc_tid = cpu_to_le32( sbi->s_journal->j_running_transaction->t_tid); if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), - (u8 *)&head, &crc)) + (u8 *)&head, &crc)) { + ret = -ENOSPC; goto out; + } } spin_lock(&sbi->s_fc_lock); @@ -1729,7 +1736,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, } /* Range is mapped and needs a state change */ - jbd_debug(1, "Converting from %d to %d %lld", + jbd_debug(1, "Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 194f5d00fa32..816dedcbd541 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -371,15 +371,32 @@ truncate: static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { - loff_t offset = iocb->ki_pos; + loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); if (error) return error; - if (size && flags & IOMAP_DIO_UNWRITTEN) - return ext4_convert_unwritten_extents(NULL, inode, - offset, size); + if (size && flags & IOMAP_DIO_UNWRITTEN) { + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); + if (error < 0) + return error; + } + /* + * If we are extending the file, we have to update i_size here before + * page cache gets invalidated in iomap_dio_rw(). Otherwise racing + * buffered reads could zero out too much from page cache pages. Update + * of on-disk size will happen later in ext4_dio_write_iter() where + * we have enough information to also perform orphan list handling etc. + * Note that we perform all extending writes synchronously under + * i_rwsem held exclusively so i_size update is safe here in that case. + * If the write was not extending, we cannot see pos > i_size here + * because operations reducing i_size like truncate wait for all + * outstanding DIO before updating i_size. + */ + pos += size; + if (pos > i_size_read(inode)) + i_size_write(inode, pos); return 0; } @@ -919,5 +936,7 @@ const struct inode_operations ext4_file_inode_operations = { .get_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, + .fileattr_get = ext4_fileattr_get, + .fileattr_set = ext4_fileattr_set, }; diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 4c2a9fe30067..4493ef0c715e 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -354,8 +354,8 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, /* Compare two fsmap items. */ static int ext4_getfsmap_compare(void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct ext4_fsmap *fa; struct ext4_fsmap *fb; diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index a92eb79de0cc..f34f4176c1e7 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -197,7 +197,7 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) * represented, and whether or not the returned hash is 32 bits or 64 * bits. 32 bit hashes will return 0 for the minor hash. */ -static int __ext4fs_dirhash(const char *name, int len, +static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo) { __u32 hash; @@ -259,6 +259,22 @@ static int __ext4fs_dirhash(const char *name, int len, hash = buf[0]; minor_hash = buf[1]; break; + case DX_HASH_SIPHASH: + { + struct qstr qname = QSTR_INIT(name, len); + __u64 combined_hash; + + if (fscrypt_has_encryption_key(dir)) { + combined_hash = fscrypt_fname_siphash(dir, &qname); + } else { + ext4_warning_inode(dir, "Siphash requires key"); + return -1; + } + + hash = (__u32)(combined_hash >> 32); + minor_hash = (__u32)combined_hash; + break; + } default: hinfo->hash = 0; return -1; @@ -280,7 +296,8 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir) && um) { + if (len && IS_CASEFOLDED(dir) && um && + (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); if (!buff) return -ENOMEM; @@ -291,12 +308,12 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, goto opaque_seq; } - r = __ext4fs_dirhash(buff, dlen, hinfo); + r = __ext4fs_dirhash(dir, buff, dlen, hinfo); kfree(buff); return r; } opaque_seq: #endif - return __ext4fs_dirhash(name, len, hinfo); + return __ext4fs_dirhash(dir, name, len, hinfo); } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 633ae7becd61..81a17a3cd80e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -970,7 +970,7 @@ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns, i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; - inode->i_uid = fsuid_into_mnt(mnt_userns); + inode_fsuid_set(inode, mnt_userns); inode->i_gid = dir->i_gid; } else inode_init_owner(mnt_userns, inode, dir, mode); @@ -1292,7 +1292,8 @@ got: ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; - if (ext4_has_feature_inline_data(sb)) + if (ext4_has_feature_inline_data(sb) && + (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); @@ -1513,6 +1514,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, handle_t *handle; ext4_fsblk_t blk; int num, ret = 0, used_blks = 0; + unsigned long used_inos = 0; /* This should not happen, but just to be sure check this */ if (sb_rdonly(sb)) { @@ -1543,22 +1545,37 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, * used inodes so we need to skip blocks with used inodes in * inode table. */ - if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) - used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp)), - sbi->s_inodes_per_block); - - if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) || - ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp)) < - EXT4_FIRST_INO(sb)))) { - ext4_error(sb, "Something is wrong with group %u: " - "used itable blocks: %d; " - "itable unused count: %u", - group, used_blks, - ext4_itable_unused_count(sb, gdp)); - ret = 1; - goto err_out; + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { + used_inos = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); + + /* Bogus inode unused count? */ + if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", + group, used_blks, + ext4_itable_unused_count(sb, gdp)); + ret = 1; + goto err_out; + } + + used_inos += group * EXT4_INODES_PER_GROUP(sb); + /* + * Are there some uninitialized inodes in the inode table + * before the first normal inode? + */ + if ((used_blks != sbi->s_itb_per_group) && + (used_inos < EXT4_FIRST_INO(sb))) { + ext4_error(sb, "Something is wrong with group %u: " + "itable unused count: %u; " + "itables initialized count: %ld", + group, ext4_itable_unused_count(sb, gdp), + used_inos); + ret = 1; + goto err_out; + } } blk = ext4_inode_table(sb, gdp) + used_blks; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 1223a18c3ff9..a7bc6ad656a9 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -705,7 +705,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, /* * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make + * be able to restart the transaction at a convenient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index b41512d1badc..3cf01629010d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -795,7 +795,7 @@ ext4_journalled_write_inline_data(struct inode *inode, * clear the inode state safely. * 2. The inode has inline data, then we need to read the data, make it * update and dirty so that ext4_da_writepages can handle it. We don't - * need to start the journal since the file's metatdata isn't changed now. + * need to start the journal since the file's metadata isn't changed now. */ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, @@ -1031,7 +1031,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(inode, de, inline_size, fname); + ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); @@ -1100,7 +1100,7 @@ static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; int new_size = get_max_inline_xattr_value_size(dir, iloc); - if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) + if (new_size - old_size <= ext4_dir_rec_len(1, NULL)) return -ENOSPC; ret = ext4_update_inline_data(handle, dir, @@ -1380,8 +1380,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file, fake.name_len = 1; strcpy(fake.name, "."); fake.rec_len = ext4_rec_len_to_disk( - EXT4_DIR_REC_LEN(fake.name_len), - inline_size); + ext4_dir_rec_len(fake.name_len, NULL), + inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_OFFSET; @@ -1390,8 +1390,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file, fake.name_len = 2; strcpy(fake.name, ".."); fake.rec_len = ext4_rec_len_to_disk( - EXT4_DIR_REC_LEN(fake.name_len), - inline_size); + ext4_dir_rec_len(fake.name_len, NULL), + inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_SIZE; @@ -1406,7 +1406,12 @@ int ext4_inlinedir_to_tree(struct file *dir_file, } } - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + if (ext4_hash_in_dirent(dir)) { + hinfo->hash = EXT4_DIRENT_HASH(de); + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) @@ -1488,8 +1493,8 @@ int ext4_read_inline_dir(struct file *file, * So we will use extra_offset and extra_size to indicate them * during the inline dir iteration. */ - dotdot_offset = EXT4_DIR_REC_LEN(1); - dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); + dotdot_offset = ext4_dir_rec_len(1, NULL); + dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL); extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; extra_size = extra_offset + inline_size; @@ -1524,7 +1529,7 @@ int ext4_read_inline_dir(struct file *file, * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, extra_size) - < EXT4_DIR_REC_LEN(1)) + < ext4_dir_rec_len(1, NULL)) break; i += ext4_rec_len_from_disk(de->rec_len, extra_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 650c5acd2f2d..fe6045a46599 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1066,8 +1066,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); } continue; } @@ -1092,8 +1091,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, } } if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && @@ -1938,13 +1936,13 @@ static int __ext4_journalled_writepage(struct page *page, if (!ret) ret = err; - if (!ext4_has_inline_data(inode)) - ext4_walk_page_buffers(NULL, page_bufs, 0, len, - NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: unlock_page(page); out_no_pagelock: + if (!inline_data && page_bufs) + ext4_walk_page_buffers(NULL, page_bufs, 0, len, + NULL, bput_one); brelse(inode_bh); return ret; } @@ -3824,7 +3822,7 @@ unlock: * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' + * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) @@ -5026,7 +5024,7 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; - int err = 0, rc, block; + int err = 0, block; int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; @@ -5138,9 +5136,9 @@ static int ext4_do_update_inode(handle_t *handle, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - rc = ext4_handle_dirty_metadata(handle, NULL, bh); - if (!err) - err = rc; + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_brelse; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); @@ -5387,8 +5385,10 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode->i_gid = attr->ia_gid; error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); - if (unlikely(error)) + if (unlikely(error)) { + ext4_fc_stop_update(inode); return error; + } } if (attr->ia_valid & ATTR_SIZE) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a2cf35066f46..31627f7dc5cd 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -20,6 +20,7 @@ #include <linux/uaccess.h> #include <linux/delay.h> #include <linux/iversion.h> +#include <linux/fileattr.h> #include "ext4_jbd2.h" #include "ext4.h" #include <linux/fsmap.h> @@ -315,6 +316,12 @@ static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) static bool dax_compatible(struct inode *inode, unsigned int oldflags, unsigned int flags) { + /* Allow the DAX flag to be changed on inline directories */ + if (S_ISDIR(inode->i_mode)) { + flags &= ~EXT4_INLINE_DATA_FL; + oldflags &= ~EXT4_INLINE_DATA_FL; + } + if (flags & EXT4_DAX_FL) { if ((oldflags & EXT4_DAX_MUT_EXCL) || ext4_test_inode_state(inode, @@ -344,11 +351,6 @@ static int ext4_ioctl_setflags(struct inode *inode, goto flags_out; oldflags = ei->i_flags; - - err = vfs_ioc_setflags_prepare(inode, oldflags, flags); - if (err) - goto flags_out; - /* * The JOURNAL_DATA flag can only be changed by * the relevant capability. @@ -459,9 +461,8 @@ flags_out: } #ifdef CONFIG_QUOTA -static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) { - struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); int err, rc; @@ -545,7 +546,7 @@ out_stop: return err; } #else -static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; @@ -553,56 +554,6 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) } #endif -/* Transfer internal flags to xflags */ -static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) -{ - __u32 xflags = 0; - - if (iflags & EXT4_SYNC_FL) - xflags |= FS_XFLAG_SYNC; - if (iflags & EXT4_IMMUTABLE_FL) - xflags |= FS_XFLAG_IMMUTABLE; - if (iflags & EXT4_APPEND_FL) - xflags |= FS_XFLAG_APPEND; - if (iflags & EXT4_NODUMP_FL) - xflags |= FS_XFLAG_NODUMP; - if (iflags & EXT4_NOATIME_FL) - xflags |= FS_XFLAG_NOATIME; - if (iflags & EXT4_PROJINHERIT_FL) - xflags |= FS_XFLAG_PROJINHERIT; - if (iflags & EXT4_DAX_FL) - xflags |= FS_XFLAG_DAX; - return xflags; -} - -#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ - FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ - FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \ - FS_XFLAG_DAX) - -/* Transfer xflags flags to internal */ -static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) -{ - unsigned long iflags = 0; - - if (xflags & FS_XFLAG_SYNC) - iflags |= EXT4_SYNC_FL; - if (xflags & FS_XFLAG_IMMUTABLE) - iflags |= EXT4_IMMUTABLE_FL; - if (xflags & FS_XFLAG_APPEND) - iflags |= EXT4_APPEND_FL; - if (xflags & FS_XFLAG_NODUMP) - iflags |= EXT4_NODUMP_FL; - if (xflags & FS_XFLAG_NOATIME) - iflags |= EXT4_NOATIME_FL; - if (xflags & FS_XFLAG_PROJINHERIT) - iflags |= EXT4_PROJINHERIT_FL; - if (xflags & FS_XFLAG_DAX) - iflags |= EXT4_DAX_FL; - - return iflags; -} - static int ext4_shutdown(struct super_block *sb, unsigned long arg) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -770,15 +721,52 @@ group_add_out: return err; } -static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa) +int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) { + struct inode *inode = d_inode(dentry); struct ext4_inode_info *ei = EXT4_I(inode); + u32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE; - simple_fill_fsxattr(fa, ext4_iflags_to_xflags(ei->i_flags & - EXT4_FL_USER_VISIBLE)); + if (S_ISREG(inode->i_mode)) + flags &= ~FS_PROJINHERIT_FL; + fileattr_fill_flags(fa, flags); if (ext4_has_feature_project(inode->i_sb)) fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); + + return 0; +} + +int ext4_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + u32 flags = fa->flags; + int err = -EOPNOTSUPP; + + ext4_fc_start_update(inode); + if (flags & ~EXT4_FL_USER_VISIBLE) + goto out; + + /* + * chattr(1) grabs flags via GETFLAGS, modifies the result and + * passes that to SETFLAGS. So we cannot easily make SETFLAGS + * more restrictive than just silently masking off visible but + * not settable flags as we always did. + */ + flags &= EXT4_FL_USER_MODIFIABLE; + if (ext4_mask_flags(inode->i_mode, flags) != flags) + goto out; + err = ext4_ioctl_check_immutable(inode, fa->fsx_projid, flags); + if (err) + goto out; + err = ext4_ioctl_setflags(inode, flags); + if (err) + goto out; + err = ext4_ioctl_setproject(inode, fa->fsx_projid); +out: + ext4_fc_stop_update(inode); + return err; } /* So that the fiemap access checks can't overflow on 32 bit machines. */ @@ -816,55 +804,13 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; - struct ext4_inode_info *ei = EXT4_I(inode); struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - unsigned int flags; ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { case FS_IOC_GETFSMAP: return ext4_ioc_getfsmap(sb, (void __user *)arg); - case FS_IOC_GETFLAGS: - flags = ei->i_flags & EXT4_FL_USER_VISIBLE; - if (S_ISREG(inode->i_mode)) - flags &= ~EXT4_PROJINHERIT_FL; - return put_user(flags, (int __user *) arg); - case FS_IOC_SETFLAGS: { - int err; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - if (get_user(flags, (int __user *) arg)) - return -EFAULT; - - if (flags & ~EXT4_FL_USER_VISIBLE) - return -EOPNOTSUPP; - /* - * chattr(1) grabs flags via GETFLAGS, modifies the result and - * passes that to SETFLAGS. So we cannot easily make SETFLAGS - * more restrictive than just silently masking off visible but - * not settable flags as we always did. - */ - flags &= EXT4_FL_USER_MODIFIABLE; - if (ext4_mask_flags(inode->i_mode, flags) != flags) - return -EOPNOTSUPP; - - err = mnt_want_write_file(filp); - if (err) - return err; - - inode_lock(inode); - err = ext4_ioctl_check_immutable(inode, - from_kprojid(&init_user_ns, ei->i_projid), - flags); - if (!err) - err = ext4_ioctl_setflags(inode, flags); - inode_unlock(inode); - mnt_drop_write_file(filp); - return err; - } case EXT4_IOC_GETVERSION: case EXT4_IOC_GETVERSION_OLD: return put_user(inode->i_generation, (int __user *) arg); @@ -1246,60 +1192,6 @@ resizefs_out: case EXT4_IOC_GET_ES_CACHE: return ext4_ioctl_get_es_cache(filp, arg); - case FS_IOC_FSGETXATTR: - { - struct fsxattr fa; - - ext4_fill_fsxattr(inode, &fa); - - if (copy_to_user((struct fsxattr __user *)arg, - &fa, sizeof(fa))) - return -EFAULT; - return 0; - } - case FS_IOC_FSSETXATTR: - { - struct fsxattr fa, old_fa; - int err; - - if (copy_from_user(&fa, (struct fsxattr __user *)arg, - sizeof(fa))) - return -EFAULT; - - /* Make sure caller has proper permission */ - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS) - return -EOPNOTSUPP; - - flags = ext4_xflags_to_iflags(fa.fsx_xflags); - if (ext4_mask_flags(inode->i_mode, flags) != flags) - return -EOPNOTSUPP; - - err = mnt_want_write_file(filp); - if (err) - return err; - - inode_lock(inode); - ext4_fill_fsxattr(inode, &old_fa); - err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa); - if (err) - goto out; - flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | - (flags & EXT4_FL_XFLAG_VISIBLE); - err = ext4_ioctl_check_immutable(inode, fa.fsx_projid, flags); - if (err) - goto out; - err = ext4_ioctl_setflags(inode, flags); - if (err) - goto out; - err = ext4_ioctl_setproject(filp, fa.fsx_projid); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return err; - } case EXT4_IOC_SHUTDOWN: return ext4_shutdown(sb, arg); @@ -1340,12 +1232,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; case EXT4_IOC32_GETVERSION: cmd = EXT4_IOC_GETVERSION; break; @@ -1405,8 +1291,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: - case FS_IOC_FSGETXATTR: - case FS_IOC_FSSETXATTR: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 99bf091fee10..3239e6669e84 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -127,11 +127,50 @@ * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * + * If "mb_optimize_scan" mount option is set, we maintain in memory group info + * structures in two data structures: + * + * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) + * + * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) + * + * This is an array of lists where the index in the array represents the + * largest free order in the buddy bitmap of the participating group infos of + * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total + * number of buddy bitmap orders possible) number of lists. Group-infos are + * placed in appropriate lists. + * + * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) + * + * Locking: sbi->s_mb_rb_lock (rwlock) + * + * This is a red black tree consisting of group infos and the tree is sorted + * by average fragment sizes (which is calculated as ext4_group_info->bb_free + * / ext4_group_info->bb_fragments). + * + * When "mb_optimize_scan" mount option is set, mballoc consults the above data + * structures to decide the order in which groups are to be traversed for + * fulfilling an allocation request. + * + * At CR = 0, we look for groups which have the largest_free_order >= the order + * of the request. We directly look at the largest free order list in the data + * structure (1) above where largest_free_order = order of the request. If that + * list is empty, we look at remaining list in the increasing order of + * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. + * + * At CR = 1, we only consider groups where average fragment size > request + * size. So, we lookup a group which has average fragment size just above or + * equal to request size using our rb tree (data structure 2) in O(log N) time. + * + * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in + * linear order which requires O(N) search time for each CR 0 and CR 1 phase. + * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req + * /sys/fs/ext4/<partition>/mb_linear_limit * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The @@ -149,6 +188,16 @@ * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * + * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not + * get traversed linearly. That may result in subsequent allocations being not + * close to each other. And so, the underlying device may get filled up in a + * non-linear fashion. While that may not matter on non-rotational devices, for + * rotational devices that may result in higher seek times. "mb_linear_limit" + * tells mballoc how many groups mballoc should search linearly before + * performing consulting above data structures for more efficient lookups. For + * non rotational devices, this value defaults to 0 and for rotational devices + * this is set to MB_DEFAULT_LINEAR_LIMIT. + * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the @@ -299,6 +348,8 @@ * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) + * - cr0 lists lock (cr0) + * - cr1 tree lock (cr1) * * Paths: * - new pa @@ -328,6 +379,9 @@ * group * object * + * - allocation path (ext4_mb_regular_allocator) + * group + * cr0/cr1 */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; @@ -351,6 +405,9 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); +static bool ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, int cr); + /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block @@ -744,6 +801,269 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } +static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, + int (*cmp)(struct rb_node *, struct rb_node *)) +{ + struct rb_node **iter = &root->rb_node, *parent = NULL; + + while (*iter) { + parent = *iter; + if (cmp(new, *iter) > 0) + iter = &((*iter)->rb_left); + else + iter = &((*iter)->rb_right); + } + + rb_link_node(new, parent, iter); + rb_insert_color(new, root); +} + +static int +ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) +{ + struct ext4_group_info *grp1 = rb_entry(rb1, + struct ext4_group_info, + bb_avg_fragment_size_rb); + struct ext4_group_info *grp2 = rb_entry(rb2, + struct ext4_group_info, + bb_avg_fragment_size_rb); + int num_frags_1, num_frags_2; + + num_frags_1 = grp1->bb_fragments ? + grp1->bb_free / grp1->bb_fragments : 0; + num_frags_2 = grp2->bb_fragments ? + grp2->bb_free / grp2->bb_fragments : 0; + + return (num_frags_2 - num_frags_1); +} + +/* + * Reinsert grpinfo into the avg_fragment_size tree with new average + * fragment size. + */ +static void +mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + return; + + write_lock(&sbi->s_mb_rb_lock); + if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { + rb_erase(&grp->bb_avg_fragment_size_rb, + &sbi->s_mb_avg_fragment_size_root); + RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); + } + + ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, + &grp->bb_avg_fragment_size_rb, + ext4_mb_avg_fragment_size_cmp); + write_unlock(&sbi->s_mb_rb_lock); +} + +/* + * Choose next group by traversing largest_free_order lists. Updates *new_cr if + * cr level needs an update. + */ +static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *iter, *grp; + int i; + + if (ac->ac_status == AC_STATUS_FOUND) + return; + + if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) + atomic_inc(&sbi->s_bal_cr0_bad_suggestions); + + grp = NULL; + for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { + if (list_empty(&sbi->s_mb_largest_free_orders[i])) + continue; + read_lock(&sbi->s_mb_largest_free_orders_locks[i]); + if (list_empty(&sbi->s_mb_largest_free_orders[i])) { + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + continue; + } + grp = NULL; + list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], + bb_largest_free_order_node) { + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { + grp = iter; + break; + } + } + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + if (grp) + break; + } + + if (!grp) { + /* Increment cr and search again */ + *new_cr = 1; + } else { + *group = grp->bb_group; + ac->ac_last_optimal_group = *group; + ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; + } +} + +/* + * Choose next group by traversing average fragment size tree. Updates *new_cr + * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that + * the linear search should continue for one iteration since there's lock + * contention on the rb tree lock. + */ +static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int avg_fragment_size, best_so_far; + struct rb_node *node, *found; + struct ext4_group_info *grp; + + /* + * If there is contention on the lock, instead of waiting for the lock + * to become available, just continue searching lineraly. We'll resume + * our rb tree search later starting at ac->ac_last_optimal_group. + */ + if (!read_trylock(&sbi->s_mb_rb_lock)) { + ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; + return; + } + + if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { + if (sbi->s_mb_stats) + atomic_inc(&sbi->s_bal_cr1_bad_suggestions); + /* We have found something at CR 1 in the past */ + grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); + for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; + found = rb_next(found)) { + grp = rb_entry(found, struct ext4_group_info, + bb_avg_fragment_size_rb); + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); + if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) + break; + } + goto done; + } + + node = sbi->s_mb_avg_fragment_size_root.rb_node; + best_so_far = 0; + found = NULL; + + while (node) { + grp = rb_entry(node, struct ext4_group_info, + bb_avg_fragment_size_rb); + avg_fragment_size = 0; + if (ext4_mb_good_group(ac, grp->bb_group, 1)) { + avg_fragment_size = grp->bb_fragments ? + grp->bb_free / grp->bb_fragments : 0; + if (!best_so_far || avg_fragment_size < best_so_far) { + best_so_far = avg_fragment_size; + found = node; + } + } + if (avg_fragment_size > ac->ac_g_ex.fe_len) + node = node->rb_right; + else + node = node->rb_left; + } + +done: + if (found) { + grp = rb_entry(found, struct ext4_group_info, + bb_avg_fragment_size_rb); + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; + } else { + *new_cr = 2; + } + + read_unlock(&sbi->s_mb_rb_lock); + ac->ac_last_optimal_group = *group; +} + +static inline int should_optimize_scan(struct ext4_allocation_context *ac) +{ + if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) + return 0; + if (ac->ac_criteria >= 2) + return 0; + if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) + return 0; + return 1; +} + +/* + * Return next linear group for allocation. If linear traversal should not be + * performed, this function just returns the same group + */ +static int +next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) +{ + if (!should_optimize_scan(ac)) + goto inc_and_return; + + if (ac->ac_groups_linear_remaining) { + ac->ac_groups_linear_remaining--; + goto inc_and_return; + } + + if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { + ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; + goto inc_and_return; + } + + return group; +inc_and_return: + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + return group + 1 >= ngroups ? 0 : group + 1; +} + +/* + * ext4_mb_choose_next_group: choose next group for allocation. + * + * @ac Allocation Context + * @new_cr This is an output parameter. If the there is no good group + * available at current CR level, this field is updated to indicate + * the new cr level that should be used. + * @group This is an input / output parameter. As an input it indicates the + * next group that the allocator intends to use for allocation. As + * output, this field indicates the next group that should be used as + * determined by the optimization functions. + * @ngroups Total number of groups + */ +static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + *new_cr = ac->ac_criteria; + + if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) + return; + + if (*new_cr == 0) { + ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); + } else if (*new_cr == 1) { + ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); + } else { + /* + * TODO: For CR=2, we can arrange groups in an rb tree sorted by + * bb_free. But until that happens, we should never come here. + */ + WARN_ON(1); + } +} + /* * Cache the order of the largest free extent we have available in this block * group. @@ -751,18 +1071,33 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { + struct ext4_sb_info *sbi = EXT4_SB(sb); int i; - int bits; + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_del_init(&grp->bb_largest_free_order_node); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } grp->bb_largest_free_order = -1; /* uninit */ - bits = sb->s_blocksize_bits + 1; - for (i = bits; i >= 0; i--) { + for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { if (grp->bb_counters[i] > 0) { grp->bb_largest_free_order = i; break; } } + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && + grp->bb_largest_free_order >= 0 && grp->bb_free) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_add_tail(&grp->bb_largest_free_order_node, + &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } } static noinline_for_stack @@ -816,10 +1151,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; - spin_lock(&sbi->s_bal_lock); - sbi->s_mb_buddies_generated++; - sbi->s_mb_generation_time += period; - spin_unlock(&sbi->s_bal_lock); + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); + mb_update_avg_fragment_size(sb, grp); } /* The buddy information is attached the buddy cache inode @@ -959,7 +1293,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * - (sb->s_blocksize_bits+2)); + (MB_NUM_ORDERS(sb))); /* * incore got set to the group block bitmap below */ @@ -1519,6 +1853,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, done: mb_set_largest_free_order(sb, e4b->bd_info); + mb_update_avg_fragment_size(sb, e4b->bd_info); mb_check_buddy(e4b); } @@ -1655,6 +1990,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); @@ -1930,7 +2266,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, int max; BUG_ON(ac->ac_2order <= 0); - for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { if (grp->bb_counters[i] == 0) continue; @@ -2109,7 +2445,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, if (free < ac->ac_g_ex.fe_len) return false; - if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) + if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) return true; if (grp->bb_largest_free_order < ac->ac_2order) @@ -2148,6 +2484,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_grpblk_t free; int ret = 0; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) ext4_lock_group(sb, group); free = grp->bb_free; @@ -2315,13 +2653,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ - if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { /* * This should tell if fe_len is exactly power of 2 */ if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) ac->ac_2order = array_index_nospec(i - 1, - sb->s_blocksize_bits + 2); + MB_NUM_ORDERS(sb)); } /* if stream allocation is enabled, use global goal */ @@ -2347,17 +2685,21 @@ repeat: * from the goal value specified */ group = ac->ac_g_ex.fe_group; + ac->ac_last_optimal_group = group; + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; - for (i = 0; i < ngroups; group++, i++) { - int ret = 0; + for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), + i++) { + int ret = 0, new_cr; + cond_resched(); - /* - * Artificially restricted ngroups for non-extent - * files makes group > ngroups possible on first loop. - */ - if (group >= ngroups) - group = 0; + + ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); + if (new_cr != cr) { + cr = new_cr; + goto repeat; + } /* * Batch reads of the block allocation bitmaps @@ -2422,6 +2764,9 @@ repeat: if (ac->ac_status != AC_STATUS_CONTINUE) break; } + /* Processed all groups and haven't found blocks */ + if (sbi->s_mb_stats && i == ngroups) + atomic64_inc(&sbi->s_bal_cX_failed[cr]); } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && @@ -2451,6 +2796,9 @@ repeat: goto repeat; } } + + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) + atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; @@ -2550,6 +2898,157 @@ const struct seq_operations ext4_mb_seq_groups_ops = { .show = ext4_mb_seq_groups_show, }; +int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = (struct super_block *)seq->private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + seq_puts(seq, "mballoc:\n"); + if (!sbi->s_mb_stats) { + seq_puts(seq, "\tmb stats collection turned off.\n"); + seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); + return 0; + } + seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); + seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); + + seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); + + seq_puts(seq, "\tcr0_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[0])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[0])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_cr0_bad_suggestions)); + + seq_puts(seq, "\tcr1_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[1])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[1])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_cr1_bad_suggestions)); + + seq_puts(seq, "\tcr2_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[2])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[2])); + + seq_puts(seq, "\tcr3_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[3])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[3])); + seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); + seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); + seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); + seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); + + seq_printf(seq, "\tbuddies_generated: %u/%u\n", + atomic_read(&sbi->s_mb_buddies_generated), + ext4_get_groups_count(sb)); + seq_printf(seq, "\tbuddies_time_used: %llu\n", + atomic64_read(&sbi->s_mb_generation_time)); + seq_printf(seq, "\tpreallocated: %u\n", + atomic_read(&sbi->s_mb_preallocated)); + seq_printf(seq, "\tdiscarded: %u\n", + atomic_read(&sbi->s_mb_discarded)); + return 0; +} + +static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + unsigned long position; + + read_lock(&EXT4_SB(sb)->s_mb_rb_lock); + + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +} + +static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + unsigned long position; + + ++*pos; + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +} + +static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long position = ((unsigned long) v); + struct ext4_group_info *grp; + struct rb_node *n; + unsigned int count, min, max; + + position--; + if (position >= MB_NUM_ORDERS(sb)) { + seq_puts(seq, "fragment_size_tree:\n"); + n = rb_first(&sbi->s_mb_avg_fragment_size_root); + if (!n) { + seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); + return 0; + } + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); + min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + count = 1; + while (rb_next(n)) { + count++; + n = rb_next(n); + } + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); + max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + + seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", + min, max, count); + return 0; + } + + if (position == 0) { + seq_printf(seq, "optimize_scan: %d\n", + test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); + seq_puts(seq, "max_free_order_lists:\n"); + } + count = 0; + list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], + bb_largest_free_order_node) + count++; + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); + + return 0; +} + +static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + + read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); +} + +const struct seq_operations ext4_mb_seq_structs_summary_ops = { + .start = ext4_mb_seq_structs_summary_start, + .next = ext4_mb_seq_structs_summary_next, + .stop = ext4_mb_seq_structs_summary_stop, + .show = ext4_mb_seq_structs_summary_show, +}; + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; @@ -2590,7 +3089,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); if (old_groupinfo) ext4_kvfree_array_rcu(old_groupinfo); - ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; } @@ -2652,7 +3151,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; + INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); + RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; @@ -2709,8 +3211,15 @@ static int ext4_mb_init_backend(struct super_block *sb) } if (ext4_has_feature_flex_bg(sb)) { - /* a single flex group is supposed to be read by a single IO */ - sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex, + /* a single flex group is supposed to be read by a single IO. + * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is + * unsigned integer, so the maximum shift is 32. + */ + if (sbi->s_es->s_log_groups_per_flex >= 32) { + ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); + goto err_freesgi; + } + sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ } else { @@ -2806,7 +3315,7 @@ int ext4_mb_init(struct super_block *sb) unsigned max; int ret; - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { @@ -2814,7 +3323,7 @@ int ext4_mb_init(struct super_block *sb) goto out; } - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { ret = -ENOMEM; @@ -2840,10 +3349,30 @@ int ext4_mb_init(struct super_block *sb) offset_incr = offset_incr >> 1; max = max >> 1; i++; - } while (i <= sb->s_blocksize_bits + 1); + } while (i < MB_NUM_ORDERS(sb)); + + sbi->s_mb_avg_fragment_size_root = RB_ROOT; + sbi->s_mb_largest_free_orders = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); + if (!sbi->s_mb_largest_free_orders) { + ret = -ENOMEM; + goto out; + } + sbi->s_mb_largest_free_orders_locks = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), + GFP_KERNEL); + if (!sbi->s_mb_largest_free_orders_locks) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { + INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); + rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); + } + rwlock_init(&sbi->s_mb_rb_lock); spin_lock_init(&sbi->s_md_lock); - spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_free_pending = 0; INIT_LIST_HEAD(&sbi->s_freed_data_list); @@ -2894,6 +3423,10 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } + if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) + sbi->s_mb_max_linear_groups = 0; + else + sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) @@ -2905,6 +3438,8 @@ out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); @@ -2961,6 +3496,8 @@ int ext4_mb_release(struct super_block *sb) kvfree(group_info); rcu_read_unlock(); } + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); @@ -2971,17 +3508,18 @@ int ext4_mb_release(struct super_block *sb) atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); ext4_msg(sb, KERN_INFO, - "mballoc: %u extents scanned, %u goal hits, " + "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_groups_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); ext4_msg(sb, KERN_INFO, - "mballoc: %lu generated and it took %Lu", - sbi->s_mb_buddies_generated, - sbi->s_mb_generation_time); + "mballoc: %u generated and it took %llu", + atomic_read(&sbi->s_mb_buddies_generated), + atomic64_read(&sbi->s_mb_generation_time)); ext4_msg(sb, KERN_INFO, "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), @@ -3576,12 +4114,13 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index e75b4749aa1c..39da92ceabf8 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -59,7 +59,7 @@ * by the stream allocator, which purpose is to pack requests * as close each to other as possible to produce smooth I/O traffic * We use locality group prealloc space for stream request. - * We can tune the same via /proc/fs/ext4/<parition>/stream_req + * We can tune the same via /proc/fs/ext4/<partition>/stream_req */ #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ @@ -78,6 +78,23 @@ */ #define MB_DEFAULT_MAX_INODE_PREALLOC 512 +/* + * Number of groups to search linearly before performing group scanning + * optimization. + */ +#define MB_DEFAULT_LINEAR_LIMIT 4 + +/* + * Minimum number of groups that should be present in the file system to perform + * group scanning optimizations. + */ +#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 + +/* + * Number of valid buddy orders + */ +#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) + struct ext4_free_data { /* this links the free block information from sb_info */ struct list_head efd_list; @@ -161,11 +178,14 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; + ext4_group_t ac_last_optimal_group; + __u32 ac_groups_considered; + __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; + __u16 ac_groups_linear_remaining; __u16 ac_found; __u16 ac_tail; __u16 ac_buddy; - __u16 ac_flags; /* allocation hints */ __u8 ac_status; __u8 ac_criteria; __u8 ac_2order; /* if request is to allocate 2^N blocks and diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index c5e3fc998211..7e0b4f81c6c0 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -32,7 +32,7 @@ static int finish_range(handle_t *handle, struct inode *inode, newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); - /* Locking only for convinience since we are operating on temp inode */ + /* Locking only for convenience since we are operating on temp inode */ down_write(&EXT4_I(inode)->i_data_sem); path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { @@ -43,8 +43,8 @@ static int finish_range(handle_t *handle, struct inode *inode, /* * Calculate the credit needed to inserting this extent - * Since we are doing this in loop we may accumalate extra - * credit. But below we try to not accumalate too much + * Since we are doing this in loop we may accumulate extra + * credit. But below we try to not accumulate too much * of them by restarting the journal. */ needed = ext4_ext_calc_credits_for_single_extent(inode, diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 795c3ff2907c..68fbeedd627b 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -56,7 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) - return 1; + return -EIO; return 0; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 686bf982c84e..afb9d05a99ba 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -280,9 +280,11 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, - struct dx_map_entry *offsets, int count, unsigned blocksize); -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); +static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, + char *to, struct dx_map_entry *offsets, + int count, unsigned int blocksize); +static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + unsigned int blocksize); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, @@ -574,8 +576,9 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value) static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - - EXT4_DIR_REC_LEN(2) - infosize; + unsigned int entry_space = dir->i_sb->s_blocksize - + ext4_dir_rec_len(1, NULL) - + ext4_dir_rec_len(2, NULL) - infosize; if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); @@ -584,7 +587,8 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) static inline unsigned dx_node_limit(struct inode *dir) { - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); + unsigned int entry_space = dir->i_sb->s_blocksize - + ext4_dir_rec_len(0, dir); if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); @@ -673,7 +677,10 @@ static struct stats dx_show_leaf(struct inode *dir, name = fname_crypto_str.name; len = fname_crypto_str.len; } - ext4fs_dirhash(dir, de->name, + if (IS_CASEFOLDED(dir)) + h.hash = EXT4_DIRENT_HASH(de); + else + ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de @@ -689,7 +696,7 @@ static struct stats dx_show_leaf(struct inode *dir, (unsigned) ((char *) de - base)); #endif } - space += EXT4_DIR_REC_LEN(de->name_len); + space += ext4_dir_rec_len(de->name_len, dir); names++; } de = ext4_next_entry(de, size); @@ -784,18 +791,34 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && - root->info.hash_version != DX_HASH_LEGACY) { + root->info.hash_version != DX_HASH_LEGACY && + root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Unrecognised inode hash code %u", root->info.hash_version); goto fail; } + if (ext4_hash_in_dirent(dir)) { + if (root->info.hash_version != DX_HASH_SIPHASH) { + ext4_warning_inode(dir, + "Hash in dirent, but hash is not SIPHASH"); + goto fail; + } + } else { + if (root->info.hash_version == DX_HASH_SIPHASH) { + ext4_warning_inode(dir, + "Hash code is SIPHASH, but hash not in dirent"); + goto fail; + } + } if (fname) hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; - if (fname && fname_name(fname)) + /* hash is already computed for encrypted casefolded directory */ + if (fname && fname_name(fname) && + !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); hash = hinfo->hash; @@ -956,7 +979,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, * If the hash is 1, then continue only if the next page has a * continuation hash of any value. This is used for readdir * handling. Otherwise, check to see if the hash matches the - * desired contiuation hash. If it doesn't, return since + * desired continuation hash. If it doesn't, return since * there's no point to read in the successive index pages. */ bhash = dx_get_hash(p->at); @@ -997,6 +1020,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; + int csum = ext4_has_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -1005,9 +1029,11 @@ static int htree_dirblock_to_tree(struct file *dir_file, return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; + /* csum entries are not larger in the casefolded encrypted case */ top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - - EXT4_DIR_REC_LEN(0)); + ext4_dir_rec_len(0, + csum ? NULL : dir)); /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_prepare_readdir(dir); @@ -1031,7 +1057,17 @@ static int htree_dirblock_to_tree(struct file *dir_file, /* silently ignore the rest of the block */ break; } - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + if (ext4_hash_in_dirent(dir)) { + if (de->name_len && de->inode) { + hinfo->hash = EXT4_DIRENT_HASH(de); + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + hinfo->hash = 0; + hinfo->minor_hash = 0; + } + } else { + ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) @@ -1100,7 +1136,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, start_hash, start_minor_hash)); dir = file_inode(dir_file); if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { - hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (ext4_hash_in_dirent(dir)) + hinfo.hash_version = DX_HASH_SIPHASH; + else + hinfo.hash_version = + EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; @@ -1218,7 +1258,10 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { - ext4fs_dirhash(dir, de->name, de->name_len, &h); + if (ext4_hash_in_dirent(dir)) + h.hash = EXT4_DIRENT_HASH(de); + else + ext4fs_dirhash(dir, de->name, de->name_len, &h); map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; @@ -1282,47 +1325,65 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) * Returns: 0 if the directory entry matches, more than 0 if it * doesn't match or less than zero on error. */ -int ext4_ci_compare(const struct inode *parent, const struct qstr *name, - const struct qstr *entry, bool quick) +static int ext4_ci_compare(const struct inode *parent, const struct qstr *name, + u8 *de_name, size_t de_name_len, bool quick) { const struct super_block *sb = parent->i_sb; const struct unicode_map *um = sb->s_encoding; + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); + struct qstr entry = QSTR_INIT(de_name, de_name_len); int ret; + if (IS_ENCRYPTED(parent)) { + const struct fscrypt_str encrypted_name = + FSTR_INIT(de_name, de_name_len); + + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); + if (!decrypted_name.name) + return -ENOMEM; + ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, + &decrypted_name); + if (ret < 0) + goto out; + entry.name = decrypted_name.name; + entry.len = decrypted_name.len; + } + if (quick) - ret = utf8_strncasecmp_folded(um, name, entry); + ret = utf8_strncasecmp_folded(um, name, &entry); else - ret = utf8_strncasecmp(um, name, entry); - + ret = utf8_strncasecmp(um, name, &entry); if (ret < 0) { /* Handle invalid character sequence as either an error * or as an opaque byte sequence. */ if (sb_has_strict_encoding(sb)) - return -EINVAL; - - if (name->len != entry->len) - return 1; - - return !!memcmp(name->name, entry->name, name->len); + ret = -EINVAL; + else if (name->len != entry.len) + ret = 1; + else + ret = !!memcmp(name->name, entry.name, entry.len); } - +out: + kfree(decrypted_name.name); return ret; } -void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, - struct fscrypt_str *cf_name) +int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, + struct ext4_filename *name) { + struct fscrypt_str *cf_name = &name->cf_name; + struct dx_hash_info *hinfo = &name->hinfo; int len; if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) { cf_name->name = NULL; - return; + return 0; } cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS); if (!cf_name->name) - return; + return -ENOMEM; len = utf8_casefold(dir->i_sb->s_encoding, iname, cf_name->name, @@ -1330,10 +1391,18 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, if (len <= 0) { kfree(cf_name->name); cf_name->name = NULL; - return; } cf_name->len = (unsigned) len; + if (!IS_ENCRYPTED(dir)) + return 0; + hinfo->hash_version = DX_HASH_SIPHASH; + hinfo->seed = NULL; + if (cf_name->name) + ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); + else + ext4fs_dirhash(dir, iname->name, iname->len, hinfo); + return 0; } #endif @@ -1342,14 +1411,11 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, * * Return: %true if the directory entry matches, otherwise %false. */ -static inline bool ext4_match(const struct inode *parent, +static bool ext4_match(struct inode *parent, const struct ext4_filename *fname, - const struct ext4_dir_entry_2 *de) + struct ext4_dir_entry_2 *de) { struct fscrypt_name f; -#ifdef CONFIG_UNICODE - const struct qstr entry = {.name = de->name, .len = de->name_len}; -#endif if (!de->inode) return false; @@ -1365,10 +1431,19 @@ static inline bool ext4_match(const struct inode *parent, if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, .len = fname->cf_name.len}; - return !ext4_ci_compare(parent, &cf, &entry, true); + if (IS_ENCRYPTED(parent)) { + if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || + fname->hinfo.minor_hash != + EXT4_DIRENT_MINOR_HASH(de)) { + + return 0; + } + } + return !ext4_ci_compare(parent, &cf, de->name, + de->name_len, true); } - return !ext4_ci_compare(parent, fname->usr_fname, &entry, - false); + return !ext4_ci_compare(parent, fname->usr_fname, de->name, + de->name_len, false); } #endif @@ -1739,11 +1814,10 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; - static const struct qstr dotdot = QSTR_INIT("..", 2); struct ext4_dir_entry_2 * de; struct buffer_head *bh; - bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL); + bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) @@ -1765,7 +1839,8 @@ struct dentry *ext4_get_parent(struct dentry *child) * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * -dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, +dx_move_dirents(struct inode *dir, char *from, char *to, + struct dx_map_entry *map, int count, unsigned blocksize) { unsigned rec_len = 0; @@ -1773,11 +1848,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, while (count--) { struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + (map->offs<<2)); - rec_len = EXT4_DIR_REC_LEN(de->name_len); + rec_len = ext4_dir_rec_len(de->name_len, dir); + memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); + + /* wipe dir_entry excluding the rec_len field */ de->inode = 0; + memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, + blocksize) - + offsetof(struct ext4_dir_entry_2, + name_len)); + map++; to += rec_len; } @@ -1788,7 +1871,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) +static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + unsigned int blocksize) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; @@ -1797,7 +1881,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) while ((char*)de < base + blocksize) { next = ext4_next_entry(de, blocksize); if (de->inode && de->name_len) { - rec_len = EXT4_DIR_REC_LEN(de->name_len); + rec_len = ext4_dir_rec_len(de->name_len, dir); if (de > to) memmove(to, de, rec_len); to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); @@ -1887,9 +1971,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, hash2, split, count-split)); /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split, + de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, blocksize); - de = dx_pack_dirents(data1, blocksize); + de = dx_pack_dirents(dir, data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, blocksize); @@ -1937,7 +2021,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; - unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); + unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); int nlen, rlen; unsigned int offset = 0; char *top; @@ -1950,7 +2034,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, return -EFSCORRUPTED; if (ext4_match(dir, fname, de)) return -EEXIST; - nlen = EXT4_DIR_REC_LEN(de->name_len); + nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) break; @@ -1964,7 +2048,8 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, return 0; } -void ext4_insert_dentry(struct inode *inode, +void ext4_insert_dentry(struct inode *dir, + struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname) @@ -1972,7 +2057,7 @@ void ext4_insert_dentry(struct inode *inode, int nlen, rlen; - nlen = EXT4_DIR_REC_LEN(de->name_len); + nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if (de->inode) { struct ext4_dir_entry_2 *de1 = @@ -1986,6 +2071,13 @@ void ext4_insert_dentry(struct inode *inode, ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); + if (ext4_hash_in_dirent(dir)) { + struct dx_hash_info *hinfo = &fname->hinfo; + + EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash); + EXT4_DIRENT_HASHES(de)->minor_hash = + cpu_to_le32(hinfo->minor_hash); + } } /* @@ -2022,7 +2114,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, } /* By now the buffer is marked for journaling */ - ext4_insert_dentry(inode, de, blocksize, fname); + ext4_insert_dentry(dir, inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful @@ -2102,6 +2194,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, data2 = bh2->b_data; memcpy(data2, de, len); + memset(de, 0, len); /* wipe old data */ de = (struct ext4_dir_entry_2 *) data2; top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) @@ -2114,11 +2207,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), - blocksize); + de->rec_len = ext4_rec_len_to_disk( + blocksize - ext4_dir_rec_len(2, NULL), blocksize); memset (&root->info, 0, sizeof(root->info)); root->info.info_length = sizeof(root->info); - root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (ext4_hash_in_dirent(dir)) + root->info.hash_version = DX_HASH_SIPHASH; + else + root->info.hash_version = + EXT4_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; dx_set_block(entries, 1); dx_set_count(entries, 1); @@ -2129,7 +2227,11 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, if (fname->hinfo.hash_version <= DX_HASH_TEA) fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; - ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo); + + /* casefolded encrypted hashes are computed on fname setup */ + if (!ext4_hash_in_dirent(dir)) + ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), &fname->hinfo); memset(frames, 0, sizeof(frames)); frame = frames; @@ -2139,10 +2241,10 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) - goto out_frames; + goto out_frames; retval = ext4_handle_dirty_dirblock(handle, dir, bh2); if (retval) - goto out_frames; + goto out_frames; de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { @@ -2482,15 +2584,27 @@ int ext4_generic_delete_entry(struct inode *dir, entry_buf, buf_size, i)) return -EFSCORRUPTED; if (de == de_del) { - if (pde) + if (pde) { pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, blocksize) + ext4_rec_len_from_disk(de->rec_len, blocksize), blocksize); - else + + /* wipe entire dir_entry */ + memset(de, 0, ext4_rec_len_from_disk(de->rec_len, + blocksize)); + } else { + /* wipe dir_entry excluding the rec_len field */ de->inode = 0; + memset(&de->name_len, 0, + ext4_rec_len_from_disk(de->rec_len, + blocksize) - + offsetof(struct ext4_dir_entry_2, + name_len)); + } + inode_inc_iversion(dir); return 0; } @@ -2722,7 +2836,7 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, { de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), blocksize); strcpy(de->name, "."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); @@ -2732,11 +2846,12 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, de->name_len = 2; if (!dotdot_real_len) de->rec_len = ext4_rec_len_to_disk(blocksize - - (csum_size + EXT4_DIR_REC_LEN(1)), + (csum_size + ext4_dir_rec_len(1, NULL)), blocksize); else de->rec_len = ext4_rec_len_to_disk( - EXT4_DIR_REC_LEN(de->name_len), blocksize); + ext4_dir_rec_len(de->name_len, NULL), + blocksize); strcpy(de->name, ".."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); @@ -2869,7 +2984,8 @@ bool ext4_empty_dir(struct inode *inode) } sb = inode->i_sb; - if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { + if (inode->i_size < ext4_dir_rec_len(1, NULL) + + ext4_dir_rec_len(2, NULL)) { EXT4_ERROR_INODE(inode, "invalid size"); return true; } @@ -3372,7 +3488,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, * for transaction commit if we are running out of space * and thus we deadlock. So we have to stop transaction now * and restart it when symlink contents is written. - * + * * To keep fs consistent in case of crash, we have to put inode * to orphan list in the mean time. */ @@ -3613,6 +3729,31 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, return retval; } +static void ext4_resetent(handle_t *handle, struct ext4_renament *ent, + unsigned ino, unsigned file_type) +{ + struct ext4_renament old = *ent; + int retval = 0; + + /* + * old->de could have moved from under us during make indexed dir, + * so the old->de may no longer valid and need to find it again + * before reset old inode info. + */ + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); + if (IS_ERR(old.bh)) + retval = PTR_ERR(old.bh); + if (!old.bh) + retval = -ENOENT; + if (retval) { + ext4_std_error(old.dir->i_sb, retval); + return; + } + + ext4_setent(handle, &old, ino, file_type); + brelse(old.bh); +} + static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, const struct qstr *d_name) { @@ -3774,14 +3915,14 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, */ retval = -ENOENT; if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) - goto end_rename; + goto release_bh; new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); new.bh = NULL; - goto end_rename; + goto release_bh; } if (new.bh) { if (!new.inode) { @@ -3798,15 +3939,13 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); if (IS_ERR(handle)) { retval = PTR_ERR(handle); - handle = NULL; - goto end_rename; + goto release_bh; } } else { whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); - whiteout = NULL; - goto end_rename; + goto release_bh; } } @@ -3850,6 +3989,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, retval = ext4_mark_inode_dirty(handle, whiteout); if (unlikely(retval)) goto end_rename; + } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); @@ -3923,6 +4063,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, ext4_fc_track_unlink(handle, new.dentry); __ext4_fc_track_link(handle, old.inode, new.dentry); __ext4_fc_track_unlink(handle, old.inode, old.dentry); + if (whiteout) + __ext4_fc_track_create(handle, whiteout, old.dentry); } if (new.inode) { @@ -3937,19 +4079,21 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, end_rename: if (whiteout) { if (retval) { - ext4_setent(handle, &old, - old.inode->i_ino, old_file_type); + ext4_resetent(handle, &old, + old.inode->i_ino, old_file_type); drop_nlink(whiteout); + ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); + ext4_journal_stop(handle); iput(whiteout); - + } else { + ext4_journal_stop(handle); } +release_bh: brelse(old.dir_bh); brelse(old.bh); brelse(new.bh); - if (handle) - ext4_journal_stop(handle); return retval; } @@ -4144,6 +4288,8 @@ const struct inode_operations ext4_dir_inode_operations = { .get_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, + .fileattr_get = ext4_fileattr_get, + .fileattr_set = ext4_fileattr_set, }; const struct inode_operations ext4_special_inode_operations = { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ad34a37278cd..7dc94f3e18e6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -667,9 +667,6 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, ext4_commit_super(sb); } - if (sb_rdonly(sb) || continue_fs) - return; - /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already @@ -679,6 +676,10 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } + + if (sb_rdonly(sb) || continue_fs) + return; + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible before @@ -1210,6 +1211,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) @@ -1687,7 +1689,7 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, - Opt_prefetch_block_bitmaps, + Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif @@ -1787,7 +1789,9 @@ static const match_table_t tokens = { {Opt_inlinecrypt, "inlinecrypt"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ - {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, + {Opt_removed, "prefetch_block_bitmaps"}, + {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"}, + {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1820,6 +1824,8 @@ static ext4_fsblk_t get_sb_block(void **data) } #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +#define DEFAULT_MB_OPTIMIZE_SCAN (-1) + static const char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; @@ -2006,8 +2012,9 @@ static const struct mount_opts { {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, - {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, + {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, + {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, @@ -2089,9 +2096,15 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, return 1; } +struct ext4_parsed_options { + unsigned long journal_devnum; + unsigned int journal_ioprio; + int mb_optimize_scan; +}; + static int handle_mount_opt(struct super_block *sb, char *opt, int token, - substring_t *args, unsigned long *journal_devnum, - unsigned int *journal_ioprio, int is_remount) + substring_t *args, struct ext4_parsed_options *parsed_opts, + int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); const struct mount_opts *m; @@ -2248,7 +2261,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, "Cannot specify journal on remount"); return -1; } - *journal_devnum = arg; + parsed_opts->journal_devnum = arg; } else if (token == Opt_journal_path) { char *journal_path; struct inode *journal_inode; @@ -2284,7 +2297,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } - *journal_devnum = new_encode_dev(journal_inode->i_rdev); + parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev); path_put(&path); kfree(journal_path); } else if (token == Opt_journal_ioprio) { @@ -2293,7 +2306,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, " (must be 0-7)"); return -1; } - *journal_ioprio = + parsed_opts->journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); } else if (token == Opt_test_dummy_encryption) { return ext4_set_test_dummy_encryption(sb, opt, &args[0], @@ -2383,6 +2396,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, sbi->s_mount_opt |= m->mount_opt; } else if (token == Opt_data_err_ignore) { sbi->s_mount_opt &= ~m->mount_opt; + } else if (token == Opt_mb_optimize_scan) { + if (arg != 0 && arg != 1) { + ext4_msg(sb, KERN_WARNING, + "mb_optimize_scan should be set to 0 or 1."); + return -1; + } + parsed_opts->mb_optimize_scan = arg; } else { if (!args->from) arg = 1; @@ -2410,8 +2430,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } static int parse_options(char *options, struct super_block *sb, - unsigned long *journal_devnum, - unsigned int *journal_ioprio, + struct ext4_parsed_options *ret_opts, int is_remount) { struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); @@ -2431,8 +2450,8 @@ static int parse_options(char *options, struct super_block *sb, */ args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); - if (handle_mount_opt(sb, p, token, args, journal_devnum, - journal_ioprio, is_remount) < 0) + if (handle_mount_opt(sb, p, token, args, ret_opts, + is_remount) < 0) return 0; } #ifdef CONFIG_QUOTA @@ -3022,9 +3041,6 @@ static void ext4_orphan_cleanup(struct super_block *sb, sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sb->s_flags |= SB_ACTIVE; - /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. @@ -3690,11 +3706,11 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, elr->lr_super = sb; elr->lr_first_not_zeroed = start; - if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) - elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; - else { + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { elr->lr_mode = EXT4_LI_MODE_ITABLE; elr->lr_next_group = start; + } else { + elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; } /* @@ -3725,7 +3741,7 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))) goto out; @@ -4014,7 +4030,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; unsigned long offset = 0; - unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; const char *descr; @@ -4025,8 +4040,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int needs_recovery, has_huge_files; __u64 blocks_count; int err = 0; - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; + struct ext4_parsed_options parsed_opts; + + /* Set defaults for the variables that will be set during parsing */ + parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + parsed_opts.journal_devnum = 0; + parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; if ((data && !orig_data) || !sbi) goto out_free_base; @@ -4272,8 +4292,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) GFP_KERNEL); if (!s_mount_opts) goto failed_mount; - if (!parse_options(s_mount_opts, sb, &journal_devnum, - &journal_ioprio, 0)) { + if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) { ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", s_mount_opts); @@ -4281,8 +4300,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) kfree(s_mount_opts); } sbi->s_def_mount_opt = sbi->s_mount_opt; - if (!parse_options((char *) data, sb, &journal_devnum, - &journal_ioprio, 0)) + if (!parse_options((char *) data, sb, &parsed_opts, 0)) goto failed_mount; #ifdef CONFIG_UNICODE @@ -4291,12 +4309,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) struct unicode_map *encoding; __u16 encoding_flags; - if (ext4_has_feature_encrypt(sb)) { - ext4_msg(sb, KERN_ERR, - "Can't mount with encoding and encryption"); - goto failed_mount; - } - if (ext4_sb_read_encoding(es, &encoding_info, &encoding_flags)) { ext4_msg(sb, KERN_ERR, @@ -4773,7 +4785,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - err = ext4_load_journal(sb, es, journal_devnum); + err = ext4_load_journal(sb, es, parsed_opts.journal_devnum); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && @@ -4873,7 +4885,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; @@ -4979,6 +4991,19 @@ no_journal: ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); + + /* + * Enable optimize_scan if number of groups is > threshold. This can be + * turned off by passing "mb_optimize_scan=0". This can also be + * turned on forcefully by passing "mb_optimize_scan=1". + */ + if (parsed_opts.mb_optimize_scan == 1) + set_opt2(sb, MB_OPTIMIZE_SCAN); + else if (parsed_opts.mb_optimize_scan == 0) + clear_opt2(sb, MB_OPTIMIZE_SCAN); + else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) + set_opt2(sb, MB_OPTIMIZE_SCAN); + err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", @@ -4995,7 +5020,7 @@ no_journal: ext4_journal_commit_callback; block = ext4_count_free_clusters(sb); - ext4_free_blocks_count_set(sbi->s_es, + ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); @@ -5012,6 +5037,9 @@ no_journal: err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); if (!err) + err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, + GFP_KERNEL); + if (!err) err = percpu_init_rwsem(&sbi->s_writepages_rwsem); if (err) { @@ -5124,6 +5152,7 @@ failed_mount6: percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); failed_mount5: ext4_ext_release(sb); @@ -5149,8 +5178,8 @@ failed_mount_wq: failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: - del_timer_sync(&sbi->s_err_report); flush_work(&sbi->s_error_work); + del_timer_sync(&sbi->s_err_report); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: @@ -5556,8 +5585,10 @@ static int ext4_commit_super(struct super_block *sb) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh || block_device_ejected(sb)) - return error; + if (!sbh) + return -EINVAL; + if (block_device_ejected(sb)) + return -ENODEV; ext4_update_super(sb); @@ -5808,13 +5839,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) struct ext4_mount_options old_opts; int enable_quota = 0; ext4_group_t g; - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err = 0; #ifdef CONFIG_QUOTA int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif char *orig_data = kstrdup(data, GFP_KERNEL); + struct ext4_parsed_options parsed_opts; + + parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + parsed_opts.journal_devnum = 0; if (data && !orig_data) return -ENOMEM; @@ -5845,7 +5879,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_qf_names[i] = NULL; #endif if (sbi->s_journal && sbi->s_journal->j_task->io_context) - journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + parsed_opts.journal_ioprio = + sbi->s_journal->j_task->io_context->ioprio; /* * Some options can be enabled by ext4 and/or by VFS mount flag @@ -5855,7 +5890,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) vfs_flags = SB_LAZYTIME | SB_I_VERSION; sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); - if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { + if (!parse_options(data, sb, &parsed_opts, 1)) { err = -EINVAL; goto restore_opts; } @@ -5905,7 +5940,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); } /* Flush outstanding errors before changing fs state */ @@ -6148,7 +6183,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t overhead = 0, resv_blocks; - u64 fsid; s64 bfree; resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); @@ -6169,9 +6203,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT4_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(es->s_uuid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 075aa3a19ff5..6f825dedc3d4 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -24,6 +24,7 @@ typedef enum { attr_session_write_kbytes, attr_lifetime_write_kbytes, attr_reserved_clusters, + attr_sra_exceeded_retry_limit, attr_inode_readahead, attr_trigger_test_error, attr_first_error_time, @@ -202,6 +203,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444); EXT4_ATTR_FUNC(session_write_kbytes, 0444); EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); EXT4_ATTR_FUNC(reserved_clusters, 0644); +EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ext4_sb_info, s_inode_readahead_blks); @@ -213,6 +215,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); +EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); @@ -251,6 +254,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(reserved_clusters), + ATTR_LIST(sra_exceeded_retry_limit), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), @@ -260,6 +264,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), ATTR_LIST(mb_max_inode_prealloc), + ATTR_LIST(mb_max_linear_groups), ATTR_LIST(max_writeback_mb_bump), ATTR_LIST(extent_max_zeroout_kb), ATTR_LIST(trigger_fs_error), @@ -310,6 +315,7 @@ EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); EXT4_ATTR_FEATURE(fast_commit); +EXT4_ATTR_FEATURE(encrypted_casefold); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), @@ -327,6 +333,7 @@ static struct attribute *ext4_feat_attrs[] = { #endif ATTR_LIST(metadata_csum_seed), ATTR_LIST(fast_commit), + ATTR_LIST(encrypted_casefold), NULL, }; ATTRIBUTE_GROUPS(ext4_feat); @@ -374,6 +381,10 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); + case attr_sra_exceeded_retry_limit: + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long) + percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); case attr_inode_readahead: case attr_pointer_ui: if (!ptr) @@ -521,6 +532,10 @@ int ext4_register_sysfs(struct super_block *sb) ext4_fc_info_show, sb); proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_ops, sb); + proc_create_single_data("mb_stats", 0444, sbi->s_proc, + ext4_seq_mb_stats_show, sb); + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, + &ext4_mb_seq_structs_summary_ops, sb); } return 0; } diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 5b7ba8f71153..eacbd489e3bf 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -45,16 +45,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *addr; page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, NULL); if (IS_ERR(page)) return PTR_ERR(page); - addr = kmap_atomic(page); - memcpy(buf, addr + offset_in_page(pos), n); - kunmap_atomic(addr); + memcpy_from_page(buf, page, offset_in_page(pos), n); put_page(page); @@ -80,7 +77,6 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, PAGE_SIZE - offset_in_page(pos)); struct page *page; void *fsdata; - void *addr; int res; res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, @@ -88,9 +84,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, if (res) return res; - addr = kmap_atomic(page); - memcpy(addr + offset_in_page(pos), buf, n); - kunmap_atomic(addr); + memcpy_to_page(page, offset_in_page(pos), buf, n); res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, page, fsdata); @@ -201,55 +195,76 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc, struct inode *inode = file_inode(filp); const int credits = 2; /* superblock and inode for ext4_orphan_del() */ handle_t *handle; + struct ext4_iloc iloc; int err = 0; - int err2; - if (desc != NULL) { - /* Succeeded; write the verity descriptor. */ - err = ext4_write_verity_descriptor(inode, desc, desc_size, - merkle_tree_size); - - /* Write all pages before clearing VERITY_IN_PROGRESS. */ - if (!err) - err = filemap_write_and_wait(inode->i_mapping); - } + /* + * If an error already occurred (which fs/verity/ signals by passing + * desc == NULL), then only clean-up is needed. + */ + if (desc == NULL) + goto cleanup; - /* If we failed, truncate anything we wrote past i_size. */ - if (desc == NULL || err) - ext4_truncate(inode); + /* Append the verity descriptor. */ + err = ext4_write_verity_descriptor(inode, desc, desc_size, + merkle_tree_size); + if (err) + goto cleanup; /* - * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and - * deleting the inode from the orphan list, even if something failed. - * If everything succeeded, we'll also set the verity bit in the same - * transaction. + * Write all pages (both data and verity metadata). Note that this must + * happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages + * beyond i_size won't be written properly. For crash consistency, this + * also must happen before the verity inode flag gets persisted. */ + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto cleanup; - ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + /* + * Finally, set the verity inode flag and remove the inode from the + * orphan list (in a single transaction). + */ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); - return PTR_ERR(handle); + err = PTR_ERR(handle); + goto cleanup; } - err2 = ext4_orphan_del(handle, inode); - if (err2) - goto out_stop; + err = ext4_orphan_del(handle, inode); + if (err) + goto stop_and_cleanup; - if (desc != NULL && !err) { - struct ext4_iloc iloc; + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto stop_and_cleanup; - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out_stop; - ext4_set_inode_flag(inode, EXT4_INODE_VERITY); - ext4_set_inode_flags(inode, false); - err = ext4_mark_iloc_dirty(handle, inode, &iloc); - } -out_stop: + ext4_set_inode_flag(inode, EXT4_INODE_VERITY); + ext4_set_inode_flags(inode, false); + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (err) + goto stop_and_cleanup; + + ext4_journal_stop(handle); + + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + return 0; + +stop_and_cleanup: ext4_journal_stop(handle); - return err ?: err2; +cleanup: + /* + * Verity failed to be enabled, so clean up by truncating any verity + * metadata that was written beyond i_size (both from cache and from + * disk), removing the inode from the orphan list (if it wasn't done + * already), and clearing EXT4_STATE_VERITY_IN_PROGRESS. + */ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); + ext4_orphan_del(NULL, inode); + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + return err; } static int ext4_get_verity_descriptor_location(struct inode *inode, @@ -349,7 +364,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 372208500f4e..10ba4b24a0aa 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1462,6 +1462,9 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, if (!ce) return NULL; + WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && + !(current->flags & PF_MEMALLOC_NOFS)); + ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); @@ -1614,7 +1617,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, * If storing the value in an external inode is an option, * reserve space for xattr entries/names in the external * attribute block so that a long value does not occupy the - * whole space and prevent futher entries being added. + * whole space and prevent further entries being added. */ if (ext4_has_feature_ea_inode(inode->i_sb) && new_size && is_block && @@ -2327,6 +2330,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, error = -ENOSPC; goto cleanup; } + WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } error = ext4_reserve_inode_write(handle, inode, &is.iloc); @@ -2400,7 +2404,7 @@ retry_inode: * external inode if possible. */ if (ext4_has_feature_ea_inode(inode->i_sb) && - !i.in_inode) { + i.value_len && !i.in_inode) { i.in_inode = 1; goto retry_inode; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e6270a867be1..e211a1b6b013 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -449,9 +449,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct qstr dotdot = QSTR_INIT("..", 2); - - return f2fs_find_entry(dir, &dotdot, p); + return f2fs_find_entry(dir, &dotdot_name, p); } ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e2d302ae3a46..11a20dc505aa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3194,6 +3194,9 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); +int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int f2fs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d26ff2ae3f5e..8a56acbcee4c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -22,6 +22,7 @@ #include <linux/file.h> #include <linux/nls.h> #include <linux/sched/signal.h> +#include <linux/fileattr.h> #include "f2fs.h" #include "node.h" @@ -990,6 +991,8 @@ const struct inode_operations f2fs_file_inode_operations = { .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, .fiemap = f2fs_fiemap, + .fileattr_get = f2fs_fileattr_get, + .fileattr_set = f2fs_fileattr_set, }; static int fill_zero(struct inode *inode, pgoff_t index, @@ -1871,13 +1874,16 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) return 0; } -/* FS_IOC_GETFLAGS and FS_IOC_SETFLAGS support */ +/* FS_IOC_[GS]ETFLAGS and FS_IOC_FS[GS]ETXATTR support */ /* * To make a new on-disk f2fs i_flag gettable via FS_IOC_GETFLAGS, add an entry * for it to f2fs_fsflags_map[], and add its FS_*_FL equivalent to * F2FS_GETTABLE_FS_FL. To also make it settable via FS_IOC_SETFLAGS, also add * its FS_*_FL equivalent to F2FS_SETTABLE_FS_FL. + * + * Translating flags to fsx_flags value used by FS_IOC_FSGETXATTR and + * FS_IOC_FSSETXATTR is done by the VFS. */ static const struct { @@ -1952,67 +1958,6 @@ static inline u32 f2fs_fsflags_to_iflags(u32 fsflags) return iflags; } -static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - struct f2fs_inode_info *fi = F2FS_I(inode); - u32 fsflags = f2fs_iflags_to_fsflags(fi->i_flags); - - if (IS_ENCRYPTED(inode)) - fsflags |= FS_ENCRYPT_FL; - if (IS_VERITY(inode)) - fsflags |= FS_VERITY_FL; - if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) - fsflags |= FS_INLINE_DATA_FL; - if (is_inode_flag_set(inode, FI_PIN_FILE)) - fsflags |= FS_NOCOW_FL; - - fsflags &= F2FS_GETTABLE_FS_FL; - - return put_user(fsflags, (int __user *)arg); -} - -static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - struct f2fs_inode_info *fi = F2FS_I(inode); - u32 fsflags, old_fsflags; - u32 iflags; - int ret; - - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EACCES; - - if (get_user(fsflags, (int __user *)arg)) - return -EFAULT; - - if (fsflags & ~F2FS_GETTABLE_FS_FL) - return -EOPNOTSUPP; - fsflags &= F2FS_SETTABLE_FS_FL; - - iflags = f2fs_fsflags_to_iflags(fsflags); - if (f2fs_mask_flags(inode->i_mode, iflags) != iflags) - return -EOPNOTSUPP; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - old_fsflags = f2fs_iflags_to_fsflags(fi->i_flags); - ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags); - if (ret) - goto out; - - ret = f2fs_setflags_common(inode, iflags, - f2fs_fsflags_to_iflags(F2FS_SETTABLE_FS_FL)); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -3019,9 +2964,8 @@ int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) return err; } -static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) { - struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *ipage; @@ -3082,7 +3026,7 @@ int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) return 0; } -static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) { if (projid != F2FS_DEF_PROJID) return -EOPNOTSUPP; @@ -3090,123 +3034,55 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) } #endif -/* FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR support */ - -/* - * To make a new on-disk f2fs i_flag gettable via FS_IOC_FSGETXATTR and settable - * via FS_IOC_FSSETXATTR, add an entry for it to f2fs_xflags_map[], and add its - * FS_XFLAG_* equivalent to F2FS_SUPPORTED_XFLAGS. - */ - -static const struct { - u32 iflag; - u32 xflag; -} f2fs_xflags_map[] = { - { F2FS_SYNC_FL, FS_XFLAG_SYNC }, - { F2FS_IMMUTABLE_FL, FS_XFLAG_IMMUTABLE }, - { F2FS_APPEND_FL, FS_XFLAG_APPEND }, - { F2FS_NODUMP_FL, FS_XFLAG_NODUMP }, - { F2FS_NOATIME_FL, FS_XFLAG_NOATIME }, - { F2FS_PROJINHERIT_FL, FS_XFLAG_PROJINHERIT }, -}; - -#define F2FS_SUPPORTED_XFLAGS ( \ - FS_XFLAG_SYNC | \ - FS_XFLAG_IMMUTABLE | \ - FS_XFLAG_APPEND | \ - FS_XFLAG_NODUMP | \ - FS_XFLAG_NOATIME | \ - FS_XFLAG_PROJINHERIT) - -/* Convert f2fs on-disk i_flags to FS_IOC_FS{GET,SET}XATTR flags */ -static inline u32 f2fs_iflags_to_xflags(u32 iflags) -{ - u32 xflags = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(f2fs_xflags_map); i++) - if (iflags & f2fs_xflags_map[i].iflag) - xflags |= f2fs_xflags_map[i].xflag; - - return xflags; -} - -/* Convert FS_IOC_FS{GET,SET}XATTR flags to f2fs on-disk i_flags */ -static inline u32 f2fs_xflags_to_iflags(u32 xflags) -{ - u32 iflags = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(f2fs_xflags_map); i++) - if (xflags & f2fs_xflags_map[i].xflag) - iflags |= f2fs_xflags_map[i].iflag; - - return iflags; -} - -static void f2fs_fill_fsxattr(struct inode *inode, struct fsxattr *fa) +int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { + struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); + u32 fsflags = f2fs_iflags_to_fsflags(fi->i_flags); - simple_fill_fsxattr(fa, f2fs_iflags_to_xflags(fi->i_flags)); + if (IS_ENCRYPTED(inode)) + fsflags |= FS_ENCRYPT_FL; + if (IS_VERITY(inode)) + fsflags |= FS_VERITY_FL; + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + fsflags |= FS_INLINE_DATA_FL; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + fsflags |= FS_NOCOW_FL; + + fileattr_fill_flags(fa, fsflags & F2FS_GETTABLE_FS_FL); if (f2fs_sb_has_project_quota(F2FS_I_SB(inode))) fa->fsx_projid = from_kprojid(&init_user_ns, fi->i_projid); -} -static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - struct fsxattr fa; - - f2fs_fill_fsxattr(inode, &fa); - - if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa))) - return -EFAULT; return 0; } -static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) +int f2fs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = file_inode(filp); - struct fsxattr fa, old_fa; + struct inode *inode = d_inode(dentry); + u32 fsflags = fa->flags, mask = F2FS_SETTABLE_FS_FL; u32 iflags; int err; - if (copy_from_user(&fa, (struct fsxattr __user *)arg, sizeof(fa))) - return -EFAULT; - - /* Make sure caller has proper permission */ - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EACCES; - - if (fa.fsx_xflags & ~F2FS_SUPPORTED_XFLAGS) + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) + return -ENOSPC; + if (fsflags & ~F2FS_GETTABLE_FS_FL) return -EOPNOTSUPP; + fsflags &= F2FS_SETTABLE_FS_FL; + if (!fa->flags_valid) + mask &= FS_COMMON_FL; - iflags = f2fs_xflags_to_iflags(fa.fsx_xflags); + iflags = f2fs_fsflags_to_iflags(fsflags); if (f2fs_mask_flags(inode->i_mode, iflags) != iflags) return -EOPNOTSUPP; - err = mnt_want_write_file(filp); - if (err) - return err; - - inode_lock(inode); - - f2fs_fill_fsxattr(inode, &old_fa); - err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa); - if (err) - goto out; - - err = f2fs_setflags_common(inode, iflags, - f2fs_xflags_to_iflags(F2FS_SUPPORTED_XFLAGS)); - if (err) - goto out; + err = f2fs_setflags_common(inode, iflags, f2fs_fsflags_to_iflags(mask)); + if (!err) + err = f2fs_ioc_setproject(inode, fa->fsx_projid); - err = f2fs_ioc_setproject(filp, fa.fsx_projid); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); return err; } @@ -4051,7 +3927,7 @@ out: static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) { - DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, page_idx); + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, page_idx); struct address_space *mapping = inode->i_mapping; struct page *page; pgoff_t redirty_idx = page_idx; @@ -4233,10 +4109,6 @@ out: static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { - case FS_IOC_GETFLAGS: - return f2fs_ioc_getflags(filp, arg); - case FS_IOC_SETFLAGS: - return f2fs_ioc_setflags(filp, arg); case FS_IOC_GETVERSION: return f2fs_ioc_getversion(filp, arg); case F2FS_IOC_START_ATOMIC_WRITE: @@ -4285,10 +4157,6 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_flush_device(filp, arg); case F2FS_IOC_GET_FEATURES: return f2fs_ioc_get_features(filp, arg); - case FS_IOC_FSGETXATTR: - return f2fs_ioc_fsgetxattr(filp, arg); - case FS_IOC_FSSETXATTR: - return f2fs_ioc_fssetxattr(filp, arg); case F2FS_IOC_GET_PIN_FILE: return f2fs_ioc_get_pin_file(filp, arg); case F2FS_IOC_SET_PIN_FILE: @@ -4518,12 +4386,6 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return -ENOSPC; switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; @@ -4552,8 +4414,6 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_DEFRAGMENT: case F2FS_IOC_FLUSH_DEVICE: case F2FS_IOC_GET_FEATURES: - case FS_IOC_FSGETXATTR: - case FS_IOC_FSSETXATTR: case F2FS_IOC_GET_PIN_FILE: case F2FS_IOC_SET_PIN_FILE: case F2FS_IOC_PRECACHE_EXTENTS: diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 17bd072a5d39..377c6b161b23 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -416,9 +416,8 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { - struct qstr dotdot = QSTR_INIT("..", 2); struct page *page; - unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &page); if (!ino) { if (IS_ERR(page)) return ERR_CAST(page); @@ -1327,6 +1326,8 @@ const struct inode_operations f2fs_dir_inode_operations = { .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, .fiemap = f2fs_fiemap, + .fileattr_get = f2fs_fileattr_get, + .fileattr_set = f2fs_fileattr_set, }; const struct inode_operations f2fs_symlink_inode_operations = { diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 054ec852b5ea..a7beff28a3c5 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -228,7 +228,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; diff --git a/fs/file.c b/fs/file.c index f3a4bac2cbe9..86dc9956af32 100644 --- a/fs/file.c +++ b/fs/file.c @@ -629,17 +629,30 @@ int close_fd(unsigned fd) } EXPORT_SYMBOL(close_fd); /* for ksys_close() */ +/** + * last_fd - return last valid index into fd table + * @cur_fds: files struct + * + * Context: Either rcu read lock or files_lock must be held. + * + * Returns: Last valid index into fdtable. + */ +static inline unsigned last_fd(struct fdtable *fdt) +{ + return fdt->max_fds - 1; +} + static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; - if (fd > max_fd) - return; - + /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); - bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); + max_fd = min(last_fd(fdt), max_fd); + if (fd <= max_fd) + bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } @@ -1068,8 +1081,6 @@ out_unlock: /** * __receive_fd() - Install received file into file descriptor table - * - * @fd: fd to install into (if negative, a new fd will be allocated) * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry @@ -1083,7 +1094,7 @@ out_unlock: * * Returns newly install fd or -ve on error. */ -int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags) +int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; @@ -1092,32 +1103,33 @@ int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flag if (error) return error; - if (fd < 0) { - new_fd = get_unused_fd_flags(o_flags); - if (new_fd < 0) - return new_fd; - } else { - new_fd = fd; - } + new_fd = get_unused_fd_flags(o_flags); + if (new_fd < 0) + return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { - if (fd < 0) - put_unused_fd(new_fd); + put_unused_fd(new_fd); return error; } } - if (fd < 0) { - fd_install(new_fd, get_file(file)); - } else { - error = replace_fd(new_fd, file, o_flags); - if (error) - return error; - } + fd_install(new_fd, get_file(file)); + __receive_sock(file); + return new_fd; +} - /* Bump the sock usage counts, if any. */ +int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) +{ + int error; + + error = security_file_receive(file); + if (error) + return error; + error = replace_fd(new_fd, file, o_flags); + if (error) + return error; __receive_sock(file); return new_fd; } diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 68b0148f4bb8..980d44fd3a36 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -310,7 +310,6 @@ EXPORT_SYMBOL(fs_param_is_path); #ifdef CONFIG_VALIDATE_FS_PARSER /** * validate_constant_table - Validate a constant table - * @name: Name to use in reporting * @tbl: The constant table to validate. * @tbl_size: The size of the table. * @low: The lowest permissible value. @@ -360,6 +359,7 @@ bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, /** * fs_validate_description - Validate a parameter description + * @name: The parameter name to search for. * @desc: The parameter description to validate. */ bool fs_validate_description(const char *name, diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 5e796e6c38e5..427efa73b9bd 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -2,6 +2,7 @@ config FSCACHE tristate "General filesystem local caching manager" + select NETFS_SUPPORT help This option enables a generic filesystem caching manager that can be used by various network and other filesystems to cache data locally. diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile index 79e08e05ef84..3b2ffa93ac18 100644 --- a/fs/fscache/Makefile +++ b/fs/fscache/Makefile @@ -7,6 +7,7 @@ fscache-y := \ cache.o \ cookie.o \ fsdef.o \ + io.o \ main.o \ netfs.o \ object.o \ diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 08e91efbce53..c483863b740a 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -142,6 +142,10 @@ extern int fscache_wait_for_operation_activation(struct fscache_object *, atomic_t *, atomic_t *); extern void fscache_invalidate_writes(struct fscache_cookie *); +struct fscache_retrieval *fscache_alloc_retrieval(struct fscache_cookie *cookie, + struct address_space *mapping, + fscache_rw_complete_t end_io_func, + void *context); /* * proc.c diff --git a/fs/fscache/io.c b/fs/fscache/io.c new file mode 100644 index 000000000000..8ecc1141802f --- /dev/null +++ b/fs/fscache/io.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Cache data I/O routines + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define FSCACHE_DEBUG_LEVEL PAGE +#include <linux/module.h> +#define FSCACHE_USE_NEW_IO_API +#include <linux/fscache-cache.h> +#include <linux/slab.h> +#include <linux/netfs.h> +#include "internal.h" + +/* + * Start a cache read operation. + * - we return: + * -ENOMEM - out of memory, some pages may be being read + * -ERESTARTSYS - interrupted, some pages may be being read + * -ENOBUFS - no backing object or space available in which to cache any + * pages not being read + * -ENODATA - no data available in the backing object for some or all of + * the pages + * 0 - dispatched a read on all pages + */ +int __fscache_begin_read_operation(struct netfs_read_request *rreq, + struct fscache_cookie *cookie) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + bool wake_cookie = false; + int ret; + + _enter("rr=%08x", rreq->debug_id); + + fscache_stat(&fscache_n_retrievals); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { + _leave(" = -ENOBUFS [invalidating]"); + return -ENOBUFS; + } + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(cookie, NULL, NULL, NULL); + if (!op) + return -ENOMEM; + trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi); + + spin_lock(&cookie->lock); + + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + __fscache_use_cookie(cookie); + atomic_inc(&object->n_reads); + __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock_dec; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_retrieval_ops); + + /* we wait for the operation to become active, and then process it + * *here*, in this thread, and not in the thread pool */ + ret = fscache_wait_for_operation_activation( + object, &op->op, + __fscache_stat(&fscache_n_retrieval_op_waits), + __fscache_stat(&fscache_n_retrievals_object_dead)); + if (ret < 0) + goto error; + + /* ask the cache to honour the operation */ + ret = object->cache->ops->begin_read_operation(rreq, op); + +error: + if (ret == -ENOMEM) + fscache_stat(&fscache_n_retrievals_nomem); + else if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_retrievals_intr); + else if (ret == -ENODATA) + fscache_stat(&fscache_n_retrievals_nodata); + else if (ret < 0) + fscache_stat(&fscache_n_retrievals_nobufs); + else + fscache_stat(&fscache_n_retrievals_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock_dec: + atomic_dec(&object->n_reads); + wake_cookie = __fscache_unuse_cookie(cookie); +nobufs_unlock: + spin_unlock(&cookie->lock); + fscache_put_retrieval(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); +nobufs: + fscache_stat(&fscache_n_retrievals_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_begin_read_operation); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 26af6fdf1538..991b0a871744 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -299,7 +299,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) /* * allocate a retrieval op */ -static struct fscache_retrieval *fscache_alloc_retrieval( +struct fscache_retrieval *fscache_alloc_retrieval( struct fscache_cookie *cookie, struct address_space *mapping, fscache_rw_complete_t end_io_func, diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index a5aa93ece8c5..a7c3ed89a3e0 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -278,5 +278,6 @@ int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_cache_stale_objects), atomic_read(&fscache_n_cache_retired_objects), atomic_read(&fscache_n_cache_culled_objects)); + netfs_stats_show(m); return 0; } diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 8c7021fb2cd4..0c48b35c058d 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-$(CONFIG_FUSE_DAX) += dax.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index e9c0f916349d..52b165319be1 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -71,6 +71,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return -EINVAL; if (acl) { + unsigned int extra_flags = 0; /* * Fuse userspace is responsible for updating access * permissions in the inode, if needed. fuse_setxattr @@ -94,7 +95,11 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - ret = fuse_setxattr(inode, name, value, size, 0); + if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) + extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; + + ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); kfree(value); } else { ret = fuse_removexattr(inode, name); diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 45082269e698..c7d882a9fe33 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -511,20 +511,18 @@ static int cuse_channel_open(struct inode *inode, struct file *file) fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); + cc->fc.release = cuse_fc_release; fud = fuse_dev_alloc_install(&cc->fc); - if (!fud) { - kfree(cc); + fuse_conn_put(&cc->fc); + if (!fud) return -ENOMEM; - } INIT_LIST_HEAD(&cc->list); - cc->fc.release = cuse_fc_release; cc->fc.initialized = 1; rc = cuse_send_init(cc); if (rc) { fuse_dev_free(fud); - fuse_conn_put(&cc->fc); return rc; } file->private_data = fud; @@ -561,8 +559,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file) unregister_chrdev_region(cc->cdev->dev, 1); cdev_del(cc->cdev); } - /* Base reference is now owned by "fud" */ - fuse_conn_put(&cc->fc); rc = fuse_dev_release(inode, file); /* puts the base reference */ @@ -627,6 +623,8 @@ static int __init cuse_init(void) cuse_channel_fops.owner = THIS_MODULE; cuse_channel_fops.open = cuse_channel_open; cuse_channel_fops.release = cuse_channel_release; + /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */ + cuse_channel_fops.unlocked_ioctl = NULL; cuse_class = class_create(THIS_MODULE, "cuse"); if (IS_ERR(cuse_class)) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c6636b4c4ccf..a5ceccc5ef00 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2229,19 +2229,18 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - int err = -ENOTTY; - - if (cmd == FUSE_DEV_IOC_CLONE) { - int oldfd; - - err = -EFAULT; - if (!get_user(oldfd, (__u32 __user *) arg)) { + int res; + int oldfd; + struct fuse_dev *fud = NULL; + + switch (cmd) { + case FUSE_DEV_IOC_CLONE: + res = -EFAULT; + if (!get_user(oldfd, (__u32 __user *)arg)) { struct file *old = fget(oldfd); - err = -EINVAL; + res = -EINVAL; if (old) { - struct fuse_dev *fud = NULL; - /* * Check against file->f_op because CUSE * uses the same ioctl handler. @@ -2252,14 +2251,18 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, if (fud) { mutex_lock(&fuse_mutex); - err = fuse_device_clone(fud->fc, file); + res = fuse_device_clone(fud->fc, file); mutex_unlock(&fuse_mutex); } fput(old); } } + break; + default: + res = -ENOTTY; + break; } - return err; + return res; } const struct file_operations fuse_dev_operations = { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 06a18700a845..1b6c001a7dd1 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -252,7 +252,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (ret == -ENOMEM) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || - (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + inode_wrong_type(inode, outarg.attr.mode)) goto invalid; forget_all_cached_acls(inode); @@ -508,7 +508,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, * 'mknod' + 'open' requests. */ static int fuse_create_open(struct inode *dir, struct dentry *entry, - struct file *file, unsigned flags, + struct file *file, unsigned int flags, umode_t mode) { int err; @@ -1054,7 +1054,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || - (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + inode_wrong_type(inode, outarg.attr.mode)) { fuse_make_bad(inode); err = -EIO; } else { @@ -1703,7 +1703,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } if (fuse_invalid_attr(&outarg.attr) || - (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + inode_wrong_type(inode, outarg.attr.mode)) { fuse_make_bad(inode); err = -EIO; goto error; @@ -1866,6 +1866,8 @@ static const struct inode_operations fuse_dir_inode_operations = { .listxattr = fuse_listxattr, .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, + .fileattr_get = fuse_fileattr_get, + .fileattr_set = fuse_fileattr_set, }; static const struct file_operations fuse_dir_operations = { @@ -1886,6 +1888,8 @@ static const struct inode_operations fuse_common_inode_operations = { .listxattr = fuse_listxattr, .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, + .fileattr_get = fuse_fileattr_get, + .fileattr_set = fuse_fileattr_set, }; static const struct inode_operations fuse_symlink_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8cccecb55fb8..09ef2a4d25ed 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -14,32 +14,20 @@ #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/module.h> -#include <linux/compat.h> #include <linux/swap.h> #include <linux/falloc.h> #include <linux/uio.h> #include <linux/fs.h> -static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) -{ - struct page **pages; - - pages = kzalloc(npages * (sizeof(struct page *) + - sizeof(struct fuse_page_desc)), flags); - *desc = (void *) (pages + npages); - - return pages; -} - -static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file, - int opcode, struct fuse_open_out *outargp) +static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, int opcode, + struct fuse_open_out *outargp) { struct fuse_open_in inarg; FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); - inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fm->fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; @@ -136,8 +124,8 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) } } -int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, - bool isdir) +struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, bool isdir) { struct fuse_conn *fc = fm->fc; struct fuse_file *ff; @@ -145,7 +133,7 @@ int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, ff = fuse_file_alloc(fm); if (!ff) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ff->fh = 0; /* Default for no-open */ @@ -154,14 +142,14 @@ int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, struct fuse_open_out outarg; int err; - err = fuse_send_open(fm, nodeid, file, opcode, &outarg); + err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); if (!err) { ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; } else if (err != -ENOSYS) { fuse_file_free(ff); - return err; + return ERR_PTR(err); } else { if (isdir) fc->no_opendir = 1; @@ -174,9 +162,19 @@ int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, ff->open_flags &= ~FOPEN_DIRECT_IO; ff->nodeid = nodeid; - file->private_data = ff; - return 0; + return ff; +} + +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, + bool isdir) +{ + struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); + + if (!IS_ERR(ff)) + file->private_data = ff; + + return PTR_ERR_OR_ZERO(ff); } EXPORT_SYMBOL_GPL(fuse_do_open); @@ -268,7 +266,7 @@ out: } static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, - int flags, int opcode) + unsigned int flags, int opcode) { struct fuse_conn *fc = ff->fm->fc; struct fuse_release_args *ra = ff->release_args; @@ -297,22 +295,21 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, ra->args.nocreds = true; } -void fuse_release_common(struct file *file, bool isdir) +void fuse_file_release(struct inode *inode, struct fuse_file *ff, + unsigned int open_flags, fl_owner_t id, bool isdir) { - struct fuse_inode *fi = get_fuse_inode(file_inode(file)); - struct fuse_file *ff = file->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_release_args *ra = ff->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(fi, ff, file->f_flags, opcode); + fuse_prepare_release(fi, ff, open_flags, opcode); if (ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; - ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, - (fl_owner_t) file); + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } /* Hold inode until release is finished */ - ra->inode = igrab(file_inode(file)); + ra->inode = igrab(inode); /* * Normally this will send the RELEASE request, however if @@ -326,6 +323,12 @@ void fuse_release_common(struct file *file, bool isdir) fuse_file_put(ff, ff->fm->fc->destroy, isdir); } +void fuse_release_common(struct file *file, bool isdir) +{ + fuse_file_release(file_inode(file), file->private_data, file->f_flags, + (fl_owner_t) file, isdir); +} + static int fuse_open(struct inode *inode, struct file *file) { return fuse_open_common(inode, file, false); @@ -345,7 +348,8 @@ static int fuse_release(struct inode *inode, struct file *file) return 0; } -void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags) +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, + unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); @@ -798,21 +802,12 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, { struct fuse_conn *fc = get_fuse_conn(inode); - if (fc->writeback_cache) { - /* - * A hole in a file. Some data after the hole are in page cache, - * but have not reached the client fs yet. So, the hole is not - * present there. - */ - int i; - int start_idx = num_read >> PAGE_SHIFT; - size_t off = num_read & (PAGE_SIZE - 1); - - for (i = start_idx; i < ap->num_pages; i++) { - zero_user_segment(ap->pages[i], off, PAGE_SIZE); - off = 0; - } - } else { + /* + * If writeback_cache is enabled, a short read means there's a hole in + * the file. Some data after the hole is in page cache, but has not + * reached the client fs yet. So the hole is not present there. + */ + if (!fc->writeback_cache) { loff_t pos = page_offset(ap->pages[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } @@ -1099,6 +1094,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; unsigned int offset, i; + bool short_write; int err; for (i = 0; i < ap->num_pages; i++) @@ -1113,32 +1109,38 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, if (!err && ia->write.out.size > count) err = -EIO; + short_write = ia->write.out.size < count; offset = ap->descs[0].offset; count = ia->write.out.size; for (i = 0; i < ap->num_pages; i++) { struct page *page = ap->pages[i]; - if (!err && !offset && count >= PAGE_SIZE) - SetPageUptodate(page); - - if (count > PAGE_SIZE - offset) - count -= PAGE_SIZE - offset; - else - count = 0; - offset = 0; - - unlock_page(page); + if (err) { + ClearPageUptodate(page); + } else { + if (count >= PAGE_SIZE - offset) + count -= PAGE_SIZE - offset; + else { + if (short_write) + ClearPageUptodate(page); + count = 0; + } + offset = 0; + } + if (ia->write.page_locked && (i == ap->num_pages - 1)) + unlock_page(page); put_page(page); } return err; } -static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, +static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct address_space *mapping, struct iov_iter *ii, loff_t pos, unsigned int max_pages) { + struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; @@ -1191,6 +1193,16 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, if (offset == PAGE_SIZE) offset = 0; + /* If we copied full page, mark it uptodate */ + if (tmp == PAGE_SIZE) + SetPageUptodate(page); + + if (PageUptodate(page)) { + unlock_page(page); + } else { + ia->write.page_locked = true; + break; + } if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && @@ -1234,7 +1246,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, break; } - count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); + count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); if (count <= 0) { err = count; } else { @@ -1346,16 +1358,6 @@ out: return written ? written : err; } -static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, - unsigned int index, - unsigned int nr_pages) -{ - int i; - - for (i = index; i < index + nr_pages; i++) - descs[i].length = PAGE_SIZE - descs[i].offset; -} - static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) { return (unsigned long)ii->iov->iov_base + ii->iov_offset; @@ -1759,8 +1761,17 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, container_of(args, typeof(*wpa), ia.ap.args); struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); mapping_set_error(inode->i_mapping, error); + /* + * A writeback finished and this might have updated mtime/ctime on + * server making local mtime/ctime stale. Hence invalidate attrs. + * Do this only if writeback_cache is not enabled. If writeback_cache + * is enabled, we trust local ctime/mtime. + */ + if (!fc->writeback_cache) + fuse_invalidate_attr(inode); spin_lock(&fi->lock); rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { @@ -2637,363 +2648,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) } /* - * CUSE servers compiled on 32bit broke on 64bit kernels because the - * ABI was defined to be 'struct iovec' which is different on 32bit - * and 64bit. Fortunately we can determine which structure the server - * used from the size of the reply. - */ -static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, - size_t transferred, unsigned count, - bool is_compat) -{ -#ifdef CONFIG_COMPAT - if (count * sizeof(struct compat_iovec) == transferred) { - struct compat_iovec *ciov = src; - unsigned i; - - /* - * With this interface a 32bit server cannot support - * non-compat (i.e. ones coming from 64bit apps) ioctl - * requests - */ - if (!is_compat) - return -EINVAL; - - for (i = 0; i < count; i++) { - dst[i].iov_base = compat_ptr(ciov[i].iov_base); - dst[i].iov_len = ciov[i].iov_len; - } - return 0; - } -#endif - - if (count * sizeof(struct iovec) != transferred) - return -EIO; - - memcpy(dst, src, transferred); - return 0; -} - -/* Make sure iov_length() won't overflow */ -static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, - size_t count) -{ - size_t n; - u32 max = fc->max_pages << PAGE_SHIFT; - - for (n = 0; n < count; n++, iov++) { - if (iov->iov_len > (size_t) max) - return -ENOMEM; - max -= iov->iov_len; - } - return 0; -} - -static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, - void *src, size_t transferred, unsigned count, - bool is_compat) -{ - unsigned i; - struct fuse_ioctl_iovec *fiov = src; - - if (fc->minor < 16) { - return fuse_copy_ioctl_iovec_old(dst, src, transferred, - count, is_compat); - } - - if (count * sizeof(struct fuse_ioctl_iovec) != transferred) - return -EIO; - - for (i = 0; i < count; i++) { - /* Did the server supply an inappropriate value? */ - if (fiov[i].base != (unsigned long) fiov[i].base || - fiov[i].len != (unsigned long) fiov[i].len) - return -EIO; - - dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; - dst[i].iov_len = (size_t) fiov[i].len; - -#ifdef CONFIG_COMPAT - if (is_compat && - (ptr_to_compat(dst[i].iov_base) != fiov[i].base || - (compat_size_t) dst[i].iov_len != fiov[i].len)) - return -EIO; -#endif - } - - return 0; -} - - -/* - * For ioctls, there is no generic way to determine how much memory - * needs to be read and/or written. Furthermore, ioctls are allowed - * to dereference the passed pointer, so the parameter requires deep - * copying but FUSE has no idea whatsoever about what to copy in or - * out. - * - * This is solved by allowing FUSE server to retry ioctl with - * necessary in/out iovecs. Let's assume the ioctl implementation - * needs to read in the following structure. - * - * struct a { - * char *buf; - * size_t buflen; - * } - * - * On the first callout to FUSE server, inarg->in_size and - * inarg->out_size will be NULL; then, the server completes the ioctl - * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and - * the actual iov array to - * - * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } - * - * which tells FUSE to copy in the requested area and retry the ioctl. - * On the second round, the server has access to the structure and - * from that it can tell what to look for next, so on the invocation, - * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to - * - * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, - * { .iov_base = a.buf, .iov_len = a.buflen } } - * - * FUSE will copy both struct a and the pointed buffer from the - * process doing the ioctl and retry ioctl with both struct a and the - * buffer. - * - * This time, FUSE server has everything it needs and completes ioctl - * without FUSE_IOCTL_RETRY which finishes the ioctl call. - * - * Copying data out works the same way. - * - * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel - * automatically initializes in and out iovs by decoding @cmd with - * _IOC_* macros and the server is not allowed to request RETRY. This - * limits ioctl data transfers to well-formed ioctls and is the forced - * behavior for all FUSE servers. - */ -long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, - unsigned int flags) -{ - struct fuse_file *ff = file->private_data; - struct fuse_mount *fm = ff->fm; - struct fuse_ioctl_in inarg = { - .fh = ff->fh, - .cmd = cmd, - .arg = arg, - .flags = flags - }; - struct fuse_ioctl_out outarg; - struct iovec *iov_page = NULL; - struct iovec *in_iov = NULL, *out_iov = NULL; - unsigned int in_iovs = 0, out_iovs = 0, max_pages; - size_t in_size, out_size, c; - ssize_t transferred; - int err, i; - struct iov_iter ii; - struct fuse_args_pages ap = {}; - -#if BITS_PER_LONG == 32 - inarg.flags |= FUSE_IOCTL_32BIT; -#else - if (flags & FUSE_IOCTL_COMPAT) { - inarg.flags |= FUSE_IOCTL_32BIT; -#ifdef CONFIG_X86_X32 - if (in_x32_syscall()) - inarg.flags |= FUSE_IOCTL_COMPAT_X32; -#endif - } -#endif - - /* assume all the iovs returned by client always fits in a page */ - BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); - - err = -ENOMEM; - ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); - iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!ap.pages || !iov_page) - goto out; - - fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); - - /* - * If restricted, initialize IO parameters as encoded in @cmd. - * RETRY from server is not allowed. - */ - if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { - struct iovec *iov = iov_page; - - iov->iov_base = (void __user *)arg; - - switch (cmd) { - case FS_IOC_GETFLAGS: - case FS_IOC_SETFLAGS: - iov->iov_len = sizeof(int); - break; - default: - iov->iov_len = _IOC_SIZE(cmd); - break; - } - - if (_IOC_DIR(cmd) & _IOC_WRITE) { - in_iov = iov; - in_iovs = 1; - } - - if (_IOC_DIR(cmd) & _IOC_READ) { - out_iov = iov; - out_iovs = 1; - } - } - - retry: - inarg.in_size = in_size = iov_length(in_iov, in_iovs); - inarg.out_size = out_size = iov_length(out_iov, out_iovs); - - /* - * Out data can be used either for actual out data or iovs, - * make sure there always is at least one page. - */ - out_size = max_t(size_t, out_size, PAGE_SIZE); - max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); - - /* make sure there are enough buffer pages and init request with them */ - err = -ENOMEM; - if (max_pages > fm->fc->max_pages) - goto out; - while (ap.num_pages < max_pages) { - ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!ap.pages[ap.num_pages]) - goto out; - ap.num_pages++; - } - - - /* okay, let's send it to the client */ - ap.args.opcode = FUSE_IOCTL; - ap.args.nodeid = ff->nodeid; - ap.args.in_numargs = 1; - ap.args.in_args[0].size = sizeof(inarg); - ap.args.in_args[0].value = &inarg; - if (in_size) { - ap.args.in_numargs++; - ap.args.in_args[1].size = in_size; - ap.args.in_pages = true; - - err = -EFAULT; - iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); - if (c != PAGE_SIZE && iov_iter_count(&ii)) - goto out; - } - } - - ap.args.out_numargs = 2; - ap.args.out_args[0].size = sizeof(outarg); - ap.args.out_args[0].value = &outarg; - ap.args.out_args[1].size = out_size; - ap.args.out_pages = true; - ap.args.out_argvar = true; - - transferred = fuse_simple_request(fm, &ap.args); - err = transferred; - if (transferred < 0) - goto out; - - /* did it ask for retry? */ - if (outarg.flags & FUSE_IOCTL_RETRY) { - void *vaddr; - - /* no retry if in restricted mode */ - err = -EIO; - if (!(flags & FUSE_IOCTL_UNRESTRICTED)) - goto out; - - in_iovs = outarg.in_iovs; - out_iovs = outarg.out_iovs; - - /* - * Make sure things are in boundary, separate checks - * are to protect against overflow. - */ - err = -ENOMEM; - if (in_iovs > FUSE_IOCTL_MAX_IOV || - out_iovs > FUSE_IOCTL_MAX_IOV || - in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) - goto out; - - vaddr = kmap_atomic(ap.pages[0]); - err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, - transferred, in_iovs + out_iovs, - (flags & FUSE_IOCTL_COMPAT) != 0); - kunmap_atomic(vaddr); - if (err) - goto out; - - in_iov = iov_page; - out_iov = in_iov + in_iovs; - - err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs); - if (err) - goto out; - - err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs); - if (err) - goto out; - - goto retry; - } - - err = -EIO; - if (transferred > inarg.out_size) - goto out; - - err = -EFAULT; - iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); - if (c != PAGE_SIZE && iov_iter_count(&ii)) - goto out; - } - err = 0; - out: - free_page((unsigned long) iov_page); - while (ap.num_pages) - __free_page(ap.pages[--ap.num_pages]); - kfree(ap.pages); - - return err ? err : outarg.result; -} -EXPORT_SYMBOL_GPL(fuse_do_ioctl); - -long fuse_ioctl_common(struct file *file, unsigned int cmd, - unsigned long arg, unsigned int flags) -{ - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - - if (!fuse_allow_current_process(fc)) - return -EACCES; - - if (fuse_is_bad(inode)) - return -EIO; - - return fuse_do_ioctl(file, cmd, arg, flags); -} - -static long fuse_file_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - return fuse_ioctl_common(file, cmd, arg, 0); -} - -static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); -} - -/* * All files which have been polled are linked to RB tree * fuse_conn->polled_files which is indexed by kh. Walk the tree and * find the matching one. diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 68cca8d4db6e..7e463e220053 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -552,9 +552,12 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; - /** Maxmum number of pages that can be used in a single request */ + /** Maximum number of pages that can be used in a single request */ unsigned int max_pages; + /** Constrain ->max_pages to this value during feature negotiation */ + unsigned int max_pages_limit; + /** Input queue */ struct fuse_iqueue iq; @@ -668,6 +671,9 @@ struct fuse_conn { /** Is setxattr not implemented by fs? */ unsigned no_setxattr:1; + /** Does file server support extended setxattr */ + unsigned setxattr_ext:1; + /** Is getxattr not implemented by fs? */ unsigned no_getxattr:1; @@ -713,7 +719,7 @@ struct fuse_conn { /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; - /** Filesystem is fully reponsible for page cache invalidation. */ + /** Filesystem is fully responsible for page cache invalidation. */ unsigned explicit_inval_data:1; /** Does the filesystem support readdirplus? */ @@ -863,6 +869,7 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) static inline void fuse_make_bad(struct inode *inode) { + remove_inode_hash(inode); set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state); } @@ -871,6 +878,28 @@ static inline bool fuse_is_bad(struct inode *inode) return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); } +static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) (pages + npages); + + return pages; +} + +static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, + unsigned int index, + unsigned int nr_pages) +{ + int i; + + for (i = index; i < index + nr_pages; i++) + descs[i].length = PAGE_SIZE - descs[i].offset; +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -911,6 +940,7 @@ struct fuse_io_args { struct { struct fuse_write_in in; struct fuse_write_out out; + bool page_locked; } write; }; struct fuse_args_pages ap; @@ -931,7 +961,8 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); -void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags); +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, + unsigned int flags); /** * Send RELEASE or RELEASEDIR request @@ -1169,7 +1200,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked); bool fuse_lock_inode(struct inode *inode); int fuse_setxattr(struct inode *inode, const char *name, const void *value, - size_t size, int flags); + size_t size, int flags, unsigned int extra_flags); ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); @@ -1213,4 +1244,19 @@ void fuse_dax_inode_cleanup(struct inode *inode); bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); void fuse_dax_cancel_work(struct fuse_conn *fc); +/* ioctl.c */ +long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int fuse_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); + +/* file.c */ + +struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, bool isdir); +void fuse_file_release(struct inode *inode, struct fuse_file *ff, + unsigned int open_flags, fl_owner_t id, bool isdir); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b0e18b470e91..393e36b74dc4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -350,7 +350,7 @@ retry: inode->i_generation = generation; fuse_init_inode(inode, attr); unlock_new_inode(inode); - } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { + } else if (inode_wrong_type(inode, attr->mode)) { /* Inode has changed type, any I/O on the old should fail */ fuse_make_bad(inode); iput(inode); @@ -712,6 +712,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + fc->max_pages_limit = FUSE_MAX_MAX_PAGES; INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); @@ -872,14 +873,13 @@ static struct dentry *fuse_get_parent(struct dentry *child) struct inode *inode; struct dentry *parent; struct fuse_entry_out outarg; - const struct qstr name = QSTR_INIT("..", 2); int err; if (!fc->export_support) return ERR_PTR(-ESTALE); err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), - &name, &outarg, &inode); + &dotdot_name, &outarg, &inode); if (err) { if (err == -ENOENT) return ERR_PTR(-ESTALE); @@ -1040,7 +1040,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->abort_err = 1; if (arg->flags & FUSE_MAX_PAGES) { fc->max_pages = - min_t(unsigned int, FUSE_MAX_MAX_PAGES, + min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } if (IS_ENABLED(CONFIG_FUSE_DAX) && @@ -1052,6 +1052,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->handle_killpriv_v2 = 1; fm->sb->s_flags |= SB_NOSEC; } + if (arg->flags & FUSE_SETXATTR_EXT) + fc->setxattr_ext = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1095,7 +1097,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | - FUSE_HANDLE_KILLPRIV_V2; + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) ia->in.flags |= FUSE_MAP_ALIGNMENT; diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c new file mode 100644 index 000000000000..546ea3d58fb4 --- /dev/null +++ b/fs/fuse/ioctl.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 Red Hat, Inc. + */ + +#include "fuse_i.h" + +#include <linux/uio.h> +#include <linux/compat.h> +#include <linux/fileattr.h> + +/* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit. Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) +{ +#ifdef CONFIG_COMPAT + if (count * sizeof(struct compat_iovec) == transferred) { + struct compat_iovec *ciov = src; + unsigned i; + + /* + * With this interface a 32bit server cannot support + * non-compat (i.e. ones coming from 64bit apps) ioctl + * requests + */ + if (!is_compat) + return -EINVAL; + + for (i = 0; i < count; i++) { + dst[i].iov_base = compat_ptr(ciov[i].iov_base); + dst[i].iov_len = ciov[i].iov_len; + } + return 0; + } +#endif + + if (count * sizeof(struct iovec) != transferred) + return -EIO; + + memcpy(dst, src, transferred); + return 0; +} + +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, + size_t count) +{ + size_t n; + u32 max = fc->max_pages << PAGE_SHIFT; + + for (n = 0; n < count; n++, iov++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; + } + return 0; +} + +static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, + void *src, size_t transferred, unsigned count, + bool is_compat) +{ + unsigned i; + struct fuse_ioctl_iovec *fiov = src; + + if (fc->minor < 16) { + return fuse_copy_ioctl_iovec_old(dst, src, transferred, + count, is_compat); + } + + if (count * sizeof(struct fuse_ioctl_iovec) != transferred) + return -EIO; + + for (i = 0; i < count; i++) { + /* Did the server supply an inappropriate value? */ + if (fiov[i].base != (unsigned long) fiov[i].base || + fiov[i].len != (unsigned long) fiov[i].len) + return -EIO; + + dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; + dst[i].iov_len = (size_t) fiov[i].len; + +#ifdef CONFIG_COMPAT + if (is_compat && + (ptr_to_compat(dst[i].iov_base) != fiov[i].base || + (compat_size_t) dst[i].iov_len != fiov[i].len)) + return -EIO; +#endif + } + + return 0; +} + + +/* + * For ioctls, there is no generic way to determine how much memory + * needs to be read and/or written. Furthermore, ioctls are allowed + * to dereference the passed pointer, so the parameter requires deep + * copying but FUSE has no idea whatsoever about what to copy in or + * out. + * + * This is solved by allowing FUSE server to retry ioctl with + * necessary in/out iovecs. Let's assume the ioctl implementation + * needs to read in the following structure. + * + * struct a { + * char *buf; + * size_t buflen; + * } + * + * On the first callout to FUSE server, inarg->in_size and + * inarg->out_size will be NULL; then, the server completes the ioctl + * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and + * the actual iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } + * + * which tells FUSE to copy in the requested area and retry the ioctl. + * On the second round, the server has access to the structure and + * from that it can tell what to look for next, so on the invocation, + * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, + * { .iov_base = a.buf, .iov_len = a.buflen } } + * + * FUSE will copy both struct a and the pointed buffer from the + * process doing the ioctl and retry ioctl with both struct a and the + * buffer. + * + * This time, FUSE server has everything it needs and completes ioctl + * without FUSE_IOCTL_RETRY which finishes the ioctl call. + * + * Copying data out works the same way. + * + * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel + * automatically initializes in and out iovs by decoding @cmd with + * _IOC_* macros and the server is not allowed to request RETRY. This + * limits ioctl data transfers to well-formed ioctls and is the forced + * behavior for all FUSE servers. + */ +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags) +{ + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + struct fuse_ioctl_in inarg = { + .fh = ff->fh, + .cmd = cmd, + .arg = arg, + .flags = flags + }; + struct fuse_ioctl_out outarg; + struct iovec *iov_page = NULL; + struct iovec *in_iov = NULL, *out_iov = NULL; + unsigned int in_iovs = 0, out_iovs = 0, max_pages; + size_t in_size, out_size, c; + ssize_t transferred; + int err, i; + struct iov_iter ii; + struct fuse_args_pages ap = {}; + +#if BITS_PER_LONG == 32 + inarg.flags |= FUSE_IOCTL_32BIT; +#else + if (flags & FUSE_IOCTL_COMPAT) { + inarg.flags |= FUSE_IOCTL_32BIT; +#ifdef CONFIG_X86_X32 + if (in_x32_syscall()) + inarg.flags |= FUSE_IOCTL_COMPAT_X32; +#endif + } +#endif + + /* assume all the iovs returned by client always fits in a page */ + BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); + + err = -ENOMEM; + ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); + iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); + if (!ap.pages || !iov_page) + goto out; + + fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); + + /* + * If restricted, initialize IO parameters as encoded in @cmd. + * RETRY from server is not allowed. + */ + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { + struct iovec *iov = iov_page; + + iov->iov_base = (void __user *)arg; + iov->iov_len = _IOC_SIZE(cmd); + + if (_IOC_DIR(cmd) & _IOC_WRITE) { + in_iov = iov; + in_iovs = 1; + } + + if (_IOC_DIR(cmd) & _IOC_READ) { + out_iov = iov; + out_iovs = 1; + } + } + + retry: + inarg.in_size = in_size = iov_length(in_iov, in_iovs); + inarg.out_size = out_size = iov_length(out_iov, out_iovs); + + /* + * Out data can be used either for actual out data or iovs, + * make sure there always is at least one page. + */ + out_size = max_t(size_t, out_size, PAGE_SIZE); + max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); + + /* make sure there are enough buffer pages and init request with them */ + err = -ENOMEM; + if (max_pages > fm->fc->max_pages) + goto out; + while (ap.num_pages < max_pages) { + ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!ap.pages[ap.num_pages]) + goto out; + ap.num_pages++; + } + + + /* okay, let's send it to the client */ + ap.args.opcode = FUSE_IOCTL; + ap.args.nodeid = ff->nodeid; + ap.args.in_numargs = 1; + ap.args.in_args[0].size = sizeof(inarg); + ap.args.in_args[0].value = &inarg; + if (in_size) { + ap.args.in_numargs++; + ap.args.in_args[1].size = in_size; + ap.args.in_pages = true; + + err = -EFAULT; + iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } + } + + ap.args.out_numargs = 2; + ap.args.out_args[0].size = sizeof(outarg); + ap.args.out_args[0].value = &outarg; + ap.args.out_args[1].size = out_size; + ap.args.out_pages = true; + ap.args.out_argvar = true; + + transferred = fuse_simple_request(fm, &ap.args); + err = transferred; + if (transferred < 0) + goto out; + + /* did it ask for retry? */ + if (outarg.flags & FUSE_IOCTL_RETRY) { + void *vaddr; + + /* no retry if in restricted mode */ + err = -EIO; + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) + goto out; + + in_iovs = outarg.in_iovs; + out_iovs = outarg.out_iovs; + + /* + * Make sure things are in boundary, separate checks + * are to protect against overflow. + */ + err = -ENOMEM; + if (in_iovs > FUSE_IOCTL_MAX_IOV || + out_iovs > FUSE_IOCTL_MAX_IOV || + in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) + goto out; + + vaddr = kmap_atomic(ap.pages[0]); + err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, + transferred, in_iovs + out_iovs, + (flags & FUSE_IOCTL_COMPAT) != 0); + kunmap_atomic(vaddr); + if (err) + goto out; + + in_iov = iov_page; + out_iov = in_iov + in_iovs; + + err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs); + if (err) + goto out; + + err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs); + if (err) + goto out; + + goto retry; + } + + err = -EIO; + if (transferred > inarg.out_size) + goto out; + + err = -EFAULT; + iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } + err = 0; + out: + free_page((unsigned long) iov_page); + while (ap.num_pages) + __free_page(ap.pages[--ap.num_pages]); + kfree(ap.pages); + + return err ? err : outarg.result; +} +EXPORT_SYMBOL_GPL(fuse_do_ioctl); + +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + if (fuse_is_bad(inode)) + return -EIO; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return fuse_ioctl_common(file, cmd, arg, 0); +} + +long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); +} + +static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, + unsigned int cmd, void *ptr, size_t size) +{ + struct fuse_mount *fm = ff->fm; + struct fuse_ioctl_in inarg; + struct fuse_ioctl_out outarg; + FUSE_ARGS(args); + int err; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.cmd = cmd; + +#if BITS_PER_LONG == 32 + inarg.flags |= FUSE_IOCTL_32BIT; +#endif + if (S_ISDIR(inode->i_mode)) + inarg.flags |= FUSE_IOCTL_DIR; + + if (_IOC_DIR(cmd) & _IOC_READ) + inarg.out_size = size; + if (_IOC_DIR(cmd) & _IOC_WRITE) + inarg.in_size = size; + + args.opcode = FUSE_IOCTL; + args.nodeid = ff->nodeid; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = inarg.in_size; + args.in_args[1].value = ptr; + args.out_numargs = 2; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + args.out_args[1].size = inarg.out_size; + args.out_args[1].value = ptr; + + err = fuse_simple_request(fm, &args); + if (!err && outarg.flags & FUSE_IOCTL_RETRY) + err = -EIO; + + return err; +} + +static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + bool isdir = S_ISDIR(inode->i_mode); + + if (!S_ISREG(inode->i_mode) && !isdir) + return ERR_PTR(-ENOTTY); + + return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir); +} + +static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) +{ + fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode)); +} + +int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct fuse_file *ff; + unsigned int flags; + struct fsxattr xfa; + int err; + + ff = fuse_priv_ioctl_prepare(inode); + if (IS_ERR(ff)) + return PTR_ERR(ff); + + if (fa->flags_valid) { + err = fuse_priv_ioctl(inode, ff, FS_IOC_GETFLAGS, + &flags, sizeof(flags)); + if (err) + goto cleanup; + + fileattr_fill_flags(fa, flags); + } else { + err = fuse_priv_ioctl(inode, ff, FS_IOC_FSGETXATTR, + &xfa, sizeof(xfa)); + if (err) + goto cleanup; + + fileattr_fill_xflags(fa, xfa.fsx_xflags); + fa->fsx_extsize = xfa.fsx_extsize; + fa->fsx_nextents = xfa.fsx_nextents; + fa->fsx_projid = xfa.fsx_projid; + fa->fsx_cowextsize = xfa.fsx_cowextsize; + } +cleanup: + fuse_priv_ioctl_cleanup(inode, ff); + + return err; +} + +int fuse_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct fuse_file *ff; + unsigned int flags = fa->flags; + struct fsxattr xfa; + int err; + + ff = fuse_priv_ioctl_prepare(inode); + if (IS_ERR(ff)) + return PTR_ERR(ff); + + if (fa->flags_valid) { + err = fuse_priv_ioctl(inode, ff, FS_IOC_SETFLAGS, + &flags, sizeof(flags)); + if (err) + goto cleanup; + } else { + memset(&xfa, 0, sizeof(xfa)); + xfa.fsx_xflags = fa->fsx_xflags; + xfa.fsx_extsize = fa->fsx_extsize; + xfa.fsx_nextents = fa->fsx_nextents; + xfa.fsx_projid = fa->fsx_projid; + xfa.fsx_cowextsize = fa->fsx_cowextsize; + + err = fuse_priv_ioctl(inode, ff, FS_IOC_FSSETXATTR, + &xfa, sizeof(xfa)); + } + +cleanup: + fuse_priv_ioctl_cleanup(inode, ff); + + return err; +} diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 3441ffa740f3..277f7041d55a 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -202,7 +202,7 @@ retry: inode = d_inode(dentry); if (!inode || get_node_id(inode) != o->nodeid || - ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { + inode_wrong_type(inode, o->attr.mode)) { d_invalidate(dentry); dput(dentry); goto retry; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 8868ac31a3c0..bcb8a02e2d8b 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -18,6 +18,12 @@ #include <linux/uio.h> #include "fuse_i.h" +/* Used to help calculate the FUSE connection's max_pages limit for a request's + * size. Parts of the struct fuse_req are sliced into scattergather lists in + * addition to the pages used, so this can help account for that overhead. + */ +#define FUSE_HEADER_OVERHEAD 4 + /* List of virtio-fs device instances and a lock for the list. Also provides * mutual exclusion in device removal and mounting path */ @@ -127,11 +133,6 @@ static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) return &fs->vqs[vq->index]; } -static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq) -{ - return &vq_to_fsvq(vq)->fud->pq; -} - /* Should be called with fsvq->lock held. */ static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq) { @@ -896,6 +897,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: vdev->config->reset(vdev); virtio_fs_cleanup_vqs(vdev, fs); + kfree(fs->vqs); out: vdev->priv = NULL; @@ -1324,8 +1326,15 @@ static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) /* virtiofs allocates and installs its own fuse devices */ ctx->fudptr = NULL; - if (ctx->dax) + if (ctx->dax) { + if (!fs->dax_dev) { + err = -EINVAL; + pr_err("virtio-fs: dax can't be enabled as filesystem" + " device does not support it.\n"); + goto err_free_fuse_devs; + } ctx->dax_dev = fs->dax_dev; + } err = fuse_fill_super_common(sb, ctx); if (err < 0) goto err_free_fuse_devs; @@ -1406,9 +1415,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) { struct virtio_fs *fs; struct super_block *sb; - struct fuse_conn *fc; + struct fuse_conn *fc = NULL; struct fuse_mount *fm; - int err; + unsigned int virtqueue_size; + int err = -EIO; /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() @@ -1420,6 +1430,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -EINVAL; } + virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq); + if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD)) + goto out_err; + err = -ENOMEM; fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); if (!fc) @@ -1429,12 +1443,15 @@ static int virtio_fs_get_tree(struct fs_context *fsc) if (!fm) goto out_err; - fuse_conn_init(fc, fm, get_user_ns(current_user_ns()), - &virtio_fs_fiq_ops, fs); + fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs); fc->release = fuse_free_conn; fc->delete_stale = true; fc->auto_submounts = true; + /* Tell FUSE to split requests that exceed the virtqueue's size */ + fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, + virtqueue_size - FUSE_HEADER_OVERHEAD); + fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); if (fsc->s_fs_info) { diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 1a7d7ace54e1..61dfaf7b7d20 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -12,7 +12,7 @@ #include <linux/posix_acl_xattr.h> int fuse_setxattr(struct inode *inode, const char *name, const void *value, - size_t size, int flags) + size_t size, int flags, unsigned int extra_flags) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); @@ -25,10 +25,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; + inarg.setxattr_flags = extra_flags; + args.opcode = FUSE_SETXATTR; args.nodeid = get_node_id(inode); args.in_numargs = 3; - args.in_args[0].size = sizeof(inarg); + args.in_args[0].size = fm->fc->setxattr_ext ? + sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE; args.in_args[0].value = &inarg; args.in_args[1].size = strlen(name) + 1; args.in_args[1].value = name; @@ -199,7 +202,7 @@ static int fuse_xattr_set(const struct xattr_handler *handler, if (!value) return fuse_removexattr(inode, name); - return fuse_setxattr(inode, name, value, size, flags); + return fuse_setxattr(inode, name, value, size, flags, 0); } static bool no_xattr_list(struct dentry *dentry) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index cc4f987687f3..23b5be3db044 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -540,10 +540,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, /** * gfs2_readahead - Read a bunch of pages at once - * @file: The file to read from - * @mapping: Address space info - * @pages: List of pages to read - * @nr_pages: Number of pages to read + * @rac: Read-ahead control structure * * Some notes: * 1. This is only for readahead, so we can simply ignore any things diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 7a358ae05185..0bcf11a9987b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -331,7 +331,7 @@ static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, if (!dblock) break; - ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]); + ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]); if (ret) return ret; } @@ -632,7 +632,7 @@ enum alloc_state { }; /** - * gfs2_iomap_alloc - Build a metadata tree of the requested height + * __gfs2_iomap_alloc - Build a metadata tree of the requested height * @inode: The GFS2 inode * @iomap: The iomap structure * @mp: The metapath, with proper height information calculated @@ -642,7 +642,7 @@ enum alloc_state { * ii) Indirect blocks to fill in lower part of the metadata tree * iii) Data blocks * - * This function is called after gfs2_iomap_get, which works out the + * This function is called after __gfs2_iomap_get, which works out the * total number of blocks which we need via gfs2_alloc_size. * * We then do the actual allocation asking for an extent at a time (if @@ -660,8 +660,8 @@ enum alloc_state { * Returns: errno on error */ -static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, - struct metapath *mp) +static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, + struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -802,10 +802,10 @@ static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size) /* * For writes to stuffed files, this function is called twice via - * gfs2_iomap_get, before and after unstuffing. The size we return the + * __gfs2_iomap_get, before and after unstuffing. The size we return the * first time needs to be large enough to get the reservation and * allocation sizes right. The size we return the second time must - * be exact or else gfs2_iomap_alloc won't do the right thing. + * be exact or else __gfs2_iomap_alloc won't do the right thing. */ if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) { @@ -829,7 +829,7 @@ static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size) } /** - * gfs2_iomap_get - Map blocks from an inode to disk blocks + * __gfs2_iomap_get - Map blocks from an inode to disk blocks * @inode: The inode * @pos: Starting position in bytes * @length: Length to map, in bytes @@ -839,9 +839,9 @@ static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size) * * Returns: errno */ -static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap *iomap, - struct metapath *mp) +static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap, + struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -961,32 +961,6 @@ hole_found: goto out; } -/** - * gfs2_lblk_to_dblk - convert logical block to disk block - * @inode: the inode of the file we're mapping - * @lblock: the block relative to the start of the file - * @dblock: the returned dblock, if no error - * - * This function maps a single block from a file logical block (relative to - * the start of the file) to a file system absolute block using iomap. - * - * Returns: the absolute file system block, or an error - */ -int gfs2_lblk_to_dblk(struct inode *inode, u32 lblock, u64 *dblock) -{ - struct iomap iomap = { }; - struct metapath mp = { .mp_aheight = 1, }; - loff_t pos = (loff_t)lblock << inode->i_blkbits; - int ret; - - ret = gfs2_iomap_get(inode, pos, i_blocksize(inode), 0, &iomap, &mp); - release_metapath(&mp); - if (ret == 0) - *dblock = iomap.addr >> inode->i_blkbits; - - return ret; -} - static int gfs2_write_lock(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); @@ -1109,14 +1083,14 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, if (ret) goto out_trans_end; release_metapath(mp); - ret = gfs2_iomap_get(inode, iomap->offset, - iomap->length, flags, iomap, mp); + ret = __gfs2_iomap_get(inode, iomap->offset, + iomap->length, flags, iomap, mp); if (ret) goto out_trans_end; } if (iomap->type == IOMAP_HOLE) { - ret = gfs2_iomap_alloc(inode, iomap, mp); + ret = __gfs2_iomap_alloc(inode, iomap, mp); if (ret) { gfs2_trans_end(sdp); gfs2_inplace_release(ip); @@ -1168,7 +1142,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, goto out; } - ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); if (ret) goto out_unlock; @@ -1290,9 +1264,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, struct gfs2_inode *ip = GFS2_I(inode); loff_t pos = (loff_t)lblock << inode->i_blkbits; loff_t length = bh_map->b_size; - struct metapath mp = { .mp_aheight = 1, }; struct iomap iomap = { }; - int flags = create ? IOMAP_WRITE : 0; int ret; clear_buffer_mapped(bh_map); @@ -1300,10 +1272,10 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, clear_buffer_boundary(bh_map); trace_gfs2_bmap(ip, bh_map, lblock, create, 1); - ret = gfs2_iomap_get(inode, pos, length, flags, &iomap, &mp); - if (create && !ret && iomap.type == IOMAP_HOLE) - ret = gfs2_iomap_alloc(inode, &iomap, &mp); - release_metapath(&mp); + if (!create) + ret = gfs2_iomap_get(inode, pos, length, &iomap); + else + ret = gfs2_iomap_alloc(inode, pos, length, &iomap); if (ret) goto out; @@ -1324,28 +1296,47 @@ out: return ret; } -/* - * Deprecated: do not use in new code - */ -int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen) +int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned int *extlen) { - struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 }; + unsigned int blkbits = inode->i_blkbits; + struct iomap iomap = { }; + unsigned int len; int ret; - int create = *new; - - BUG_ON(!extlen); - BUG_ON(!dblock); - BUG_ON(!new); - - bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5)); - ret = gfs2_block_map(inode, lblock, &bh, create); - *extlen = bh.b_size >> inode->i_blkbits; - *dblock = bh.b_blocknr; - if (buffer_new(&bh)) - *new = 1; - else - *new = 0; - return ret; + + ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits, + &iomap); + if (ret) + return ret; + if (iomap.type != IOMAP_MAPPED) + return -EIO; + *dblock = iomap.addr >> blkbits; + len = iomap.length >> blkbits; + if (len < *extlen) + *extlen = len; + return 0; +} + +int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned int *extlen, bool *new) +{ + unsigned int blkbits = inode->i_blkbits; + struct iomap iomap = { }; + unsigned int len; + int ret; + + ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits, + &iomap); + if (ret) + return ret; + if (iomap.type != IOMAP_MAPPED) + return -EIO; + *dblock = iomap.addr >> blkbits; + len = iomap.length >> blkbits; + if (len < *extlen) + *extlen = len; + *new = iomap.flags & IOMAP_F_NEW; + return 0; } /* @@ -1461,15 +1452,26 @@ out: return error; } -int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length, - struct iomap *iomap) +int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap) { struct metapath mp = { .mp_aheight = 1, }; int ret; - ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp); + ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp); + release_metapath(&mp); + return ret; +} + +int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap) +{ + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp); if (!ret && iomap->type == IOMAP_HOLE) - ret = gfs2_iomap_alloc(inode, iomap, &mp); + ret = __gfs2_iomap_alloc(inode, iomap, &mp); release_metapath(&mp); return ret; } @@ -1477,7 +1479,7 @@ int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length, /** * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein * @ip: inode - * @rg_gh: holder of resource group glock + * @rd_gh: holder of resource group glock * @bh: buffer head to sweep * @start: starting point in bh * @end: end point in bh @@ -1658,8 +1660,11 @@ static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h) /** * find_nonnull_ptr - find a non-null pointer given a metapath and height + * @sdp: The superblock * @mp: starting metapath * @h: desired height to search + * @end_list: See punch_hole(). + * @end_aligned: See punch_hole(). * * Assumes the metapath is valid (with buffers) out to height h. * Returns: true if a non-null pointer was found in the metapath buffer @@ -2519,7 +2524,6 @@ out: static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset) { - struct metapath mp = { .mp_aheight = 1, }; int ret; if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode)))) @@ -2530,8 +2534,7 @@ static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, return 0; memset(&wpc->iomap, 0, sizeof(wpc->iomap)); - ret = gfs2_iomap_get(inode, offset, INT_MAX, 0, &wpc->iomap, &mp); - release_metapath(&mp); + ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap); return ret; } diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index aed4632d47d3..6676d863faef 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -49,10 +49,14 @@ extern const struct iomap_writeback_ops gfs2_writeback_ops; extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); extern int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); -extern int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length, - struct iomap *iomap); -extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, - u64 *dblock, unsigned *extlen); +extern int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap); +extern int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap); +extern int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned int *extlen); +extern int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned *extlen, bool *new); extern int gfs2_setattr_size(struct inode *inode, u64 size); extern void gfs2_trim_blocks(struct inode *inode); extern int gfs2_truncatei_resume(struct gfs2_inode *ip); @@ -62,6 +66,5 @@ extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd); extern int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length); -extern int gfs2_lblk_to_dblk(struct inode *inode, u32 lblock, u64 *dblock); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c0f2875c946c..18f67b37d6f8 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -159,7 +159,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, unsigned int o; int copied = 0; int error = 0; - int new = 0; + bool new = false; if (!size) return 0; @@ -189,9 +189,9 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, amount = sdp->sd_sb.sb_bsize - o; if (!extlen) { - new = 1; - error = gfs2_extent_map(&ip->i_inode, lblock, &new, - &dblock, &extlen); + extlen = 1; + error = gfs2_alloc_extent(&ip->i_inode, lblock, &dblock, + &extlen, &new); if (error) goto fail; error = -EIO; @@ -286,15 +286,14 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, while (copied < size) { unsigned int amount; struct buffer_head *bh; - int new; amount = size - copied; if (amount > sdp->sd_sb.sb_bsize - o) amount = sdp->sd_sb.sb_bsize - o; if (!extlen) { - new = 0; - error = gfs2_extent_map(&ip->i_inode, lblock, &new, + extlen = 32; + error = gfs2_get_extent(&ip->i_inode, lblock, &dblock, &extlen); if (error || !dblock) goto fail; @@ -770,14 +769,13 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, /** * get_leaf_nr - Get a leaf number associated with the index * @dip: The GFS2 inode - * @index: - * @leaf_out: + * @index: hash table index of the targeted leaf + * @leaf_out: Resulting leaf block number * * Returns: 0 on success, error code otherwise */ -static int get_leaf_nr(struct gfs2_inode *dip, u32 index, - u64 *leaf_out) +static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out) { __be64 *hash; int error; @@ -898,7 +896,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, /** * dir_make_exhash - Convert a stuffed directory into an ExHash directory - * @dip: The GFS2 inode + * @inode: The directory inode to be converted to exhash * * Returns: 0 on success, error code otherwise */ @@ -991,9 +989,8 @@ static int dir_make_exhash(struct inode *inode) /** * dir_split_leaf - Split a leaf block into two - * @dip: The GFS2 inode - * @index: - * @leaf_no: + * @inode: The directory inode to be split + * @name: name of the dirent we're trying to insert * * Returns: 0 on success, error code on failure */ @@ -1252,6 +1249,7 @@ static int compare_dents(const void *a, const void *b) * @ctx: what to feed the entries to * @darr: an array of struct gfs2_dirent pointers to read * @entries: the number of entries in darr + * @sort_start: index of the directory array to start our sort * @copied: pointer to int that's non-zero if a entry has been copied out * * Jump through some hoops to make sure that if there are hash collsions, @@ -1468,6 +1466,10 @@ out: /** * gfs2_dir_readahead - Issue read-ahead requests for leaf blocks. + * @inode: the directory inode + * @hsize: hash table size + * @index: index into the hash table + * @f_ra: read-ahead parameters * * Note: we can't calculate each index like dir_e_read can because we don't * have the leaf, and therefore we don't have the depth, and therefore we @@ -1517,8 +1519,9 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, /** * dir_e_read - Reads the entries from a directory into a filldir buffer - * @dip: dinode pointer + * @inode: the directory inode * @ctx: actor to feed the entries to + * @f_ra: read-ahead parameters * * Returns: errno */ @@ -1627,7 +1630,7 @@ out: /** * gfs2_dir_search - Search a directory - * @dip: The GFS2 dir inode + * @dir: The GFS2 directory inode * @name: The name we are looking up * @fail_on_exist: Fail if the name exists rather than looking it up * @@ -1864,7 +1867,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, /** * gfs2_dir_del - Delete a directory entry * @dip: The GFS2 inode - * @filename: The filename + * @dentry: The directory entry we want to delete * * Returns: 0 on success, error code on failure */ @@ -1918,9 +1921,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) /** * gfs2_dir_mvino - Change inode number of directory entry - * @dip: The GFS2 inode - * @filename: - * @new_inode: + * @dip: The GFS2 directory inode + * @filename: the filename to be moved + * @nip: the new GFS2 inode + * @new_type: the de_type of the new dirent * * This routine changes the inode number of a directory entry. It's used * by rename to change ".." when a directory is moved. @@ -1960,7 +1964,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, * @len: the number of pointers to this leaf * @leaf_no: the leaf number * @leaf_bh: buffer_head for the starting leaf - * last_dealloc: 1 if this is the final dealloc for the leaf, else 0 + * @last_dealloc: 1 if this is the final dealloc for the leaf, else 0 * * Returns: errno */ @@ -2142,8 +2146,8 @@ out: /** * gfs2_diradd_alloc_required - find if adding entry will require an allocation - * @ip: the file being written to - * @filname: the filename that's going to be added + * @inode: the directory inode being written to + * @name: the filename that's going to be added * @da: The structure to return dir alloc info * * Returns: 0 if ok, -ve on error diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 2d500f90cdac..a0b542d84cd9 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,6 +25,7 @@ #include <linux/dlm_plock.h> #include <linux/delay.h> #include <linux/backing-dev.h> +#include <linux/fileattr.h> #include "gfs2.h" #include "incore.h" @@ -118,8 +119,8 @@ static int gfs2_readdir(struct file *file, struct dir_context *ctx) return error; } -/** - * fsflag_gfs2flag +/* + * struct fsflag_gfs2flag * * The FS_JOURNAL_DATA_FL flag maps to GFS2_DIF_INHERIT_JDATA for directories, * and to GFS2_DIF_JDATA for non-directories. @@ -153,14 +154,17 @@ static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags) return fsflags; } -static int gfs2_get_flags(struct file *filp, u32 __user *ptr) +int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = file_inode(filp); + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; u32 fsflags; + if (d_is_special(dentry)) + return -ENOTTY; + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); error = gfs2_glock_nq(&gh); if (error) @@ -168,8 +172,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) fsflags = gfs2_gfsflags_to_fsflags(inode, ip->i_diskflags); - if (put_user(fsflags, ptr)) - error = -EFAULT; + fileattr_fill_flags(fa, fsflags); gfs2_glock_dq(&gh); out_uninit: @@ -213,33 +216,19 @@ void gfs2_set_inode_flags(struct inode *inode) * @fsflags: The FS_* inode flags passed in * */ -static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask, +static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask, const u32 fsflags) { - struct inode *inode = file_inode(filp); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *bh; struct gfs2_holder gh; int error; - u32 new_flags, flags, oldflags; - - error = mnt_want_write_file(filp); - if (error) - return error; + u32 new_flags, flags; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (error) - goto out_drop_write; - - oldflags = gfs2_gfsflags_to_fsflags(inode, ip->i_diskflags); - error = vfs_ioc_setflags_prepare(inode, oldflags, fsflags); - if (error) - goto out; - - error = -EACCES; - if (!inode_owner_or_capable(&init_user_ns, inode)) - goto out; + return error; error = 0; flags = ip->i_diskflags; @@ -252,9 +241,6 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask, goto out; if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) goto out; - if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) && - !capable(CAP_LINUX_IMMUTABLE)) - goto out; if (!IS_IMMUTABLE(inode)) { error = gfs2_permission(&init_user_ns, inode, MAY_WRITE); if (error) @@ -291,20 +277,22 @@ out_trans_end: gfs2_trans_end(sdp); out: gfs2_glock_dq_uninit(&gh); -out_drop_write: - mnt_drop_write_file(filp); return error; } -static int gfs2_set_flags(struct file *filp, u32 __user *ptr) +int gfs2_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = file_inode(filp); - u32 fsflags, gfsflags = 0; + struct inode *inode = d_inode(dentry); + u32 fsflags = fa->flags, gfsflags = 0; u32 mask; int i; - if (get_user(fsflags, ptr)) - return -EFAULT; + if (d_is_special(dentry)) + return -ENOTTY; + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) { if (fsflags & fsflag_gfs2flag[i].fsflag) { @@ -325,7 +313,7 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA); } - return do_gfs2_set_flags(filp, gfsflags, mask, fsflags); + return do_gfs2_set_flags(inode, gfsflags, mask, fsflags); } static int gfs2_getlabel(struct file *filp, char __user *label) @@ -342,10 +330,6 @@ static int gfs2_getlabel(struct file *filp, char __user *label) static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch(cmd) { - case FS_IOC_GETFLAGS: - return gfs2_get_flags(filp, (u32 __user *)arg); - case FS_IOC_SETFLAGS: - return gfs2_set_flags(filp, (u32 __user *)arg); case FITRIM: return gfs2_fitrim(filp, (void __user *)arg); case FS_IOC_GETFSLABEL: @@ -359,13 +343,6 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) static long gfs2_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch(cmd) { - /* These are just misnamed, they actually get/put from/to user an int */ - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; /* Keep this list in sync with gfs2_ioctl */ case FITRIM: case FS_IOC_GETFSLABEL: @@ -421,7 +398,7 @@ static int gfs2_allocate_page_backing(struct page *page, unsigned int length) do { struct iomap iomap = { }; - if (gfs2_iomap_get_alloc(page->mapping->host, pos, length, &iomap)) + if (gfs2_iomap_alloc(page->mapping->host, pos, length, &iomap)) return -EIO; if (length < iomap.length) @@ -435,7 +412,6 @@ static int gfs2_allocate_page_backing(struct page *page, unsigned int length) /** * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable - * @vma: The virtual memory area * @vmf: The virtual memory fault containing the page to become writable * * When the page becomes writable, we need to ensure that we have @@ -586,7 +562,7 @@ static const struct vm_operations_struct gfs2_vm_ops = { }; /** - * gfs2_mmap - + * gfs2_mmap * @file: The file to map * @vma: The VMA which described the mapping * @@ -991,8 +967,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, while (offset < end) { struct iomap iomap = { }; - error = gfs2_iomap_get_alloc(inode, offset, end - offset, - &iomap); + error = gfs2_iomap_alloc(inode, offset, end - offset, &iomap); if (error) goto out; offset = iomap.offset + iomap.length; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9567520d79f7..84c38103aa06 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -359,7 +359,8 @@ static void gfs2_holder_wake(struct gfs2_holder *gh) /** * do_error - Something unexpected has happened during a lock request - * + * @gl: The glock + * @ret: The status from the DLM */ static void do_error(struct gfs2_glock *gl, const int ret) @@ -454,8 +455,7 @@ static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) /** * state_change - record that the glock is now in a different state * @gl: the glock - * @new_state the new state - * + * @new_state: the new state */ static void state_change(struct gfs2_glock *gl, unsigned int new_state) @@ -566,7 +566,7 @@ retry: if (state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { spin_unlock(&gl->gl_lockref.lock); - rv = glops->go_xmote_bh(gl, gh); + rv = glops->go_xmote_bh(gl); spin_lock(&gl->gl_lockref.lock); if (rv) { do_error(gl, rv); @@ -1268,6 +1268,8 @@ wait_for_dlm: * handle_callback - process a demote request * @gl: the glock * @state: the state the caller wants us to change to + * @delay: zero to demote immediately; otherwise pending demote + * @remote: true if this came from a different cluster node * * There are only two requests that we are going to see in actual * practise: LM_ST_SHARED and LM_ST_UNLOCKED @@ -1569,6 +1571,7 @@ static int glock_compare(const void *arg_a, const void *arg_b) * nq_m_sync - synchonously acquire more than one glock in deadlock free order * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures + * @p: placeholder for the holder structure to pass back * * Returns: 0 on success (all glocks acquired), * errno on failure (no glocks acquired) @@ -1732,7 +1735,8 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) spin_unlock(&gl->gl_lockref.lock); } -static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) +static int glock_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct gfs2_glock *gla, *glb; @@ -1985,7 +1989,6 @@ static void dump_glock_func(struct gfs2_glock *gl) /** * gfs2_gl_hash_clear - Empty out the glock hash table * @sdp: the filesystem - * @wait: wait until it's all gone * * Called when unmounting the filesystem. */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 8e32d569c8bf..454095e9fedf 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -49,6 +49,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) * __gfs2_ail_flush - remove all buffers for a given lock from the AIL * @gl: the glock * @fsync: set when called from fsync (not all buffers will be clean) + * @nr_revokes: Number of buffers to revoke * * None of the buffers should be dirty, locked, or pinned. */ @@ -394,18 +395,24 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) const struct gfs2_dinode *str = buf; struct timespec64 atime; u16 height, depth; + umode_t mode = be32_to_cpu(str->di_mode); + bool is_new = ip->i_inode.i_flags & I_NEW; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) goto corrupt; + if (unlikely(!is_new && inode_wrong_type(&ip->i_inode, mode))) + goto corrupt; ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); - ip->i_inode.i_mode = be32_to_cpu(str->di_mode); - ip->i_inode.i_rdev = 0; - switch (ip->i_inode.i_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), - be32_to_cpu(str->di_minor)); - break; + ip->i_inode.i_mode = mode; + if (is_new) { + ip->i_inode.i_rdev = 0; + switch (mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), + be32_to_cpu(str->di_minor)); + break; + } } i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); @@ -474,8 +481,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip) /** * inode_go_lock - operation done after an inode lock is locked by a process - * @gl: the glock - * @flags: + * @gh: The glock holder * * Returns: errno */ @@ -516,7 +522,7 @@ static int inode_go_lock(struct gfs2_holder *gh) /** * inode_go_dump - print information about an inode * @seq: The iterator - * @ip: the inode + * @gl: The glock * @fs_id_buf: file system id (may be empty) * */ @@ -547,9 +553,6 @@ static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl, /** * freeze_go_sync - promote/demote the freeze glock * @gl: the glock - * @state: the requested state - * @flags: - * */ static int freeze_go_sync(struct gfs2_glock *gl) @@ -594,10 +597,8 @@ static int freeze_go_sync(struct gfs2_glock *gl) /** * freeze_go_xmote_bh - After promoting/demoting the freeze glock * @gl: the glock - * */ - -static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) +static int freeze_go_xmote_bh(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); @@ -624,7 +625,7 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) } /** - * trans_go_demote_ok + * freeze_go_demote_ok * @gl: the glock * * Always returns 0 @@ -638,6 +639,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl) /** * iopen_go_callback - schedule the dcache entry for the inode to be deleted * @gl: the glock + * @remote: true if this came from a different cluster node * * gl_lockref.lock lock is held while calling this */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0957119f7744..e6f820f146cb 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -217,7 +217,7 @@ struct lm_lockname { struct gfs2_glock_operations { int (*go_sync) (struct gfs2_glock *gl); - int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); + int (*go_xmote_bh)(struct gfs2_glock *gl); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); @@ -625,7 +625,6 @@ struct gfs2_inum_host { struct gfs2_sb_host { u32 sb_magic; u32 sb_type; - u32 sb_format; u32 sb_fs_format; u32 sb_multihost_format; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c9775d5c6594..6e15434b23ac 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -285,10 +285,9 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) /** * gfs2_lookupi - Look up a filename in a directory and return its inode - * @d_gh: An initialized holder for the directory glock + * @dir: The inode of the directory containing the inode to look-up * @name: The name of the inode to look for * @is_root: If 1, ignore the caller's permissions - * @i_gh: An uninitialized holder for the new inode glock * * This can be called via the VFS filldir function when NFS is doing * a readdirplus and the inode which its intending to stat isn't @@ -476,7 +475,6 @@ static void gfs2_init_xattr(struct gfs2_inode *ip) * @dip: The directory this inode is being created in * @ip: The inode * @symname: The symlink destination (if a symlink) - * @bhp: The buffer head (returned to caller) * */ @@ -514,7 +512,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, } /** - * gfs2_trans_da_blocks - Calculate number of blocks to link inode + * gfs2_trans_da_blks - Calculate number of blocks to link inode * @dip: The directory we are linking into * @da: The dir add information * @nr_inodes: The number of inodes involved @@ -595,6 +593,7 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, * @dev: For device nodes, this is the device number * @symname: For symlinks, this is the link destination * @size: The initial size of the inode (ignored for directories) + * @excl: Force fail if inode exists * * Returns: 0 on success, or error code */ @@ -837,9 +836,11 @@ fail: /** * gfs2_create - Create a file + * @mnt_userns: User namespace of the mount the inode was found from * @dir: The directory in which to create the file * @dentry: The dentry of the new file * @mode: The mode of the new file + * @excl: Force fail if inode exists * * Returns: errno */ @@ -962,6 +963,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, break; case 0: error = -EEXIST; + goto out_gunlock; default: goto out_gunlock; } @@ -1080,8 +1082,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, /** * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it * @dip: The parent directory - * @name: The name of the entry in the parent directory - * @inode: The inode to be removed + * @dentry: The dentry to unlink * * Called with all the locks and in a transaction. This will only be * called for a directory after it has been checked to ensure it is empty. @@ -1199,6 +1200,7 @@ out_inodes: /** * gfs2_symlink - Create a symlink + * @mnt_userns: User namespace of the mount the inode was found from * @dir: The directory to create the symlink in * @dentry: The dentry to put the symlink in * @symname: The thing which the link points to @@ -1220,6 +1222,7 @@ static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, /** * gfs2_mkdir - Make a directory + * @mnt_userns: User namespace of the mount the inode was found from * @dir: The parent directory of the new one * @dentry: The dentry of the new directory * @mode: The mode of the new directory @@ -1236,6 +1239,7 @@ static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, /** * gfs2_mknod - Make a special file + * @mnt_userns: User namespace of the mount the inode was found from * @dir: The directory in which the special file will reside * @dentry: The dentry of the special file * @mode: The mode of the special file @@ -1505,6 +1509,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, break; case 0: error = -EEXIST; + goto out_gunlock; default: goto out_gunlock; } @@ -1828,10 +1833,10 @@ out: } /** - * gfs2_permission - + * gfs2_permission + * @mnt_userns: User namespace of the mount the inode was found from * @inode: The inode * @mask: The mask to be tested - * @flags: Indicates whether this is an RCU path walk or not * * This may be called from the VFS directly, or from within GFS2 with the * inode locked, so we look to see if the glock is already locked and only @@ -1874,15 +1879,7 @@ static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) return 0; } -/** - * gfs2_setattr_simple - - * @ip: - * @attr: - * - * Returns: errno - */ - -int gfs2_setattr_simple(struct inode *inode, struct iattr *attr) +static int gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { int error; @@ -1962,6 +1959,7 @@ out: /** * gfs2_setattr - Change attributes on an inode + * @mnt_userns: User namespace of the mount the inode was found from * @dentry: The dentry which is changing * @attr: The structure describing the change * @@ -2157,6 +2155,8 @@ static const struct inode_operations gfs2_file_iops = { .get_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, .update_time = gfs2_update_time, + .fileattr_get = gfs2_fileattr_get, + .fileattr_set = gfs2_fileattr_set, }; static const struct inode_operations gfs2_dir_iops = { @@ -2178,6 +2178,8 @@ static const struct inode_operations gfs2_dir_iops = { .set_acl = gfs2_set_acl, .update_time = gfs2_update_time, .atomic_open = gfs2_atomic_open, + .fileattr_get = gfs2_fileattr_get, + .fileattr_set = gfs2_fileattr_set, }; static const struct inode_operations gfs2_symlink_iops = { diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index c447bd5b3017..7b2c1f390db7 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -101,7 +101,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); extern int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask); -extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); extern int gfs2_open_common(struct inode *inode, struct file *file); @@ -111,6 +110,9 @@ extern loff_t gfs2_seek_hole(struct file *file, loff_t offset); extern const struct file_operations gfs2_file_fops_nolock; extern const struct file_operations gfs2_dir_fops_nolock; +extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +extern int gfs2_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); extern void gfs2_set_inode_flags(struct inode *inode); #ifdef CONFIG_GFS2_FS_LOCKING_DLM diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 153272f82984..dac040162ecc 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -24,28 +24,31 @@ /** * gfs2_update_stats - Update time based stats - * @mv: Pointer to mean/variance structure to update + * @s: The stats to update (local or global) + * @index: The index inside @s * @sample: New data to include - * - * @delta is the difference between the current rtt sample and the - * running average srtt. We add 1/8 of that to the srtt in order to - * update the current srtt estimate. The variance estimate is a bit - * more complicated. We subtract the current variance estimate from - * the abs value of the @delta and add 1/4 of that to the running - * total. That's equivalent to 3/4 of the current variance - * estimate plus 1/4 of the abs of @delta. - * - * Note that the index points at the array entry containing the smoothed - * mean value, and the variance is always in the following entry - * - * Reference: TCP/IP Illustrated, vol 2, p. 831,832 - * All times are in units of integer nanoseconds. Unlike the TCP/IP case, - * they are not scaled fixed point. */ - static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, s64 sample) { + /* + * @delta is the difference between the current rtt sample and the + * running average srtt. We add 1/8 of that to the srtt in order to + * update the current srtt estimate. The variance estimate is a bit + * more complicated. We subtract the current variance estimate from + * the abs value of the @delta and add 1/4 of that to the running + * total. That's equivalent to 3/4 of the current variance + * estimate plus 1/4 of the abs of @delta. + * + * Note that the index points at the array entry containing the + * smoothed mean value, and the variance is always in the following + * entry + * + * Reference: TCP/IP Illustrated, vol 2, p. 831,832 + * All times are in units of integer nanoseconds. Unlike the TCP/IP + * case, they are not scaled fixed point. + */ + s64 delta = sample - s->stats[index]; s->stats[index] += (delta >> 3); index++; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 6410281546f9..97d54e581a7b 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -65,7 +65,6 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) /** * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters - * @mapping: The associated mapping (maybe NULL) * @bd: The gfs2_bufdata to remove * * The ail lock _must_ be held when calling this function @@ -82,11 +81,11 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) } /** - * gfs2_ail1_start_one - Start I/O on a part of the AIL - * @sdp: the filesystem + * gfs2_ail1_start_one - Start I/O on a transaction + * @sdp: The superblock * @wbc: The writeback control structure - * @ai: The ail structure - * + * @tr: The transaction to start I/O on + * @plug: The block plug currently active */ static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, @@ -269,7 +268,7 @@ static void gfs2_log_update_head(struct gfs2_sbd *sdp) sdp->sd_log_head = new_head; } -/** +/* * gfs2_ail_empty_tr - empty one of the ail lists of a transaction */ @@ -695,7 +694,7 @@ void log_flush_wait(struct gfs2_sbd *sdp) } } -static int ip_cmp(void *priv, struct list_head *a, struct list_head *b) +static int ip_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct gfs2_inode *ipa, *ipb; @@ -859,7 +858,11 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, if (!list_empty(&jd->extent_list)) dblock = gfs2_log_bmap(jd, lblock); else { - int ret = gfs2_lblk_to_dblk(jd->jd_inode, lblock, &dblock); + unsigned int extlen; + int ret; + + extlen = 1; + ret = gfs2_get_extent(jd->jd_inode, lblock, &dblock, &extlen); if (gfs2_assert_withdraw(sdp, ret == 0)) return; } @@ -1014,7 +1017,7 @@ static void trans_drain(struct gfs2_trans *tr) /** * gfs2_log_flush - flush incore transaction(s) - * @sdp: the filesystem + * @sdp: The filesystem * @gl: The glock structure to flush. If NULL, flush the whole incore log * @flags: The log header flags: GFS2_LOG_HEAD_FLUSH_* and debug flags * @@ -1166,7 +1169,7 @@ out_withdraw: /** * gfs2_merge_trans - Merge a new transaction into a cached transaction - * @old: Original transaction to be expanded + * @sdp: the filesystem * @new: New transaction to be merged */ @@ -1283,7 +1286,7 @@ static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) /** * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks - * @sdp: Pointer to GFS2 superblock + * @data: Pointer to GFS2 superblock * * Also, periodically check to make sure that we're using the most recent * journal index. diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index a82f4747aa8d..221e7118cc3b 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -96,9 +96,7 @@ out: * gfs2_unpin - Unpin a buffer * @sdp: the filesystem the buffer belongs to * @bh: The buffer to unpin - * @ai: - * @flags: The inode dirty flags - * + * @tr: The system transaction being flushed */ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, @@ -281,7 +279,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, * gfs2_log_get_bio - Get cached log bio, or allocate a new one * @sdp: The super block * @blkno: The device block number we want to write to - * @bio: The bio to get or allocate + * @biop: The bio to get or allocate * @op: REQ_OP * @end_io: The bi_end_io callback * @flush: Always flush the current bio and allocate a new one? @@ -317,6 +315,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, /** * gfs2_log_write - write to log * @sdp: the filesystem + * @jd: The journal descriptor * @page: the page to write * @size: the size of the data to write * @offset: the offset within the page @@ -417,6 +416,7 @@ static void gfs2_end_log_read(struct bio *bio) /** * gfs2_jhead_pg_srch - Look for the journal head in a given page. * @jd: The journal descriptor + * @head: The journal head to start from * @page: The page to look in * * Returns: 1 if found, 0 otherwise. @@ -450,6 +450,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, * gfs2_jhead_process_page - Search/cleanup a page * @jd: The journal descriptor * @index: Index of the page to look into + * @head: The journal head to start from * @done: If set, perform only cleanup, else search and set if found. * * Find the page with 'index' in the journal's mapping. Search the page for @@ -502,6 +503,7 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) * gfs2_find_jhead - find the head of a log * @jd: The journal descriptor * @head: The log descriptor for the head of the log is returned here + * @keep_cache: If set inode pages will not be truncated * * Do a search of a journal by reading it in large chunks using bios and find * the valid log entry with the highest sequence number. (i.e. the log head) @@ -634,7 +636,8 @@ static void gfs2_check_magic(struct buffer_head *bh) kunmap_atomic(kaddr); } -static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b) +static int blocknr_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct gfs2_bufdata *bda, *bdb; @@ -976,7 +979,8 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) /** * databuf_lo_before_commit - Scan the data buffers, writing as we go - * + * @sdp: The filesystem + * @tr: The system transaction being flushed */ static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 2db573e31f78..d68184ebbfdd 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -131,16 +131,19 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) break; yield(); } + if (!page_has_buffers(page)) + create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); } else { page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED); if (!page) return NULL; + if (!page_has_buffers(page)) { + bh = NULL; + goto out_unlock; + } } - if (!page_has_buffers(page)) - create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); - /* Locate header for our buffer within our page */ for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) /* Do nothing */; @@ -149,6 +152,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) if (!buffer_mapped(bh)) map_bh(bh, sdp->sd_vfs, blkno); +out_unlock: unlock_page(page); put_page(page); @@ -239,6 +243,7 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], * @gl: The glock covering the block * @blkno: The block number * @flags: flags + * @rahead: Do read-ahead * @bhp: the place where the buffer is returned (NULL on failure) * * Returns: errno @@ -462,23 +467,22 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) } /** - * gfs2_meta_indirect_buffer - Get a metadata buffer + * gfs2_meta_buffer - Get a metadata buffer * @ip: The GFS2 inode - * @height: The level of this buf in the metadata (indir addr) tree (if any) + * @mtype: The block type (GFS2_METATYPE_*) * @num: The block number (device relative) of the buffer * @bhp: the buffer is returned here * * Returns: errno */ -int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, - struct buffer_head **bhp) +int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, + struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_glock *gl = ip->i_gl; struct buffer_head *bh; int ret = 0; - u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; int rahead = 0; if (num == ip->i_no_addr) diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 4a8c01929b79..21880d72081a 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -61,13 +61,13 @@ enum { extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta); extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); -extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, - struct buffer_head **bhp); +extern int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, + struct buffer_head **bhp); static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, struct buffer_head **bhp) { - return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, bhp); + return gfs2_meta_buffer(ip, GFS2_METATYPE_DI, ip->i_no_addr, bhp); } struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index aa4136055a83..826f77d9cff5 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -150,7 +150,6 @@ fail: /** * gfs2_check_sb - Check superblock * @sdp: the filesystem - * @sb: The superblock * @silent: Don't print a message if the check fails * * Checks the version code of the FS is one that we understand how to @@ -204,7 +203,6 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); sb->sb_type = be32_to_cpu(str->sb_header.mh_type); - sb->sb_format = be32_to_cpu(str->sb_header.mh_format); sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); sb->sb_bsize = be32_to_cpu(str->sb_bsize); @@ -223,7 +221,7 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) * gfs2_read_super - Read the gfs2 super block from disk * @sdp: The GFS2 super block * @sector: The location of the super block - * @error: The error code to return + * @silent: Don't print a message if the check fails * * This uses the bio functions to read the super block from disk * because we want to be 100% sure that we never read cached data. @@ -983,7 +981,6 @@ static const struct lm_lockops nolock_ops = { /** * gfs2_lm_mount - mount a locking protocol * @sdp: the filesystem - * @args: mount arguments * @silent: if 1, don't complain if the FS isn't a GFS2 fs * * Returns: errno @@ -1093,8 +1090,7 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp) /** * gfs2_fill_super - Read in superblock * @sb: The VFS superblock - * @args: Mount options - * @silent: Don't complain if it's not a GFS2 filesystem + * @fc: Mount options and flags * * Returns: -errno */ diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6e173ae378c4..94637c307cc8 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1375,8 +1375,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) unsigned int y; if (!extlen) { - int new = 0; - error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen); + extlen = 32; + error = gfs2_get_extent(&ip->i_inode, x, &dblock, &extlen); if (error) goto fail; } @@ -1534,7 +1534,7 @@ void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) { /** * gfs2_quotad - Write cached quota changes into the quota file - * @sdp: Pointer to GFS2 superblock + * @data: Pointer to GFS2 superblock * */ diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 282173774005..016ed1b2ca1d 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -34,12 +34,12 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_glock *gl = ip->i_gl; - int new = 0; u64 dblock; u32 extlen; int error; - error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen); + extlen = 32; + error = gfs2_get_extent(&ip->i_inode, blk, &dblock, &extlen); if (error) return error; if (!dblock) { @@ -154,7 +154,7 @@ int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, * get_log_header - read the log header for a given segment * @jd: the journal * @blk: the block to look at - * @lh: the log header to return + * @head: the log header to return * * Read the log header for a given segement in a given journal. Do a few * sanity checks on it. @@ -187,6 +187,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, * @jd: the journal * @start: the first log header in the active region * @end: the last log header (don't process the contents of this entry)) + * @pass: iteration number (foreach_descriptor() is called in a for() loop) * * Call a given function once for every log descriptor in the active * portion of the log. @@ -437,6 +438,7 @@ void gfs2_recover_func(struct work_struct *work) case GLR_TRYFAILED: fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid); error = 0; + goto fail; default: goto fail; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 89c37a845e64..c3b00ba92ed2 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -743,7 +743,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) } /** - * gfs2_compute_bitstructs - Compute the bitmap sizes + * compute_bitstructs - Compute the bitmap sizes * @rgd: The resource group descriptor * * Calculates bitmap descriptors, one for each block that contains bitmap data @@ -1534,8 +1534,9 @@ static void rs_insert(struct gfs2_inode *ip) } /** - * rgd_free - return the number of free blocks we can allocate. + * rgd_free - return the number of free blocks we can allocate * @rgd: the resource group + * @rs: The reservation to free * * This function returns the number of free blocks for an rgrp. * That's the clone-free blocks (blocks that are free, not including those @@ -1783,7 +1784,7 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, goto next_bitmap; } rbm->offset = offset; - if (!rs) + if (!rs || !minext) return 0; ret = gfs2_reservation_check_and_update(rbm, rs, *minext, @@ -2019,6 +2020,7 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b /** * fast_to_acquire - determine if a resource group will be fast to acquire + * @rgd: The rgrp * * If this is one of our preferred rgrps, it should be quicker to acquire, * because we tried to set ourselves up as dlm lock master. diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 97076d3f562f..4d4ceb0b6903 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -162,8 +162,10 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) int error; error = init_threads(sdp); - if (error) + if (error) { + gfs2_withdraw_delayed(sdp); return error; + } j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); if (gfs2_withdrawn(sdp)) { @@ -385,8 +387,6 @@ struct lfcc { * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all * journals are clean * @sdp: the file system - * @state: the state to put the transaction lock into - * @t_gh: the hold on the transaction lock * * Returns: errno */ @@ -699,6 +699,7 @@ restart: /** * gfs2_sync_fs - sync the filesystem * @sb: the superblock + * @wait: true to wait for completion * * Flushes the log to disk. */ @@ -750,11 +751,13 @@ void gfs2_freeze_func(struct work_struct *work) static int gfs2_freeze(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; - int error = 0; + int error; mutex_lock(&sdp->sd_freeze_mutex); - if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) + if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) { + error = -EBUSY; goto out; + } for (;;) { if (gfs2_withdrawn(sdp)) { @@ -795,10 +798,10 @@ static int gfs2_unfreeze(struct super_block *sb) struct gfs2_sbd *sdp = sb->s_fs_info; mutex_lock(&sdp->sd_freeze_mutex); - if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN || + if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN || !gfs2_holder_initialized(&sdp->sd_freeze_gh)) { mutex_unlock(&sdp->sd_freeze_mutex); - return 0; + return -EINVAL; } gfs2_freeze_unlock(&sdp->sd_freeze_gh); @@ -807,7 +810,7 @@ static int gfs2_unfreeze(struct super_block *sb) } /** - * statfs_fill - fill in the sg for a given RG + * statfs_slow_fill - fill in the sg for a given RG * @rgd: the RG * @sc: the sc structure * @@ -905,7 +908,7 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host /** * gfs2_statfs_i - Do a statfs * @sdp: the filesystem - * @sg: the sg structure + * @sc: the sc structure * * Returns: errno */ @@ -936,8 +939,8 @@ static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *s /** * gfs2_statfs - Gather and return stats about the filesystem - * @sb: The superblock - * @statfsbuf: The buffer + * @dentry: The name of the link + * @buf: The buffer * * Returns: 0 on success or error code */ @@ -1268,6 +1271,7 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) /** * evict_should_delete - determine whether the inode is eligible for deletion * @inode: The inode to evict + * @gh: The glock holder structure * * This function determines whether the evicted inode is eligible to be deleted * and locks the inode glock. diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index c3e72dba7418..c0a34d9ddee4 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -63,6 +63,71 @@ static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev)); } +static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) +{ + unsigned long f = sdp->sd_flags; + ssize_t s; + + s = snprintf(buf, PAGE_SIZE, + "Journal Checked: %d\n" + "Journal Live: %d\n" + "Journal ID: %d\n" + "Spectator: %d\n" + "Withdrawn: %d\n" + "No barriers: %d\n" + "No recovery: %d\n" + "Demote: %d\n" + "No Journal ID: %d\n" + "Mounted RO: %d\n" + "RO Recovery: %d\n" + "Skip DLM Unlock: %d\n" + "Force AIL Flush: %d\n" + "FS Frozen: %d\n" + "Withdrawing: %d\n" + "Withdraw In Prog: %d\n" + "Remote Withdraw: %d\n" + "Withdraw Recovery: %d\n" + "sd_log_error: %d\n" + "sd_log_flush_lock: %d\n" + "sd_log_num_revoke: %u\n" + "sd_log_in_flight: %d\n" + "sd_log_blks_needed: %d\n" + "sd_log_blks_free: %d\n" + "sd_log_flush_head: %d\n" + "sd_log_flush_tail: %d\n" + "sd_log_blks_reserved: %d\n" + "sd_log_revokes_available: %d\n", + test_bit(SDF_JOURNAL_CHECKED, &f), + test_bit(SDF_JOURNAL_LIVE, &f), + (sdp->sd_jdesc ? sdp->sd_jdesc->jd_jid : 0), + (sdp->sd_args.ar_spectator ? 1 : 0), + test_bit(SDF_WITHDRAWN, &f), + test_bit(SDF_NOBARRIERS, &f), + test_bit(SDF_NORECOVERY, &f), + test_bit(SDF_DEMOTE, &f), + test_bit(SDF_NOJOURNALID, &f), + (sb_rdonly(sdp->sd_vfs) ? 1 : 0), + test_bit(SDF_RORECOVERY, &f), + test_bit(SDF_SKIP_DLM_UNLOCK, &f), + test_bit(SDF_FORCE_AIL_FLUSH, &f), + test_bit(SDF_FS_FROZEN, &f), + test_bit(SDF_WITHDRAWING, &f), + test_bit(SDF_WITHDRAW_IN_PROG, &f), + test_bit(SDF_REMOTE_WITHDRAW, &f), + test_bit(SDF_WITHDRAW_RECOVERY, &f), + sdp->sd_log_error, + rwsem_is_locked(&sdp->sd_log_flush_lock), + sdp->sd_log_num_revoke, + atomic_read(&sdp->sd_log_in_flight), + atomic_read(&sdp->sd_log_blks_needed), + atomic_read(&sdp->sd_log_blks_free), + sdp->sd_log_flush_head, + sdp->sd_log_flush_tail, + sdp->sd_log_blks_reserved, + atomic_read(&sdp->sd_log_revokes_available)); + return s; +} + static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) { return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); @@ -283,6 +348,7 @@ GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store); GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); GFS2_ATTR(demote_rq, 0200, NULL, demote_rq_store); +GFS2_ATTR(status, 0400, status_show, NULL); static struct attribute *gfs2_attrs[] = { &gfs2_attr_id.attr, @@ -295,6 +361,7 @@ static struct attribute *gfs2_attrs[] = { &gfs2_attr_quota_refresh_user.attr, &gfs2_attr_quota_refresh_group.attr, &gfs2_attr_demote_rq.attr, + &gfs2_attr_status.attr, NULL, }; ATTRIBUTE_GROUPS(gfs2); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 4f034b87b427..3e08027a6c81 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -44,6 +44,7 @@ void gfs2_assert_i(struct gfs2_sbd *sdp) * check_journal_clean - Make sure a journal is clean for a spectator mount * @sdp: The GFS2 superblock * @jd: The journal descriptor + * @verbose: Show more prints in the log * * Returns: 0 if the journal is clean or locked, else an error */ @@ -362,7 +363,7 @@ int gfs2_withdraw(struct gfs2_sbd *sdp) return -1; } -/** +/* * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false */ @@ -392,7 +393,7 @@ void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, dump_stack(); } -/** +/* * gfs2_assert_warn_i - Print a message to the console if @assertion is false */ @@ -422,7 +423,7 @@ void gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, sdp->sd_last_warning = jiffies; } -/** +/* * gfs2_consist_i - Flag a filesystem consistency error and withdraw */ @@ -435,7 +436,7 @@ void gfs2_consist_i(struct gfs2_sbd *sdp, const char *function, gfs2_withdraw(sdp); } -/** +/* * gfs2_consist_inode_i - Flag an inode consistency error and withdraw */ @@ -454,7 +455,7 @@ void gfs2_consist_inode_i(struct gfs2_inode *ip, gfs2_withdraw(sdp); } -/** +/* * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw */ @@ -475,7 +476,7 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, gfs2_withdraw(sdp); } -/** +/* * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw * Returns: -1 if this call withdrew the machine, * -2 if it was already withdrawn @@ -497,7 +498,7 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, return (me) ? -1 : -2; } -/** +/* * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw * Returns: -1 if this call withdrew the machine, * -2 if it was already withdrawn @@ -519,7 +520,7 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, return (me) ? -1 : -2; } -/** +/* * gfs2_io_error_i - Flag an I/O error and withdraw * Returns: -1 if this call withdrew the machine, * 0 if it was already withdrawn @@ -535,7 +536,7 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, return gfs2_withdraw(sdp); } -/** +/* * gfs2_io_error_bh_i - Flag a buffer I/O error * @withdraw: withdraw the filesystem */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 124b3d5a7266..0c5650fe1fd1 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -26,12 +26,9 @@ #include "trans.h" #include "util.h" -/** - * ea_calc_size - returns the acutal number of bytes the request will take up +/* + * ea_calc_size - returns the actual number of bytes the request will take up * (not counting any unstuffed data blocks) - * @sdp: - * @er: - * @size: * * Returns: 1 if the EA should be stuffed */ @@ -219,13 +216,8 @@ static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, return error; } -/** - * ea_dealloc_unstuffed - - * @ip: - * @bh: - * @ea: - * @prev: - * @private: +/* + * ea_dealloc_unstuffed * * Take advantage of the fact that all unstuffed blocks are * allocated from the same RG. But watch, this may not always @@ -448,8 +440,8 @@ ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) } /** - * ea_iter_unstuffed - copies the unstuffed xattr data to/from the - * request buffer + * gfs2_iter_unstuffed - copies the unstuffed xattr data to/from the + * request buffer * @ip: The GFS2 inode * @ea: The extended attribute header structure * @din: The data to be copied in @@ -573,7 +565,7 @@ out: } /** - * gfs2_xattr_get - Get a GFS2 extended attribute + * __gfs2_xattr_get - Get a GFS2 extended attribute * @inode: The inode * @name: The name of the extended attribute * @buffer: The buffer to write the result into @@ -801,14 +793,11 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, return error; } -/** +/* * ea_init - initializes a new eattr block - * @ip: - * @er: * * Returns: errno */ - static int ea_init(struct gfs2_inode *ip, int type, const char *name, const void *data, size_t size) { @@ -1164,7 +1153,7 @@ static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name) /** * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute - * @ip: The inode + * @inode: The inode * @name: The name of the extended attribute * @value: The value of the extended attribute (NULL for remove) * @size: The size of the @value argument diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 03e6c046faf4..84714bbccc12 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -569,6 +569,8 @@ const struct inode_operations hfsplus_dir_inode_operations = { .rename = hfsplus_rename, .getattr = hfsplus_getattr, .listxattr = hfsplus_listxattr, + .fileattr_get = hfsplus_fileattr_get, + .fileattr_set = hfsplus_fileattr_set, }; const struct file_operations hfsplus_dir_operations = { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 12b20479ed2b..1798949f269b 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -345,17 +345,6 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) #define hfs_part_find hfsplus_part_find /* - * definitions for ext2 flag ioctls (linux really needs a generic - * interface for this). - */ - -/* ext2 ioctls (EXT2_IOC_GETFLAGS and EXT2_IOC_SETFLAGS) to support - * chattr/lsattr */ -#define HFSPLUS_IOC_EXT2_GETFLAGS FS_IOC_GETFLAGS -#define HFSPLUS_IOC_EXT2_SETFLAGS FS_IOC_SETFLAGS - - -/* * hfs+-specific ioctl for making the filesystem bootable */ #define HFSPLUS_IOC_BLESS _IO('h', 0x80) @@ -493,6 +482,9 @@ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, unsigned int query_flags); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); +int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int hfsplus_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); /* ioctl.c */ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 078c5c8a5156..8ea447e5c470 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/cred.h> #include <linux/uio.h> +#include <linux/fileattr.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" @@ -353,6 +354,8 @@ static const struct inode_operations hfsplus_file_inode_operations = { .setattr = hfsplus_setattr, .getattr = hfsplus_getattr, .listxattr = hfsplus_listxattr, + .fileattr_get = hfsplus_fileattr_get, + .fileattr_set = hfsplus_fileattr_set, }; static const struct file_operations hfsplus_file_operations = { @@ -628,3 +631,54 @@ out: hfs_find_exit(&fd); return 0; } + +int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + unsigned int flags = 0; + + if (inode->i_flags & S_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + if (inode->i_flags & S_APPEND) + flags |= FS_APPEND_FL; + if (hip->userflags & HFSPLUS_FLG_NODUMP) + flags |= FS_NODUMP_FL; + + fileattr_fill_flags(fa, flags); + + return 0; +} + +int hfsplus_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + unsigned int new_fl = 0; + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + + /* don't silently ignore unsupported ext2 flags */ + if (fa->flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) + return -EOPNOTSUPP; + + if (fa->flags & FS_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + + if (fa->flags & FS_APPEND_FL) + new_fl |= S_APPEND; + + inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND); + + if (fa->flags & FS_NODUMP_FL) + hip->userflags |= HFSPLUS_FLG_NODUMP; + else + hip->userflags &= ~HFSPLUS_FLG_NODUMP; + + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + + return 0; +} diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 3edb1926d127..5661a2e24d03 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -57,95 +57,11 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) return 0; } -static inline unsigned int hfsplus_getflags(struct inode *inode) -{ - struct hfsplus_inode_info *hip = HFSPLUS_I(inode); - unsigned int flags = 0; - - if (inode->i_flags & S_IMMUTABLE) - flags |= FS_IMMUTABLE_FL; - if (inode->i_flags & S_APPEND) - flags |= FS_APPEND_FL; - if (hip->userflags & HFSPLUS_FLG_NODUMP) - flags |= FS_NODUMP_FL; - return flags; -} - -static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags) -{ - struct inode *inode = file_inode(file); - unsigned int flags = hfsplus_getflags(inode); - - return put_user(flags, user_flags); -} - -static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) -{ - struct inode *inode = file_inode(file); - struct hfsplus_inode_info *hip = HFSPLUS_I(inode); - unsigned int flags, new_fl = 0; - unsigned int oldflags = hfsplus_getflags(inode); - int err = 0; - - err = mnt_want_write_file(file); - if (err) - goto out; - - if (!inode_owner_or_capable(&init_user_ns, inode)) { - err = -EACCES; - goto out_drop_write; - } - - if (get_user(flags, user_flags)) { - err = -EFAULT; - goto out_drop_write; - } - - inode_lock(inode); - - err = vfs_ioc_setflags_prepare(inode, oldflags, flags); - if (err) - goto out_unlock_inode; - - /* don't silently ignore unsupported ext2 flags */ - if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { - err = -EOPNOTSUPP; - goto out_unlock_inode; - } - - if (flags & FS_IMMUTABLE_FL) - new_fl |= S_IMMUTABLE; - - if (flags & FS_APPEND_FL) - new_fl |= S_APPEND; - - inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND); - - if (flags & FS_NODUMP_FL) - hip->userflags |= HFSPLUS_FLG_NODUMP; - else - hip->userflags &= ~HFSPLUS_FLG_NODUMP; - - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); - -out_unlock_inode: - inode_unlock(inode); -out_drop_write: - mnt_drop_write_file(file); -out: - return err; -} - long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; switch (cmd) { - case HFSPLUS_IOC_EXT2_GETFLAGS: - return hfsplus_ioctl_getflags(file, argp); - case HFSPLUS_IOC_EXT2_SETFLAGS: - return hfsplus_ioctl_setflags(file, argp); case HFSPLUS_IOC_BLESS: return hfsplus_ioctl_bless(file, argp); default: diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 29e407762626..7d0c3dbb2898 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -144,7 +144,7 @@ static char *follow_link(char *link) char *name, *resolved, *end; int n; - name = __getname(); + name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { n = -ENOMEM; goto out_free; @@ -173,12 +173,11 @@ static char *follow_link(char *link) goto out_free; } - __putname(name); - kfree(link); + kfree(name); return resolved; out_free: - __putname(name); + kfree(name); return ERR_PTR(n); } @@ -317,7 +316,7 @@ retry: if (mode & FMODE_WRITE) r = w = 1; - name = dentry_name(d_real(file->f_path.dentry, file->f_inode)); + name = dentry_name(file_dentry(file)); if (name == NULL) return -ENOMEM; @@ -712,7 +711,6 @@ static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (name == NULL) goto out_put; - init_special_inode(inode, mode, dev); err = do_mknod(name, mode, MAJOR(dev), MINOR(dev)); if (err) goto out_free; diff --git a/fs/inode.c b/fs/inode.c index a047ab306f9a..9e192bea0630 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -12,7 +12,6 @@ #include <linux/security.h> #include <linux/cdev.h> #include <linux/memblock.h> -#include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/posix_acl.h> @@ -2148,7 +2147,7 @@ EXPORT_SYMBOL(init_special_inode); void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, const struct inode *dir, umode_t mode) { - inode->i_uid = fsuid_into_mnt(mnt_userns); + inode_fsuid_set(inode, mnt_userns); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; @@ -2160,7 +2159,7 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID)) mode &= ~S_ISGID; } else - inode->i_gid = fsgid_into_mnt(mnt_userns); + inode_fsgid_set(inode, mnt_userns); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); @@ -2314,89 +2313,3 @@ struct timespec64 current_time(struct inode *inode) return timestamp_truncate(now, inode); } EXPORT_SYMBOL(current_time); - -/* - * Generic function to check FS_IOC_SETFLAGS values and reject any invalid - * configurations. - * - * Note: the caller should be holding i_mutex, or else be sure that they have - * exclusive access to the inode structure. - */ -int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags, - unsigned int flags) -{ - /* - * The IMMUTABLE and APPEND_ONLY flags can only be changed by - * the relevant capability. - * - * This test looks nicer. Thanks to Pauline Middelink - */ - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && - !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - - return fscrypt_prepare_setflags(inode, oldflags, flags); -} -EXPORT_SYMBOL(vfs_ioc_setflags_prepare); - -/* - * Generic function to check FS_IOC_FSSETXATTR values and reject any invalid - * configurations. - * - * Note: the caller should be holding i_mutex, or else be sure that they have - * exclusive access to the inode structure. - */ -int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa, - struct fsxattr *fa) -{ - /* - * Can't modify an immutable/append-only file unless we have - * appropriate permission. - */ - if ((old_fa->fsx_xflags ^ fa->fsx_xflags) & - (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND) && - !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - - /* - * Project Quota ID state is only allowed to change from within the init - * namespace. Enforce that restriction only if we are trying to change - * the quota ID state. Everything else is allowed in user namespaces. - */ - if (current_user_ns() != &init_user_ns) { - if (old_fa->fsx_projid != fa->fsx_projid) - return -EINVAL; - if ((old_fa->fsx_xflags ^ fa->fsx_xflags) & - FS_XFLAG_PROJINHERIT) - return -EINVAL; - } - - /* Check extent size hints. */ - if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode)) - return -EINVAL; - - if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && - !S_ISDIR(inode->i_mode)) - return -EINVAL; - - if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) && - !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) - return -EINVAL; - - /* - * It is only valid to set the DAX flag on regular files and - * directories on filesystems. - */ - if ((fa->fsx_xflags & FS_XFLAG_DAX) && - !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) - return -EINVAL; - - /* Extent size hints of zero turn off the flags. */ - if (fa->fsx_extsize == 0) - fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); - if (fa->fsx_cowextsize == 0) - fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; - - return 0; -} -EXPORT_SYMBOL(vfs_ioc_fssetxattr_check); diff --git a/fs/io-wq.c b/fs/io-wq.c index 0ae9ecadf295..5361a9b4b47b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -16,9 +16,7 @@ #include <linux/rculist_nulls.h> #include <linux/cpu.h> #include <linux/tracehook.h> -#include <linux/freezer.h> -#include "../kernel/sched/sched.h" #include "io-wq.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) @@ -69,6 +67,7 @@ struct io_worker { struct io_wqe_acct { unsigned nr_workers; unsigned max_workers; + int index; atomic_t nr_running; }; @@ -109,19 +108,16 @@ struct io_wq { free_work_fn *free_work; io_wq_work_fn *do_work; - struct task_struct *manager; - struct io_wq_hash *hash; refcount_t refs; - struct completion exited; atomic_t worker_refs; struct completion worker_done; struct hlist_node cpuhp_node; - pid_t task_pid; + struct task_struct *task; }; static enum cpuhp_state io_wq_online; @@ -134,8 +130,7 @@ struct io_cb_cancel_data { bool cancel_all; }; -static void io_wqe_cancel_pending_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match); +static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); static bool io_worker_get(struct io_worker *worker) { @@ -148,23 +143,26 @@ static void io_worker_release(struct io_worker *worker) complete(&worker->ref_done); } +static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound) +{ + return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND]; +} + static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, struct io_wq_work *work) { - if (work->flags & IO_WQ_WORK_UNBOUND) - return &wqe->acct[IO_WQ_ACCT_UNBOUND]; - - return &wqe->acct[IO_WQ_ACCT_BOUND]; + return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND)); } static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker) { - struct io_wqe *wqe = worker->wqe; - - if (worker->flags & IO_WORKER_F_BOUND) - return &wqe->acct[IO_WQ_ACCT_BOUND]; + return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND); +} - return &wqe->acct[IO_WQ_ACCT_UNBOUND]; +static void io_worker_ref_put(struct io_wq *wq) +{ + if (atomic_dec_and_test(&wq->worker_refs)) + complete(&wq->worker_done); } static void io_worker_exit(struct io_worker *worker) @@ -194,8 +192,7 @@ static void io_worker_exit(struct io_worker *worker) raw_spin_unlock_irq(&wqe->lock); kfree_rcu(worker, rcu); - if (atomic_dec_and_test(&wqe->wq->worker_refs)) - complete(&wqe->wq->worker_done); + io_worker_ref_put(wqe->wq); do_exit(0); } @@ -210,7 +207,7 @@ static inline bool io_wqe_run_queue(struct io_wqe *wqe) /* * Check head of free list for an available worker. If one isn't available, - * caller must wake up the wq manager to create one. + * caller must create one. */ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) __must_hold(RCU) @@ -234,7 +231,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) /* * We need a worker. If we find a free one, we're good. If not, and we're - * below the max number of workers, wake up the manager to create one. + * below the max number of workers, create one. */ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) { @@ -250,8 +247,11 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) ret = io_wqe_activate_free_worker(wqe); rcu_read_unlock(); - if (!ret && acct->nr_workers < acct->max_workers) - wake_up_process(wqe->wq->manager); + if (!ret && acct->nr_workers < acct->max_workers) { + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); + create_io_worker(wqe->wq, wqe, acct->index); + } } static void io_wqe_inc_running(struct io_worker *worker) @@ -261,14 +261,61 @@ static void io_wqe_inc_running(struct io_worker *worker) atomic_inc(&acct->nr_running); } +struct create_worker_data { + struct callback_head work; + struct io_wqe *wqe; + int index; +}; + +static void create_worker_cb(struct callback_head *cb) +{ + struct create_worker_data *cwd; + struct io_wq *wq; + + cwd = container_of(cb, struct create_worker_data, work); + wq = cwd->wqe->wq; + create_io_worker(wq, cwd->wqe, cwd->index); + kfree(cwd); +} + +static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct) +{ + struct create_worker_data *cwd; + struct io_wq *wq = wqe->wq; + + /* raced with exit, just ignore create call */ + if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) + goto fail; + + cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC); + if (cwd) { + init_task_work(&cwd->work, create_worker_cb); + cwd->wqe = wqe; + cwd->index = acct->index; + if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL)) + return; + + kfree(cwd); + } +fail: + atomic_dec(&acct->nr_running); + io_worker_ref_put(wq); +} + static void io_wqe_dec_running(struct io_worker *worker) __must_hold(wqe->lock) { struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; - if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) - io_wqe_wake_worker(wqe, acct); + if (!(worker->flags & IO_WORKER_F_UP)) + return; + + if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) { + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); + io_queue_worker_create(wqe, acct); + } } /* @@ -281,6 +328,8 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, { bool worker_bound, work_bound; + BUILD_BUG_ON((IO_WQ_ACCT_UNBOUND ^ IO_WQ_ACCT_BOUND) != 1); + if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; hlist_nulls_del_init_rcu(&worker->nulls_node); @@ -293,16 +342,11 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; if (worker_bound != work_bound) { + int index = work_bound ? IO_WQ_ACCT_UNBOUND : IO_WQ_ACCT_BOUND; io_wqe_dec_running(worker); - if (work_bound) { - worker->flags |= IO_WORKER_F_BOUND; - wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--; - wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++; - } else { - worker->flags &= ~IO_WORKER_F_BOUND; - wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++; - wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--; - } + worker->flags ^= IO_WORKER_F_BOUND; + wqe->acct[index].nr_workers--; + wqe->acct[index ^ 1].nr_workers++; io_wqe_inc_running(worker); } } @@ -386,13 +430,14 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) return NULL; } -static void io_flush_signals(void) +static bool io_flush_signals(void) { - if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) { - if (current->task_works) - task_work_run(); - clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL); + if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) { + __set_current_state(TASK_RUNNING); + tracehook_notify_signal(); + return true; } + return false; } static void io_assign_current_work(struct io_worker *worker, @@ -415,6 +460,7 @@ static void io_worker_handle_work(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; + bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); do { struct io_wq_work *work; @@ -444,6 +490,9 @@ get_next: unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); + + if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) + work->flags |= IO_WQ_WORK_CANCEL; wq->do_work(work); io_assign_current_work(worker, NULL); @@ -482,12 +531,13 @@ static int io_wqe_worker(void *data) char buf[TASK_COMM_LEN]; worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); - io_wqe_inc_running(worker); - sprintf(buf, "iou-wrk-%d", wq->task_pid); + snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); set_task_comm(current, buf); while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { + long ret; + set_current_state(TASK_INTERRUPTIBLE); loop: raw_spin_lock_irq(&wqe->lock); @@ -497,11 +547,18 @@ loop: } __io_worker_idle(wqe, worker); raw_spin_unlock_irq(&wqe->lock); - io_flush_signals(); - if (schedule_timeout(WORKER_IDLE_TIMEOUT)) + if (io_flush_signals()) continue; - if (fatal_signal_pending(current)) + ret = schedule_timeout(WORKER_IDLE_TIMEOUT); + if (signal_pending(current)) { + struct ksignal ksig; + + if (!get_signal(&ksig)) + continue; break; + } + if (ret) + continue; /* timed out, exit unless we're the fixed worker */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || !(worker->flags & IO_WORKER_F_FIXED)) @@ -539,8 +596,7 @@ void io_wq_worker_running(struct task_struct *tsk) /* * Called when worker is going to sleep. If there are no workers currently - * running and we have work pending, wake up a free one or have the manager - * set one up. + * running and we have work pending, wake up a free one or create a new one. */ void io_wq_worker_sleeping(struct task_struct *tsk) { @@ -560,7 +616,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk) raw_spin_unlock_irq(&worker->wqe->lock); } -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { struct io_wqe_acct *acct = &wqe->acct[index]; struct io_worker *worker; @@ -570,7 +626,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); if (!worker) - return false; + goto fail; refcount_set(&worker->ref, 1); worker->nulls_node.pprev = NULL; @@ -578,14 +634,13 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) spin_lock_init(&worker->lock); init_completion(&worker->ref_done); - atomic_inc(&wq->worker_refs); - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); if (IS_ERR(tsk)) { - if (atomic_dec_and_test(&wq->worker_refs)) - complete(&wq->worker_done); kfree(worker); - return false; +fail: + atomic_dec(&acct->nr_running); + io_worker_ref_put(wq); + return; } tsk->pf_io_worker = worker; @@ -604,20 +659,6 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) acct->nr_workers++; raw_spin_unlock_irq(&wqe->lock); wake_up_new_task(tsk); - return true; -} - -static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) - __must_hold(wqe->lock) -{ - struct io_wqe_acct *acct = &wqe->acct[index]; - - if (acct->nr_workers && test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) - return false; - /* if we have available workers or no work, no need */ - if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe)) - return false; - return acct->nr_workers < acct->max_workers; } /* @@ -652,88 +693,11 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) return false; } -static void io_wq_check_workers(struct io_wq *wq) -{ - int node; - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - bool fork_worker[2] = { false, false }; - - if (!node_online(node)) - continue; - - raw_spin_lock_irq(&wqe->lock); - if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) - fork_worker[IO_WQ_ACCT_BOUND] = true; - if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) - fork_worker[IO_WQ_ACCT_UNBOUND] = true; - raw_spin_unlock_irq(&wqe->lock); - if (fork_worker[IO_WQ_ACCT_BOUND]) - create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); - if (fork_worker[IO_WQ_ACCT_UNBOUND]) - create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND); - } -} - static bool io_wq_work_match_all(struct io_wq_work *work, void *data) { return true; } -static void io_wq_cancel_pending(struct io_wq *wq) -{ - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_all, - .cancel_all = true, - }; - int node; - - for_each_node(node) - io_wqe_cancel_pending_work(wq->wqes[node], &match); -} - -/* - * Manager thread. Tasked with creating new workers, if we need them. - */ -static int io_wq_manager(void *data) -{ - struct io_wq *wq = data; - char buf[TASK_COMM_LEN]; - int node; - - sprintf(buf, "iou-mgr-%d", wq->task_pid); - set_task_comm(current, buf); - - do { - set_current_state(TASK_INTERRUPTIBLE); - io_wq_check_workers(wq); - schedule_timeout(HZ); - if (fatal_signal_pending(current)) - set_bit(IO_WQ_BIT_EXIT, &wq->state); - } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); - - io_wq_check_workers(wq); - - rcu_read_lock(); - for_each_node(node) - io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); - rcu_read_unlock(); - - if (atomic_dec_and_test(&wq->worker_refs)) - complete(&wq->worker_done); - wait_for_completion(&wq->worker_done); - - spin_lock_irq(&wq->hash->wait.lock); - for_each_node(node) - list_del_init(&wq->wqes[node]->wait.entry); - spin_unlock_irq(&wq->hash->wait.lock); - - io_wq_cancel_pending(wq); - complete(&wq->exited); - do_exit(0); -} - static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) { struct io_wq *wq = wqe->wq; @@ -765,39 +729,13 @@ append: wq_list_add_after(&work->list, &tail->list, &wqe->work_list); } -static int io_wq_fork_manager(struct io_wq *wq) -{ - struct task_struct *tsk; - - if (wq->manager) - return 0; - - WARN_ON_ONCE(test_bit(IO_WQ_BIT_EXIT, &wq->state)); - - init_completion(&wq->worker_done); - atomic_set(&wq->worker_refs, 1); - tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE); - if (!IS_ERR(tsk)) { - wq->manager = get_task_struct(tsk); - wake_up_new_task(tsk); - return 0; - } - - if (atomic_dec_and_test(&wq->worker_refs)) - complete(&wq->worker_done); - - return PTR_ERR(tsk); -} - static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); int work_flags; unsigned long flags; - /* Can only happen if manager creation fails after exec */ - if (io_wq_fork_manager(wqe->wq) || - test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) { + if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) { io_run_cancel(work, wqe); return; } @@ -952,17 +890,12 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); - int ret; list_del_init(&wait->entry); rcu_read_lock(); - ret = io_wqe_activate_free_worker(wqe); + io_wqe_activate_free_worker(wqe); rcu_read_unlock(); - - if (!ret) - wake_up_process(wqe->wq->manager); - return 1; } @@ -1003,6 +936,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) goto err; wq->wqes[node] = wqe; wqe->node = alloc_node; + wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND; + wqe->acct[IO_WQ_ACCT_UNBOUND].index = IO_WQ_ACCT_UNBOUND; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = @@ -1017,13 +952,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) INIT_LIST_HEAD(&wqe->all_list); } - wq->task_pid = current->pid; - init_completion(&wq->exited); + wq->task = get_task_struct(data->task); refcount_set(&wq->refs, 1); - - ret = io_wq_fork_manager(wq); - if (!ret) - return wq; + atomic_set(&wq->worker_refs, 1); + init_completion(&wq->worker_done); + return wq; err: io_wq_put_hash(data->hash); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); @@ -1036,14 +969,49 @@ err_wq: return ERR_PTR(ret); } -static void io_wq_destroy_manager(struct io_wq *wq) +static bool io_task_work_match(struct callback_head *cb, void *data) { - if (wq->manager) { - wake_up_process(wq->manager); - wait_for_completion(&wq->exited); - put_task_struct(wq->manager); - wq->manager = NULL; + struct create_worker_data *cwd; + + if (cb->func != create_worker_cb) + return false; + cwd = container_of(cb, struct create_worker_data, work); + return cwd->wqe->wq == data; +} + +static void io_wq_exit_workers(struct io_wq *wq) +{ + struct callback_head *cb; + int node; + + set_bit(IO_WQ_BIT_EXIT, &wq->state); + + if (!wq->task) + return; + + while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { + struct create_worker_data *cwd; + + cwd = container_of(cb, struct create_worker_data, work); + atomic_dec(&cwd->wqe->acct[cwd->index].nr_running); + io_worker_ref_put(wq); + kfree(cwd); } + + rcu_read_lock(); + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); + spin_lock_irq(&wq->hash->wait.lock); + list_del_init(&wq->wqes[node]->wait.entry); + spin_unlock_irq(&wq->hash->wait.lock); + } + rcu_read_unlock(); + io_worker_ref_put(wq); + wait_for_completion(&wq->worker_done); + put_task_struct(wq->task); + wq->task = NULL; } static void io_wq_destroy(struct io_wq *wq) @@ -1052,12 +1020,15 @@ static void io_wq_destroy(struct io_wq *wq) cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - set_bit(IO_WQ_BIT_EXIT, &wq->state); - io_wq_destroy_manager(wq); + io_wq_exit_workers(wq); for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - WARN_ON_ONCE(!wq_list_empty(&wqe->work_list)); + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_all, + .cancel_all = true, + }; + io_wqe_cancel_pending_work(wqe, &match); kfree(wqe); } io_wq_put_hash(wq->hash); @@ -1073,21 +1044,14 @@ void io_wq_put(struct io_wq *wq) void io_wq_put_and_exit(struct io_wq *wq) { - set_bit(IO_WQ_BIT_EXIT, &wq->state); - io_wq_destroy_manager(wq); + io_wq_exit_workers(wq); io_wq_put(wq); } static bool io_wq_worker_affinity(struct io_worker *worker, void *data) { - struct task_struct *task = worker->task; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(task, &rf); - do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node)); - task->flags |= PF_NO_SETAFFINITY; - task_rq_unlock(rq, task, &rf); + set_cpus_allowed_ptr(worker->task, cpumask_of_node(worker->wqe->node)); + return false; } diff --git a/fs/io-wq.h b/fs/io-wq.h index 1ac2f3248088..0e6d310999e8 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -2,7 +2,6 @@ #define INTERNAL_IO_WQ_H #include <linux/refcount.h> -#include <linux/io_uring.h> struct io_wq; @@ -21,6 +20,15 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; +struct io_wq_work_node { + struct io_wq_work_node *next; +}; + +struct io_wq_work_list { + struct io_wq_work_node *first; + struct io_wq_work_node *last; +}; + static inline void wq_list_add_after(struct io_wq_work_node *node, struct io_wq_work_node *pos, struct io_wq_work_list *list) @@ -108,6 +116,7 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash) struct io_wq_data { struct io_wq_hash *hash; + struct task_struct *task; io_wq_work_fn *do_work; free_work_fn *free_work; }; diff --git a/fs/io_uring.c b/fs/io_uring.c index a4bce17af506..360f81395d81 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,7 +78,6 @@ #include <linux/task_work.h> #include <linux/pagemap.h> #include <linux/io_uring.h> -#include <linux/freezer.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -195,43 +194,56 @@ enum io_uring_cmd_flags { struct io_mapped_ubuf { u64 ubuf; - size_t len; - struct bio_vec *bvec; + u64 ubuf_end; unsigned int nr_bvecs; unsigned long acct_pages; + struct bio_vec bvec[]; }; struct io_ring_ctx; +struct io_overflow_cqe { + struct io_uring_cqe cqe; + struct list_head list; +}; + +struct io_fixed_file { + /* file * with additional FFS_* flags */ + unsigned long file_ptr; +}; + struct io_rsrc_put { struct list_head list; + u64 tag; union { void *rsrc; struct file *file; + struct io_mapped_ubuf *buf; }; }; -struct fixed_rsrc_table { - struct file **files; +struct io_file_table { + /* two level table */ + struct io_fixed_file **files; }; -struct fixed_rsrc_ref_node { +struct io_rsrc_node { struct percpu_ref refs; struct list_head node; struct list_head rsrc_list; - struct fixed_rsrc_data *rsrc_data; - void (*rsrc_put)(struct io_ring_ctx *ctx, - struct io_rsrc_put *prsrc); + struct io_rsrc_data *rsrc_data; struct llist_node llist; bool done; }; -struct fixed_rsrc_data { - struct fixed_rsrc_table *table; +typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); + +struct io_rsrc_data { struct io_ring_ctx *ctx; - struct fixed_rsrc_ref_node *node; - struct percpu_ref refs; + u64 *tags; + rsrc_put_fn *do_put; + atomic_t refs; struct completion done; bool quiesce; }; @@ -258,7 +270,8 @@ enum { struct io_sq_data { refcount_t refs; - struct rw_semaphore rw_lock; + atomic_t park_pending; + struct mutex lock; /* ctx's that are using this sqd */ struct list_head ctx_list; @@ -273,6 +286,7 @@ struct io_sq_data { unsigned long state; struct completion exited; + struct callback_head *park_task_work; }; #define IO_IOPOLL_BATCH 8 @@ -329,7 +343,6 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int compat: 1; - unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; @@ -387,12 +400,14 @@ struct io_ring_ctx { * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct fixed_rsrc_data *file_data; + struct io_rsrc_data *file_data; + struct io_file_table file_table; unsigned nr_user_files; /* if used, fixed mapped user buffers */ + struct io_rsrc_data *buf_data; unsigned nr_user_bufs; - struct io_mapped_ubuf *user_bufs; + struct io_mapped_ubuf **user_bufs; struct user_struct *user; @@ -402,7 +417,7 @@ struct io_ring_ctx { struct socket *ring_sock; #endif - struct idr io_buffer_idr; + struct xarray io_buffers; struct xarray personalities; u32 pers_next; @@ -413,6 +428,7 @@ struct io_ring_ctx { unsigned cq_mask; atomic_t cq_timeouts; unsigned cq_last_tm_flush; + unsigned cq_extra; unsigned long cq_check_overflow; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; @@ -432,28 +448,41 @@ struct io_ring_ctx { struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_file; - - spinlock_t inflight_lock; - struct list_head inflight_list; } ____cacheline_aligned_in_smp; struct delayed_work rsrc_put_work; struct llist_head rsrc_put_llist; struct list_head rsrc_ref_list; spinlock_t rsrc_ref_lock; + struct io_rsrc_node *rsrc_node; + struct io_rsrc_node *rsrc_backup_node; struct io_restriction restrictions; /* exit task_work */ struct callback_head *exit_task_work; - struct wait_queue_head hash_wait; - /* Keep this last, we don't need it for the fast path */ struct work_struct exit_work; struct list_head tctx_list; }; +struct io_uring_task { + /* submission side */ + struct xarray xa; + struct wait_queue_head wait; + const struct io_ring_ctx *last; + struct io_wq *io_wq; + struct percpu_counter inflight; + atomic_t inflight_tracked; + atomic_t in_idle; + + spinlock_t task_lock; + struct io_wq_work_list task_list; + unsigned long task_state; + struct callback_head task_work; +}; + /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in <linux/fs.h> @@ -467,9 +496,13 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; -struct io_poll_remove { +struct io_poll_update { struct file *file; - u64 addr; + u64 old_user_data; + u64 new_user_data; + __poll_t events; + bool update_events; + bool update_user_data; }; struct io_close { @@ -539,8 +572,9 @@ struct io_connect { struct io_sr_msg { struct file *file; union { - struct user_msghdr __user *umsg; - void __user *buf; + struct compat_msghdr __user *umsg_compat; + struct user_msghdr __user *umsg; + void __user *buf; }; int msg_flags; int bgid; @@ -597,7 +631,7 @@ struct io_splice { struct io_provide_buf { struct file *file; __u64 addr; - __s32 len; + __u32 len; __u32 bgid; __u16 nbufs; __u16 bid; @@ -636,7 +670,7 @@ struct io_unlink { struct io_completion { struct file *file; struct list_head list; - int cflags; + u32 cflags; }; struct io_async_connect { @@ -673,13 +707,17 @@ enum { REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, - REQ_F_ISREG_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, - REQ_F_NO_FILE_TABLE_BIT, REQ_F_LTIMEOUT_ACTIVE_BIT, REQ_F_COMPLETE_INLINE_BIT, + REQ_F_REISSUE_BIT, + REQ_F_DONT_REISSUE_BIT, + /* keep async read/write and isreg together and in order */ + REQ_F_ASYNC_READ_BIT, + REQ_F_ASYNC_WRITE_BIT, + REQ_F_ISREG_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -709,20 +747,26 @@ enum { REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), /* has or had linked timeout */ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* regular file */ - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* needs cleanup */ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), /* already went through poll handler */ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), - /* doesn't need file table for this request */ - REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), /* linked timeout is active, i.e. prepared by link's head */ REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), /* completion is deferred through io_comp_state */ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), + /* caller should reissue async */ + REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), + /* don't attempt request reissue, see io_rw_reissue() */ + REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT), + /* supports async reads */ + REQ_F_ASYNC_READ = BIT(REQ_F_ASYNC_READ_BIT), + /* supports async writes */ + REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT), + /* regular file */ + REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), }; struct async_poll { @@ -746,7 +790,7 @@ struct io_kiocb { struct file *file; struct io_rw rw; struct io_poll_iocb poll; - struct io_poll_remove poll_remove; + struct io_poll_update poll_update; struct io_accept accept; struct io_sync sync; struct io_cancel cancel; @@ -781,17 +825,14 @@ struct io_kiocb { struct io_ring_ctx *ctx; unsigned int flags; - refcount_t refs; + atomic_t refs; struct task_struct *task; u64 user_data; struct io_kiocb *link; struct percpu_ref *fixed_rsrc_refs; - /* - * 1. used with ctx->iopoll_list with reads/writes - * 2. to track reqs with ->files (see io_op_def::file_table) - */ + /* used with ctx->iopoll_list with reads/writes */ struct list_head inflight_entry; union { struct io_task_work io_task_work; @@ -801,6 +842,8 @@ struct io_kiocb { struct hlist_node hash_node; struct async_poll *apoll; struct io_wq_work work; + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; }; struct io_tctx_node { @@ -829,8 +872,8 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; - /* must always have async data allocated */ - unsigned needs_async_data : 1; + /* do prep async if is going to be punted */ + unsigned needs_async_setup : 1; /* should block plug */ unsigned plug : 1; /* size of async data needed, if any */ @@ -844,7 +887,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .needs_async_data = 1, + .needs_async_setup = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), }, @@ -853,7 +896,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_async_data = 1, + .needs_async_setup = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), }, @@ -887,7 +930,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_async_data = 1, + .needs_async_setup = 1, .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_RECVMSG] = { @@ -895,11 +938,10 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .needs_async_data = 1, + .needs_async_setup = 1, .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_TIMEOUT] = { - .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), }, [IORING_OP_TIMEOUT_REMOVE] = { @@ -912,14 +954,13 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { - .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), }, [IORING_OP_CONNECT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_async_data = 1, + .needs_async_setup = 1, .async_size = sizeof(struct io_async_connect), }, [IORING_OP_FALLOCATE] = { @@ -988,40 +1029,31 @@ static void io_uring_del_task_file(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files); -static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx); -static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node); -static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node( - struct io_ring_ctx *ctx); -static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); - -static bool io_rw_reissue(struct io_kiocb *req); -static void io_cqring_fill_event(struct io_kiocb *req, long res); +static void io_uring_cancel_sqpoll(struct io_sq_data *sqd); +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); + +static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, + long res, unsigned int cflags); static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req, int nr); -static void io_double_put_req(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); static void io_put_task(struct task_struct *task, int nr); -static void io_queue_next(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); -static void __io_queue_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); -static int __io_sqe_files_update(struct io_ring_ctx *ctx, - struct io_uring_rsrc_update *ip, - unsigned nr_args); -static void __io_clean_op(struct io_kiocb *req); +static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, + struct io_uring_rsrc_update2 *up, + unsigned nr_args); +static void io_clean_op(struct io_kiocb *req); static struct file *io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, bool fixed); static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); -static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool needs_lock); -static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, - struct iov_iter *iter, bool force); static void io_req_task_queue(struct io_kiocb *req); static void io_submit_flush_completions(struct io_comp_state *cs, struct io_ring_ctx *ctx); +static bool io_poll_remove_waitqs(struct io_kiocb *req); +static int io_req_prep_async(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -1043,49 +1075,49 @@ EXPORT_SYMBOL(io_uring_get_socket); #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) -static inline void io_clean_op(struct io_kiocb *req) -{ - if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) - __io_clean_op(req); -} - -static inline void io_set_resource_node(struct io_kiocb *req) +static inline void io_req_set_rsrc_node(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; if (!req->fixed_rsrc_refs) { - req->fixed_rsrc_refs = &ctx->file_data->node->refs; + req->fixed_rsrc_refs = &ctx->rsrc_node->refs; percpu_ref_get(req->fixed_rsrc_refs); } } +static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) +{ + bool got = percpu_ref_tryget(ref); + + /* already at zero, wait for ->release() */ + if (!got) + wait_for_completion(compl); + percpu_ref_resurrect(ref); + if (got) + percpu_ref_put(ref); +} + static bool io_match_task(struct io_kiocb *head, struct task_struct *task, struct files_struct *files) { struct io_kiocb *req; - if (task && head->task != task) { - /* in terms of cancelation, always match if req task is dead */ - if (head->task->flags & PF_EXITING) - return true; + if (task && head->task != task) return false; - } if (!files) return true; io_for_each_link(req, head) { if (req->flags & REQ_F_INFLIGHT) return true; - if (req->task->files == files) - return true; } return false; } static inline void req_set_fail_links(struct io_kiocb *req) { - if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + if (req->flags & REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; } @@ -1135,7 +1167,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); - idr_init(&ctx->io_buffer_idr); + xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); @@ -1143,8 +1175,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); - spin_lock_init(&ctx->inflight_lock); - INIT_LIST_HEAD(&ctx->inflight_list); spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); @@ -1164,7 +1194,7 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) if (unlikely(req->flags & REQ_F_IO_DRAIN)) { struct io_ring_ctx *ctx = req->ctx; - return seq != ctx->cached_cq_tail + return seq + ctx->cq_extra != ctx->cached_cq_tail + READ_ONCE(ctx->cached_cq_overflow); } @@ -1173,14 +1203,9 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) static void io_req_track_inflight(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; - - spin_lock_irq(&ctx->inflight_lock); - list_add(&req->inflight_entry, &ctx->inflight_list); - spin_unlock_irq(&ctx->inflight_lock); + atomic_inc(¤t->io_uring->inflight_tracked); } } @@ -1192,16 +1217,26 @@ static void io_prep_async_work(struct io_kiocb *req) if (!req->work.creds) req->work.creds = get_current_cred(); + req->work.list.next = NULL; + req->work.flags = 0; if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); - } else { + } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } + + switch (req->opcode) { + case IORING_OP_SPLICE: + case IORING_OP_TEE: + if (!S_ISREG(file_inode(req->splice.file_in)->i_mode)) + req->work.flags |= IO_WQ_WORK_UNBOUND; + break; + } } static void io_prep_async_link(struct io_kiocb *req) @@ -1221,50 +1256,29 @@ static void io_queue_async_work(struct io_kiocb *req) BUG_ON(!tctx); BUG_ON(!tctx->io_wq); - trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, - &req->work, req->flags); /* init ->work of the whole link before punting */ io_prep_async_link(req); + trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, + &req->work, req->flags); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); } -static void io_kill_timeout(struct io_kiocb *req) +static void io_kill_timeout(struct io_kiocb *req, int status) + __must_hold(&req->ctx->completion_lock) { struct io_timeout_data *io = req->async_data; - int ret; - ret = hrtimer_try_to_cancel(&io->timer); - if (ret != -1) { + if (hrtimer_try_to_cancel(&io->timer) != -1) { atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_cqring_fill_event(req, 0); + io_cqring_fill_event(req->ctx, req->user_data, status, 0); io_put_req_deferred(req, 1); } } -/* - * Returns true if we found and killed one or more timeouts - */ -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, - struct files_struct *files) -{ - struct io_kiocb *req, *tmp; - int canceled = 0; - - spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_match_task(req, tsk, files)) { - io_kill_timeout(req); - canceled++; - } - } - spin_unlock_irq(&ctx->completion_lock); - return canceled != 0; -} - static void __io_queue_deferred(struct io_ring_ctx *ctx) { do { @@ -1309,7 +1323,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) break; list_del_init(&req->timeout.list); - io_kill_timeout(req); + io_kill_timeout(req, 0); } while (!list_empty(&ctx->timeout_list)); ctx->cq_last_tm_flush = seq; @@ -1338,7 +1352,7 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } -static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) +static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; unsigned tail; @@ -1357,13 +1371,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) { - if (!ctx->cq_ev_fd) + if (likely(!ctx->cq_ev_fd)) return false; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) return false; - if (!ctx->eventfd_async) - return true; - return io_wq_current_is_worker(); + return !ctx->eventfd_async || io_wq_current_is_worker(); } static void io_cqring_ev_posted(struct io_ring_ctx *ctx) @@ -1401,41 +1413,33 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) } /* Returns true if there are no backlogged entries after the flush */ -static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, - struct task_struct *tsk, - struct files_struct *files) +static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { struct io_rings *rings = ctx->rings; - struct io_kiocb *req, *tmp; - struct io_uring_cqe *cqe; unsigned long flags; bool all_flushed, posted; - LIST_HEAD(list); if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries) return false; posted = false; spin_lock_irqsave(&ctx->completion_lock, flags); - list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { - if (!io_match_task(req, tsk, files)) - continue; + while (!list_empty(&ctx->cq_overflow_list)) { + struct io_uring_cqe *cqe = io_get_cqring(ctx); + struct io_overflow_cqe *ocqe; - cqe = io_get_cqring(ctx); if (!cqe && !force) break; - - list_move(&req->compl.list, &list); - if (cqe) { - WRITE_ONCE(cqe->user_data, req->user_data); - WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, req->compl.cflags); - } else { - ctx->cached_cq_overflow++; + ocqe = list_first_entry(&ctx->cq_overflow_list, + struct io_overflow_cqe, list); + if (cqe) + memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); + else WRITE_ONCE(ctx->rings->cq_overflow, - ctx->cached_cq_overflow); - } + ++ctx->cached_cq_overflow); posted = true; + list_del(&ocqe->list); + kfree(ocqe); } all_flushed = list_empty(&ctx->cq_overflow_list); @@ -1450,19 +1454,10 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, spin_unlock_irqrestore(&ctx->completion_lock, flags); if (posted) io_cqring_ev_posted(ctx); - - while (!list_empty(&list)) { - req = list_first_entry(&list, struct io_kiocb, compl.list); - list_del(&req->compl.list); - io_put_req(req); - } - return all_flushed; } -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, - struct task_struct *tsk, - struct files_struct *files) +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { bool ret = true; @@ -1470,7 +1465,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); - ret = __io_cqring_overflow_flush(ctx, force, tsk, files); + ret = __io_cqring_overflow_flush(ctx, force); if (ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&ctx->uring_lock); } @@ -1478,12 +1473,74 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, return ret; } -static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) +/* + * Shamelessly stolen from the mm implementation of page reference checking, + * see commit f958d7b528b1 for details. + */ +#define req_ref_zero_or_close_to_overflow(req) \ + ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) + +static inline bool req_ref_inc_not_zero(struct io_kiocb *req) +{ + return atomic_inc_not_zero(&req->refs); +} + +static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs) +{ + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_sub_and_test(refs, &req->refs); +} + +static inline bool req_ref_put_and_test(struct io_kiocb *req) +{ + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + +static inline void req_ref_put(struct io_kiocb *req) +{ + WARN_ON_ONCE(req_ref_put_and_test(req)); +} + +static inline void req_ref_get(struct io_kiocb *req) +{ + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + atomic_inc(&req->refs); +} + +static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, + long res, unsigned int cflags) +{ + struct io_overflow_cqe *ocqe; + + ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); + if (!ocqe) { + /* + * If we're in ring overflow flush mode, or in task cancel mode, + * or cannot allocate an overflow entry, then we need to drop it + * on the floor. + */ + WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow); + return false; + } + if (list_empty(&ctx->cq_overflow_list)) { + set_bit(0, &ctx->sq_check_overflow); + set_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; + } + ocqe->cqe.user_data = user_data; + ocqe->cqe.res = res; + ocqe->cqe.flags = cflags; + list_add_tail(&ocqe->list, &ctx->cq_overflow_list); + return true; +} + +static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, + long res, unsigned int cflags) { - struct io_ring_ctx *ctx = req->ctx; struct io_uring_cqe *cqe; - trace_io_uring_complete(ctx, req->user_data, res); + trace_io_uring_complete(ctx, user_data, res, cflags); /* * If we can't get a cq entry, userspace overflowed the @@ -1492,35 +1549,19 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) */ cqe = io_get_cqring(ctx); if (likely(cqe)) { - WRITE_ONCE(cqe->user_data, req->user_data); + WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - } else if (ctx->cq_overflow_flushed || - atomic_read(&req->task->io_uring->in_idle)) { - /* - * If we're in ring overflow flush mode, or in task cancel mode, - * then we cannot store the request for later flushing, we need - * to drop it on the floor. - */ - ctx->cached_cq_overflow++; - WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow); - } else { - if (list_empty(&ctx->cq_overflow_list)) { - set_bit(0, &ctx->sq_check_overflow); - set_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; - } - io_clean_op(req); - req->result = res; - req->compl.cflags = cflags; - refcount_inc(&req->refs); - list_add_tail(&req->compl.list, &ctx->cq_overflow_list); + return true; } + return io_cqring_event_overflow(ctx, user_data, res, cflags); } -static void io_cqring_fill_event(struct io_kiocb *req, long res) +/* not as hot to bloat with inlining */ +static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, + long res, unsigned int cflags) { - __io_cqring_fill_event(req, res, 0); + return __io_cqring_fill_event(ctx, user_data, res, cflags); } static void io_req_complete_post(struct io_kiocb *req, long res, @@ -1530,12 +1571,12 @@ static void io_req_complete_post(struct io_kiocb *req, long res, unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - __io_cqring_fill_event(req, res, cflags); + __io_cqring_fill_event(ctx, req->user_data, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. */ - if (refcount_dec_and_test(&req->refs)) { + if (req_ref_put_and_test(req)) { struct io_comp_state *cs = &ctx->submit_state.comp; if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { @@ -1550,20 +1591,30 @@ static void io_req_complete_post(struct io_kiocb *req, long res, io_put_task(req->task, 1); list_add(&req->compl.list, &cs->locked_free_list); cs->locked_free_nr++; - } else - req = NULL; + } else { + if (!percpu_ref_tryget(&ctx->refs)) + req = NULL; + } io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - io_cqring_ev_posted(ctx); - if (req) + if (req) { + io_cqring_ev_posted(ctx); percpu_ref_put(&ctx->refs); + } +} + +static inline bool io_req_needs_clean(struct io_kiocb *req) +{ + return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | + REQ_F_POLLED | REQ_F_INFLIGHT); } static void io_req_complete_state(struct io_kiocb *req, long res, unsigned int cflags) { - io_clean_op(req); + if (io_req_needs_clean(req)) + io_clean_op(req); req->result = res; req->compl.cflags = cflags; req->flags |= REQ_F_COMPLETE_INLINE; @@ -1583,34 +1634,50 @@ static inline void io_req_complete(struct io_kiocb *req, long res) __io_req_complete(req, 0, res, 0); } +static void io_req_complete_failed(struct io_kiocb *req, long res) +{ + req_set_fail_links(req); + io_put_req(req); + io_req_complete_post(req, res, 0); +} + +static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, + struct io_comp_state *cs) +{ + spin_lock_irq(&ctx->completion_lock); + list_splice_init(&cs->locked_free_list, &cs->free_list); + cs->locked_free_nr = 0; + spin_unlock_irq(&ctx->completion_lock); +} + +/* Returns true IFF there are requests in the cache */ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; struct io_comp_state *cs = &state->comp; - struct io_kiocb *req = NULL; + int nr; /* * If we have more than a batch's worth of requests in our IRQ side * locked cache, grab the lock and move them over to our submission * side cache. */ - if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) { - spin_lock_irq(&ctx->completion_lock); - list_splice_init(&cs->locked_free_list, &cs->free_list); - cs->locked_free_nr = 0; - spin_unlock_irq(&ctx->completion_lock); - } + if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) + io_flush_cached_locked_reqs(ctx, cs); + nr = state->free_reqs; while (!list_empty(&cs->free_list)) { - req = list_first_entry(&cs->free_list, struct io_kiocb, - compl.list); + struct io_kiocb *req = list_first_entry(&cs->free_list, + struct io_kiocb, compl.list); + list_del(&req->compl.list); - state->reqs[state->free_reqs++] = req; - if (state->free_reqs == ARRAY_SIZE(state->reqs)) + state->reqs[nr++] = req; + if (nr == ARRAY_SIZE(state->reqs)) break; } - return req != NULL; + state->free_reqs = nr; + return nr != 0; } static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) @@ -1646,37 +1713,28 @@ got_req: return state->reqs[state->free_reqs]; } -static inline void io_put_file(struct io_kiocb *req, struct file *file, - bool fixed) +static inline void io_put_file(struct file *file) { - if (!fixed) + if (file) fput(file); } static void io_dismantle_req(struct io_kiocb *req) { - io_clean_op(req); + unsigned int flags = req->flags; - if (req->async_data) - kfree(req->async_data); - if (req->file) - io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); + if (io_req_needs_clean(req)) + io_clean_op(req); + if (!(flags & REQ_F_FIXED_FILE)) + io_put_file(req->file); if (req->fixed_rsrc_refs) percpu_ref_put(req->fixed_rsrc_refs); + if (req->async_data) + kfree(req->async_data); if (req->work.creds) { put_cred(req->work.creds); req->work.creds = NULL; } - - if (req->flags & REQ_F_INFLIGHT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - req->flags &= ~REQ_F_INFLIGHT; - } } /* must to be called somewhat shortly after putting a request */ @@ -1713,7 +1771,6 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { struct io_kiocb *link = req->link; - bool cancelled = false; /* * Can happen if a linked timeout fired and link had been like @@ -1721,19 +1778,17 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) */ if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) { struct io_timeout_data *io = link->async_data; - int ret; io_remove_next_linked(req); link->timeout.head = NULL; - ret = hrtimer_try_to_cancel(&io->timer); - if (ret != -1) { - io_cqring_fill_event(link, -ECANCELED); + if (hrtimer_try_to_cancel(&io->timer) != -1) { + io_cqring_fill_event(link->ctx, link->user_data, + -ECANCELED, 0); io_put_req_deferred(link, 1); - cancelled = true; + return true; } } - req->flags &= ~REQ_F_LINK_TIMEOUT; - return cancelled; + return false; } static void io_fail_links(struct io_kiocb *req) @@ -1747,7 +1802,7 @@ static void io_fail_links(struct io_kiocb *req) link->link = NULL; trace_io_uring_fail_link(req, link); - io_cqring_fill_event(link, -ECANCELED); + io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); io_put_req_deferred(link, 2); link = nxt; } @@ -1760,7 +1815,8 @@ static bool io_disarm_next(struct io_kiocb *req) if (likely(req->flags & REQ_F_LINK_TIMEOUT)) posted = io_kill_linked_timeout(req); - if (unlikely(req->flags & REQ_F_FAIL_LINK)) { + if (unlikely((req->flags & REQ_F_FAIL_LINK) && + !(req->flags & REQ_F_HARDLINK))) { posted |= (req->link != NULL); io_fail_links(req); } @@ -1858,13 +1914,17 @@ static void tctx_task_work(struct callback_head *cb) cond_resched(); } -static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req, - enum task_work_notify_mode notify) +static int io_req_task_work_add(struct io_kiocb *req) { + struct task_struct *tsk = req->task; struct io_uring_task *tctx = tsk->io_uring; + enum task_work_notify_mode notify; struct io_wq_work_node *node, *prev; unsigned long flags; - int ret; + int ret = 0; + + if (unlikely(tsk->flags & PF_EXITING)) + return -ESRCH; WARN_ON_ONCE(!tctx); @@ -1877,14 +1937,23 @@ static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req, test_and_set_bit(0, &tctx->task_state)) return 0; - if (!task_work_add(tsk, &tctx->task_work, notify)) + /* + * SQPOLL kernel thread doesn't need notification, just a wakeup. For + * all other cases, use TWA_SIGNAL unconditionally to ensure we're + * processing task_work. There's no reliable way to tell if TWA_RESUME + * will do the job. + */ + notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; + + if (!task_work_add(tsk, &tctx->task_work, notify)) { + wake_up_process(tsk); return 0; + } /* * Slow path - we failed, find and delete work. if the work is not * in the list, it got run and we're fine. */ - ret = 0; spin_lock_irqsave(&tctx->task_lock, flags); wq_list_for_each(node, prev, &tctx->task_list) { if (&req->io_task_work.node == node) { @@ -1898,58 +1967,44 @@ static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req, return ret; } -static int io_req_task_work_add(struct io_kiocb *req) +static bool io_run_task_work_head(struct callback_head **work_head) { - struct task_struct *tsk = req->task; - struct io_ring_ctx *ctx = req->ctx; - enum task_work_notify_mode notify; - int ret; - - if (tsk->flags & PF_EXITING) - return -ESRCH; + struct callback_head *work, *next; + bool executed = false; - /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. For - * all other cases, use TWA_SIGNAL unconditionally to ensure we're - * processing task_work. There's no reliable way to tell if TWA_RESUME - * will do the job. - */ - notify = TWA_NONE; - if (!(ctx->flags & IORING_SETUP_SQPOLL)) - notify = TWA_SIGNAL; + do { + work = xchg(work_head, NULL); + if (!work) + break; - ret = io_task_work_add(tsk, req, notify); - if (!ret) - wake_up_process(tsk); + do { + next = work->next; + work->func(work); + work = next; + cond_resched(); + } while (work); + executed = true; + } while (1); - return ret; + return executed; } -static void io_req_task_work_add_fallback(struct io_kiocb *req, - task_work_func_t cb) +static void io_task_work_add_head(struct callback_head **work_head, + struct callback_head *task_work) { - struct io_ring_ctx *ctx = req->ctx; struct callback_head *head; - init_task_work(&req->task_work, cb); do { - head = READ_ONCE(ctx->exit_task_work); - req->task_work.next = head; - } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head); + head = READ_ONCE(*work_head); + task_work->next = head; + } while (cmpxchg(work_head, head, task_work) != head); } -static void __io_req_task_cancel(struct io_kiocb *req, int error) +static void io_req_task_work_add_fallback(struct io_kiocb *req, + task_work_func_t cb) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->completion_lock); - io_cqring_fill_event(req, error); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - req_set_fail_links(req); - io_double_put_req(req); + init_task_work(&req->task_work, cb); + io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work); } static void io_req_task_cancel(struct callback_head *cb) @@ -1957,10 +2012,10 @@ static void io_req_task_cancel(struct callback_head *cb) struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx; + /* ctx is guaranteed to stay alive while we hold uring_lock */ mutex_lock(&ctx->uring_lock); - __io_req_task_cancel(req, req->result); + io_req_complete_failed(req, req->result); mutex_unlock(&ctx->uring_lock); - percpu_ref_put(&ctx->refs); } static void __io_req_task_submit(struct io_kiocb *req) @@ -1972,7 +2027,7 @@ static void __io_req_task_submit(struct io_kiocb *req) if (!(current->flags & PF_EXITING) && !current->in_execve) __io_queue_sqe(req); else - __io_req_task_cancel(req, -EFAULT); + io_req_complete_failed(req, -EFAULT); mutex_unlock(&ctx->uring_lock); } @@ -1983,27 +2038,21 @@ static void io_req_task_submit(struct callback_head *cb) __io_req_task_submit(req); } -static void io_req_task_queue(struct io_kiocb *req) +static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { - int ret; + req->result = ret; + req->task_work.func = io_req_task_cancel; - req->task_work.func = io_req_task_submit; - ret = io_req_task_work_add(req); - if (unlikely(ret)) { - req->result = -ECANCELED; - percpu_ref_get(&req->ctx->refs); + if (unlikely(io_req_task_work_add(req))) io_req_task_work_add_fallback(req, io_req_task_cancel); - } } -static void io_req_task_queue_fail(struct io_kiocb *req, int ret) +static void io_req_task_queue(struct io_kiocb *req) { - percpu_ref_get(&req->ctx->refs); - req->result = ret; - req->task_work.func = io_req_task_cancel; + req->task_work.func = io_req_task_submit; if (unlikely(io_req_task_work_add(req))) - io_req_task_work_add_fallback(req, io_req_task_cancel); + io_req_task_queue_fail(req, -ECANCELED); } static inline void io_queue_next(struct io_kiocb *req) @@ -2046,6 +2095,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, struct io_submit_state *state) { io_queue_next(req); + io_dismantle_req(req); if (req->task != rb->task) { if (rb->task) @@ -2056,7 +2106,6 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, rb->task_refs++; rb->ctx_refs++; - io_dismantle_req(req); if (state->free_reqs != ARRAY_SIZE(state->reqs)) state->reqs[state->free_reqs++] = req; else @@ -2074,7 +2123,8 @@ static void io_submit_flush_completions(struct io_comp_state *cs, spin_lock_irq(&ctx->completion_lock); for (i = 0; i < nr; i++) { req = cs->reqs[i]; - __io_cqring_fill_event(req, req->result, req->compl.cflags); + __io_cqring_fill_event(ctx, req->user_data, req->result, + req->compl.cflags); } io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); @@ -2084,7 +2134,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs, req = cs->reqs[i]; /* submission and completion refs */ - if (refcount_sub_and_test(2, &req->refs)) + if (req_ref_sub_and_test(req, 2)) io_req_free_batch(&rb, req, &ctx->submit_state); } @@ -2096,20 +2146,20 @@ static void io_submit_flush_completions(struct io_comp_state *cs, * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ -static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) +static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt = NULL; - if (refcount_dec_and_test(&req->refs)) { + if (req_ref_put_and_test(req)) { nxt = io_req_find_next(req); __io_free_req(req); } return nxt; } -static void io_put_req(struct io_kiocb *req) +static inline void io_put_req(struct io_kiocb *req) { - if (refcount_dec_and_test(&req->refs)) + if (req_ref_put_and_test(req)) io_free_req(req); } @@ -2122,27 +2172,17 @@ static void io_put_req_deferred_cb(struct callback_head *cb) static void io_free_req_deferred(struct io_kiocb *req) { - int ret; - req->task_work.func = io_put_req_deferred_cb; - ret = io_req_task_work_add(req); - if (unlikely(ret)) + if (unlikely(io_req_task_work_add(req))) io_req_task_work_add_fallback(req, io_put_req_deferred_cb); } static inline void io_put_req_deferred(struct io_kiocb *req, int refs) { - if (refcount_sub_and_test(refs, &req->refs)) + if (req_ref_sub_and_test(req, refs)) io_free_req_deferred(req); } -static void io_double_put_req(struct io_kiocb *req) -{ - /* drop both submit and complete references */ - if (refcount_sub_and_test(2, &req->refs)) - io_free_req(req); -} - static unsigned io_cqring_events(struct io_ring_ctx *ctx) { /* See comment at the top of this file */ @@ -2213,19 +2253,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); - if (READ_ONCE(req->result) == -EAGAIN) { + if (READ_ONCE(req->result) == -EAGAIN && + !(req->flags & REQ_F_DONT_REISSUE)) { req->iopoll_completed = 0; - if (io_rw_reissue(req)) - continue; + req_ref_get(req); + io_queue_async_work(req); + continue; } if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); - __io_cqring_fill_event(req, req->result, cflags); + __io_cqring_fill_event(ctx, req->user_data, req->result, cflags); (*nr_events)++; - if (refcount_dec_and_test(&req->refs)) + if (req_ref_put_and_test(req)) io_req_free_batch(&rb, req, &ctx->submit_state); } @@ -2284,27 +2326,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, } /* - * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a - * non-spinning poll check - we'll still enter the driver poll loop, but only - * as a non-spinning completion check. - */ -static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, - long min) -{ - while (!list_empty(&ctx->iopoll_list) && !need_resched()) { - int ret; - - ret = io_do_iopoll(ctx, nr_events, min); - if (ret < 0) - return ret; - if (*nr_events >= min) - return 0; - } - - return 1; -} - -/* * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ @@ -2339,7 +2360,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { unsigned int nr_events = 0; - int iters = 0, ret = 0; + int ret = 0; /* * We disallow the app entering submit/complete with polling, but we @@ -2347,18 +2368,17 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * that got punted to a workqueue. */ mutex_lock(&ctx->uring_lock); + /* + * Don't enter poll loop if we already have events pending. + * If we do, we can potentially be spinning for commands that + * already triggered a CQE (eg in error). + */ + if (test_bit(0, &ctx->cq_check_overflow)) + __io_cqring_overflow_flush(ctx, false); + if (io_cqring_events(ctx)) + goto out; do { /* - * Don't enter poll loop if we already have events pending. - * If we do, we can potentially be spinning for commands that - * already triggered a CQE (eg in error). - */ - if (test_bit(0, &ctx->cq_check_overflow)) - __io_cqring_overflow_flush(ctx, false, NULL, NULL); - if (io_cqring_events(ctx)) - break; - - /* * If a submit got punted to a workqueue, we can have the * application entering polling for a command before it gets * issued. That app will hold the uring_lock for the duration @@ -2368,18 +2388,17 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * forever, while the workqueue is stuck trying to acquire the * very same mutex. */ - if (!(++iters & 7)) { + if (list_empty(&ctx->iopoll_list)) { mutex_unlock(&ctx->uring_lock); io_run_task_work(); mutex_lock(&ctx->uring_lock); - } - - ret = io_iopoll_getevents(ctx, &nr_events, min); - if (ret <= 0) - break; - ret = 0; - } while (min && !nr_events && !need_resched()); + if (list_empty(&ctx->iopoll_list)) + break; + } + ret = io_do_iopoll(ctx, &nr_events, min); + } while (!ret && nr_events < min && !need_resched()); +out: mutex_unlock(&ctx->uring_lock); return ret; } @@ -2391,45 +2410,23 @@ static void kiocb_end_write(struct io_kiocb *req) * thread. */ if (req->flags & REQ_F_ISREG) { - struct inode *inode = file_inode(req->file); + struct super_block *sb = file_inode(req->file)->i_sb; - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + __sb_writers_acquired(sb, SB_FREEZE_WRITE); + sb_end_write(sb); } - file_end_write(req->file); } #ifdef CONFIG_BLOCK static bool io_resubmit_prep(struct io_kiocb *req) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - int rw, ret; - struct iov_iter iter; - - /* already prepared */ - if (req->async_data) - return true; - - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - rw = READ; - break; - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - rw = WRITE; - break; - default: - printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n", - req->opcode); - return false; - } + struct io_async_rw *rw = req->async_data; - ret = io_import_iovec(rw, req, &iovec, &iter, false); - if (ret < 0) - return false; - return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false); + if (!rw) + return !io_req_prep_async(req); + /* may have left rw->iter inconsistent on -EIOCBQUEUED */ + iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter)); + return true; } static bool io_rw_should_reissue(struct io_kiocb *req) @@ -2451,38 +2448,32 @@ static bool io_rw_should_reissue(struct io_kiocb *req) return false; return true; } -#endif - -static bool io_rw_reissue(struct io_kiocb *req) +#else +static bool io_resubmit_prep(struct io_kiocb *req) { -#ifdef CONFIG_BLOCK - if (!io_rw_should_reissue(req)) - return false; - - lockdep_assert_held(&req->ctx->uring_lock); - - if (io_resubmit_prep(req)) { - refcount_inc(&req->refs); - io_queue_async_work(req); - return true; - } - req_set_fail_links(req); -#endif return false; } +static bool io_rw_should_reissue(struct io_kiocb *req) +{ + return false; +} +#endif static void __io_complete_rw(struct io_kiocb *req, long res, long res2, unsigned int issue_flags) { int cflags = 0; - if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req)) - return; - if (res != req->result) - req_set_fail_links(req); - if (req->rw.kiocb.ki_flags & IOCB_WRITE) kiocb_end_write(req); + if (res != req->result) { + if ((res == -EAGAIN || res == -EOPNOTSUPP) && + io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE; + return; + } + req_set_fail_links(req); + } if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); __io_req_complete(req, issue_flags, res, cflags); @@ -2499,27 +2490,18 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); -#ifdef CONFIG_BLOCK - /* Rewind iter, if we have one. iopoll path resubmits as usual */ - if (res == -EAGAIN && io_rw_should_reissue(req)) { - struct io_async_rw *rw = req->async_data; - - if (rw) - iov_iter_revert(&rw->iter, - req->result - iov_iter_count(&rw->iter)); - else if (!io_resubmit_prep(req)) - res = -EIO; - } -#endif - if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - - if (res != -EAGAIN && res != req->result) - req_set_fail_links(req); + if (unlikely(res != req->result)) { + if (!(res == -EAGAIN && io_rw_should_reissue(req) && + io_resubmit_prep(req))) { + req_set_fail_links(req); + req->flags |= REQ_F_DONT_REISSUE; + } + } WRITE_ONCE(req->result, res); - /* order with io_poll_complete() checking ->result */ + /* order with io_iopoll_complete() checking ->result */ smp_wmb(); WRITE_ONCE(req->iopoll_completed, 1); } @@ -2527,7 +2509,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) /* * After the iocb has been issued, it's safe to be found on the poll list. * Adding the kiocb to the list AFTER submission ensures that we don't - * find it from a io_iopoll_getevents() thread before the issuer is done + * find it from a io_do_iopoll() thread before the issuer is done * accessing the kiocb cookie. */ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) @@ -2613,7 +2595,7 @@ static bool io_bdev_nowait(struct block_device *bdev) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static bool io_file_supports_async(struct file *file, int rw) +static bool __io_file_supports_async(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; @@ -2646,6 +2628,16 @@ static bool io_file_supports_async(struct file *file, int rw) return file->f_op->write_iter != NULL; } +static bool io_file_supports_async(struct io_kiocb *req, int rw) +{ + if (rw == READ && (req->flags & REQ_F_ASYNC_READ)) + return true; + else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE)) + return true; + + return __io_file_supports_async(req->file, rw); +} + static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -2654,7 +2646,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned ioprio; int ret; - if (S_ISREG(file_inode(file)->i_mode)) + if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode)) req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); @@ -2696,6 +2688,12 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) kiocb->ki_complete = io_complete_rw; } + if (req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED) { + req->imu = NULL; + io_req_set_rsrc_node(req); + } + req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); req->buf_index = READ_ONCE(sqe->buf_index); @@ -2728,6 +2726,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_async_rw *io = req->async_data; + bool check_reissue = kiocb->ki_complete == io_complete_rw; /* add previously done IO, if any */ if (io && io->bytes_done > 0) { @@ -2743,28 +2742,34 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, __io_complete_rw(req, ret, 0, issue_flags); else io_rw_done(kiocb, ret); + + if (check_reissue && req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; + if (io_resubmit_prep(req)) { + req_ref_get(req); + io_queue_async_work(req); + } else { + int cflags = 0; + + req_set_fail_links(req); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_rw_kbuf(req); + __io_req_complete(req, issue_flags, ret, cflags); + } + } } -static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) +static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, + struct io_mapped_ubuf *imu) { - struct io_ring_ctx *ctx = req->ctx; size_t len = req->rw.len; - struct io_mapped_ubuf *imu; - u16 index, buf_index = req->buf_index; + u64 buf_end, buf_addr = req->rw.addr; size_t offset; - u64 buf_addr; - - if (unlikely(buf_index >= ctx->nr_user_bufs)) - return -EFAULT; - index = array_index_nospec(buf_index, ctx->nr_user_bufs); - imu = &ctx->user_bufs[index]; - buf_addr = req->rw.addr; - /* overflow */ - if (buf_addr + len < buf_addr) + if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ - if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len) + if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) return -EFAULT; /* @@ -2812,6 +2817,22 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) return 0; } +static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_mapped_ubuf *imu = req->imu; + u16 index, buf_index = req->buf_index; + + if (likely(!imu)) { + if (unlikely(buf_index >= ctx->nr_user_bufs)) + return -EFAULT; + index = array_index_nospec(buf_index, ctx->nr_user_bufs); + imu = READ_ONCE(ctx->user_bufs[index]); + req->imu = imu; + } + return __io_import_fixed(req, rw, iter, imu); +} + static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) { if (needs_lock) @@ -2843,7 +2864,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, lockdep_assert_held(&req->ctx->uring_lock); - head = idr_find(&req->ctx->io_buffer_idr, bgid); + head = xa_load(&req->ctx->io_buffers, bgid); if (head) { if (!list_empty(&head->list)) { kbuf = list_last_entry(&head->list, struct io_buffer, @@ -2851,7 +2872,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, list_del(&kbuf->list); } else { kbuf = head; - idr_remove(&req->ctx->io_buffer_idr, bgid); + xa_erase(&req->ctx->io_buffers, bgid); } if (*len > kbuf->len) *len = kbuf->len; @@ -3079,29 +3100,21 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, } } -static inline int __io_alloc_async_data(struct io_kiocb *req) +static inline int io_alloc_async_data(struct io_kiocb *req) { WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); return req->async_data == NULL; } -static int io_alloc_async_data(struct io_kiocb *req) -{ - if (!io_op_defs[req->opcode].needs_async_data) - return 0; - - return __io_alloc_async_data(req); -} - static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter, bool force) { - if (!force && !io_op_defs[req->opcode].needs_async_data) + if (!force && !io_op_defs[req->opcode].needs_async_setup) return 0; if (!req->async_data) { - if (__io_alloc_async_data(req)) { + if (io_alloc_async_data(req)) { kfree(iovec); return -ENOMEM; } @@ -3161,7 +3174,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, list_del_init(&wait->entry); /* submit ref gets dropped, acquire a new one */ - refcount_inc(&req->refs); + req_ref_get(req); io_req_task_queue(req); return 1; } @@ -3246,7 +3259,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req->file, READ)) { + if (force_nonblock && !io_file_supports_async(req, READ)) { ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); return ret ?: -EAGAIN; } @@ -3259,11 +3272,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(req, iter); - if (ret == -EIOCBQUEUED) { - if (req->async_data) - iov_iter_revert(iter, io_size - iov_iter_count(iter)); - goto out_free; - } else if (ret == -EAGAIN) { + if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { + req->flags &= ~REQ_F_REISSUE; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; @@ -3273,6 +3283,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) /* some cases will consume bytes even on error returns */ iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = 0; + } else if (ret == -EIOCBQUEUED) { + goto out_free; } else if (ret <= 0 || ret == io_size || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) { /* read all, failed, already did sync or don't want to retry */ @@ -3352,7 +3364,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req->file, WRITE)) + if (force_nonblock && !io_file_supports_async(req, WRITE)) goto copy_iov; /* file path doesn't support NOWAIT for non-direct_IO */ @@ -3385,6 +3397,11 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; + if (req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; + ret2 = -EAGAIN; + } + /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. @@ -3394,8 +3411,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) /* no retry on NONBLOCK nor RWF_NOWAIT */ if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) goto done; - if (ret2 == -EIOCBQUEUED && req->async_data) - iov_iter_revert(iter, io_size - iov_iter_count(iter)); if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) @@ -3568,15 +3583,6 @@ static int __io_splice_prep(struct io_kiocb *req, if (!sp->file_in) return -EBADF; req->flags |= REQ_F_NEED_CLEANUP; - - if (!S_ISREG(file_inode(sp->file_in)->i_mode)) { - /* - * Splice operation will be punted aync, and here need to - * modify io_wq_work.flags, so initialize io_wq_work firstly. - */ - req->work.flags |= IO_WQ_WORK_UNBOUND; - } - return 0; } @@ -3601,7 +3607,8 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) if (sp->len) ret = do_tee(in, out, sp->len, flags); - io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) + io_put_file(in); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret != sp->len) @@ -3637,7 +3644,8 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) if (sp->len) ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); - io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) + io_put_file(in); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret != sp->len) @@ -3843,7 +3851,7 @@ err: req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -3892,7 +3900,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, } i++; kfree(buf); - idr_remove(&ctx->io_buffer_idr, bgid); + xa_erase(&ctx->io_buffers, bgid); return i; } @@ -3910,26 +3918,22 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); ret = -ENOENT; - head = idr_find(&ctx->io_buffer_idr, p->bgid); + head = xa_load(&ctx->io_buffers, p->bgid); if (head) ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); if (ret < 0) req_set_fail_links(req); - /* need to hold the lock to complete IOPOLL requests */ - if (ctx->flags & IORING_SETUP_IOPOLL) { - __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, !force_nonblock); - } else { - io_ring_submit_unlock(ctx, !force_nonblock); - __io_req_complete(req, issue_flags, ret, 0); - } + /* complete before unlock, IOPOLL may need the lock */ + __io_req_complete(req, issue_flags, ret, 0); + io_ring_submit_unlock(ctx, !force_nonblock); return 0; } static int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + unsigned long size, tmp_check; struct io_provide_buf *p = &req->pbuf; u64 tmp; @@ -3943,7 +3947,14 @@ static int io_provide_buffers_prep(struct io_kiocb *req, p->addr = READ_ONCE(sqe->addr); p->len = READ_ONCE(sqe->len); - if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs))) + if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, + &size)) + return -EOVERFLOW; + if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) + return -EOVERFLOW; + + size = (unsigned long)p->len * p->nbufs; + if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; p->bgid = READ_ONCE(sqe->buf_group); @@ -3993,32 +4004,19 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); - list = head = idr_find(&ctx->io_buffer_idr, p->bgid); + list = head = xa_load(&ctx->io_buffers, p->bgid); ret = io_add_buffers(p, &head); - if (ret < 0) - goto out; - - if (!list) { - ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1, - GFP_KERNEL); - if (ret < 0) { + if (ret >= 0 && !list) { + ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); + if (ret < 0) __io_remove_buffers(ctx, head, p->bgid, -1U); - goto out; - } } -out: if (ret < 0) req_set_fail_links(req); - - /* need to hold the lock to complete IOPOLL requests */ - if (ctx->flags & IORING_SETUP_IOPOLL) { - __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, !force_nonblock); - } else { - io_ring_submit_unlock(ctx, !force_nonblock); - __io_req_complete(req, issue_flags, ret, 0); - } + /* complete before unlock, IOPOLL may need the lock */ + __io_req_complete(req, issue_flags, ret, 0); + io_ring_submit_unlock(ctx, !force_nonblock); return 0; } @@ -4137,7 +4135,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4164,12 +4162,8 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) struct io_statx *ctx = &req->statx; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) { - /* only need file table for an actual valid fd */ - if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD) - req->flags |= REQ_F_NO_FILE_TABLE; + if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - } ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, ctx->buffer); @@ -4199,11 +4193,9 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) struct files_struct *files = current->files; struct io_close *close = &req->close; struct fdtable *fdt; - struct file *file; - int ret; + struct file *file = NULL; + int ret = -EBADF; - file = NULL; - ret = -EBADF; spin_lock(&files->file_lock); fdt = files_fdtable(files); if (close->fd >= fdt->max_fds) { @@ -4211,12 +4203,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) goto err; } file = fdt->fd[close->fd]; - if (!file) { - spin_unlock(&files->file_lock); - goto err; - } - - if (file->f_op == &io_uring_fops) { + if (!file || file->f_op == &io_uring_fops) { spin_unlock(&files->file_lock); file = NULL; goto err; @@ -4314,8 +4301,6 @@ static int io_sendmsg_prep_async(struct io_kiocb *req) { int ret; - if (!io_op_defs[req->opcode].needs_async_data) - return 0; ret = io_sendmsg_copy_hdr(req, req->async_data); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; @@ -4329,9 +4314,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); + sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + if (sr->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -4345,6 +4332,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) struct io_async_msghdr iomsg, *kmsg; struct socket *sock; unsigned flags; + int min_ret = 0; int ret; sock = sock_from_file(req->file); @@ -4360,10 +4348,10 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) } flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (issue_flags & IO_URING_F_NONBLOCK) + if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) @@ -4375,7 +4363,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) + if (ret < min_ret) req_set_fail_links(req); __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -4388,6 +4376,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) struct iovec iov; struct socket *sock; unsigned flags; + int min_ret = 0; int ret; sock = sock_from_file(req->file); @@ -4404,10 +4393,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_namelen = 0; flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (issue_flags & IO_URING_F_NONBLOCK) + if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); msg.msg_flags = flags; ret = sock_sendmsg(sock, &msg); @@ -4416,7 +4405,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) if (ret == -ERESTARTSYS) ret = -EINTR; - if (ret < 0) + if (ret < min_ret) req_set_fail_links(req); __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -4458,16 +4447,14 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { - struct compat_msghdr __user *msg_compat; struct io_sr_msg *sr = &req->sr_msg; struct compat_iovec __user *uiov; compat_uptr_t ptr; compat_size_t len; int ret; - msg_compat = (struct compat_msghdr __user *) sr->umsg; - ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr, - &ptr, &len); + ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, + &ptr, &len); if (ret) return ret; @@ -4535,8 +4522,6 @@ static int io_recvmsg_prep_async(struct io_kiocb *req) { int ret; - if (!io_op_defs[req->opcode].needs_async_data) - return 0; ret = io_recvmsg_copy_hdr(req, req->async_data); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; @@ -4550,10 +4535,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->bgid = READ_ONCE(sqe->buf_group); + sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + if (sr->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -4568,6 +4555,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; struct io_buffer *kbuf; unsigned flags; + int min_ret = 0; int ret, cflags = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -4594,10 +4582,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) } flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) + if (force_nonblock) flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); @@ -4612,7 +4600,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) + if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) req_set_fail_links(req); __io_req_complete(req, issue_flags, ret, cflags); return 0; @@ -4627,6 +4615,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; struct iovec iov; unsigned flags; + int min_ret = 0; int ret, cflags = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -4653,10 +4642,10 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) msg.msg_flags = 0; flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) + if (force_nonblock) flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); ret = sock_recvmsg(sock, &msg, flags); if (force_nonblock && ret == -EAGAIN) @@ -4666,7 +4655,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) out_free: if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_recv_kbuf(req); - if (ret < 0) + if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) req_set_fail_links(req); __io_req_complete(req, issue_flags, ret, cflags); return 0; @@ -4763,7 +4752,6 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) ret = -ENOMEM; goto out; } - io = req->async_data; memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } @@ -4825,7 +4813,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, req->result = mask; req->task_work.func = func; - percpu_ref_get(&req->ctx->refs); /* * If this fails, then the task is exiting. When a task exits, the @@ -4877,6 +4864,7 @@ static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) } static void io_poll_remove_double(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) { struct io_poll_iocb *poll = io_poll_get_double(req); @@ -4888,20 +4876,37 @@ static void io_poll_remove_double(struct io_kiocb *req) spin_lock(&head->lock); list_del_init(&poll->wait.entry); if (poll->wait.private) - refcount_dec(&req->refs); + req_ref_put(req); poll->head = NULL; spin_unlock(&head->lock); } } -static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) +static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) + __must_hold(&req->ctx->completion_lock) { struct io_ring_ctx *ctx = req->ctx; + unsigned flags = IORING_CQE_F_MORE; + int error; + + if (READ_ONCE(req->poll.canceled)) { + error = -ECANCELED; + req->poll.events |= EPOLLONESHOT; + } else { + error = mangle_poll(mask); + } + if (req->poll.events & EPOLLONESHOT) + flags = 0; + if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { + io_poll_remove_waitqs(req); + req->poll.done = true; + flags = 0; + } + if (flags & IORING_CQE_F_MORE) + ctx->cq_extra++; - io_poll_remove_double(req); - req->poll.done = true; - io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); + return !(flags & IORING_CQE_F_MORE); } static void io_poll_task_func(struct callback_head *cb) @@ -4913,17 +4918,24 @@ static void io_poll_task_func(struct callback_head *cb) if (io_poll_rewait(req, &req->poll)) { spin_unlock_irq(&ctx->completion_lock); } else { - hash_del(&req->hash_node); - io_poll_complete(req, req->result, 0); - spin_unlock_irq(&ctx->completion_lock); + bool done; - nxt = io_put_req_find_next(req); + done = io_poll_complete(req, req->result); + if (done) { + hash_del(&req->hash_node); + } else { + req->result = 0; + add_wait_queue(req->poll.head, &req->poll.wait); + } + spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - if (nxt) - __io_req_task_submit(nxt); - } - percpu_ref_put(&ctx->refs); + if (done) { + nxt = io_put_req_find_next(req); + if (nxt) + __io_req_task_submit(nxt); + } + } } static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, @@ -4936,6 +4948,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) return 0; + if (!(poll->events & EPOLLONESHOT)) + return poll->wait.func(&poll->wait, mode, sync, key); list_del_init(&wait->entry); @@ -4954,7 +4968,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, poll->wait.func(&poll->wait, mode, sync, key); } } - refcount_dec(&req->refs); + req_ref_put(req); return 1; } @@ -4964,7 +4978,9 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, poll->head = NULL; poll->done = false; poll->canceled = false; - poll->events = events; +#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) + /* mask in events that we always want/need */ + poll->events = events | IO_POLL_UNMASK; INIT_LIST_HEAD(&poll->wait.entry); init_waitqueue_func_entry(&poll->wait, wake_func); } @@ -4988,6 +5004,12 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -EINVAL; return; } + /* + * Can't handle multishot for double wait for now, turn it + * into one-shot mode. + */ + if (!(req->poll.events & EPOLLONESHOT)) + req->poll.events |= EPOLLONESHOT; /* double add on the same waitqueue head, ignore */ if (poll->head == head) return; @@ -4997,7 +5019,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, return; } io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); - refcount_inc(&req->refs); + req_ref_get(req); poll->wait.private = req; *poll_ptr = poll; } @@ -5030,25 +5052,17 @@ static void io_async_task_func(struct callback_head *cb) if (io_poll_rewait(req, &apoll->poll)) { spin_unlock_irq(&ctx->completion_lock); - percpu_ref_put(&ctx->refs); return; } - /* If req is still hashed, it cannot have been canceled. Don't check. */ - if (hash_hashed(&req->hash_node)) - hash_del(&req->hash_node); - + hash_del(&req->hash_node); io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) __io_req_task_submit(req); else - __io_req_task_cancel(req, -ECANCELED); - - percpu_ref_put(&ctx->refs); - kfree(apoll->double_poll); - kfree(apoll); + io_req_complete_failed(req, -ECANCELED); } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -5101,7 +5115,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, ipt->error = 0; mask = 0; } - if (mask || ipt->error) + if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error) list_del_init(&poll->wait.entry); else if (cancel) WRITE_ONCE(poll->canceled, true); @@ -5133,7 +5147,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) else return false; /* if we can't nonblock try, then no point in arming a poll handler */ - if (!io_file_supports_async(req->file, rw)) + if (!io_file_supports_async(req, rw)) return false; apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); @@ -5144,7 +5158,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) req->flags |= REQ_F_POLLED; req->apoll = apoll; - mask = 0; + mask = EPOLLONESHOT; if (def->pollin) mask |= POLLIN | POLLRDNORM; if (def->pollout) @@ -5164,8 +5178,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (ret || ipt.error) { io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); - kfree(apoll->double_poll); - kfree(apoll); return false; } spin_unlock_irq(&ctx->completion_lock); @@ -5175,12 +5187,16 @@ static bool io_arm_poll_handler(struct io_kiocb *req) } static bool __io_poll_remove_one(struct io_kiocb *req, - struct io_poll_iocb *poll) + struct io_poll_iocb *poll, bool do_cancel) + __must_hold(&req->ctx->completion_lock) { bool do_complete = false; + if (!poll->head) + return false; spin_lock(&poll->head->lock); - WRITE_ONCE(poll->canceled, true); + if (do_cancel) + WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); do_complete = true; @@ -5190,28 +5206,29 @@ static bool __io_poll_remove_one(struct io_kiocb *req, return do_complete; } -static bool io_poll_remove_one(struct io_kiocb *req) +static bool io_poll_remove_waitqs(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) { bool do_complete; io_poll_remove_double(req); + do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); - if (req->opcode == IORING_OP_POLL_ADD) { - do_complete = __io_poll_remove_one(req, &req->poll); - } else { - struct async_poll *apoll = req->apoll; - + if (req->opcode != IORING_OP_POLL_ADD && do_complete) { /* non-poll requests have submit ref still */ - do_complete = __io_poll_remove_one(req, &apoll->poll); - if (do_complete) { - io_put_req(req); - kfree(apoll->double_poll); - kfree(apoll); - } + req_ref_put(req); } + return do_complete; +} + +static bool io_poll_remove_one(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) +{ + bool do_complete; + do_complete = io_poll_remove_waitqs(req); if (do_complete) { - io_cqring_fill_event(req, -ECANCELED); + io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); io_commit_cqring(req->ctx); req_set_fail_links(req); io_put_req_deferred(req, 1); @@ -5248,7 +5265,9 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, return posted != 0; } -static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) +static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, + bool poll_only) + __must_hold(&ctx->completion_lock) { struct hlist_head *list; struct io_kiocb *req; @@ -5257,43 +5276,72 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) hlist_for_each_entry(req, list, hash_node) { if (sqe_addr != req->user_data) continue; - if (io_poll_remove_one(req)) - return 0; - return -EALREADY; + if (poll_only && req->opcode != IORING_OP_POLL_ADD) + continue; + return req; } + return NULL; +} + +static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, + bool poll_only) + __must_hold(&ctx->completion_lock) +{ + struct io_kiocb *req; + + req = io_poll_find(ctx, sqe_addr, poll_only); + if (!req) + return -ENOENT; + if (io_poll_remove_one(req)) + return 0; + + return -EALREADY; +} + +static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, + unsigned int flags) +{ + u32 events; - return -ENOENT; + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif + if (!(flags & IORING_POLL_ADD_MULTI)) + events |= EPOLLONESHOT; + return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); } -static int io_poll_remove_prep(struct io_kiocb *req, +static int io_poll_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_poll_update *upd = &req->poll_update; + u32 flags; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->poll_events) + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + flags = READ_ONCE(sqe->len); + if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | + IORING_POLL_ADD_MULTI)) + return -EINVAL; + /* meaningless without update */ + if (flags == IORING_POLL_ADD_MULTI) return -EINVAL; - req->poll_remove.addr = READ_ONCE(sqe->addr); - return 0; -} - -/* - * Find a running poll command that matches one specified in sqe->addr, - * and remove it if found. - */ -static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; + upd->old_user_data = READ_ONCE(sqe->addr); + upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; + upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; - spin_lock_irq(&ctx->completion_lock); - ret = io_poll_cancel(ctx, req->poll_remove.addr); - spin_unlock_irq(&ctx->completion_lock); + upd->new_user_data = READ_ONCE(sqe->off); + if (!upd->update_user_data && upd->new_user_data) + return -EINVAL; + if (upd->update_events) + upd->events = io_poll_parse_events(sqe, flags); + else if (sqe->poll32_events) + return -EINVAL; - if (ret < 0) - req_set_fail_links(req); - io_req_complete(req, ret); return 0; } @@ -5317,19 +5365,17 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - u32 events; + u32 flags; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) + return -EINVAL; + flags = READ_ONCE(sqe->len); + if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; - events = READ_ONCE(sqe->poll32_events); -#ifdef __BIG_ENDIAN - events = swahw32(events); -#endif - poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP | - (events & EPOLLEXCLUSIVE); + poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -5347,17 +5393,80 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - io_poll_complete(req, mask, 0); + io_poll_complete(req, mask); } spin_unlock_irq(&ctx->completion_lock); if (mask) { io_cqring_ev_posted(ctx); - io_put_req(req); + if (poll->events & EPOLLONESHOT) + io_put_req(req); } return ipt.error; } +static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *preq; + bool completing; + int ret; + + spin_lock_irq(&ctx->completion_lock); + preq = io_poll_find(ctx, req->poll_update.old_user_data, true); + if (!preq) { + ret = -ENOENT; + goto err; + } + + if (!req->poll_update.update_events && !req->poll_update.update_user_data) { + completing = true; + ret = io_poll_remove_one(preq) ? 0 : -EALREADY; + goto err; + } + + /* + * Don't allow racy completion with singleshot, as we cannot safely + * update those. For multishot, if we're racing with completion, just + * let completion re-add it. + */ + completing = !__io_poll_remove_one(preq, &preq->poll, false); + if (completing && (preq->poll.events & EPOLLONESHOT)) { + ret = -EALREADY; + goto err; + } + /* we now have a detached poll request. reissue. */ + ret = 0; +err: + if (ret < 0) { + spin_unlock_irq(&ctx->completion_lock); + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; + } + /* only mask one event flags, keep behavior flags */ + if (req->poll_update.update_events) { + preq->poll.events &= ~0xffff; + preq->poll.events |= req->poll_update.events & 0xffff; + preq->poll.events |= IO_POLL_UNMASK; + } + if (req->poll_update.update_user_data) + preq->user_data = req->poll_update.new_user_data; + spin_unlock_irq(&ctx->completion_lock); + + /* complete update request, we're done with it */ + io_req_complete(req, ret); + + if (!completing) { + ret = io_poll_add(preq, issue_flags); + if (ret < 0) { + req_set_fail_links(preq); + io_req_complete(preq, ret); + } + } + return 0; +} + static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -5371,7 +5480,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - io_cqring_fill_event(req, -ETIME); + io_cqring_fill_event(ctx, req->user_data, -ETIME, 0); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -5383,30 +5492,29 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, __u64 user_data) + __must_hold(&ctx->completion_lock) { struct io_timeout_data *io; struct io_kiocb *req; - int ret = -ENOENT; + bool found = false; list_for_each_entry(req, &ctx->timeout_list, timeout.list) { - if (user_data == req->user_data) { - ret = 0; + found = user_data == req->user_data; + if (found) break; - } } - - if (ret == -ENOENT) - return ERR_PTR(ret); + if (!found) + return ERR_PTR(-ENOENT); io = req->async_data; - ret = hrtimer_try_to_cancel(&io->timer); - if (ret == -1) + if (hrtimer_try_to_cancel(&io->timer) == -1) return ERR_PTR(-EALREADY); list_del_init(&req->timeout.list); return req; } static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) + __must_hold(&ctx->completion_lock) { struct io_kiocb *req = io_timeout_extract(ctx, user_data); @@ -5414,13 +5522,14 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return PTR_ERR(req); req_set_fail_links(req); - io_cqring_fill_event(req, -ECANCELED); + io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); io_put_req_deferred(req, 1); return 0; } static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) + __must_hold(&ctx->completion_lock) { struct io_kiocb *req = io_timeout_extract(ctx, user_data); struct io_timeout_data *data; @@ -5486,7 +5595,7 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) ret = io_timeout_update(ctx, tr->addr, &tr->ts, io_translate_timeout_mode(tr->flags)); - io_cqring_fill_event(req, ret); + io_cqring_fill_event(ctx, req->user_data, ret, 0); io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -5526,7 +5635,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - io_req_track_inflight(req); + if (is_timeout_link) + io_req_track_inflight(req); return 0; } @@ -5627,27 +5737,23 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, int ret; ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - if (ret != -ENOENT) { - spin_lock_irqsave(&ctx->completion_lock, flags); - goto done; - } - spin_lock_irqsave(&ctx->completion_lock, flags); + if (ret != -ENOENT) + goto done; ret = io_timeout_cancel(ctx, sqe_addr); if (ret != -ENOENT) goto done; - ret = io_poll_cancel(ctx, sqe_addr); + ret = io_poll_cancel(ctx, sqe_addr, false); done: if (!ret) ret = success_ret; - io_cqring_fill_event(req, ret); + io_cqring_fill_event(ctx, req->user_data, ret, 0); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); if (ret < 0) req_set_fail_links(req); - io_put_req(req); } static int io_async_cancel_prep(struct io_kiocb *req, @@ -5679,7 +5785,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) ret = io_timeout_cancel(ctx, sqe_addr); if (ret != -ENOENT) goto done; - ret = io_poll_cancel(ctx, sqe_addr); + ret = io_poll_cancel(ctx, sqe_addr, false); if (ret != -ENOENT) goto done; spin_unlock_irq(&ctx->completion_lock); @@ -5690,8 +5796,6 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; - if (!tctx || !tctx->io_wq) - continue; ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); if (ret != -ENOENT) break; @@ -5700,7 +5804,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) spin_lock_irq(&ctx->completion_lock); done: - io_cqring_fill_event(req, ret); + io_cqring_fill_event(ctx, req->user_data, ret, 0); io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -5732,7 +5836,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_uring_rsrc_update up; + struct io_uring_rsrc_update2 up; int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -5740,9 +5844,13 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.offset = req->rsrc_update.offset; up.data = req->rsrc_update.arg; + up.nr = 0; + up.tags = 0; + up.resv = 0; mutex_lock(&ctx->uring_lock); - ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args); + ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, + &up, req->rsrc_update.nr_args); mutex_unlock(&ctx->uring_lock); if (ret < 0) @@ -5767,7 +5875,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_POLL_ADD: return io_poll_add_prep(req, sqe); case IORING_OP_POLL_REMOVE: - return io_poll_remove_prep(req, sqe); + return io_poll_update_prep(req, sqe); case IORING_OP_FSYNC: return io_fsync_prep(req, sqe); case IORING_OP_SYNC_FILE_RANGE: @@ -5826,42 +5934,33 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); - return-EINVAL; + return -EINVAL; } static int io_req_prep_async(struct io_kiocb *req) { + if (!io_op_defs[req->opcode].needs_async_setup) + return 0; + if (WARN_ON_ONCE(req->async_data)) + return -EFAULT; + if (io_alloc_async_data(req)) + return -EAGAIN; + switch (req->opcode) { case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: return io_rw_prep_async(req, READ); case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: return io_rw_prep_async(req, WRITE); case IORING_OP_SENDMSG: - case IORING_OP_SEND: return io_sendmsg_prep_async(req); case IORING_OP_RECVMSG: - case IORING_OP_RECV: return io_recvmsg_prep_async(req); case IORING_OP_CONNECT: return io_connect_prep_async(req); } - return 0; -} - -static int io_req_defer_prep(struct io_kiocb *req) -{ - if (!io_op_defs[req->opcode].needs_async_data) - return 0; - /* some opcodes init it during the inital prep */ - if (req->async_data) - return 0; - if (__io_alloc_async_data(req)) - return -EAGAIN; - return io_req_prep_async(req); + printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", + req->opcode); + return -EFAULT; } static u32 io_get_sequence(struct io_kiocb *req) @@ -5894,7 +5993,7 @@ static int io_req_defer(struct io_kiocb *req) if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; - ret = io_req_defer_prep(req); + ret = io_req_prep_async(req); if (ret) return ret; io_prep_async_link(req); @@ -5918,7 +6017,7 @@ static int io_req_defer(struct io_kiocb *req) return -EIOCBQUEUED; } -static void __io_clean_op(struct io_kiocb *req) +static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { switch (req->opcode) { @@ -5957,8 +6056,8 @@ static void __io_clean_op(struct io_kiocb *req) } case IORING_OP_SPLICE: case IORING_OP_TEE: - io_put_file(req, req->splice.file_in, - (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED)) + io_put_file(req->splice.file_in); break; case IORING_OP_OPENAT: case IORING_OP_OPENAT2: @@ -5975,6 +6074,17 @@ static void __io_clean_op(struct io_kiocb *req) } req->flags &= ~REQ_F_NEED_CLEANUP; } + if ((req->flags & REQ_F_POLLED) && req->apoll) { + kfree(req->apoll->double_poll); + kfree(req->apoll); + req->apoll = NULL; + } + if (req->flags & REQ_F_INFLIGHT) { + struct io_uring_task *tctx = req->task->io_uring; + + atomic_dec(&tctx->inflight_tracked); + req->flags &= ~REQ_F_INFLIGHT; + } } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) @@ -6007,7 +6117,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ret = io_poll_add(req, issue_flags); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, issue_flags); + ret = io_poll_update(req, issue_flags); break; case IORING_OP_SYNC_FILE_RANGE: ret = io_sync_file_range(req, issue_flags); @@ -6145,18 +6255,48 @@ static void io_wq_submit_work(struct io_wq_work *work) /* avoid locking problems by failing it from a clean context */ if (ret) { /* io-wq is going to take one down */ - refcount_inc(&req->refs); + req_ref_get(req); io_req_task_queue_fail(req, ret); } } +#define FFS_ASYNC_READ 0x1UL +#define FFS_ASYNC_WRITE 0x2UL +#ifdef CONFIG_64BIT +#define FFS_ISREG 0x4UL +#else +#define FFS_ISREG 0x0UL +#endif +#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) + +static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, + unsigned i) +{ + struct io_fixed_file *table_l2; + + table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT]; + return &table_l2[i & IORING_FILE_TABLE_MASK]; +} + static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, int index) { - struct fixed_rsrc_table *table; + struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); - table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; - return table->files[index & IORING_FILE_TABLE_MASK]; + return (struct file *) (slot->file_ptr & FFS_MASK); +} + +static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) +{ + unsigned long file_ptr = (unsigned long) file; + + if (__io_file_supports_async(file, READ)) + file_ptr |= FFS_ASYNC_READ; + if (__io_file_supports_async(file, WRITE)) + file_ptr |= FFS_ASYNC_WRITE; + if (S_ISREG(file_inode(file)->i_mode)) + file_ptr |= FFS_ISREG; + file_slot->file_ptr = file_ptr; } static struct file *io_file_get(struct io_submit_state *state, @@ -6166,18 +6306,26 @@ static struct file *io_file_get(struct io_submit_state *state, struct file *file; if (fixed) { + unsigned long file_ptr; + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) return NULL; fd = array_index_nospec(fd, ctx->nr_user_files); - file = io_file_from_index(ctx, fd); - io_set_resource_node(req); + file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; + file = (struct file *) (file_ptr & FFS_MASK); + file_ptr &= ~FFS_MASK; + /* mask in overlapping REQ_F and FFS bits */ + req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT); + io_req_set_rsrc_node(req); } else { trace_io_uring_file_get(ctx, fd); file = __io_file_get(state, fd); + + /* we don't allow fixed io_uring files */ + if (file && unlikely(file->f_op == &io_uring_fops)) + io_req_track_inflight(req); } - if (file && unlikely(file->f_op == &io_uring_fops)) - io_req_track_inflight(req); return file; } @@ -6197,25 +6345,27 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (prev && refcount_inc_not_zero(&prev->refs)) + if (prev && req_ref_inc_not_zero(prev)) io_remove_next_linked(prev); else prev = NULL; spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - req_set_fail_links(prev); io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req_deferred(prev, 1); } else { io_req_complete_post(req, -ETIME, 0); - io_put_req_deferred(req, 1); } + io_put_req_deferred(req, 1); return HRTIMER_NORESTART; } -static void __io_queue_linked_timeout(struct io_kiocb *req) +static void io_queue_linked_timeout(struct io_kiocb *req) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -6227,16 +6377,7 @@ static void __io_queue_linked_timeout(struct io_kiocb *req) hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); } -} - -static void io_queue_linked_timeout(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->completion_lock); - __io_queue_linked_timeout(req); spin_unlock_irq(&ctx->completion_lock); - /* drop submission reference */ io_put_req(req); } @@ -6266,15 +6407,7 @@ static void __io_queue_sqe(struct io_kiocb *req) * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - if (!io_arm_poll_handler(req)) { - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req); - } - } else if (likely(!ret)) { + if (likely(!ret)) { /* drop submission reference */ if (req->flags & REQ_F_COMPLETE_INLINE) { struct io_ring_ctx *ctx = req->ctx; @@ -6286,10 +6419,16 @@ static void __io_queue_sqe(struct io_kiocb *req) } else { io_put_req(req); } + } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + if (!io_arm_poll_handler(req)) { + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req); + } } else { - req_set_fail_links(req); - io_put_req(req); - io_req_complete(req, ret); + io_req_complete_failed(req, ret); } if (linked_timeout) io_queue_linked_timeout(linked_timeout); @@ -6303,12 +6442,10 @@ static void io_queue_sqe(struct io_kiocb *req) if (ret) { if (ret != -EIOCBQUEUED) { fail_req: - req_set_fail_links(req); - io_put_req(req); - io_req_complete(req, ret); + io_req_complete_failed(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { - ret = io_req_defer_prep(req); + ret = io_req_prep_async(req); if (unlikely(ret)) goto fail_req; io_queue_async_work(req); @@ -6360,12 +6497,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->link = NULL; req->fixed_rsrc_refs = NULL; /* one is dropped after submission, the other at completion */ - refcount_set(&req->refs, 2); + atomic_set(&req->refs, 2); req->task = current; req->result = 0; - req->work.list.next = NULL; req->work.creds = NULL; - req->work.flags = 0; /* enforce forwards compatibility on users */ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { @@ -6423,15 +6558,13 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { fail_req: - io_put_req(req); - io_req_complete(req, ret); if (link->head) { /* fail even hard links since we don't submit */ link->head->flags |= REQ_F_FAIL_LINK; - io_put_req(link->head); - io_req_complete(link->head, -ECANCELED); + io_req_complete_failed(link->head, -ECANCELED); link->head = NULL; } + io_req_complete_failed(req, ret); return ret; } ret = io_req_prep(req, sqe); @@ -6463,7 +6596,7 @@ fail_req: head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - ret = io_req_defer_prep(req); + ret = io_req_prep_async(req); if (unlikely(ret)) goto fail_req; trace_io_uring_link(ctx, req, head); @@ -6565,12 +6698,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { int submitted = 0; - /* if we have a backlog and couldn't flush it all, return BUSY */ - if (test_bit(0, &ctx->sq_check_overflow)) { - if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL)) - return -EBUSY; - } - /* make sure SQ entry isn't read before tail */ nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); @@ -6651,6 +6778,10 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, &nr_events, 0); + /* + * Don't submit if refs are dying, good for io_uring_register(), + * but also it is relied upon by io_ring_exit_work() + */ if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && !(ctx->flags & IORING_SETUP_R_DISABLED)) ret = io_submit_sqes(ctx, to_submit); @@ -6668,11 +6799,8 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd) struct io_ring_ctx *ctx; unsigned sq_thread_idle = 0; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - if (sq_thread_idle < ctx->sq_thread_idle) - sq_thread_idle = ctx->sq_thread_idle; - } - + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); sqd->sq_thread_idle = sq_thread_idle; } @@ -6684,9 +6812,8 @@ static int io_sq_thread(void *data) char buf[TASK_COMM_LEN]; DEFINE_WAIT(wait); - sprintf(buf, "iou-sqp-%d", sqd->task_pid); + snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); set_task_comm(current, buf); - current->pf_io_worker = NULL; if (sqd->sq_cpu != -1) set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); @@ -6694,22 +6821,33 @@ static int io_sq_thread(void *data) set_cpus_allowed_ptr(current, cpu_online_mask); current->flags |= PF_NO_SETAFFINITY; - down_read(&sqd->rw_lock); + mutex_lock(&sqd->lock); + /* a user may had exited before the thread started */ + io_run_task_work_head(&sqd->park_task_work); while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { int ret; bool cap_entries, sqt_spin, needs_sched; - if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { - up_read(&sqd->rw_lock); + if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || + signal_pending(current)) { + bool did_sig = false; + + mutex_unlock(&sqd->lock); + if (signal_pending(current)) { + struct ksignal ksig; + + did_sig = get_signal(&ksig); + } cond_resched(); - down_read(&sqd->rw_lock); + mutex_lock(&sqd->lock); io_run_task_work(); + io_run_task_work_head(&sqd->park_task_work); + if (did_sig) + break; timeout = jiffies + sqd->sq_thread_idle; continue; } - if (fatal_signal_pending(current)) - break; sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { @@ -6732,50 +6870,46 @@ static int io_sq_thread(void *data) continue; } - needs_sched = true; prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->iopoll_list)) { - needs_sched = false; - break; - } - if (io_sqring_entries(ctx)) { - needs_sched = false; - break; - } - } - - if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { + if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); - up_read(&sqd->rw_lock); - schedule(); - down_read(&sqd->rw_lock); + needs_sched = true; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !list_empty_careful(&ctx->iopoll_list)) { + needs_sched = false; + break; + } + if (io_sqring_entries(ctx)) { + needs_sched = false; + break; + } + } + + if (needs_sched) { + mutex_unlock(&sqd->lock); + schedule(); + mutex_lock(&sqd->lock); + } list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); } finish_wait(&sqd->wait, &wait); + io_run_task_work_head(&sqd->park_task_work); timeout = jiffies + sqd->sq_thread_idle; } - up_read(&sqd->rw_lock); - down_write(&sqd->rw_lock); - /* - * someone may have parked and added a cancellation task_work, run - * it first because we don't want it in io_uring_cancel_sqpoll() - */ - io_run_task_work(); - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_uring_cancel_sqpoll(ctx); + io_uring_cancel_sqpoll(sqd); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); - up_write(&sqd->rw_lock); - io_run_task_work(); + io_run_task_work_head(&sqd->park_task_work); + mutex_unlock(&sqd->lock); + complete(&sqd->exited); do_exit(0); } @@ -6821,7 +6955,7 @@ static int io_run_task_work_sig(void) return 1; if (!signal_pending(current)) return 0; - if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL)) + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) return -ERESTARTSYS; return -EINTR; } @@ -6867,7 +7001,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, int ret; do { - io_cqring_overflow_flush(ctx, false, NULL, NULL); + io_cqring_overflow_flush(ctx, false); if (io_cqring_events(ctx) >= min_events) return 0; if (!io_run_task_work()) @@ -6899,7 +7033,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, trace_io_uring_cqring_wait(ctx, min_events); do { /* if we can't even flush overflow, don't wait for more */ - if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) { + if (!io_cqring_overflow_flush(ctx, false)) { ret = -EBUSY; break; } @@ -6915,35 +7049,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) +static void io_free_file_tables(struct io_file_table *table, unsigned nr_files) { -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff *skb; + unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE); - while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) - kfree_skb(skb); - } -#else - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - - file = io_file_from_index(ctx, i); - if (file) - fput(file); - } -#endif -} - -static void io_rsrc_data_ref_zero(struct percpu_ref *ref) -{ - struct fixed_rsrc_data *data; - - data = container_of(ref, struct fixed_rsrc_data, refs); - complete(&data->done); + for (i = 0; i < nr_tables; i++) + kfree(table->files[i]); + kfree(table->files); + table->files = NULL; } static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx) @@ -6956,142 +7069,173 @@ static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx) spin_unlock_bh(&ctx->rsrc_ref_lock); } -static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx, - struct fixed_rsrc_data *rsrc_data, - struct fixed_rsrc_ref_node *ref_node) +static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) { - io_rsrc_ref_lock(ctx); - rsrc_data->node = ref_node; - list_add_tail(&ref_node->node, &ctx->rsrc_ref_list); - io_rsrc_ref_unlock(ctx); - percpu_ref_get(&rsrc_data->refs); + percpu_ref_exit(&ref_node->refs); + kfree(ref_node); } -static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data) +static void io_rsrc_node_switch(struct io_ring_ctx *ctx, + struct io_rsrc_data *data_to_kill) { - struct fixed_rsrc_ref_node *ref_node = NULL; + WARN_ON_ONCE(!ctx->rsrc_backup_node); + WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); - io_rsrc_ref_lock(ctx); - ref_node = data->node; - data->node = NULL; - io_rsrc_ref_unlock(ctx); - if (ref_node) - percpu_ref_kill(&ref_node->refs); + if (data_to_kill) { + struct io_rsrc_node *rsrc_node = ctx->rsrc_node; + + rsrc_node->rsrc_data = data_to_kill; + io_rsrc_ref_lock(ctx); + list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); + io_rsrc_ref_unlock(ctx); + + atomic_inc(&data_to_kill->refs); + percpu_ref_kill(&rsrc_node->refs); + ctx->rsrc_node = NULL; + } + + if (!ctx->rsrc_node) { + ctx->rsrc_node = ctx->rsrc_backup_node; + ctx->rsrc_backup_node = NULL; + } +} + +static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) +{ + if (ctx->rsrc_backup_node) + return 0; + ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); + return ctx->rsrc_backup_node ? 0 : -ENOMEM; } -static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, - struct io_ring_ctx *ctx, - void (*rsrc_put)(struct io_ring_ctx *ctx, - struct io_rsrc_put *prsrc)) +static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) { - struct fixed_rsrc_ref_node *backup_node; int ret; + /* As we may drop ->uring_lock, other task may have started quiesce */ if (data->quiesce) return -ENXIO; data->quiesce = true; do { - ret = -ENOMEM; - backup_node = alloc_fixed_rsrc_ref_node(ctx); - if (!backup_node) + ret = io_rsrc_node_switch_start(ctx); + if (ret) break; - backup_node->rsrc_data = data; - backup_node->rsrc_put = rsrc_put; + io_rsrc_node_switch(ctx, data); - io_sqe_rsrc_kill_node(ctx, data); - percpu_ref_kill(&data->refs); + /* kill initial ref, already quiesced if zero */ + if (atomic_dec_and_test(&data->refs)) + break; flush_delayed_work(&ctx->rsrc_put_work); - ret = wait_for_completion_interruptible(&data->done); if (!ret) break; - percpu_ref_resurrect(&data->refs); - io_sqe_rsrc_set_node(ctx, data, backup_node); - backup_node = NULL; + atomic_inc(&data->refs); + /* wait for all works potentially completing data->done */ + flush_delayed_work(&ctx->rsrc_put_work); reinit_completion(&data->done); + mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(); mutex_lock(&ctx->uring_lock); } while (ret >= 0); data->quiesce = false; - if (backup_node) - destroy_fixed_rsrc_ref_node(backup_node); return ret; } -static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx) +static void io_rsrc_data_free(struct io_rsrc_data *data) +{ + kvfree(data->tags); + kfree(data); +} + +static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx, + rsrc_put_fn *do_put, + unsigned nr) { - struct fixed_rsrc_data *data; + struct io_rsrc_data *data; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return NULL; - if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL); + if (!data->tags) { kfree(data); return NULL; } + + atomic_set(&data->refs, 1); data->ctx = ctx; + data->do_put = do_put; init_completion(&data->done); return data; } -static void free_fixed_rsrc_data(struct fixed_rsrc_data *data) +static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { - percpu_ref_exit(&data->refs); - kfree(data->table); - kfree(data); +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) { + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) + kfree_skb(skb); + } +#else + int i; + + for (i = 0; i < ctx->nr_user_files; i++) { + struct file *file; + + file = io_file_from_index(ctx, i); + if (file) + fput(file); + } +#endif + io_free_file_tables(&ctx->file_table, ctx->nr_user_files); + io_rsrc_data_free(ctx->file_data); + ctx->file_data = NULL; + ctx->nr_user_files = 0; } static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { - struct fixed_rsrc_data *data = ctx->file_data; - unsigned nr_tables, i; int ret; - /* - * percpu_ref_is_dying() is to stop parallel files unregister - * Since we possibly drop uring lock later in this function to - * run task work. - */ - if (!data || percpu_ref_is_dying(&data->refs)) + if (!ctx->file_data) return -ENXIO; - ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put); - if (ret) - return ret; - - __io_sqe_files_unregister(ctx); - nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); - for (i = 0; i < nr_tables; i++) - kfree(data->table[i].files); - free_fixed_rsrc_data(data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; - return 0; + ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); + if (!ret) + __io_sqe_files_unregister(ctx); + return ret; } static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->rw_lock) + __releases(&sqd->lock) { WARN_ON_ONCE(sqd->thread == current); + /* + * Do the dance but not conditional clear_bit() because it'd race with + * other threads incrementing park_pending and setting the bit. + */ clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - up_write(&sqd->rw_lock); + if (atomic_dec_return(&sqd->park_pending)) + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + mutex_unlock(&sqd->lock); } static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->rw_lock) + __acquires(&sqd->lock) { WARN_ON_ONCE(sqd->thread == current); + atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - down_write(&sqd->rw_lock); - /* set again for consistency, in case concurrent parks are happening */ - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + mutex_lock(&sqd->lock); if (sqd->thread) wake_up_process(sqd->thread); } @@ -7099,18 +7243,21 @@ static void io_sq_thread_park(struct io_sq_data *sqd) static void io_sq_thread_stop(struct io_sq_data *sqd) { WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); - down_write(&sqd->rw_lock); set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + mutex_lock(&sqd->lock); if (sqd->thread) wake_up_process(sqd->thread); - up_write(&sqd->rw_lock); + mutex_unlock(&sqd->lock); wait_for_completion(&sqd->exited); } static void io_put_sq_data(struct io_sq_data *sqd) { if (refcount_dec_and_test(&sqd->refs)) { + WARN_ON_ONCE(atomic_read(&sqd->park_pending)); + io_sq_thread_stop(sqd); kfree(sqd); } @@ -7128,8 +7275,6 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) io_put_sq_data(sqd); ctx->sq_data = NULL; - if (ctx->sq_creds) - put_cred(ctx->sq_creds); } } @@ -7184,9 +7329,10 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, if (!sqd) return ERR_PTR(-ENOMEM); + atomic_set(&sqd->park_pending, 0); refcount_set(&sqd->refs, 1); INIT_LIST_HEAD(&sqd->ctx_list); - init_rwsem(&sqd->rw_lock); + mutex_init(&sqd->lock); init_waitqueue_head(&sqd->wait); init_completion(&sqd->exited); return sqd; @@ -7289,34 +7435,32 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) } #endif -static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data, - unsigned nr_tables, unsigned nr_files) +static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) { - int i; + unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE); + + table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL); + if (!table->files) + return false; for (i = 0; i < nr_tables; i++) { - struct fixed_rsrc_table *table = &file_data->table[i]; - unsigned this_files; + unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE); - this_files = min(nr_files, IORING_MAX_FILES_TABLE); - table->files = kcalloc(this_files, sizeof(struct file *), + table->files[i] = kcalloc(this_files, sizeof(*table->files[i]), GFP_KERNEL); - if (!table->files) + if (!table->files[i]) break; nr_files -= this_files; } if (i == nr_tables) - return 0; + return true; - for (i = 0; i < nr_tables; i++) { - struct fixed_rsrc_table *table = &file_data->table[i]; - kfree(table->files); - } - return 1; + io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE); + return false; } -static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) { struct file *file = prsrc->file; #if defined(CONFIG_UNIX) @@ -7379,21 +7523,35 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) #endif } -static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node) +static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) { - struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data; + struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; struct io_ring_ctx *ctx = rsrc_data->ctx; struct io_rsrc_put *prsrc, *tmp; list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { list_del(&prsrc->list); - ref_node->rsrc_put(ctx, prsrc); + + if (prsrc->tag) { + bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; + unsigned long flags; + + io_ring_submit_lock(ctx, lock_ring); + spin_lock_irqsave(&ctx->completion_lock, flags); + io_cqring_fill_event(ctx, prsrc->tag, 0, 0); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + io_cqring_ev_posted(ctx); + io_ring_submit_unlock(ctx, lock_ring); + } + + rsrc_data->do_put(ctx, prsrc); kfree(prsrc); } - percpu_ref_exit(&ref_node->refs); - kfree(ref_node); - percpu_ref_put(&rsrc_data->refs); + io_rsrc_node_destroy(ref_node); + if (atomic_dec_and_test(&rsrc_data->refs)) + complete(&rsrc_data->done); } static void io_rsrc_put_work(struct work_struct *work) @@ -7405,63 +7563,42 @@ static void io_rsrc_put_work(struct work_struct *work) node = llist_del_all(&ctx->rsrc_put_llist); while (node) { - struct fixed_rsrc_ref_node *ref_node; + struct io_rsrc_node *ref_node; struct llist_node *next = node->next; - ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist); + ref_node = llist_entry(node, struct io_rsrc_node, llist); __io_rsrc_put_work(ref_node); node = next; } } -static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data, - unsigned i) -{ - struct fixed_rsrc_table *table; - - table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT]; - return &table->files[i & IORING_FILE_TABLE_MASK]; -} - static void io_rsrc_node_ref_zero(struct percpu_ref *ref) { - struct fixed_rsrc_ref_node *ref_node; - struct fixed_rsrc_data *data; - struct io_ring_ctx *ctx; + struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); + struct io_ring_ctx *ctx = node->rsrc_data->ctx; bool first_add = false; - int delay = HZ; - - ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs); - data = ref_node->rsrc_data; - ctx = data->ctx; io_rsrc_ref_lock(ctx); - ref_node->done = true; + node->done = true; while (!list_empty(&ctx->rsrc_ref_list)) { - ref_node = list_first_entry(&ctx->rsrc_ref_list, - struct fixed_rsrc_ref_node, node); + node = list_first_entry(&ctx->rsrc_ref_list, + struct io_rsrc_node, node); /* recycle ref nodes in order */ - if (!ref_node->done) + if (!node->done) break; - list_del(&ref_node->node); - first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist); + list_del(&node->node); + first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); } io_rsrc_ref_unlock(ctx); - if (percpu_ref_is_dying(&data->refs)) - delay = 0; - - if (!delay) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0); - else if (first_add) - queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay); + if (first_add) + mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); } -static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node( - struct io_ring_ctx *ctx) +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { - struct fixed_rsrc_ref_node *ref_node; + struct io_rsrc_node *ref_node; ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); if (!ref_node) @@ -7478,29 +7615,14 @@ static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node( return ref_node; } -static void init_fixed_file_ref_node(struct io_ring_ctx *ctx, - struct fixed_rsrc_ref_node *ref_node) -{ - ref_node->rsrc_data = ctx->file_data; - ref_node->rsrc_put = io_ring_file_put; -} - -static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node) -{ - percpu_ref_exit(&ref_node->refs); - kfree(ref_node); -} - - static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) + unsigned nr_args, u64 __user *tags) { __s32 __user *fds = (__s32 __user *) arg; - unsigned nr_tables, i; struct file *file; - int fd, ret = -ENOMEM; - struct fixed_rsrc_ref_node *ref_node; - struct fixed_rsrc_data *file_data; + int fd, ret; + unsigned i; + struct io_rsrc_data *file_data; if (ctx->file_data) return -EBUSY; @@ -7508,33 +7630,37 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + return ret; - file_data = alloc_fixed_rsrc_data(ctx); + file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args); if (!file_data) return -ENOMEM; ctx->file_data = file_data; - - nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - file_data->table = kcalloc(nr_tables, sizeof(*file_data->table), - GFP_KERNEL); - if (!file_data->table) - goto out_free; - - if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args)) + ret = -ENOMEM; + if (!io_alloc_file_tables(&ctx->file_table, nr_args)) goto out_free; for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - if (copy_from_user(&fd, &fds[i], sizeof(fd))) { + u64 tag = 0; + + if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) || + copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; goto out_fput; } /* allow sparse sets */ - if (fd == -1) + if (fd == -1) { + ret = -EINVAL; + if (unlikely(tag)) + goto out_fput; continue; + } file = fget(fd); ret = -EBADF; - if (!file) + if (unlikely(!file)) goto out_fput; /* @@ -7548,23 +7674,17 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(file); goto out_fput; } - *io_fixed_file_slot(file_data, i) = file; + ctx->file_data->tags[i] = tag; + io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); } ret = io_sqe_files_scm(ctx); if (ret) { - io_sqe_files_unregister(ctx); + __io_sqe_files_unregister(ctx); return ret; } - ref_node = alloc_fixed_rsrc_ref_node(ctx); - if (!ref_node) { - io_sqe_files_unregister(ctx); - return -ENOMEM; - } - init_fixed_file_ref_node(ctx, ref_node); - - io_sqe_rsrc_set_node(ctx, file_data, ref_node); + io_rsrc_node_switch(ctx, NULL); return ret; out_fput: for (i = 0; i < ctx->nr_user_files; i++) { @@ -7572,11 +7692,10 @@ out_fput: if (file) fput(file); } - for (i = 0; i < nr_tables; i++) - kfree(file_data->table[i].files); + io_free_file_tables(&ctx->file_table, nr_args); ctx->nr_user_files = 0; out_free: - free_fixed_rsrc_data(ctx->file_data); + io_rsrc_data_free(ctx->file_data); ctx->file_data = NULL; return ret; } @@ -7624,67 +7743,64 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc) +static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, + struct io_rsrc_node *node, void *rsrc) { struct io_rsrc_put *prsrc; - struct fixed_rsrc_ref_node *ref_node = data->node; prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); if (!prsrc) return -ENOMEM; + prsrc->tag = data->tags[idx]; prsrc->rsrc = rsrc; - list_add(&prsrc->list, &ref_node->rsrc_list); - + list_add(&prsrc->list, &node->rsrc_list); return 0; } -static inline int io_queue_file_removal(struct fixed_rsrc_data *data, - struct file *file) -{ - return io_queue_rsrc_removal(data, (void *)file); -} - static int __io_sqe_files_update(struct io_ring_ctx *ctx, - struct io_uring_rsrc_update *up, + struct io_uring_rsrc_update2 *up, unsigned nr_args) { - struct fixed_rsrc_data *data = ctx->file_data; - struct fixed_rsrc_ref_node *ref_node; - struct file *file, **file_slot; - __s32 __user *fds; - int fd, i, err; - __u32 done; + u64 __user *tags = u64_to_user_ptr(up->tags); + __s32 __user *fds = u64_to_user_ptr(up->data); + struct io_rsrc_data *data = ctx->file_data; + struct io_fixed_file *file_slot; + struct file *file; + int fd, i, err = 0; + unsigned int done; bool needs_switch = false; - if (check_add_overflow(up->offset, nr_args, &done)) - return -EOVERFLOW; - if (done > ctx->nr_user_files) + if (!ctx->file_data) + return -ENXIO; + if (up->offset + nr_args > ctx->nr_user_files) return -EINVAL; - ref_node = alloc_fixed_rsrc_ref_node(ctx); - if (!ref_node) - return -ENOMEM; - init_fixed_file_ref_node(ctx, ref_node); - - fds = u64_to_user_ptr(up->data); for (done = 0; done < nr_args; done++) { - err = 0; - if (copy_from_user(&fd, &fds[done], sizeof(fd))) { + u64 tag = 0; + + if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || + copy_from_user(&fd, &fds[done], sizeof(fd))) { err = -EFAULT; break; } + if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { + err = -EINVAL; + break; + } if (fd == IORING_REGISTER_FILES_SKIP) continue; i = array_index_nospec(up->offset + done, ctx->nr_user_files); - file_slot = io_fixed_file_slot(ctx->file_data, i); + file_slot = io_fixed_file_slot(&ctx->file_table, i); - if (*file_slot) { - err = io_queue_file_removal(data, *file_slot); + if (file_slot->file_ptr) { + file = (struct file *)(file_slot->file_ptr & FFS_MASK); + err = io_queue_rsrc_removal(data, up->offset + done, + ctx->rsrc_node, file); if (err) break; - *file_slot = NULL; + file_slot->file_ptr = 0; needs_switch = true; } if (fd != -1) { @@ -7706,42 +7822,22 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *file_slot = file; + data->tags[up->offset + done] = tag; + io_fixed_file_set(file_slot, file); err = io_sqe_file_register(ctx, file, i); if (err) { - *file_slot = NULL; + file_slot->file_ptr = 0; fput(file); break; } } } - if (needs_switch) { - percpu_ref_kill(&data->node->refs); - io_sqe_rsrc_set_node(ctx, data, ref_node); - } else - destroy_fixed_rsrc_ref_node(ref_node); - + if (needs_switch) + io_rsrc_node_switch(ctx, data); return done ? done : err; } -static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update up; - - if (!ctx->file_data) - return -ENXIO; - if (!nr_args) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (up.resv) - return -EINVAL; - - return __io_sqe_files_update(ctx, &up, nr_args); -} - static struct io_wq_work *io_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -7750,7 +7846,8 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work) return req ? &req->work : NULL; } -static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx) +static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, + struct task_struct *task) { struct io_wq_hash *hash; struct io_wq_data data; @@ -7767,6 +7864,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx) } data.hash = hash; + data.task = task; data.free_work = io_free_work; data.do_work = io_wq_submit_work; @@ -7792,7 +7890,7 @@ static int io_uring_alloc_task_context(struct task_struct *task, return ret; } - tctx->io_wq = io_init_wq_offload(ctx); + tctx->io_wq = io_init_wq_offload(ctx, task); if (IS_ERR(tctx->io_wq)) { ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); @@ -7804,6 +7902,7 @@ static int io_uring_alloc_task_context(struct task_struct *task, init_waitqueue_head(&tctx->wait); tctx->last = NULL; atomic_set(&tctx->in_idle, 0); + atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -7837,21 +7936,15 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, f = fdget(p->wq_fd); if (!f.file) return -ENXIO; - if (f.file->f_op != &io_uring_fops) { - fdput(f); - return -EINVAL; - } fdput(f); + if (f.file->f_op != &io_uring_fops) + return -EINVAL; } if (ctx->flags & IORING_SETUP_SQPOLL) { struct task_struct *tsk; struct io_sq_data *sqd; bool attached; - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) - goto err; - sqd = io_get_sq_data(p, &attached); if (IS_ERR(sqd)) { ret = PTR_ERR(sqd); @@ -7864,34 +7957,24 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, if (!ctx->sq_thread_idle) ctx->sq_thread_idle = HZ; - ret = 0; io_sq_thread_park(sqd); + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); /* don't attach to a dying SQPOLL thread, would be racy */ - if (attached && !sqd->thread) { - ret = -ENXIO; - } else { - list_add(&ctx->sqd_list, &sqd->ctx_list); - io_sqd_update_thread_idle(sqd); - } + ret = (attached && !sqd->thread) ? -ENXIO : 0; io_sq_thread_unpark(sqd); - if (ret < 0) { - io_put_sq_data(sqd); - ctx->sq_data = NULL; - return ret; - } else if (attached) { + if (ret < 0) + goto err; + if (attached) return 0; - } if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; ret = -EINVAL; - if (cpu >= nr_cpu_ids) + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) goto err_sqpoll; - if (!cpu_online(cpu)) - goto err_sqpoll; - sqd->sq_cpu = cpu; } else { sqd->sq_cpu = -1; @@ -7917,12 +8000,11 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, } return 0; +err_sqpoll: + complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); return ret; -err_sqpoll: - complete(&ctx->sq_data->exited); - goto err; } static inline void __io_unaccount_mem(struct user_struct *user, @@ -8024,29 +8106,49 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, return off; } -static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) { - int i, j; + struct io_mapped_ubuf *imu = *slot; + unsigned int i; - if (!ctx->user_bufs) - return -ENXIO; - - for (i = 0; i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + for (i = 0; i < imu->nr_bvecs; i++) + unpin_user_page(imu->bvec[i].bv_page); + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages); + kvfree(imu); + *slot = NULL; +} - for (j = 0; j < imu->nr_bvecs; j++) - unpin_user_page(imu->bvec[j].bv_page); +static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +{ + io_buffer_unmap(ctx, &prsrc->buf); + prsrc->buf = NULL; +} - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu->bvec); - imu->nr_bvecs = 0; - } +static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) +{ + unsigned int i; + for (i = 0; i < ctx->nr_user_bufs; i++) + io_buffer_unmap(ctx, &ctx->user_bufs[i]); kfree(ctx->user_bufs); + kfree(ctx->buf_data); ctx->user_bufs = NULL; + ctx->buf_data = NULL; ctx->nr_user_bufs = 0; - return 0; +} + +static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) +{ + int ret; + + if (!ctx->buf_data) + return -ENXIO; + + ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); + if (!ret) + __io_sqe_buffers_unregister(ctx); + return ret; } static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, @@ -8098,7 +8200,7 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, /* check previously registered pages */ for (i = 0; i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + struct io_mapped_ubuf *imu = ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) @@ -8143,9 +8245,10 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, } static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf *imu, + struct io_mapped_ubuf **pimu, struct page **last_hpage) { + struct io_mapped_ubuf *imu = NULL; struct vm_area_struct **vmas = NULL; struct page **pages = NULL; unsigned long off, start, end, ubuf; @@ -8157,6 +8260,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; + *pimu = NULL; ret = -ENOMEM; pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); @@ -8168,9 +8272,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, if (!vmas) goto done; - imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), - GFP_KERNEL); - if (!imu->bvec) + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + if (!imu) goto done; ret = 0; @@ -8199,14 +8302,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, */ if (pret > 0) unpin_user_pages(pages, pret); - kvfree(imu->bvec); goto done; } ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); if (ret) { unpin_user_pages(pages, pret); - kvfree(imu->bvec); goto done; } @@ -8224,10 +8325,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, } /* store original address for later verification */ imu->ubuf = ubuf; - imu->len = iov->iov_len; + imu->ubuf_end = ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; + *pimu = imu; ret = 0; done: + if (ret) + kvfree(imu); kvfree(pages); kvfree(vmas); return ret; @@ -8235,21 +8339,14 @@ done: static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) { - if (ctx->user_bufs) - return -EBUSY; - if (!nr_args || nr_args > UIO_MAXIOV) - return -EINVAL; - - ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf), - GFP_KERNEL); - if (!ctx->user_bufs) - return -ENOMEM; - - return 0; + ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); + return ctx->user_bufs ? 0 : -ENOMEM; } static int io_buffer_validate(struct iovec *iov) { + unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); + /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is @@ -8262,44 +8359,123 @@ static int io_buffer_validate(struct iovec *iov) if (iov->iov_len > SZ_1G) return -EFAULT; + if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) + return -EOVERFLOW; + return 0; } static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int nr_args) + unsigned int nr_args, u64 __user *tags) { + struct page *last_hpage = NULL; + struct io_rsrc_data *data; int i, ret; struct iovec iov; - struct page *last_hpage = NULL; - ret = io_buffers_map_alloc(ctx, nr_args); + if (ctx->user_bufs) + return -EBUSY; + if (!nr_args || nr_args > UIO_MAXIOV) + return -EINVAL; + ret = io_rsrc_node_switch_start(ctx); if (ret) return ret; + data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args); + if (!data) + return -ENOMEM; + ret = io_buffers_map_alloc(ctx, nr_args); + if (ret) { + kfree(data); + return ret; + } - for (i = 0; i < nr_args; i++) { - struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { + u64 tag = 0; + if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) { + ret = -EFAULT; + break; + } ret = io_copy_iov(ctx, &iov, arg, i); if (ret) break; - ret = io_buffer_validate(&iov); if (ret) break; - ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage); + ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], + &last_hpage); if (ret) break; - - ctx->nr_user_bufs++; + data->tags[i] = tag; } - if (ret) - io_sqe_buffers_unregister(ctx); + WARN_ON_ONCE(ctx->buf_data); + ctx->buf_data = data; + if (ret) + __io_sqe_buffers_unregister(ctx); + else + io_rsrc_node_switch(ctx, NULL); return ret; } +static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, + struct io_uring_rsrc_update2 *up, + unsigned int nr_args) +{ + u64 __user *tags = u64_to_user_ptr(up->tags); + struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); + struct page *last_hpage = NULL; + bool needs_switch = false; + __u32 done; + int i, err; + + if (!ctx->buf_data) + return -ENXIO; + if (up->offset + nr_args > ctx->nr_user_bufs) + return -EINVAL; + + for (done = 0; done < nr_args; done++) { + struct io_mapped_ubuf *imu; + int offset = up->offset + done; + u64 tag = 0; + + err = io_copy_iov(ctx, &iov, iovs, done); + if (err) + break; + if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { + err = -EFAULT; + break; + } + err = io_buffer_validate(&iov); + if (err) + break; + err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); + if (err) + break; + + i = array_index_nospec(offset, ctx->nr_user_bufs); + if (ctx->user_bufs[i]) { + err = io_queue_rsrc_removal(ctx->buf_data, offset, + ctx->rsrc_node, ctx->user_bufs[i]); + if (unlikely(err)) { + io_buffer_unmap(ctx, &imu); + break; + } + ctx->user_bufs[i] = NULL; + needs_switch = true; + } + + ctx->user_bufs[i] = imu; + ctx->buf_data->tags[offset] = tag; + } + + if (needs_switch) + io_rsrc_node_switch(ctx, ctx->buf_data); + return done ? done : err; +} + static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) { __s32 __user *fds = arg; @@ -8332,19 +8508,13 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) return -ENXIO; } -static int __io_destroy_buffers(int id, void *p, void *data) -{ - struct io_ring_ctx *ctx = data; - struct io_buffer *buf = p; - - __io_remove_buffers(ctx, buf, id, -1U); - return 0; -} - static void io_destroy_buffers(struct io_ring_ctx *ctx) { - idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx); - idr_destroy(&ctx->io_buffer_idr); + struct io_buffer *buf; + unsigned long index; + + xa_for_each(&ctx->io_buffers, index, buf) + __io_remove_buffers(ctx, buf, index, -1U); } static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) @@ -8372,28 +8542,23 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) submit_state->free_reqs = 0; } - spin_lock_irq(&ctx->completion_lock); - list_splice_init(&cs->locked_free_list, &cs->free_list); - cs->locked_free_nr = 0; - spin_unlock_irq(&ctx->completion_lock); - + io_flush_cached_locked_reqs(ctx, cs); io_req_cache_free(&cs->free_list, NULL); - mutex_unlock(&ctx->uring_lock); } -static void io_ring_ctx_free(struct io_ring_ctx *ctx) +static bool io_wait_rsrc_data(struct io_rsrc_data *data) { - /* - * Some may use context even when all refs and requests have been put, - * and they are free to do so while still holding uring_lock, see - * __io_req_task_submit(). Wait for them to finish. - */ - mutex_lock(&ctx->uring_lock); - mutex_unlock(&ctx->uring_lock); + if (!data) + return false; + if (!atomic_dec_and_test(&data->refs)) + wait_for_completion(&data->done); + return true; +} +static void io_ring_ctx_free(struct io_ring_ctx *ctx) +{ io_sq_thread_finish(ctx); - io_sqe_buffers_unregister(ctx); if (ctx->mm_account) { mmdrop(ctx->mm_account); @@ -8401,10 +8566,27 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) } mutex_lock(&ctx->uring_lock); - io_sqe_files_unregister(ctx); + if (io_wait_rsrc_data(ctx->buf_data)) + __io_sqe_buffers_unregister(ctx); + if (io_wait_rsrc_data(ctx->file_data)) + __io_sqe_files_unregister(ctx); + if (ctx->rings) + __io_cqring_overflow_flush(ctx, true); mutex_unlock(&ctx->uring_lock); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); + if (ctx->sq_creds) + put_cred(ctx->sq_creds); + + /* there are no registered resources left, nobody uses it */ + if (ctx->rsrc_node) + io_rsrc_node_destroy(ctx->rsrc_node); + if (ctx->rsrc_backup_node) + io_rsrc_node_destroy(ctx->rsrc_backup_node); + flush_delayed_work(&ctx->rsrc_put_work); + + WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); + WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { @@ -8478,26 +8660,9 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } -static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) +static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx) { - struct callback_head *work, *next; - bool executed = false; - - do { - work = xchg(&ctx->exit_task_work, NULL); - if (!work) - break; - - do { - next = work->next; - work->func(work); - work = next; - cond_resched(); - } while (work); - executed = true; - } while (1); - - return executed; + return io_run_task_work_head(&ctx->exit_task_work); } struct io_tctx_exit { @@ -8521,6 +8686,13 @@ static void io_tctx_exit_cb(struct callback_head *cb) complete(&work->completion); } +static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + return req->ctx == data; +} + static void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); @@ -8537,19 +8709,38 @@ static void io_ring_exit_work(struct work_struct *work) */ do { io_uring_try_cancel_requests(ctx, NULL, NULL); + if (ctx->sq_data) { + struct io_sq_data *sqd = ctx->sq_data; + struct task_struct *tsk; + + io_sq_thread_park(sqd); + tsk = sqd->thread; + if (tsk && tsk->io_uring && tsk->io_uring->io_wq) + io_wq_cancel_cb(tsk->io_uring->io_wq, + io_cancel_ctx_cb, ctx, true); + io_sq_thread_unpark(sqd); + } WARN_ON_ONCE(time_after(jiffies, timeout)); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); + init_completion(&exit.completion); + init_task_work(&exit.task_work, io_tctx_exit_cb); + exit.ctx = ctx; + /* + * Some may use context even when all refs and requests have been put, + * and they are free to do so while still holding uring_lock or + * completion_lock, see __io_req_task_submit(). Apart from other work, + * this lock/unlock section also waits them to finish. + */ mutex_lock(&ctx->uring_lock); while (!list_empty(&ctx->tctx_list)) { WARN_ON_ONCE(time_after(jiffies, timeout)); node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, ctx_node); - exit.ctx = ctx; - init_completion(&exit.completion); - init_task_work(&exit.task_work, io_tctx_exit_cb); + /* don't spin on a single task if cancellation failed */ + list_rotate_left(&ctx->tctx_list); ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); if (WARN_ON_ONCE(ret)) continue; @@ -8557,14 +8748,37 @@ static void io_ring_exit_work(struct work_struct *work) mutex_unlock(&ctx->uring_lock); wait_for_completion(&exit.completion); - cond_resched(); mutex_lock(&ctx->uring_lock); } mutex_unlock(&ctx->uring_lock); + spin_lock_irq(&ctx->completion_lock); + spin_unlock_irq(&ctx->completion_lock); io_ring_ctx_free(ctx); } +/* Returns true if we found and killed one or more timeouts */ +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + struct files_struct *files) +{ + struct io_kiocb *req, *tmp; + int canceled = 0; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + if (io_match_task(req, tsk, files)) { + io_kill_timeout(req, -ECANCELED); + canceled++; + } + } + if (canceled != 0) + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + if (canceled != 0) + io_cqring_ev_posted(ctx); + return canceled != 0; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; @@ -8572,10 +8786,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); - /* if force is set, the ring is going away. always drop after that */ - ctx->cq_overflow_flushed = 1; if (ctx->rings) - __io_cqring_overflow_flush(ctx, true, NULL, NULL); + __io_cqring_overflow_flush(ctx, true); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); @@ -8651,21 +8863,12 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx, while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); list_del_init(&de->list); - req_set_fail_links(de->req); - io_put_req(de->req); - io_req_complete(de->req, -ECANCELED); + io_req_complete_failed(de->req, -ECANCELED); kfree(de); } return true; } -static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - return req->ctx == data; -} - static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) { struct io_tctx_node *node; @@ -8727,53 +8930,13 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_kill_timeouts(ctx, task, files); ret |= io_run_task_work(); ret |= io_run_ctx_fallback(ctx); - io_cqring_overflow_flush(ctx, true, task, files); if (!ret) break; cond_resched(); } } -static int io_uring_count_inflight(struct io_ring_ctx *ctx, - struct task_struct *task, - struct files_struct *files) -{ - struct io_kiocb *req; - int cnt = 0; - - spin_lock_irq(&ctx->inflight_lock); - list_for_each_entry(req, &ctx->inflight_list, inflight_entry) - cnt += io_match_task(req, task, files); - spin_unlock_irq(&ctx->inflight_lock); - return cnt; -} - -static void io_uring_cancel_files(struct io_ring_ctx *ctx, - struct task_struct *task, - struct files_struct *files) -{ - while (!list_empty_careful(&ctx->inflight_list)) { - DEFINE_WAIT(wait); - int inflight; - - inflight = io_uring_count_inflight(ctx, task, files); - if (!inflight) - break; - - io_uring_try_cancel_requests(ctx, task, files); - - prepare_to_wait(&task->io_uring->wait, &wait, - TASK_UNINTERRUPTIBLE); - if (inflight == io_uring_count_inflight(ctx, task, files)) - schedule(); - finish_wait(&task->io_uring->wait, &wait); - } -} - -/* - * Note that this task has used io_uring. We use it for cancelation purposes. - */ -static int io_uring_add_task_file(struct io_ring_ctx *ctx) +static int __io_uring_add_task_file(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -8785,33 +8948,41 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx) return ret; tctx = current->io_uring; } - if (tctx->last != ctx) { - void *old = xa_load(&tctx->xa, (unsigned long)ctx); - - if (!old) { - node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - node->ctx = ctx; - node->task = current; - - ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, - node, GFP_KERNEL)); - if (ret) { - kfree(node); - return ret; - } + if (!xa_load(&tctx->xa, (unsigned long)ctx)) { + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + node->ctx = ctx; + node->task = current; - mutex_lock(&ctx->uring_lock); - list_add(&node->ctx_node, &ctx->tctx_list); - mutex_unlock(&ctx->uring_lock); + ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, + node, GFP_KERNEL)); + if (ret) { + kfree(node); + return ret; } - tctx->last = ctx; + + mutex_lock(&ctx->uring_lock); + list_add(&node->ctx_node, &ctx->tctx_list); + mutex_unlock(&ctx->uring_lock); } + tctx->last = ctx; return 0; } /* + * Note that this task has used io_uring. We use it for cancelation purposes. + */ +static inline int io_uring_add_task_file(struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx = current->io_uring; + + if (likely(tctx && tctx->last == ctx)) + return 0; + return __io_uring_add_task_file(ctx); +} + +/* * Remove this io_uring_file -> task mapping. */ static void io_uring_del_task_file(unsigned long index) @@ -8850,86 +9021,48 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) } } -static s64 tctx_inflight(struct io_uring_task *tctx) +static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) { + if (tracked) + return atomic_read(&tctx->inflight_tracked); return percpu_counter_sum(&tctx->inflight); } -static void io_sqpoll_cancel_cb(struct callback_head *cb) -{ - struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work); - struct io_ring_ctx *ctx = work->ctx; - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd->thread) - io_uring_cancel_sqpoll(ctx); - complete(&work->completion); -} - -static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - struct io_tctx_exit work = { .ctx = ctx, }; - struct task_struct *task; - - io_sq_thread_park(sqd); - list_del_init(&ctx->sqd_list); - io_sqd_update_thread_idle(sqd); - task = sqd->thread; - if (task) { - init_completion(&work.completion); - init_task_work(&work.task_work, io_sqpoll_cancel_cb); - WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL)); - wake_up_process(task); - } - io_sq_thread_unpark(sqd); - - if (task) - wait_for_completion(&work.completion); -} - -void __io_uring_files_cancel(struct files_struct *files) +static void io_uring_try_cancel(struct files_struct *files) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; unsigned long index; - /* make sure overflow events are dropped */ - atomic_inc(&tctx->in_idle); xa_for_each(&tctx->xa, index, node) { struct io_ring_ctx *ctx = node->ctx; - if (ctx->sq_data) { - io_sqpoll_cancel_sync(ctx); - continue; - } - io_uring_cancel_files(ctx, current, files); - if (!files) - io_uring_try_cancel_requests(ctx, current, NULL); + /* sqpoll task will cancel all its requests */ + if (!ctx->sq_data) + io_uring_try_cancel_requests(ctx, current, files); } - atomic_dec(&tctx->in_idle); - - if (files) - io_uring_clean_tctx(tctx); } /* should only be called by SQPOLL task */ -static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) +static void io_uring_cancel_sqpoll(struct io_sq_data *sqd) { - struct io_sq_data *sqd = ctx->sq_data; struct io_uring_task *tctx = current->io_uring; + struct io_ring_ctx *ctx; s64 inflight; DEFINE_WAIT(wait); - WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current); + if (!current->io_uring) + return; + WARN_ON_ONCE(!sqd || sqd->thread != current); atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ - inflight = tctx_inflight(tctx); + inflight = tctx_inflight(tctx, false); if (!inflight) break; - io_uring_try_cancel_requests(ctx, current, NULL); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_uring_try_cancel_requests(ctx, current, NULL); prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); /* @@ -8937,7 +9070,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) * avoids a race where a completion comes in before we did * prepare_to_wait(). */ - if (inflight == tctx_inflight(tctx)) + if (inflight == tctx_inflight(tctx, false)) schedule(); finish_wait(&tctx->wait, &wait); } while (1); @@ -8948,7 +9081,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) * Find any io_uring fd that this task has registered or done IO on, and cancel * requests. */ -void __io_uring_task_cancel(void) +void __io_uring_cancel(struct files_struct *files) { struct io_uring_task *tctx = current->io_uring; DEFINE_WAIT(wait); @@ -8958,11 +9091,10 @@ void __io_uring_task_cancel(void) atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ - inflight = tctx_inflight(tctx); + inflight = tctx_inflight(tctx, !!files); if (!inflight) break; - __io_uring_files_cancel(NULL); - + io_uring_try_cancel(files); prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); /* @@ -8970,16 +9102,17 @@ void __io_uring_task_cancel(void) * avoids a race where a completion comes in before we did * prepare_to_wait(). */ - if (inflight == tctx_inflight(tctx)) + if (inflight == tctx_inflight(tctx, !!files)) schedule(); finish_wait(&tctx->wait, &wait); } while (1); - atomic_dec(&tctx->in_idle); io_uring_clean_tctx(tctx); - /* all current's requests should be gone, we can kill tctx */ - __io_uring_free(current); + if (!files) { + /* for exec all current's requests should be gone, kill tctx */ + __io_uring_free(current); + } } static void *io_uring_validate_mmap_request(struct file *file, @@ -9105,31 +9238,31 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, size_t, argsz) { struct io_ring_ctx *ctx; - long ret = -EBADF; int submitted = 0; struct fd f; + long ret; io_run_task_work(); - if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)) + if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) return -EINVAL; f = fdget(fd); - if (!f.file) + if (unlikely(!f.file)) return -EBADF; ret = -EOPNOTSUPP; - if (f.file->f_op != &io_uring_fops) + if (unlikely(f.file->f_op != &io_uring_fops)) goto out_fput; ret = -ENXIO; ctx = f.file->private_data; - if (!percpu_ref_tryget(&ctx->refs)) + if (unlikely(!percpu_ref_tryget(&ctx->refs))) goto out_fput; ret = -EBADFD; - if (ctx->flags & IORING_SETUP_R_DISABLED) + if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) goto out; /* @@ -9139,7 +9272,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx, false, NULL, NULL); + io_cqring_overflow_flush(ctx, false); ret = -EOWNERDEAD; if (unlikely(ctx->sq_data->thread == NULL)) { @@ -9252,7 +9385,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); for (i = 0; has_lock && i < ctx->nr_user_files; i++) { - struct file *f = *io_fixed_file_slot(ctx->file_data, i); + struct file *f = io_file_from_index(ctx, i); if (f) seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); @@ -9261,10 +9394,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) } seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *buf = &ctx->user_bufs[i]; + struct io_mapped_ubuf *buf = ctx->user_bufs[i]; + unsigned int len = buf->ubuf_end - buf->ubuf; - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, - (unsigned int) buf->len); + seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); } if (has_lock && !xa_empty(&ctx->personalities)) { unsigned long index; @@ -9473,6 +9606,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ret = io_sq_offload_create(ctx, p); if (ret) goto err; + /* always set a rsrc node */ + io_rsrc_node_switch_start(ctx); + io_rsrc_node_switch(ctx, NULL); memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); @@ -9698,14 +9834,96 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) return 0; } +static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, + struct io_uring_rsrc_update2 *up, + unsigned nr_args) +{ + __u32 tmp; + int err; + + if (up->resv) + return -EINVAL; + if (check_add_overflow(up->offset, nr_args, &tmp)) + return -EOVERFLOW; + err = io_rsrc_node_switch_start(ctx); + if (err) + return err; + + switch (type) { + case IORING_RSRC_FILE: + return __io_sqe_files_update(ctx, up, nr_args); + case IORING_RSRC_BUFFER: + return __io_sqe_buffers_update(ctx, up, nr_args); + } + return -EINVAL; +} + +static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update2 up; + + if (!nr_args) + return -EINVAL; + memset(&up, 0, sizeof(up)); + if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) + return -EFAULT; + return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); +} + +static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned size) +{ + struct io_uring_rsrc_update2 up; + + if (size != sizeof(up)) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (!up.nr) + return -EINVAL; + return __io_register_rsrc_update(ctx, up.type, &up, up.nr); +} + +static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, + unsigned int size) +{ + struct io_uring_rsrc_register rr; + + /* keep it extendible */ + if (size != sizeof(rr)) + return -EINVAL; + + memset(&rr, 0, sizeof(rr)); + if (copy_from_user(&rr, arg, size)) + return -EFAULT; + if (!rr.nr) + return -EINVAL; + + switch (rr.type) { + case IORING_RSRC_FILE: + return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), + rr.nr, u64_to_user_ptr(rr.tags)); + case IORING_RSRC_BUFFER: + return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), + rr.nr, u64_to_user_ptr(rr.tags)); + } + return -EINVAL; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { + case IORING_REGISTER_BUFFERS: + case IORING_UNREGISTER_BUFFERS: + case IORING_REGISTER_FILES: case IORING_UNREGISTER_FILES: case IORING_REGISTER_FILES_UPDATE: case IORING_REGISTER_PROBE: case IORING_REGISTER_PERSONALITY: case IORING_UNREGISTER_PERSONALITY: + case IORING_REGISTER_RSRC: + case IORING_REGISTER_RSRC_UPDATE: return false; default: return true; @@ -9727,6 +9945,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (percpu_ref_is_dying(&ctx->refs)) return -ENXIO; + if (ctx->restricted) { + if (opcode >= IORING_REGISTER_LAST) + return -EINVAL; + opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); + if (!test_bit(opcode, ctx->restrictions.register_op)) + return -EACCES; + } + if (io_register_op_must_quiesce(opcode)) { percpu_ref_kill(&ctx->refs); @@ -9747,30 +9973,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ret < 0) break; } while (1); - mutex_lock(&ctx->uring_lock); if (ret) { - percpu_ref_resurrect(&ctx->refs); - goto out_quiesce; - } - } - - if (ctx->restricted) { - if (opcode >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - if (!test_bit(opcode, ctx->restrictions.register_op)) { - ret = -EACCES; - goto out; + io_refs_resurrect(&ctx->refs, &ctx->ref_comp); + return ret; } } switch (opcode) { case IORING_REGISTER_BUFFERS: - ret = io_sqe_buffers_register(ctx, arg, nr_args); + ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_BUFFERS: ret = -EINVAL; @@ -9779,7 +9992,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_buffers_unregister(ctx); break; case IORING_REGISTER_FILES: - ret = io_sqe_files_register(ctx, arg, nr_args); + ret = io_sqe_files_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_FILES: ret = -EINVAL; @@ -9788,7 +10001,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_files_unregister(ctx); break; case IORING_REGISTER_FILES_UPDATE: - ret = io_sqe_files_update(ctx, arg, nr_args); + ret = io_register_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: case IORING_REGISTER_EVENTFD_ASYNC: @@ -9836,16 +10049,20 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_RESTRICTIONS: ret = io_register_restrictions(ctx, arg, nr_args); break; + case IORING_REGISTER_RSRC: + ret = io_register_rsrc(ctx, arg, nr_args); + break; + case IORING_REGISTER_RSRC_UPDATE: + ret = io_register_rsrc_update(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; } -out: if (io_register_op_must_quiesce(opcode)) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); -out_quiesce: reinit_completion(&ctx->ref_comp); } return ret; diff --git a/fs/ioctl.c b/fs/ioctl.c index 4e6cc0a7d69c..1e2204fa9963 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -19,6 +19,9 @@ #include <linux/falloc.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> +#include <linux/mount.h> +#include <linux/fscrypt.h> +#include <linux/fileattr.h> #include "internal.h" @@ -657,6 +660,307 @@ out: return ret; } +/** + * fileattr_fill_xflags - initialize fileattr with xflags + * @fa: fileattr pointer + * @xflags: FS_XFLAG_* flags + * + * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All + * other fields are zeroed. + */ +void fileattr_fill_xflags(struct fileattr *fa, u32 xflags) +{ + memset(fa, 0, sizeof(*fa)); + fa->fsx_valid = true; + fa->fsx_xflags = xflags; + if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE) + fa->flags |= FS_IMMUTABLE_FL; + if (fa->fsx_xflags & FS_XFLAG_APPEND) + fa->flags |= FS_APPEND_FL; + if (fa->fsx_xflags & FS_XFLAG_SYNC) + fa->flags |= FS_SYNC_FL; + if (fa->fsx_xflags & FS_XFLAG_NOATIME) + fa->flags |= FS_NOATIME_FL; + if (fa->fsx_xflags & FS_XFLAG_NODUMP) + fa->flags |= FS_NODUMP_FL; + if (fa->fsx_xflags & FS_XFLAG_DAX) + fa->flags |= FS_DAX_FL; + if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) + fa->flags |= FS_PROJINHERIT_FL; +} +EXPORT_SYMBOL(fileattr_fill_xflags); + +/** + * fileattr_fill_flags - initialize fileattr with flags + * @fa: fileattr pointer + * @flags: FS_*_FL flags + * + * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags). + * All other fields are zeroed. + */ +void fileattr_fill_flags(struct fileattr *fa, u32 flags) +{ + memset(fa, 0, sizeof(*fa)); + fa->flags_valid = true; + fa->flags = flags; + if (fa->flags & FS_SYNC_FL) + fa->fsx_xflags |= FS_XFLAG_SYNC; + if (fa->flags & FS_IMMUTABLE_FL) + fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; + if (fa->flags & FS_APPEND_FL) + fa->fsx_xflags |= FS_XFLAG_APPEND; + if (fa->flags & FS_NODUMP_FL) + fa->fsx_xflags |= FS_XFLAG_NODUMP; + if (fa->flags & FS_NOATIME_FL) + fa->fsx_xflags |= FS_XFLAG_NOATIME; + if (fa->flags & FS_DAX_FL) + fa->fsx_xflags |= FS_XFLAG_DAX; + if (fa->flags & FS_PROJINHERIT_FL) + fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; +} +EXPORT_SYMBOL(fileattr_fill_flags); + +/** + * vfs_fileattr_get - retrieve miscellaneous file attributes + * @dentry: the object to retrieve from + * @fa: fileattr pointer + * + * Call i_op->fileattr_get() callback, if exists. + * + * Return: 0 on success, or a negative error on failure. + */ +int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + + if (!inode->i_op->fileattr_get) + return -ENOIOCTLCMD; + + return inode->i_op->fileattr_get(dentry, fa); +} +EXPORT_SYMBOL(vfs_fileattr_get); + +/** + * copy_fsxattr_to_user - copy fsxattr to userspace. + * @fa: fileattr pointer + * @ufa: fsxattr user pointer + * + * Return: 0 on success, or -EFAULT on failure. + */ +int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa) +{ + struct fsxattr xfa; + + memset(&xfa, 0, sizeof(xfa)); + xfa.fsx_xflags = fa->fsx_xflags; + xfa.fsx_extsize = fa->fsx_extsize; + xfa.fsx_nextents = fa->fsx_nextents; + xfa.fsx_projid = fa->fsx_projid; + xfa.fsx_cowextsize = fa->fsx_cowextsize; + + if (copy_to_user(ufa, &xfa, sizeof(xfa))) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(copy_fsxattr_to_user); + +static int copy_fsxattr_from_user(struct fileattr *fa, + struct fsxattr __user *ufa) +{ + struct fsxattr xfa; + + if (copy_from_user(&xfa, ufa, sizeof(xfa))) + return -EFAULT; + + fileattr_fill_xflags(fa, xfa.fsx_xflags); + fa->fsx_extsize = xfa.fsx_extsize; + fa->fsx_nextents = xfa.fsx_nextents; + fa->fsx_projid = xfa.fsx_projid; + fa->fsx_cowextsize = xfa.fsx_cowextsize; + + return 0; +} + +/* + * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject + * any invalid configurations. + * + * Note: must be called with inode lock held. + */ +static int fileattr_set_prepare(struct inode *inode, + const struct fileattr *old_ma, + struct fileattr *fa) +{ + int err; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags); + if (err) + return err; + + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() != &init_user_ns) { + if (old_ma->fsx_projid != fa->fsx_projid) + return -EINVAL; + if ((old_ma->fsx_xflags ^ fa->fsx_xflags) & + FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + /* Check extent size hints. */ + if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode)) + return -EINVAL; + + if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && + !S_ISDIR(inode->i_mode)) + return -EINVAL; + + if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) && + !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return -EINVAL; + + /* + * It is only valid to set the DAX flag on regular files and + * directories on filesystems. + */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && + !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EINVAL; + + /* Extent size hints of zero turn off the flags. */ + if (fa->fsx_extsize == 0) + fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); + if (fa->fsx_cowextsize == 0) + fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + + return 0; +} + +/** + * vfs_fileattr_set - change miscellaneous file attributes + * @mnt_userns: user namespace of the mount + * @dentry: the object to change + * @fa: fileattr pointer + * + * After verifying permissions, call i_op->fileattr_set() callback, if + * exists. + * + * Verifying attributes involves retrieving current attributes with + * i_op->fileattr_get(), this also allows initializing attributes that have + * not been set by the caller to current values. Inode lock is held + * thoughout to prevent racing with another instance. + * + * Return: 0 on success, or a negative error on failure. + */ +int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, + struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct fileattr old_ma = {}; + int err; + + if (!inode->i_op->fileattr_set) + return -ENOIOCTLCMD; + + if (!inode_owner_or_capable(mnt_userns, inode)) + return -EPERM; + + inode_lock(inode); + err = vfs_fileattr_get(dentry, &old_ma); + if (!err) { + /* initialize missing bits from old_ma */ + if (fa->flags_valid) { + fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON; + fa->fsx_extsize = old_ma.fsx_extsize; + fa->fsx_nextents = old_ma.fsx_nextents; + fa->fsx_projid = old_ma.fsx_projid; + fa->fsx_cowextsize = old_ma.fsx_cowextsize; + } else { + fa->flags |= old_ma.flags & ~FS_COMMON_FL; + } + err = fileattr_set_prepare(inode, &old_ma, fa); + if (!err) + err = inode->i_op->fileattr_set(mnt_userns, dentry, fa); + } + inode_unlock(inode); + + return err; +} +EXPORT_SYMBOL(vfs_fileattr_set); + +static int ioctl_getflags(struct file *file, unsigned int __user *argp) +{ + struct fileattr fa = { .flags_valid = true }; /* hint only */ + int err; + + err = vfs_fileattr_get(file->f_path.dentry, &fa); + if (!err) + err = put_user(fa.flags, argp); + return err; +} + +static int ioctl_setflags(struct file *file, unsigned int __user *argp) +{ + struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct dentry *dentry = file->f_path.dentry; + struct fileattr fa; + unsigned int flags; + int err; + + err = get_user(flags, argp); + if (!err) { + err = mnt_want_write_file(file); + if (!err) { + fileattr_fill_flags(&fa, flags); + err = vfs_fileattr_set(mnt_userns, dentry, &fa); + mnt_drop_write_file(file); + } + } + return err; +} + +static int ioctl_fsgetxattr(struct file *file, void __user *argp) +{ + struct fileattr fa = { .fsx_valid = true }; /* hint only */ + int err; + + err = vfs_fileattr_get(file->f_path.dentry, &fa); + if (!err) + err = copy_fsxattr_to_user(&fa, argp); + + return err; +} + +static int ioctl_fssetxattr(struct file *file, void __user *argp) +{ + struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct dentry *dentry = file->f_path.dentry; + struct fileattr fa; + int err; + + err = copy_fsxattr_from_user(&fa, argp); + if (!err) { + err = mnt_want_write_file(file); + if (!err) { + err = vfs_fileattr_set(mnt_userns, dentry, &fa); + mnt_drop_write_file(file); + } + } + return err; +} + /* * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. @@ -727,6 +1031,18 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, return put_user(i_size_read(inode) - filp->f_pos, (int __user *)argp); + case FS_IOC_GETFLAGS: + return ioctl_getflags(filp, argp); + + case FS_IOC_SETFLAGS: + return ioctl_setflags(filp, argp); + + case FS_IOC_FSGETXATTR: + return ioctl_fsgetxattr(filp, argp); + + case FS_IOC_FSSETXATTR: + return ioctl_fssetxattr(filp, argp); + default: if (S_ISREG(inode->i_mode)) return file_ioctl(filp, cmd, argp); @@ -828,6 +1144,15 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, #endif /* + * These access 32-bit values anyway so no further handling is + * necessary. + */ + case FS_IOC32_GETFLAGS: + case FS_IOC32_SETFLAGS: + cmd = (cmd == FS_IOC32_GETFLAGS) ? + FS_IOC_GETFLAGS : FS_IOC_SETFLAGS; + fallthrough; + /* * everything else in do_vfs_ioctl() takes either a compatible * pointer argument or no argument -- call it with a modified * argument. diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 414769a6ad11..0129e6bab985 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1155,7 +1155,8 @@ iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends, EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); static int -iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b) +iomap_ioend_compare(void *priv, const struct list_head *a, + const struct list_head *b) { struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bdd0d89bbf0a..9398b8c31323 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -487,12 +487,28 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (pos >= dio->i_size) goto out_free_dio; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, pos, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + iomap_flags |= IOMAP_NOWAIT; + } + if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { iomap_flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, pos, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + iomap_flags |= IOMAP_NOWAIT; + } + /* for data sync or sync, we need sync completion processing */ if (iocb->ki_flags & IOCB_DSYNC) dio->flags |= IOMAP_DIO_NEED_SYNC; @@ -507,14 +523,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->flags |= IOMAP_DIO_WRITE_FUA; } - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, pos, end)) { - ret = -EAGAIN; - goto out_free_dio; - } - iomap_flags |= IOMAP_NOWAIT; - } - if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (pos >= dio->i_size || pos + count > dio->i_size) diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index a648dbf6991e..6250ca6a1f85 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -18,6 +18,7 @@ struct iomap_swapfile_info { uint64_t highest_ppage; /* highest physical addr seen (pages) */ unsigned long nr_pages; /* number of pages collected */ int nr_extents; /* extent count */ + struct file *file; }; /* @@ -70,6 +71,18 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) return 0; } +static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) +{ + char *buf, *p = ERR_PTR(-ENOMEM); + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (buf) + p = file_path(isi->file, buf, PATH_MAX); + pr_err("swapon: file %s %s\n", IS_ERR(p) ? "<unknown>" : p, str); + kfree(buf); + return -EINVAL; +} + /* * Accumulate iomaps for this swap file. We have to accumulate iomaps because * swap only cares about contiguous page-aligned physical extents and makes no @@ -89,28 +102,20 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, break; case IOMAP_INLINE: /* No inline data. */ - pr_err("swapon: file is inline\n"); - return -EINVAL; + return iomap_swapfile_fail(isi, "is inline"); default: - pr_err("swapon: file has unallocated extents\n"); - return -EINVAL; + return iomap_swapfile_fail(isi, "has unallocated extents"); } /* No uncommitted metadata or shared blocks. */ - if (iomap->flags & IOMAP_F_DIRTY) { - pr_err("swapon: file is not committed\n"); - return -EINVAL; - } - if (iomap->flags & IOMAP_F_SHARED) { - pr_err("swapon: file has shared extents\n"); - return -EINVAL; - } + if (iomap->flags & IOMAP_F_DIRTY) + return iomap_swapfile_fail(isi, "is not committed"); + if (iomap->flags & IOMAP_F_SHARED) + return iomap_swapfile_fail(isi, "has shared extents"); /* Only one bdev per swap file. */ - if (iomap->bdev != isi->sis->bdev) { - pr_err("swapon: file is on multiple devices\n"); - return -EINVAL; - } + if (iomap->bdev != isi->sis->bdev) + return iomap_swapfile_fail(isi, "outside the main device"); if (isi->iomap.length == 0) { /* No accumulated extent, so just store it. */ @@ -139,6 +144,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, struct iomap_swapfile_info isi = { .sis = sis, .lowest_ppage = (sector_t)-1ULL, + .file = swap_file, }; struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; @@ -170,6 +176,16 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, return ret; } + /* + * If this swapfile doesn't contain even a single page-aligned + * contiguous range of blocks, reject this useless swapfile to + * prevent confusion later on. + */ + if (isi.nr_pages == 0) { + pr_warn("swapon: Cannot find a single usable page in file.\n"); + return -EINVAL; + } + *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; sis->max = isi.nr_pages; sis->pages = isi.nr_pages - 1; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 69f18fe20923..d47a0d96bf30 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -245,15 +245,14 @@ static int fc_do_one_pass(journal_t *journal, return 0; while (next_fc_block <= journal->j_fc_last) { - jbd_debug(3, "Fast commit replay: next block %ld", + jbd_debug(3, "Fast commit replay: next block %ld\n", next_fc_block); err = jread(&bh, journal, next_fc_block); if (err) { - jbd_debug(3, "Fast commit replay: read error"); + jbd_debug(3, "Fast commit replay: read error\n"); break; } - jbd_debug(3, "Processing fast commit blk with seq %d"); err = journal->j_fc_replay_callback(journal, bh, pass, next_fc_block - journal->j_fc_first, expected_commit_id); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 9396666b7314..e8fc45fd751f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -349,7 +349,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle, } alloc_transaction: - if (!journal->j_running_transaction) { + /* + * This check is racy but it is just an optimization of allocating new + * transaction early if there are high chances we'll need it. If we + * guess wrong, we'll retry or free unused transaction. + */ + if (!data_race(journal->j_running_transaction)) { /* * If __GFP_FS is not present, then we may be being called from * inside the fs writeback layer, so we MUST NOT fail. @@ -1474,8 +1479,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * crucial to catch bugs so let's do a reliable check until the * lockless handling is fully proven. */ - if (jh->b_transaction != transaction && - jh->b_next_transaction != transaction) { + if (data_race(jh->b_transaction != transaction && + jh->b_next_transaction != transaction)) { spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_next_transaction == transaction); @@ -1483,8 +1488,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } if (jh->b_modified == 1) { /* If it's in our transaction it must be in BJ_Metadata list. */ - if (jh->b_transaction == transaction && - jh->b_jlist != BJ_Metadata) { + if (data_race(jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata)) { spin_lock(&jh->b_state_lock); if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) diff --git a/fs/jffs2/TODO b/fs/jffs2/TODO deleted file mode 100644 index ca28964abd4b..000000000000 --- a/fs/jffs2/TODO +++ /dev/null @@ -1,37 +0,0 @@ - - - support asynchronous operation -- add a per-fs 'reserved_space' count, - let each outstanding write reserve the _maximum_ amount of physical - space it could take. Let GC flush the outstanding writes because the - reservations will necessarily be pessimistic. With this we could even - do shared writable mmap, if we can have a fs hook for do_wp_page() to - make the reservation. - - disable compression in commit_write()? - - fine-tune the allocation / GC thresholds - - chattr support - turning on/off and tuning compression per-inode - - checkpointing (do we need this? scan is quite fast) - - make the scan code populate real inodes so read_inode just after - mount doesn't have to read the flash twice for large files. - Make this a per-inode option, changeable with chattr, so you can - decide which inodes should be in-core immediately after mount. - - test, test, test - - - NAND flash support: - - almost done :) - - use bad block check instead of the hardwired byte check - - - Optimisations: - - Split writes so they go to two separate blocks rather than just c->nextblock. - By writing _new_ nodes to one block, and garbage-collected REF_PRISTINE - nodes to a different one, we can separate clean nodes from those which - are likely to become dirty, and end up with blocks which are each far - closer to 100% or 0% clean, hence speeding up later GC progress dramatically. - - Stop keeping name in-core with struct jffs2_full_dirent. If we keep the hash in - the full dirent, we only need to go to the flash in lookup() when we think we've - got a match, and in readdir(). - - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately? - - Remove size from jffs2_raw_node_frag. - -dedekind: -1. __jffs2_flush_wbuf() has a strange 'pad' parameter. Eliminate. -2. get_sb()->build_fs()->scan() path... Why get_sb() removes scan()'s crap in - case of failure? scan() does not clean everything. Fix. diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 28b70e7c7dd4..1d732fd223d4 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -130,6 +130,8 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, const struct inode_operations jfs_file_inode_operations = { .listxattr = jfs_listxattr, .setattr = jfs_setattr, + .fileattr_get = jfs_fileattr_get, + .fileattr_set = jfs_fileattr_set, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, .set_acl = jfs_set_acl, @@ -147,7 +149,5 @@ const struct file_operations jfs_file_operations = { .fsync = jfs_fsync, .release = jfs_release, .unlocked_ioctl = jfs_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = jfs_compat_ioctl, -#endif + .compat_ioctl = compat_ptr_ioctl, }; diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 2581d4db58ff..03a845ab4f00 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -15,6 +15,7 @@ #include <linux/blkdev.h> #include <asm/current.h> #include <linux/uaccess.h> +#include <linux/fileattr.h> #include "jfs_filsys.h" #include "jfs_debug.h" @@ -56,69 +57,56 @@ static long jfs_map_ext2(unsigned long flags, int from) return mapped; } +int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct jfs_inode_info *jfs_inode = JFS_IP(d_inode(dentry)); + unsigned int flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE; -long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + if (d_is_special(dentry)) + return -ENOTTY; + + fileattr_fill_flags(fa, jfs_map_ext2(flags, 0)); + + return 0; +} + +int jfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = file_inode(filp); + struct inode *inode = d_inode(dentry); struct jfs_inode_info *jfs_inode = JFS_IP(inode); unsigned int flags; - switch (cmd) { - case JFS_IOC_GETFLAGS: - flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE; - flags = jfs_map_ext2(flags, 0); - return put_user(flags, (int __user *) arg); - case JFS_IOC_SETFLAGS: { - unsigned int oldflags; - int err; - - err = mnt_want_write_file(filp); - if (err) - return err; - - if (!inode_owner_or_capable(&init_user_ns, inode)) { - err = -EACCES; - goto setflags_out; - } - if (get_user(flags, (int __user *) arg)) { - err = -EFAULT; - goto setflags_out; - } + if (d_is_special(dentry)) + return -ENOTTY; - flags = jfs_map_ext2(flags, 1); - if (!S_ISDIR(inode->i_mode)) - flags &= ~JFS_DIRSYNC_FL; + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; - /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) { - err = -EPERM; - goto setflags_out; - } + flags = jfs_map_ext2(fa->flags, 1); + if (!S_ISDIR(inode->i_mode)) + flags &= ~JFS_DIRSYNC_FL; - /* Lock against other parallel changes of flags */ - inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return -EPERM; - oldflags = jfs_map_ext2(jfs_inode->mode2 & JFS_FL_USER_VISIBLE, - 0); - err = vfs_ioc_setflags_prepare(inode, oldflags, flags); - if (err) { - inode_unlock(inode); - goto setflags_out; - } + flags = flags & JFS_FL_USER_MODIFIABLE; + flags |= jfs_inode->mode2 & ~JFS_FL_USER_MODIFIABLE; + jfs_inode->mode2 = flags; - flags = flags & JFS_FL_USER_MODIFIABLE; - flags |= jfs_inode->mode2 & ~JFS_FL_USER_MODIFIABLE; - jfs_inode->mode2 = flags; - - jfs_set_inode_flags(inode); - inode_unlock(inode); - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); -setflags_out: - mnt_drop_write_file(filp); - return err; - } + jfs_set_inode_flags(inode); + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + + return 0; +} + +long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + switch (cmd) { case FITRIM: { struct super_block *sb = inode->i_sb; @@ -156,22 +144,3 @@ setflags_out: return -ENOTTY; } } - -#ifdef CONFIG_COMPAT -long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - /* While these ioctl numbers defined with 'long' and have different - * numbers than the 64bit ABI, - * the actual implementation only deals with ints and is compatible. - */ - switch (cmd) { - case JFS_IOC_GETFLAGS32: - cmd = JFS_IOC_GETFLAGS; - break; - case JFS_IOC_SETFLAGS32: - cmd = JFS_IOC_SETFLAGS; - break; - } - return jfs_ioctl(filp, cmd, arg); -} -#endif diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h index 5fa9fd594115..d6af79e94263 100644 --- a/fs/jfs/jfs_dinode.h +++ b/fs/jfs/jfs_dinode.h @@ -160,11 +160,4 @@ struct dinode { #define JFS_FL_USER_MODIFIABLE 0x03F80000 #define JFS_FL_INHERIT 0x03C80000 -/* These are identical to EXT[23]_IOC_GETFLAGS/SETFLAGS */ -#define JFS_IOC_GETFLAGS _IOR('f', 1, long) -#define JFS_IOC_SETFLAGS _IOW('f', 2, long) - -#define JFS_IOC_GETFLAGS32 _IOR('f', 1, int) -#define JFS_IOC_SETFLAGS32 _IOW('f', 2, int) - #endif /*_H_JFS_DINODE */ diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 01daa0cb0ae5..7de961a81862 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -9,8 +9,10 @@ struct fid; extern struct inode *ialloc(struct inode *, umode_t); extern int jfs_fsync(struct file *, loff_t, loff_t, int); +extern int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +extern int jfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); extern long jfs_ioctl(struct file *, unsigned int, unsigned long); -extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); extern int jfs_commit_inode(struct inode *, int); extern int jfs_write_inode(struct inode *, struct writeback_control *); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 9abed0d750e5..9db4f5789c0e 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1522,6 +1522,8 @@ const struct inode_operations jfs_dir_inode_operations = { .rename = jfs_rename, .listxattr = jfs_listxattr, .setattr = jfs_setattr, + .fileattr_get = jfs_fileattr_get, + .fileattr_set = jfs_fileattr_set, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, .set_acl = jfs_set_acl, @@ -1533,9 +1535,7 @@ const struct file_operations jfs_dir_operations = { .iterate = jfs_readdir, .fsync = jfs_fsync, .unlocked_ioctl = jfs_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = jfs_compat_ioctl, -#endif + .compat_ioctl = compat_ptr_ioctl, .llseek = generic_file_llseek, }; diff --git a/fs/libfs.c b/fs/libfs.c index e2de5401abca..e9b29c6ffccb 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -481,6 +481,7 @@ EXPORT_SYMBOL(simple_rename); /** * simple_setattr - setattr for simple filesystem + * @mnt_userns: user namespace of the target mount * @dentry: dentry * @iattr: iattr structure * diff --git a/fs/locks.c b/fs/locks.c index 99ca97e81b7a..5c42363aa811 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1808,9 +1808,6 @@ check_conflicting_open(struct file *filp, const long arg, int flags) if (flags & FL_LAYOUT) return 0; - if (flags & FL_DELEG) - /* We leave these checks to the caller. */ - return 0; if (arg == F_RDLCK) return inode_is_open_for_write(inode) ? -EAGAIN : 0; @@ -2372,7 +2369,6 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) if (flock->l_pid != 0) goto out; - cmd = F_GETLK; fl->fl_flags |= FL_OFDLCK; fl->fl_owner = filp; } @@ -2828,7 +2824,7 @@ struct locks_iterator { }; static void lock_get_status(struct seq_file *f, struct file_lock *fl, - loff_t id, char *pfx) + loff_t id, char *pfx, int repeat) { struct inode *inode = NULL; unsigned int fl_pid; @@ -2844,7 +2840,11 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, if (fl->fl_file != NULL) inode = locks_inode(fl->fl_file); - seq_printf(f, "%lld:%s ", id, pfx); + seq_printf(f, "%lld: ", id); + + if (repeat) + seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx); + if (IS_POSIX(fl)) { if (fl->fl_flags & FL_ACCESS) seq_puts(f, "ACCESS"); @@ -2906,21 +2906,64 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } } +static struct file_lock *get_next_blocked_member(struct file_lock *node) +{ + struct file_lock *tmp; + + /* NULL node or root node */ + if (node == NULL || node->fl_blocker == NULL) + return NULL; + + /* Next member in the linked list could be itself */ + tmp = list_next_entry(node, fl_blocked_member); + if (list_entry_is_head(tmp, &node->fl_blocker->fl_blocked_requests, fl_blocked_member) + || tmp == node) { + return NULL; + } + + return tmp; +} + static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; - struct file_lock *fl, *bfl; + struct file_lock *cur, *tmp; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); + int level = 0; - fl = hlist_entry(v, struct file_lock, fl_link); + cur = hlist_entry(v, struct file_lock, fl_link); - if (locks_translate_pid(fl, proc_pidns) == 0) + if (locks_translate_pid(cur, proc_pidns) == 0) return 0; - lock_get_status(f, fl, iter->li_pos, ""); + /* View this crossed linked list as a binary tree, the first member of fl_blocked_requests + * is the left child of current node, the next silibing in fl_blocked_member is the + * right child, we can alse get the parent of current node from fl_blocker, so this + * question becomes traversal of a binary tree + */ + while (cur != NULL) { + if (level) + lock_get_status(f, cur, iter->li_pos, "-> ", level); + else + lock_get_status(f, cur, iter->li_pos, "", level); - list_for_each_entry(bfl, &fl->fl_blocked_requests, fl_blocked_member) - lock_get_status(f, bfl, iter->li_pos, " ->"); + if (!list_empty(&cur->fl_blocked_requests)) { + /* Turn left */ + cur = list_first_entry_or_null(&cur->fl_blocked_requests, + struct file_lock, fl_blocked_member); + level++; + } else { + /* Turn right */ + tmp = get_next_blocked_member(cur); + /* Fall back to parent node */ + while (tmp == NULL && cur->fl_blocker != NULL) { + cur = cur->fl_blocker; + level--; + tmp = get_next_blocked_member(cur); + } + cur = tmp; + } + } return 0; } @@ -2941,7 +2984,7 @@ static void __show_fd_locks(struct seq_file *f, (*id)++; seq_puts(f, "lock:\t"); - lock_get_status(f, fl, *id, ""); + lock_get_status(f, fl, *id, "", 0); } } diff --git a/fs/namei.c b/fs/namei.c index 216f16e74351..79b0ff9b151e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -579,6 +579,8 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) p->stack = p->internal; p->dfd = dfd; p->name = name; + p->path.mnt = NULL; + p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; @@ -652,6 +654,8 @@ static void terminate_walk(struct nameidata *nd) rcu_read_unlock(); } nd->depth = 0; + nd->path.mnt = NULL; + nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ @@ -1122,8 +1126,7 @@ int may_linkat(struct user_namespace *mnt_userns, struct path *link) * should be allowed, or not, on files that already * exist. * @mnt_userns: user namespace of the mount the inode was found from - * @dir_mode: mode bits of directory - * @dir_uid: owner of directory + * @nd: nameidata pathwalk data * @inode: the inode of the file to open * * Block an O_CREAT open of a FIFO (or a regular file) when: @@ -2322,8 +2325,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } nd->root.mnt = NULL; - nd->path.mnt = NULL; - nd->path.dentry = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { @@ -2419,16 +2420,16 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; + if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { + err = handle_lookup_down(nd); + nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; - if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { - err = handle_lookup_down(nd); - nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... - } if (!err) { *path = nd->path; nd->path.mnt = NULL; @@ -2823,16 +2824,14 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, static inline int may_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *child) { - struct user_namespace *s_user_ns; audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - s_user_ns = dir->i_sb->s_user_ns; - if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) || - !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns))) + if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) return -EOVERFLOW; + return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); } @@ -3034,14 +3033,11 @@ static int may_o_create(struct user_namespace *mnt_userns, const struct path *dir, struct dentry *dentry, umode_t mode) { - struct user_namespace *s_user_ns; int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; - s_user_ns = dir->dentry->d_sb->s_user_ns; - if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) || - !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns))) + if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns)) return -EOVERFLOW; error = inode_permission(mnt_userns, dir->dentry->d_inode, @@ -3381,7 +3377,7 @@ static int do_open(struct nameidata *nd, * @mnt_userns: user namespace of the mount the inode was found from * @dentry: pointer to dentry of the base directory * @mode: mode of the new tmpfile - * @open_flags: flags + * @open_flag: flags * * Create a temporary file. * @@ -4406,14 +4402,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname /** * vfs_rename - rename a filesystem object - * @old_mnt_userns: old user namespace of the mount the inode was found from - * @old_dir: parent of source - * @old_dentry: source - * @new_mnt_userns: new user namespace of the mount the inode was found from - * @new_dir: parent of destination - * @new_dentry: destination - * @delegated_inode: returns an inode needing a delegation break - * @flags: rename flags + * @rd: pointer to &struct renamedata info * * The caller must hold multiple mutexes--see lock_rename()). * diff --git a/fs/namespace.c b/fs/namespace.c index 56bb5a5fdc0d..f63337828e1c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1242,8 +1242,9 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -/* path_is_mountpoint() - Check if path is a mount in the current - * namespace. +/** + * path_is_mountpoint() - Check if path is a mount in the current namespace. + * @path: path to check * * d_mountpoint() can only be used reliably to establish if a dentry is * not mounted in any namespace and that common case is handled inline. @@ -1369,7 +1370,7 @@ void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) /** * may_umount_tree - check if a mount tree is busy - * @mnt: root of mount tree + * @m: root of mount tree * * This is called to check if a tree of mounts has any * open files, pwds, chroots or sub mounts that are @@ -1939,10 +1940,11 @@ void drop_collected_mounts(struct vfsmount *mnt) /** * clone_private_mount - create a private clone of a path + * @path: path to clone * - * This creates a new vfsmount, which will be the clone of @path. The new will - * not be attached anywhere in the namespace and will be private (i.e. changes - * to the originating mount won't be propagated into this). + * This creates a new vfsmount, which will be the clone of @path. The new mount + * will not be attached anywhere in the namespace and will be private (i.e. + * changes to the originating mount won't be propagated into this). * * Release with mntput(). */ diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig new file mode 100644 index 000000000000..578112713703 --- /dev/null +++ b/fs/netfs/Kconfig @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config NETFS_SUPPORT + tristate "Support for network filesystem high-level I/O" + help + This option enables support for network filesystems, including + helpers for high-level buffered I/O, abstracting out read + segmentation, local caching and transparent huge page support. + +config NETFS_STATS + bool "Gather statistical information on local caching" + depends on NETFS_SUPPORT && PROC_FS + help + This option causes statistical information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/stats + + The gathering of statistics adds a certain amount of overhead to + execution as there are a quite a few stats gathered, and on a + multi-CPU system these may be on cachelines that keep bouncing + between CPUs. On the other hand, the stats are very useful for + debugging purposes. Saying 'Y' here is recommended. diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile new file mode 100644 index 000000000000..c15bfc966d96 --- /dev/null +++ b/fs/netfs/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +netfs-y := read_helper.o stats.o + +obj-$(CONFIG_NETFS_SUPPORT) := netfs.o diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h new file mode 100644 index 000000000000..b7f2c4459f33 --- /dev/null +++ b/fs/netfs/internal.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Internal definitions for network filesystem support + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "netfs: " fmt + +/* + * read_helper.c + */ +extern unsigned int netfs_debug; + +/* + * stats.c + */ +#ifdef CONFIG_NETFS_STATS +extern atomic_t netfs_n_rh_readahead; +extern atomic_t netfs_n_rh_readpage; +extern atomic_t netfs_n_rh_rreq; +extern atomic_t netfs_n_rh_sreq; +extern atomic_t netfs_n_rh_download; +extern atomic_t netfs_n_rh_download_done; +extern atomic_t netfs_n_rh_download_failed; +extern atomic_t netfs_n_rh_download_instead; +extern atomic_t netfs_n_rh_read; +extern atomic_t netfs_n_rh_read_done; +extern atomic_t netfs_n_rh_read_failed; +extern atomic_t netfs_n_rh_zero; +extern atomic_t netfs_n_rh_short_read; +extern atomic_t netfs_n_rh_write; +extern atomic_t netfs_n_rh_write_begin; +extern atomic_t netfs_n_rh_write_done; +extern atomic_t netfs_n_rh_write_failed; +extern atomic_t netfs_n_rh_write_zskip; + + +static inline void netfs_stat(atomic_t *stat) +{ + atomic_inc(stat); +} + +static inline void netfs_stat_d(atomic_t *stat) +{ + atomic_dec(stat); +} + +#else +#define netfs_stat(x) do {} while(0) +#define netfs_stat_d(x) do {} while(0) +#endif + +/*****************************************************************************/ +/* + * debug tracing + */ +#define dbgprintk(FMT, ...) \ + printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + +#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) + +#ifdef __KDEBUG +#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) +#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) +#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) + +#elif defined(CONFIG_NETFS_DEBUG) +#define _enter(FMT, ...) \ +do { \ + if (netfs_debug) \ + kenter(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT, ...) \ +do { \ + if (netfs_debug) \ + kleave(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT, ...) \ +do { \ + if (netfs_debug) \ + kdebug(FMT, ##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) +#endif diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c new file mode 100644 index 000000000000..193841d03de0 --- /dev/null +++ b/fs/netfs/read_helper.c @@ -0,0 +1,1185 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Network filesystem high-level read support. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/module.h> +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/uio.h> +#include <linux/sched/mm.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/netfs.h> +#include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/netfs.h> + +MODULE_DESCRIPTION("Network fs support"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +unsigned netfs_debug; +module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); + +static void netfs_rreq_work(struct work_struct *); +static void __netfs_put_subrequest(struct netfs_read_subrequest *, bool); + +static void netfs_put_subrequest(struct netfs_read_subrequest *subreq, + bool was_async) +{ + if (refcount_dec_and_test(&subreq->usage)) + __netfs_put_subrequest(subreq, was_async); +} + +static struct netfs_read_request *netfs_alloc_read_request( + const struct netfs_read_request_ops *ops, void *netfs_priv, + struct file *file) +{ + static atomic_t debug_ids; + struct netfs_read_request *rreq; + + rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL); + if (rreq) { + rreq->netfs_ops = ops; + rreq->netfs_priv = netfs_priv; + rreq->inode = file_inode(file); + rreq->i_size = i_size_read(rreq->inode); + rreq->debug_id = atomic_inc_return(&debug_ids); + INIT_LIST_HEAD(&rreq->subrequests); + INIT_WORK(&rreq->work, netfs_rreq_work); + refcount_set(&rreq->usage, 1); + __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + ops->init_rreq(rreq, file); + netfs_stat(&netfs_n_rh_rreq); + } + + return rreq; +} + +static void netfs_get_read_request(struct netfs_read_request *rreq) +{ + refcount_inc(&rreq->usage); +} + +static void netfs_rreq_clear_subreqs(struct netfs_read_request *rreq, + bool was_async) +{ + struct netfs_read_subrequest *subreq; + + while (!list_empty(&rreq->subrequests)) { + subreq = list_first_entry(&rreq->subrequests, + struct netfs_read_subrequest, rreq_link); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, was_async); + } +} + +static void netfs_free_read_request(struct work_struct *work) +{ + struct netfs_read_request *rreq = + container_of(work, struct netfs_read_request, work); + netfs_rreq_clear_subreqs(rreq, false); + if (rreq->netfs_priv) + rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv); + trace_netfs_rreq(rreq, netfs_rreq_trace_free); + if (rreq->cache_resources.ops) + rreq->cache_resources.ops->end_operation(&rreq->cache_resources); + kfree(rreq); + netfs_stat_d(&netfs_n_rh_rreq); +} + +static void netfs_put_read_request(struct netfs_read_request *rreq, bool was_async) +{ + if (refcount_dec_and_test(&rreq->usage)) { + if (was_async) { + rreq->work.func = netfs_free_read_request; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_free_read_request(&rreq->work); + } + } +} + +/* + * Allocate and partially initialise an I/O request structure. + */ +static struct netfs_read_subrequest *netfs_alloc_subrequest( + struct netfs_read_request *rreq) +{ + struct netfs_read_subrequest *subreq; + + subreq = kzalloc(sizeof(struct netfs_read_subrequest), GFP_KERNEL); + if (subreq) { + INIT_LIST_HEAD(&subreq->rreq_link); + refcount_set(&subreq->usage, 2); + subreq->rreq = rreq; + netfs_get_read_request(rreq); + netfs_stat(&netfs_n_rh_sreq); + } + + return subreq; +} + +static void netfs_get_read_subrequest(struct netfs_read_subrequest *subreq) +{ + refcount_inc(&subreq->usage); +} + +static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq, + bool was_async) +{ + struct netfs_read_request *rreq = subreq->rreq; + + trace_netfs_sreq(subreq, netfs_sreq_trace_free); + kfree(subreq); + netfs_stat_d(&netfs_n_rh_sreq); + netfs_put_read_request(rreq, was_async); +} + +/* + * Clear the unread part of an I/O request. + */ +static void netfs_clear_unread(struct netfs_read_subrequest *subreq) +{ + struct iov_iter iter; + + iov_iter_xarray(&iter, WRITE, &subreq->rreq->mapping->i_pages, + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); + iov_iter_zero(iov_iter_count(&iter), &iter); +} + +static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_read_subrequest *subreq = priv; + + netfs_subreq_terminated(subreq, transferred_or_error, was_async); +} + +/* + * Issue a read against the cache. + * - Eats the caller's ref on subreq. + */ +static void netfs_read_from_cache(struct netfs_read_request *rreq, + struct netfs_read_subrequest *subreq, + bool seek_data) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + struct iov_iter iter; + + netfs_stat(&netfs_n_rh_read); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); + + cres->ops->read(cres, subreq->start, &iter, seek_data, + netfs_cache_read_terminated, subreq); +} + +/* + * Fill a subrequest region with zeroes. + */ +static void netfs_fill_with_zeroes(struct netfs_read_request *rreq, + struct netfs_read_subrequest *subreq) +{ + netfs_stat(&netfs_n_rh_zero); + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_subreq_terminated(subreq, 0, false); +} + +/* + * Ask the netfs to issue a read request to the server for us. + * + * The netfs is expected to read from subreq->pos + subreq->transferred to + * subreq->pos + subreq->len - 1. It may not backtrack and write data into the + * buffer prior to the transferred point as it might clobber dirty data + * obtained from the cache. + * + * Alternatively, the netfs is allowed to indicate one of two things: + * + * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and + * make progress. + * + * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be + * cleared. + */ +static void netfs_read_from_server(struct netfs_read_request *rreq, + struct netfs_read_subrequest *subreq) +{ + netfs_stat(&netfs_n_rh_download); + rreq->netfs_ops->issue_op(subreq); +} + +/* + * Release those waiting. + */ +static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async) +{ + trace_netfs_rreq(rreq, netfs_rreq_trace_done); + netfs_rreq_clear_subreqs(rreq, was_async); + netfs_put_read_request(rreq, was_async); +} + +/* + * Deal with the completion of writing the data to the cache. We have to clear + * the PG_fscache bits on the pages involved and release the caller's ref. + * + * May be called in softirq mode and we inherit a ref from the caller. + */ +static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq, + bool was_async) +{ + struct netfs_read_subrequest *subreq; + struct page *page; + pgoff_t unlocked = 0; + bool have_unlocked = false; + + rcu_read_lock(); + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); + + xas_for_each(&xas, page, (subreq->start + subreq->len - 1) / PAGE_SIZE) { + /* We might have multiple writes from the same huge + * page, but we mustn't unlock a page more than once. + */ + if (have_unlocked && page->index <= unlocked) + continue; + unlocked = page->index; + end_page_fscache(page); + have_unlocked = true; + } + } + + rcu_read_unlock(); + netfs_rreq_completed(rreq, was_async); +} + +static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_read_subrequest *subreq = priv; + struct netfs_read_request *rreq = subreq->rreq; + + if (IS_ERR_VALUE(transferred_or_error)) { + netfs_stat(&netfs_n_rh_write_failed); + trace_netfs_failure(rreq, subreq, transferred_or_error, + netfs_fail_copy_to_cache); + } else { + netfs_stat(&netfs_n_rh_write_done); + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); + + /* If we decrement nr_wr_ops to 0, the ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_wr_ops)) + netfs_rreq_unmark_after_write(rreq, was_async); + + netfs_put_subrequest(subreq, was_async); +} + +/* + * Perform any outstanding writes to the cache. We inherit a ref from the + * caller. + */ +static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + struct netfs_read_subrequest *subreq, *next, *p; + struct iov_iter iter; + int ret; + + trace_netfs_rreq(rreq, netfs_rreq_trace_write); + + /* We don't want terminating writes trying to wake us up whilst we're + * still going through the list. + */ + atomic_inc(&rreq->nr_wr_ops); + + list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { + if (!test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) { + list_del_init(&subreq->rreq_link); + netfs_put_subrequest(subreq, false); + } + } + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + /* Amalgamate adjacent writes */ + while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + next = list_next_entry(subreq, rreq_link); + if (next->start != subreq->start + subreq->len) + break; + subreq->len += next->len; + list_del_init(&next->rreq_link); + netfs_put_subrequest(next, false); + } + + ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, + rreq->i_size); + if (ret < 0) { + trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); + trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); + continue; + } + + iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages, + subreq->start, subreq->len); + + atomic_inc(&rreq->nr_wr_ops); + netfs_stat(&netfs_n_rh_write); + netfs_get_read_subrequest(subreq); + trace_netfs_sreq(subreq, netfs_sreq_trace_write); + cres->ops->write(cres, subreq->start, &iter, + netfs_rreq_copy_terminated, subreq); + } + + /* If we decrement nr_wr_ops to 0, the usage ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_wr_ops)) + netfs_rreq_unmark_after_write(rreq, false); +} + +static void netfs_rreq_write_to_cache_work(struct work_struct *work) +{ + struct netfs_read_request *rreq = + container_of(work, struct netfs_read_request, work); + + netfs_rreq_do_write_to_cache(rreq); +} + +static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq, + bool was_async) +{ + if (was_async) { + rreq->work.func = netfs_rreq_write_to_cache_work; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_rreq_do_write_to_cache(rreq); + } +} + +/* + * Unlock the pages in a read operation. We need to set PG_fscache on any + * pages we're going to write back before we unlock them. + */ +static void netfs_rreq_unlock(struct netfs_read_request *rreq) +{ + struct netfs_read_subrequest *subreq; + struct page *page; + unsigned int iopos, account = 0; + pgoff_t start_page = rreq->start / PAGE_SIZE; + pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + bool subreq_failed = false; + int i; + + XA_STATE(xas, &rreq->mapping->i_pages, start_page); + + if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { + __clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); + } + } + + /* Walk through the pagecache and the I/O request lists simultaneously. + * We may have a mixture of cached and uncached sections and we only + * really want to write out the uncached sections. This is slightly + * complicated by the possibility that we might have huge pages with a + * mixture inside. + */ + subreq = list_first_entry(&rreq->subrequests, + struct netfs_read_subrequest, rreq_link); + iopos = 0; + subreq_failed = (subreq->error < 0); + + trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); + + rcu_read_lock(); + xas_for_each(&xas, page, last_page) { + unsigned int pgpos = (page->index - start_page) * PAGE_SIZE; + unsigned int pgend = pgpos + thp_size(page); + bool pg_failed = false; + + for (;;) { + if (!subreq) { + pg_failed = true; + break; + } + if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) + set_page_fscache(page); + pg_failed |= subreq_failed; + if (pgend < iopos + subreq->len) + break; + + account += subreq->transferred; + iopos += subreq->len; + if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + subreq = list_next_entry(subreq, rreq_link); + subreq_failed = (subreq->error < 0); + } else { + subreq = NULL; + subreq_failed = false; + } + if (pgend == iopos) + break; + } + + if (!pg_failed) { + for (i = 0; i < thp_nr_pages(page); i++) + flush_dcache_page(page); + SetPageUptodate(page); + } + + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_PAGES, &rreq->flags)) { + if (page->index == rreq->no_unlock_page && + test_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags)) + _debug("no unlock"); + else + unlock_page(page); + } + } + rcu_read_unlock(); + + task_io_account_read(account); + if (rreq->netfs_ops->done) + rreq->netfs_ops->done(rreq); +} + +/* + * Handle a short read. + */ +static void netfs_rreq_short_read(struct netfs_read_request *rreq, + struct netfs_read_subrequest *subreq) +{ + __clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags); + __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags); + + netfs_stat(&netfs_n_rh_short_read); + trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short); + + netfs_get_read_subrequest(subreq); + atomic_inc(&rreq->nr_rd_ops); + if (subreq->source == NETFS_READ_FROM_CACHE) + netfs_read_from_cache(rreq, subreq, true); + else + netfs_read_from_server(rreq, subreq); +} + +/* + * Resubmit any short or failed operations. Returns true if we got the rreq + * ref back. + */ +static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq) +{ + struct netfs_read_subrequest *subreq; + + WARN_ON(in_interrupt()); + + trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); + + /* We don't want terminating submissions trying to wake us up whilst + * we're still going through the list. + */ + atomic_inc(&rreq->nr_rd_ops); + + __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (subreq->error) { + if (subreq->source != NETFS_READ_FROM_CACHE) + break; + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + subreq->error = 0; + netfs_stat(&netfs_n_rh_download_instead); + trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); + netfs_get_read_subrequest(subreq); + atomic_inc(&rreq->nr_rd_ops); + netfs_read_from_server(rreq, subreq); + } else if (test_bit(NETFS_SREQ_SHORT_READ, &subreq->flags)) { + netfs_rreq_short_read(rreq, subreq); + } + } + + /* If we decrement nr_rd_ops to 0, the usage ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_rd_ops)) + return true; + + wake_up_var(&rreq->nr_rd_ops); + return false; +} + +/* + * Check to see if the data read is still valid. + */ +static void netfs_rreq_is_still_valid(struct netfs_read_request *rreq) +{ + struct netfs_read_subrequest *subreq; + + if (!rreq->netfs_ops->is_still_valid || + rreq->netfs_ops->is_still_valid(rreq)) + return; + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (subreq->source == NETFS_READ_FROM_CACHE) { + subreq->error = -ESTALE; + __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + } + } +} + +/* + * Assess the state of a read request and decide what to do next. + * + * Note that we could be in an ordinary kernel thread, on a workqueue or in + * softirq context at this point. We inherit a ref from the caller. + */ +static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async) +{ + trace_netfs_rreq(rreq, netfs_rreq_trace_assess); + +again: + netfs_rreq_is_still_valid(rreq); + + if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) && + test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) { + if (netfs_rreq_perform_resubmissions(rreq)) + goto again; + return; + } + + netfs_rreq_unlock(rreq); + + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + + if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags)) + return netfs_rreq_write_to_cache(rreq, was_async); + + netfs_rreq_completed(rreq, was_async); +} + +static void netfs_rreq_work(struct work_struct *work) +{ + struct netfs_read_request *rreq = + container_of(work, struct netfs_read_request, work); + netfs_rreq_assess(rreq, false); +} + +/* + * Handle the completion of all outstanding I/O operations on a read request. + * We inherit a ref from the caller. + */ +static void netfs_rreq_terminated(struct netfs_read_request *rreq, + bool was_async) +{ + if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) && + was_async) { + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_rreq_assess(rreq, was_async); + } +} + +/** + * netfs_subreq_terminated - Note the termination of an I/O operation. + * @subreq: The I/O request that has terminated. + * @transferred_or_error: The amount of data transferred or an error code. + * @was_async: The termination was asynchronous + * + * This tells the read helper that a contributory I/O operation has terminated, + * one way or another, and that it should integrate the results. + * + * The caller indicates in @transferred_or_error the outcome of the operation, + * supplying a positive value to indicate the number of bytes transferred, 0 to + * indicate a failure to transfer anything that should be retried or a negative + * error code. The helper will look after reissuing I/O operations as + * appropriate and writing downloaded data to the cache. + * + * If @was_async is true, the caller might be running in softirq or interrupt + * context and we can't sleep. + */ +void netfs_subreq_terminated(struct netfs_read_subrequest *subreq, + ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_read_request *rreq = subreq->rreq; + int u; + + _enter("[%u]{%llx,%lx},%zd", + subreq->debug_index, subreq->start, subreq->flags, + transferred_or_error); + + switch (subreq->source) { + case NETFS_READ_FROM_CACHE: + netfs_stat(&netfs_n_rh_read_done); + break; + case NETFS_DOWNLOAD_FROM_SERVER: + netfs_stat(&netfs_n_rh_download_done); + break; + default: + break; + } + + if (IS_ERR_VALUE(transferred_or_error)) { + subreq->error = transferred_or_error; + trace_netfs_failure(rreq, subreq, transferred_or_error, + netfs_fail_read); + goto failed; + } + + if (WARN(transferred_or_error > subreq->len - subreq->transferred, + "Subreq overread: R%x[%x] %zd > %zu - %zu", + rreq->debug_id, subreq->debug_index, + transferred_or_error, subreq->len, subreq->transferred)) + transferred_or_error = subreq->len - subreq->transferred; + + subreq->error = 0; + subreq->transferred += transferred_or_error; + if (subreq->transferred < subreq->len) + goto incomplete; + +complete: + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) + set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); + +out: + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + /* If we decrement nr_rd_ops to 0, the ref belongs to us. */ + u = atomic_dec_return(&rreq->nr_rd_ops); + if (u == 0) + netfs_rreq_terminated(rreq, was_async); + else if (u == 1) + wake_up_var(&rreq->nr_rd_ops); + + netfs_put_subrequest(subreq, was_async); + return; + +incomplete: + if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) { + netfs_clear_unread(subreq); + subreq->transferred = subreq->len; + goto complete; + } + + if (transferred_or_error == 0) { + if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + subreq->error = -ENODATA; + goto failed; + } + } else { + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } + + __set_bit(NETFS_SREQ_SHORT_READ, &subreq->flags); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + goto out; + +failed: + if (subreq->source == NETFS_READ_FROM_CACHE) { + netfs_stat(&netfs_n_rh_read_failed); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + } else { + netfs_stat(&netfs_n_rh_download_failed); + set_bit(NETFS_RREQ_FAILED, &rreq->flags); + rreq->error = subreq->error; + } + goto out; +} +EXPORT_SYMBOL(netfs_subreq_terminated); + +static enum netfs_read_source netfs_cache_prepare_read(struct netfs_read_subrequest *subreq, + loff_t i_size) +{ + struct netfs_read_request *rreq = subreq->rreq; + struct netfs_cache_resources *cres = &rreq->cache_resources; + + if (cres->ops) + return cres->ops->prepare_read(subreq, i_size); + if (subreq->start >= rreq->i_size) + return NETFS_FILL_WITH_ZEROES; + return NETFS_DOWNLOAD_FROM_SERVER; +} + +/* + * Work out what sort of subrequest the next one will be. + */ +static enum netfs_read_source +netfs_rreq_prepare_read(struct netfs_read_request *rreq, + struct netfs_read_subrequest *subreq) +{ + enum netfs_read_source source; + + _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); + + source = netfs_cache_prepare_read(subreq, rreq->i_size); + if (source == NETFS_INVALID_READ) + goto out; + + if (source == NETFS_DOWNLOAD_FROM_SERVER) { + /* Call out to the netfs to let it shrink the request to fit + * its own I/O sizes and boundaries. If it shinks it here, it + * will be called again to make simultaneous calls; if it wants + * to make serial calls, it can indicate a short read and then + * we will call it again. + */ + if (subreq->len > rreq->i_size - subreq->start) + subreq->len = rreq->i_size - subreq->start; + + if (rreq->netfs_ops->clamp_length && + !rreq->netfs_ops->clamp_length(subreq)) { + source = NETFS_INVALID_READ; + goto out; + } + } + + if (WARN_ON(subreq->len == 0)) + source = NETFS_INVALID_READ; + +out: + subreq->source = source; + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + return source; +} + +/* + * Slice off a piece of a read request and submit an I/O request for it. + */ +static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq, + unsigned int *_debug_index) +{ + struct netfs_read_subrequest *subreq; + enum netfs_read_source source; + + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) + return false; + + subreq->debug_index = (*_debug_index)++; + subreq->start = rreq->start + rreq->submitted; + subreq->len = rreq->len - rreq->submitted; + + _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); + list_add_tail(&subreq->rreq_link, &rreq->subrequests); + + /* Call out to the cache to find out what it can do with the remaining + * subset. It tells us in subreq->flags what it decided should be done + * and adjusts subreq->len down if the subset crosses a cache boundary. + * + * Then when we hand the subset, it can choose to take a subset of that + * (the starts must coincide), in which case, we go around the loop + * again and ask it to download the next piece. + */ + source = netfs_rreq_prepare_read(rreq, subreq); + if (source == NETFS_INVALID_READ) + goto subreq_failed; + + atomic_inc(&rreq->nr_rd_ops); + + rreq->submitted += subreq->len; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + switch (source) { + case NETFS_FILL_WITH_ZEROES: + netfs_fill_with_zeroes(rreq, subreq); + break; + case NETFS_DOWNLOAD_FROM_SERVER: + netfs_read_from_server(rreq, subreq); + break; + case NETFS_READ_FROM_CACHE: + netfs_read_from_cache(rreq, subreq, false); + break; + default: + BUG(); + } + + return true; + +subreq_failed: + rreq->error = subreq->error; + netfs_put_subrequest(subreq, false); + return false; +} + +static void netfs_cache_expand_readahead(struct netfs_read_request *rreq, + loff_t *_start, size_t *_len, loff_t i_size) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + + if (cres->ops && cres->ops->expand_readahead) + cres->ops->expand_readahead(cres, _start, _len, i_size); +} + +static void netfs_rreq_expand(struct netfs_read_request *rreq, + struct readahead_control *ractl) +{ + /* Give the cache a chance to change the request parameters. The + * resultant request must contain the original region. + */ + netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); + + /* Give the netfs a chance to change the request parameters. The + * resultant request must contain the original region. + */ + if (rreq->netfs_ops->expand_readahead) + rreq->netfs_ops->expand_readahead(rreq); + + /* Expand the request if the cache wants it to start earlier. Note + * that the expansion may get further extended if the VM wishes to + * insert THPs and the preferred start and/or end wind up in the middle + * of THPs. + * + * If this is the case, however, the THP size should be an integer + * multiple of the cache granule size, so we get a whole number of + * granules to deal with. + */ + if (rreq->start != readahead_pos(ractl) || + rreq->len != readahead_length(ractl)) { + readahead_expand(ractl, rreq->start, rreq->len); + rreq->start = readahead_pos(ractl); + rreq->len = readahead_length(ractl); + + trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), + netfs_read_trace_expanded); + } +} + +/** + * netfs_readahead - Helper to manage a read request + * @ractl: The description of the readahead request + * @ops: The network filesystem's operations for the helper to use + * @netfs_priv: Private netfs data to be retained in the request + * + * Fulfil a readahead request by drawing data from the cache if possible, or + * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O + * requests from different sources will get munged together. If necessary, the + * readahead window can be expanded in either direction to a more convenient + * alighment for RPC efficiency or to make storage in the cache feasible. + * + * The calling netfs must provide a table of operations, only one of which, + * issue_op, is mandatory. It may also be passed a private token, which will + * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup(). + * + * This is usable whether or not caching is enabled. + */ +void netfs_readahead(struct readahead_control *ractl, + const struct netfs_read_request_ops *ops, + void *netfs_priv) +{ + struct netfs_read_request *rreq; + struct page *page; + unsigned int debug_index = 0; + int ret; + + _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); + + if (readahead_count(ractl) == 0) + goto cleanup; + + rreq = netfs_alloc_read_request(ops, netfs_priv, ractl->file); + if (!rreq) + goto cleanup; + rreq->mapping = ractl->mapping; + rreq->start = readahead_pos(ractl); + rreq->len = readahead_length(ractl); + + if (ops->begin_cache_operation) { + ret = ops->begin_cache_operation(rreq); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto cleanup_free; + } + + netfs_stat(&netfs_n_rh_readahead); + trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), + netfs_read_trace_readahead); + + netfs_rreq_expand(rreq, ractl); + + atomic_set(&rreq->nr_rd_ops, 1); + do { + if (!netfs_rreq_submit_slice(rreq, &debug_index)) + break; + + } while (rreq->submitted < rreq->len); + + /* Drop the refs on the pages here rather than in the cache or + * filesystem. The locks will be dropped in netfs_rreq_unlock(). + */ + while ((page = readahead_page(ractl))) + put_page(page); + + /* If we decrement nr_rd_ops to 0, the ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_rd_ops)) + netfs_rreq_assess(rreq, false); + return; + +cleanup_free: + netfs_put_read_request(rreq, false); + return; +cleanup: + if (netfs_priv) + ops->cleanup(ractl->mapping, netfs_priv); + return; +} +EXPORT_SYMBOL(netfs_readahead); + +/** + * netfs_readpage - Helper to manage a readpage request + * @file: The file to read from + * @page: The page to read + * @ops: The network filesystem's operations for the helper to use + * @netfs_priv: Private netfs data to be retained in the request + * + * Fulfil a readpage request by drawing data from the cache if possible, or the + * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests + * from different sources will get munged together. + * + * The calling netfs must provide a table of operations, only one of which, + * issue_op, is mandatory. It may also be passed a private token, which will + * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup(). + * + * This is usable whether or not caching is enabled. + */ +int netfs_readpage(struct file *file, + struct page *page, + const struct netfs_read_request_ops *ops, + void *netfs_priv) +{ + struct netfs_read_request *rreq; + unsigned int debug_index = 0; + int ret; + + _enter("%lx", page_index(page)); + + rreq = netfs_alloc_read_request(ops, netfs_priv, file); + if (!rreq) { + if (netfs_priv) + ops->cleanup(netfs_priv, page_file_mapping(page)); + unlock_page(page); + return -ENOMEM; + } + rreq->mapping = page_file_mapping(page); + rreq->start = page_file_offset(page); + rreq->len = thp_size(page); + + if (ops->begin_cache_operation) { + ret = ops->begin_cache_operation(rreq); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) { + unlock_page(page); + goto out; + } + } + + netfs_stat(&netfs_n_rh_readpage); + trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); + + netfs_get_read_request(rreq); + + atomic_set(&rreq->nr_rd_ops, 1); + do { + if (!netfs_rreq_submit_slice(rreq, &debug_index)) + break; + + } while (rreq->submitted < rreq->len); + + /* Keep nr_rd_ops incremented so that the ref always belongs to us, and + * the service code isn't punted off to a random thread pool to + * process. + */ + do { + wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1); + netfs_rreq_assess(rreq, false); + } while (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)); + + ret = rreq->error; + if (ret == 0 && rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_readpage); + ret = -EIO; + } +out: + netfs_put_read_request(rreq, false); + return ret; +} +EXPORT_SYMBOL(netfs_readpage); + +static void netfs_clear_thp(struct page *page) +{ + unsigned int i; + + for (i = 0; i < thp_nr_pages(page); i++) + clear_highpage(page + i); +} + +/** + * netfs_write_begin - Helper to prepare for writing + * @file: The file to read from + * @mapping: The mapping to read from + * @pos: File position at which the write will begin + * @len: The length of the write in this page + * @flags: AOP_* flags + * @_page: Where to put the resultant page + * @_fsdata: Place for the netfs to store a cookie + * @ops: The network filesystem's operations for the helper to use + * @netfs_priv: Private netfs data to be retained in the request + * + * Pre-read data for a write-begin request by drawing data from the cache if + * possible, or the netfs if not. Space beyond the EOF is zero-filled. + * Multiple I/O requests from different sources will get munged together. If + * necessary, the readahead window can be expanded in either direction to a + * more convenient alighment for RPC efficiency or to make storage in the cache + * feasible. + * + * The calling netfs must provide a table of operations, only one of which, + * issue_op, is mandatory. + * + * The check_write_begin() operation can be provided to check for and flush + * conflicting writes once the page is grabbed and locked. It is passed a + * pointer to the fsdata cookie that gets returned to the VM to be passed to + * write_end. It is permitted to sleep. It should return 0 if the request + * should go ahead; unlock the page and return -EAGAIN to cause the page to be + * regot; or return an error. + * + * This is usable whether or not caching is enabled. + */ +int netfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int flags, + struct page **_page, void **_fsdata, + const struct netfs_read_request_ops *ops, + void *netfs_priv) +{ + struct netfs_read_request *rreq; + struct page *page, *xpage; + struct inode *inode = file_inode(file); + unsigned int debug_index = 0; + pgoff_t index = pos >> PAGE_SHIFT; + int pos_in_page = pos & ~PAGE_MASK; + loff_t size; + int ret; + + DEFINE_READAHEAD(ractl, file, NULL, mapping, index); + +retry: + page = grab_cache_page_write_begin(mapping, index, 0); + if (!page) + return -ENOMEM; + + if (ops->check_write_begin) { + /* Allow the netfs (eg. ceph) to flush conflicts. */ + ret = ops->check_write_begin(file, pos, len, page, _fsdata); + if (ret < 0) { + trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); + if (ret == -EAGAIN) + goto retry; + goto error; + } + } + + if (PageUptodate(page)) + goto have_page; + + /* If the page is beyond the EOF, we want to clear it - unless it's + * within the cache granule containing the EOF, in which case we need + * to preload the granule. + */ + size = i_size_read(inode); + if (!ops->is_cache_enabled(inode) && + ((pos_in_page == 0 && len == thp_size(page)) || + (pos >= size) || + (pos_in_page == 0 && (pos + len) >= size))) { + netfs_clear_thp(page); + SetPageUptodate(page); + netfs_stat(&netfs_n_rh_write_zskip); + goto have_page_no_wait; + } + + ret = -ENOMEM; + rreq = netfs_alloc_read_request(ops, netfs_priv, file); + if (!rreq) + goto error; + rreq->mapping = page->mapping; + rreq->start = page_offset(page); + rreq->len = thp_size(page); + rreq->no_unlock_page = page->index; + __set_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags); + netfs_priv = NULL; + + if (ops->begin_cache_operation) { + ret = ops->begin_cache_operation(rreq); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; + } + + netfs_stat(&netfs_n_rh_write_begin); + trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); + + /* Expand the request to meet caching requirements and download + * preferences. + */ + ractl._nr_pages = thp_nr_pages(page); + netfs_rreq_expand(rreq, &ractl); + netfs_get_read_request(rreq); + + /* We hold the page locks, so we can drop the references */ + while ((xpage = readahead_page(&ractl))) + if (xpage != page) + put_page(xpage); + + atomic_set(&rreq->nr_rd_ops, 1); + do { + if (!netfs_rreq_submit_slice(rreq, &debug_index)) + break; + + } while (rreq->submitted < rreq->len); + + /* Keep nr_rd_ops incremented so that the ref always belongs to us, and + * the service code isn't punted off to a random thread pool to + * process. + */ + for (;;) { + wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1); + netfs_rreq_assess(rreq, false); + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) + break; + cond_resched(); + } + + ret = rreq->error; + if (ret == 0 && rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_write_begin); + ret = -EIO; + } + netfs_put_read_request(rreq, false); + if (ret < 0) + goto error; + +have_page: + ret = wait_on_page_fscache_killable(page); + if (ret < 0) + goto error; +have_page_no_wait: + if (netfs_priv) + ops->cleanup(netfs_priv, mapping); + *_page = page; + _leave(" = 0"); + return 0; + +error_put: + netfs_put_read_request(rreq, false); +error: + unlock_page(page); + put_page(page); + if (netfs_priv) + ops->cleanup(netfs_priv, mapping); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_write_begin); diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c new file mode 100644 index 000000000000..9ae538c85378 --- /dev/null +++ b/fs/netfs/stats.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Netfs support statistics + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/seq_file.h> +#include <linux/netfs.h> +#include "internal.h" + +atomic_t netfs_n_rh_readahead; +atomic_t netfs_n_rh_readpage; +atomic_t netfs_n_rh_rreq; +atomic_t netfs_n_rh_sreq; +atomic_t netfs_n_rh_download; +atomic_t netfs_n_rh_download_done; +atomic_t netfs_n_rh_download_failed; +atomic_t netfs_n_rh_download_instead; +atomic_t netfs_n_rh_read; +atomic_t netfs_n_rh_read_done; +atomic_t netfs_n_rh_read_failed; +atomic_t netfs_n_rh_zero; +atomic_t netfs_n_rh_short_read; +atomic_t netfs_n_rh_write; +atomic_t netfs_n_rh_write_begin; +atomic_t netfs_n_rh_write_done; +atomic_t netfs_n_rh_write_failed; +atomic_t netfs_n_rh_write_zskip; + +void netfs_stats_show(struct seq_file *m) +{ + seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n", + atomic_read(&netfs_n_rh_readahead), + atomic_read(&netfs_n_rh_readpage), + atomic_read(&netfs_n_rh_write_begin), + atomic_read(&netfs_n_rh_write_zskip), + atomic_read(&netfs_n_rh_rreq), + atomic_read(&netfs_n_rh_sreq)); + seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n", + atomic_read(&netfs_n_rh_zero), + atomic_read(&netfs_n_rh_short_read), + atomic_read(&netfs_n_rh_write_zskip)); + seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n", + atomic_read(&netfs_n_rh_download), + atomic_read(&netfs_n_rh_download_done), + atomic_read(&netfs_n_rh_download_failed), + atomic_read(&netfs_n_rh_download_instead)); + seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n", + atomic_read(&netfs_n_rh_read), + atomic_read(&netfs_n_rh_read_done), + atomic_read(&netfs_n_rh_read_failed)); + seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n", + atomic_read(&netfs_n_rh_write), + atomic_read(&netfs_n_rh_write_done), + atomic_read(&netfs_n_rh_write_failed)); +} +EXPORT_SYMBOL(netfs_stats_show); diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 971a9251c1d9..a06d213d7689 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -463,6 +463,9 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, if (opt < 0) return ctx->sloppy ? 1 : opt; + if (fc->security) + ctx->has_sec_mnt_opts = 1; + switch (opt) { case Opt_source: if (fc->source) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a7fb076a5f44..5a8854de0c19 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -335,7 +335,7 @@ nfs_find_actor(struct inode *inode, void *opaque) if (NFS_FILEID(inode) != fattr->fileid) return 0; - if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode)) + if (inode_wrong_type(inode, fattr->mode)) return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; @@ -1461,7 +1461,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return 0; return -ESTALE; } - if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) return -ESTALE; @@ -1876,7 +1876,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* * Make sure the inode's type hasn't changed. */ - if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) { /* * Big trouble! The inode has become a different object. */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7b644d6c09e4..7395d0977b7d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -96,6 +96,7 @@ struct nfs_fs_context { char *fscache_uniq; unsigned short protofamily; unsigned short mountfamily; + bool has_sec_mnt_opts; struct { union { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 94885c6f8f54..4aaa1f5dd381 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1045,7 +1045,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) sb->s_blocksize = 0; sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr; sb->s_op = server->nfs_client->cl_nfs_mod->sops; - if (ctx && ctx->bsize) + if (ctx->bsize) sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits); if (server->nfs_client->rpc_ops->version != 2) { @@ -1077,6 +1077,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) &sb->s_blocksize_bits); nfs_super_set_maxbytes(sb, server->maxfilesize); + server->has_sec_mnt_opts = ctx->has_sec_mnt_opts; } static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, @@ -1193,6 +1194,9 @@ static int nfs_compare_super(struct super_block *sb, struct fs_context *fc) return 0; if (!nfs_compare_userns(old, server)) return 0; + if ((old->has_sec_mnt_opts || fc->security) && + security_sb_mnt_opts_compat(sb, fc->security)) + return 0; return nfs_compare_mount_options(sb, server, fc); } diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index 79c563c1a5e8..5a5bd85d08f8 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -136,6 +136,77 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, } EXPORT_SYMBOL_GPL(nfsacl_encode); +/** + * nfs_stream_encode_acl - Encode an NFSv3 ACL + * + * @xdr: an xdr_stream positioned to receive an encoded ACL + * @inode: inode of file whose ACL this is + * @acl: posix_acl to encode + * @encode_entries: whether to encode ACEs as well + * @typeflag: ACL type: NFS_ACL_DEFAULT or zero + * + * Return values: + * %false: The ACL could not be encoded + * %true: @xdr is advanced to the next available position + */ +bool nfs_stream_encode_acl(struct xdr_stream *xdr, struct inode *inode, + struct posix_acl *acl, int encode_entries, + int typeflag) +{ + const size_t elem_size = XDR_UNIT * 3; + u32 entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0; + struct nfsacl_encode_desc nfsacl_desc = { + .desc = { + .elem_size = elem_size, + .array_len = encode_entries ? entries : 0, + .xcode = xdr_nfsace_encode, + }, + .acl = acl, + .typeflag = typeflag, + .uid = inode->i_uid, + .gid = inode->i_gid, + }; + struct nfsacl_simple_acl aclbuf; + unsigned int base; + int err; + + if (entries > NFS_ACL_MAX_ENTRIES) + return false; + if (xdr_stream_encode_u32(xdr, entries) < 0) + return false; + + if (encode_entries && acl && acl->a_count == 3) { + struct posix_acl *acl2 = &aclbuf.acl; + + /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is + * invoked in contexts where a memory allocation failure is + * fatal. Fortunately this fake ACL is small enough to + * construct on the stack. */ + posix_acl_init(acl2, 4); + + /* Insert entries in canonical order: other orders seem + to confuse Solaris VxFS. */ + acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */ + acl2->a_entries[1] = acl->a_entries[1]; /* ACL_GROUP_OBJ */ + acl2->a_entries[2] = acl->a_entries[1]; /* ACL_MASK */ + acl2->a_entries[2].e_tag = ACL_MASK; + acl2->a_entries[3] = acl->a_entries[2]; /* ACL_OTHER */ + nfsacl_desc.acl = acl2; + } + + base = xdr_stream_pos(xdr); + if (!xdr_reserve_space(xdr, XDR_UNIT + + elem_size * nfsacl_desc.desc.array_len)) + return false; + err = xdr_encode_array2(xdr->buf, base, &nfsacl_desc.desc); + if (err) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(nfs_stream_encode_acl); + + struct nfsacl_decode_desc { struct xdr_array2_desc desc; unsigned int count; diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 821e5913faee..5fa38ad9e7e3 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -73,6 +73,7 @@ config NFSD_V4 select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS + select CRYPTO select CRYPTO_MD5 select CRYPTO_SHA256 select GRACE_PERIOD @@ -98,7 +99,7 @@ config NFSD_BLOCKLAYOUT help This option enables support for the exporting pNFS block layouts in the kernel's NFS server. The pNFS block layout enables NFS - clients to directly perform I/O to block devices accesible to both + clients to directly perform I/O to block devices accessible to both the server and the clients. See RFC 5663 for more details. If unsure, say N. @@ -112,7 +113,7 @@ config NFSD_SCSILAYOUT help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS - clients to directly perform I/O to SCSI devices accesible to both + clients to directly perform I/O to SCSI devices accessible to both the server and the clients. See draft-ietf-nfsv4-scsi-layout for more details. @@ -126,7 +127,7 @@ config NFSD_FLEXFILELAYOUT This option enables support for the exporting pNFS Flex File layouts in the kernel's NFS server. The pNFS Flex File layout enables NFS clients to directly perform I/O to NFSv3 devices - accesible to both the server and the clients. See + accessible to both the server and the clients. See draft-ietf-nfsv4-flex-files for more details. Warning, this server implements the bare minimum functionality diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 53fcbf79bdca..7629248fdd53 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -898,6 +898,8 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, continue; if (!nfsd_match_cred(nf->nf_cred, current_cred())) continue; + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) + continue; if (nfsd_file_get(nf) != NULL) return nf; } diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index c330f5bd0cf3..a75abeb1e698 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -51,9 +51,6 @@ struct nfsd_net { bool grace_ended; time64_t boot_time; - /* internal mount of the "nfsd" pseudofilesystem: */ - struct vfsmount *nfsd_mnt; - struct dentry *nfsd_client_dir; /* @@ -130,6 +127,9 @@ struct nfsd_net { wait_queue_head_t ntf_wq; atomic_t ntf_refcnt; + /* Allow umount to wait for nfsd state cleanup */ + struct completion nfsd_shutdown_complete; + /* * clientid and stateid data for construction of net unique COPY * stateids. diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 855e17772eba..4b43929c1f25 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -242,79 +242,61 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) /* GETACL */ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; struct inode *inode; - struct kvec *head = rqstp->rq_res.head; - unsigned int base; - int n; int w; - *p++ = resp->status; - if (resp->status != nfs_ok) - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; - /* - * Since this is version 2, the check for nfserr in - * nfsd_dispatch actually ensures the following cannot happen. - * However, it seems fragile to depend on that. - */ if (dentry == NULL || d_really_is_negative(dentry)) - return 0; + return 1; inode = d_inode(dentry); - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); - *p++ = htonl(resp->mask); - if (!xdr_ressize_check(rqstp, p)) + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->mask) < 0) return 0; - base = (char *)p - (char *)head->iov_base; rqstp->rq_res.page_len = w = nfsacl_size( (resp->mask & NFS_ACL) ? resp->acl_access : NULL, (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); while (w > 0) { if (!*(rqstp->rq_next_page++)) - return 0; + return 1; w -= PAGE_SIZE; } - n = nfsacl_encode(&rqstp->rq_res, base, inode, - resp->acl_access, - resp->mask & NFS_ACL, 0); - if (n > 0) - n = nfsacl_encode(&rqstp->rq_res, base + n, inode, - resp->acl_default, - resp->mask & NFS_DFACL, - NFS_ACL_DEFAULT); - return (n > 0); -} - -static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_attrstat *resp = rqstp->rq_resp; - - *p++ = resp->status; - if (resp->status != nfs_ok) - goto out; + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, + resp->mask & NFS_ACL, 0)) + return 0; + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default, + resp->mask & NFS_DFACL, NFS_ACL_DEFAULT)) + return 0; - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); -out: - return xdr_ressize_check(rqstp, p); + return 1; } /* ACCESS */ static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_accessres *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status != nfs_ok) - goto out; + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->access) < 0) + return 0; + break; + } - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); - *p++ = htonl(resp->access); -out: - return xdr_ressize_check(rqstp, p); + return 1; } /* @@ -329,13 +311,6 @@ static void nfsaclsvc_release_getacl(struct svc_rqst *rqstp) posix_acl_release(resp->acl_default); } -static void nfsaclsvc_release_attrstat(struct svc_rqst *rqstp) -{ - struct nfsd_attrstat *resp = rqstp->rq_resp; - - fh_put(&resp->fh); -} - static void nfsaclsvc_release_access(struct svc_rqst *rqstp) { struct nfsd3_accessres *resp = rqstp->rq_resp; @@ -375,8 +350,8 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { [ACLPROC2_SETACL] = { .pc_func = nfsacld_proc_setacl, .pc_decode = nfsaclsvc_decode_setaclargs, - .pc_encode = nfsaclsvc_encode_attrstatres, - .pc_release = nfsaclsvc_release_attrstat, + .pc_encode = nfssvc_encode_attrstatres, + .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd3_setaclargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, @@ -386,8 +361,8 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { [ACLPROC2_GETATTR] = { .pc_func = nfsacld_proc_getattr, .pc_decode = nfssvc_decode_fhandleargs, - .pc_encode = nfsaclsvc_encode_attrstatres, - .pc_release = nfsaclsvc_release_attrstat, + .pc_encode = nfssvc_encode_attrstatres, + .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 9a6f18d74d14..a1591feeea22 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -168,22 +168,25 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) /* GETACL */ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; + struct kvec *head = rqstp->rq_res.head; + struct inode *inode = d_inode(dentry); + unsigned int base; + int n; + int w; - *p++ = resp->status; - p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0 && dentry && d_really_is_positive(dentry)) { - struct inode *inode = d_inode(dentry); - struct kvec *head = rqstp->rq_res.head; - unsigned int base; - int n; - int w; - - *p++ = htonl(resp->mask); - if (!xdr_ressize_check(rqstp, p)) + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->mask) < 0) return 0; - base = (char *)p - (char *)head->iov_base; + + base = (char *)xdr->p - (char *)head->iov_base; rqstp->rq_res.page_len = w = nfsacl_size( (resp->mask & NFS_ACL) ? resp->acl_access : NULL, @@ -204,9 +207,11 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) NFS_ACL_DEFAULT); if (n <= 0) return 0; - } else - if (!xdr_ressize_check(rqstp, p)) + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) return 0; + } return 1; } @@ -214,11 +219,11 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) /* SETACL */ static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_attrstat *resp = rqstp->rq_resp; - *p++ = resp->status; - p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh); } /* diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 8675851199f8..17715a6c7a40 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -126,14 +126,15 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_readlinkres *resp = rqstp->rq_resp; - char *buffer = page_address(*(rqstp->rq_next_page++)); dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ fh_copy(&resp->fh, &argp->fh); resp->len = NFS3_MAXPATHLEN; - resp->status = nfsd_readlink(rqstp, &resp->fh, buffer, &resp->len); + resp->pages = rqstp->rq_next_page++; + resp->status = nfsd_readlink(rqstp, &resp->fh, + page_address(*resp->pages), &resp->len); return rpc_success; } @@ -158,6 +159,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp) v = 0; len = argp->count; + resp->pages = rqstp->rq_next_page; while (len > 0) { struct page *page = *(rqstp->rq_next_page++); @@ -439,17 +441,30 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, struct nfsd3_readdirres *resp, int count) { + struct xdr_buf *buf = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; + count = min_t(u32, count, svc_max_payload(rqstp)); - /* Convert byte count to number of words (i.e. >> 2), - * and reserve room for the NULL ptr & eof flag (-2 words) */ - resp->buflen = (count >> 2) - 2; + memset(buf, 0, sizeof(*buf)); - resp->buffer = page_address(*rqstp->rq_next_page); + /* Reserve room for the NULL ptr & eof flag (-2 words) */ + buf->buflen = count - XDR_UNIT * 2; + buf->pages = rqstp->rq_next_page; while (count > 0) { rqstp->rq_next_page++; count -= PAGE_SIZE; } + + /* This is xdr_init_encode(), but it assumes that + * the head kvec has already been consumed. */ + xdr_set_scratch_buffer(xdr, NULL, 0); + xdr->buf = buf; + xdr->page_ptr = buf->pages; + xdr->iov = NULL; + xdr->p = page_address(*buf->pages); + xdr->end = xdr->p + (PAGE_SIZE >> 2); + xdr->rqst = NULL; } /* @@ -460,10 +475,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) { struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; - int count = 0; loff_t offset; - struct page **p; - caddr_t page_addr = NULL; dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -471,39 +483,18 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) nfsd3_init_dirlist_pages(rqstp, resp, argp->count); - /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); - resp->common.err = nfs_ok; + resp->cookie_offset = 0; resp->rqstp = rqstp; offset = argp->cookie; - resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, - &resp->common, nfs3svc_encode_entry); + &resp->common, nfs3svc_encode_entry3); memcpy(resp->verf, argp->verf, 8); - count = 0; - for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { - page_addr = page_address(*p); + nfs3svc_encode_cookie3(resp, offset); - if (((caddr_t)resp->buffer >= page_addr) && - ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { - count += (caddr_t)resp->buffer - page_addr; - break; - } - count += PAGE_SIZE; - } - resp->count = count >> 2; - if (resp->offset) { - if (unlikely(resp->offset1)) { - /* we ended up with offset on a page boundary */ - *resp->offset = htonl(offset >> 32); - *resp->offset1 = htonl(offset & 0xffffffff); - resp->offset1 = NULL; - } else { - xdr_encode_hyper(resp->offset, offset); - } - resp->offset = NULL; - } + /* Recycle only pages that were part of the reply */ + rqstp->rq_next_page = resp->xdr.page_ptr + 1; return rpc_success; } @@ -517,10 +508,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) { struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; - int count = 0; loff_t offset; - struct page **p; - caddr_t page_addr = NULL; dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -528,10 +516,9 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) nfsd3_init_dirlist_pages(rqstp, resp, argp->count); - /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); - resp->common.err = nfs_ok; + resp->cookie_offset = 0; resp->rqstp = rqstp; offset = argp->cookie; @@ -545,30 +532,12 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) } resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, - &resp->common, nfs3svc_encode_entry_plus); + &resp->common, nfs3svc_encode_entryplus3); memcpy(resp->verf, argp->verf, 8); - for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { - page_addr = page_address(*p); + nfs3svc_encode_cookie3(resp, offset); - if (((caddr_t)resp->buffer >= page_addr) && - ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { - count += (caddr_t)resp->buffer - page_addr; - break; - } - count += PAGE_SIZE; - } - resp->count = count >> 2; - if (resp->offset) { - if (unlikely(resp->offset1)) { - /* we ended up with offset on a page boundary */ - *resp->offset = htonl(offset >> 32); - *resp->offset1 = htonl(offset & 0xffffffff); - resp->offset1 = NULL; - } else { - xdr_encode_hyper(resp->offset, offset); - } - resp->offset = NULL; - } + /* Recycle only pages that were part of the reply */ + rqstp->rq_next_page = resp->xdr.page_ptr + 1; out: return rpc_success; @@ -736,7 +705,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { [NFS3PROC_GETATTR] = { .pc_func = nfsd3_proc_getattr, .pc_decode = nfs3svc_decode_fhandleargs, - .pc_encode = nfs3svc_encode_attrstatres, + .pc_encode = nfs3svc_encode_getattrres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_attrstatres), @@ -758,7 +727,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { [NFS3PROC_LOOKUP] = { .pc_func = nfsd3_proc_lookup, .pc_decode = nfs3svc_decode_diropargs, - .pc_encode = nfs3svc_encode_diropres, + .pc_encode = nfs3svc_encode_lookupres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_diropres), diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9d9a01ce0b27..0a5ebc52e6a9 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -14,13 +14,26 @@ #include "netns.h" #include "vfs.h" -#define NFSDDBG_FACILITY NFSDDBG_XDR +/* + * Force construction of an empty post-op attr + */ +static const struct svc_fh nfs3svc_null_fh = { + .fh_no_wcc = true, +}; +/* + * time_delta. {1, 0} means the server is accurate only + * to the nearest second. + */ +static const struct timespec64 nfs3svc_time_delta = { + .tv_sec = 1, + .tv_nsec = 0, +}; /* * Mapping of S_IF* types to NFS file types */ -static u32 nfs3_ftypes[] = { +static const u32 nfs3_ftypes[] = { NF3NON, NF3FIFO, NF3CHR, NF3BAD, NF3DIR, NF3BAD, NF3BLK, NF3BAD, NF3REG, NF3BAD, NF3LNK, NF3BAD, @@ -33,9 +46,11 @@ static u32 nfs3_ftypes[] = { */ static __be32 * -encode_time3(__be32 *p, struct timespec64 *time) +encode_nfstime3(__be32 *p, const struct timespec64 *time) { - *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec); + *p++ = cpu_to_be32((u32)time->tv_sec); + *p++ = cpu_to_be32(time->tv_nsec); + return p; } @@ -82,14 +97,80 @@ svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp) return true; } -static __be32 * -encode_fh(__be32 *p, struct svc_fh *fhp) +/** + * svcxdr_encode_nfsstat3 - Encode an NFSv3 status code + * @xdr: XDR stream + * @status: status value to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_nfsstat3(struct xdr_stream *xdr, __be32 status) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(status)); + if (!p) + return false; + *p = status; + + return true; +} + +static bool +svcxdr_encode_nfs_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp) { - unsigned int size = fhp->fh_handle.fh_size; - *p++ = htonl(size); - if (size) p[XDR_QUADLEN(size)-1]=0; + u32 size = fhp->fh_handle.fh_size; + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT + size); + if (!p) + return false; + *p++ = cpu_to_be32(size); + if (size) + p[XDR_QUADLEN(size) - 1] = 0; memcpy(p, &fhp->fh_handle.fh_base, size); - return p + XDR_QUADLEN(size); + + return true; +} + +static bool +svcxdr_encode_post_op_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp) +{ + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + if (!svcxdr_encode_nfs_fh3(xdr, fhp)) + return false; + + return true; +} + +static bool +svcxdr_encode_cookieverf3(struct xdr_stream *xdr, const __be32 *verf) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS3_COOKIEVERFSIZE); + if (!p) + return false; + memcpy(p, verf, NFS3_COOKIEVERFSIZE); + + return true; +} + +static bool +svcxdr_encode_writeverf3(struct xdr_stream *xdr, const __be32 *verf) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS3_WRITEVERFSIZE); + if (!p) + return false; + memcpy(p, verf, NFS3_WRITEVERFSIZE); + + return true; } static bool @@ -253,115 +334,157 @@ svcxdr_decode_devicedata3(struct svc_rqst *rqstp, struct xdr_stream *xdr, svcxdr_decode_specdata3(xdr, args); } -static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp, const struct kstat *stat) { - u64 f; + struct user_namespace *userns = nfsd_user_namespace(rqstp); + __be32 *p; + u64 fsid; + + p = xdr_reserve_space(xdr, XDR_UNIT * 21); + if (!p) + return false; + + *p++ = cpu_to_be32(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); + *p++ = cpu_to_be32((u32)(stat->mode & S_IALLUGO)); + *p++ = cpu_to_be32((u32)stat->nlink); + *p++ = cpu_to_be32((u32)from_kuid_munged(userns, stat->uid)); + *p++ = cpu_to_be32((u32)from_kgid_munged(userns, stat->gid)); + if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) + p = xdr_encode_hyper(p, (u64)NFS3_MAXPATHLEN); + else + p = xdr_encode_hyper(p, (u64)stat->size); + + /* used */ + p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); + + /* rdev */ + *p++ = cpu_to_be32((u32)MAJOR(stat->rdev)); + *p++ = cpu_to_be32((u32)MINOR(stat->rdev)); + switch(fsid_source(fhp)) { - default: - case FSIDSOURCE_DEV: - p = xdr_encode_hyper(p, (u64)huge_encode_dev - (fhp->fh_dentry->d_sb->s_dev)); - break; case FSIDSOURCE_FSID: - p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); + fsid = (u64)fhp->fh_export->ex_fsid; break; case FSIDSOURCE_UUID: - f = ((u64*)fhp->fh_export->ex_uuid)[0]; - f ^= ((u64*)fhp->fh_export->ex_uuid)[1]; - p = xdr_encode_hyper(p, f); + fsid = ((u64 *)fhp->fh_export->ex_uuid)[0]; + fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1]; break; + default: + fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev); } - return p; -} + p = xdr_encode_hyper(p, fsid); -static __be32 * -encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, - struct kstat *stat) -{ - struct user_namespace *userns = nfsd_user_namespace(rqstp); - *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); - *p++ = htonl((u32) (stat->mode & S_IALLUGO)); - *p++ = htonl((u32) stat->nlink); - *p++ = htonl((u32) from_kuid_munged(userns, stat->uid)); - *p++ = htonl((u32) from_kgid_munged(userns, stat->gid)); - if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { - p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); - } else { - p = xdr_encode_hyper(p, (u64) stat->size); - } - p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); - *p++ = htonl((u32) MAJOR(stat->rdev)); - *p++ = htonl((u32) MINOR(stat->rdev)); - p = encode_fsid(p, fhp); + /* fileid */ p = xdr_encode_hyper(p, stat->ino); - p = encode_time3(p, &stat->atime); - p = encode_time3(p, &stat->mtime); - p = encode_time3(p, &stat->ctime); - return p; + p = encode_nfstime3(p, &stat->atime); + p = encode_nfstime3(p, &stat->mtime); + encode_nfstime3(p, &stat->ctime); + + return true; } -static __be32 * -encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_wcc_attr(struct xdr_stream *xdr, const struct svc_fh *fhp) { - /* Attributes to follow */ - *p++ = xdr_one; - return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr); + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 6); + if (!p) + return false; + p = xdr_encode_hyper(p, (u64)fhp->fh_pre_size); + p = encode_nfstime3(p, &fhp->fh_pre_mtime); + encode_nfstime3(p, &fhp->fh_pre_ctime); + + return true; } -/* - * Encode post-operation attributes. - * The inode may be NULL if the call failed because of a stale file - * handle. In this case, no attributes are returned. - */ -static __be32 * -encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_pre_op_attr(struct xdr_stream *xdr, const struct svc_fh *fhp) { - struct dentry *dentry = fhp->fh_dentry; - if (!fhp->fh_no_wcc && dentry && d_really_is_positive(dentry)) { - __be32 err; - struct kstat stat; - - err = fh_getattr(fhp, &stat); - if (!err) { - *p++ = xdr_one; /* attributes follow */ - lease_get_mtime(d_inode(dentry), &stat.mtime); - return encode_fattr3(rqstp, p, fhp, &stat); - } + if (!fhp->fh_pre_saved) { + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + return true; } - *p++ = xdr_zero; - return p; + + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + return svcxdr_encode_wcc_attr(xdr, fhp); } -/* Helper for NFSv3 ACLs */ -__be32 * -nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +/** + * svcxdr_encode_post_op_attr - Encode NFSv3 post-op attributes + * @rqstp: Context of a completed RPC transaction + * @xdr: XDR stream + * @fhp: File handle to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_post_op_attr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp) { - return encode_post_op_attr(rqstp, p, fhp); + struct dentry *dentry = fhp->fh_dentry; + struct kstat stat; + + /* + * The inode may be NULL if the call failed because of a + * stale file handle. In this case, no attributes are + * returned. + */ + if (fhp->fh_no_wcc || !dentry || !d_really_is_positive(dentry)) + goto no_post_op_attrs; + if (fh_getattr(fhp, &stat) != nfs_ok) + goto no_post_op_attrs; + + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + lease_get_mtime(d_inode(dentry), &stat.mtime); + if (!svcxdr_encode_fattr3(rqstp, xdr, fhp, &stat)) + return false; + + return true; + +no_post_op_attrs: + return xdr_stream_encode_item_absent(xdr) > 0; } /* - * Enocde weak cache consistency data + * Encode weak cache consistency data */ -static __be32 * -encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_wcc_data(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp) { - struct dentry *dentry = fhp->fh_dentry; - - if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) { - if (fhp->fh_pre_saved) { - *p++ = xdr_one; - p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size); - p = encode_time3(p, &fhp->fh_pre_mtime); - p = encode_time3(p, &fhp->fh_pre_ctime); - } else { - *p++ = xdr_zero; - } - return encode_saved_post_attr(rqstp, p, fhp); - } - /* no pre- or post-attrs */ - *p++ = xdr_zero; - return encode_post_op_attr(rqstp, p, fhp); + struct dentry *dentry = fhp->fh_dentry; + + if (!dentry || !d_really_is_positive(dentry) || !fhp->fh_post_saved) + goto neither; + + /* before */ + if (!svcxdr_encode_pre_op_attr(xdr, fhp)) + return false; + + /* after */ + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + if (!svcxdr_encode_fattr3(rqstp, xdr, fhp, &fhp->fh_post_attr)) + return false; + + return true; + +neither: + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, fhp)) + return false; + + return true; } static bool fs_supports_change_attribute(struct super_block *sb) @@ -713,210 +836,252 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p) /* GETATTR */ int -nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) +nfs3svc_encode_getattrres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_attrstat *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status == 0) { - lease_get_mtime(d_inode(resp->fh.fh_dentry), - &resp->stat.mtime); - p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime); + if (!svcxdr_encode_fattr3(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + break; } - return xdr_ressize_check(rqstp, p); + + return 1; } /* SETATTR, REMOVE, RMDIR */ int nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_attrstat *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_wcc_data(rqstp, p, &resp->fh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh); } /* LOOKUP */ -int -nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p) +int nfs3svc_encode_lookupres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_diropres *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status == 0) { - p = encode_fh(p, &resp->fh); - p = encode_post_op_attr(rqstp, p, &resp->fh); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_nfs_fh3(xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh)) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh)) + return 0; } - p = encode_post_op_attr(rqstp, p, &resp->dirfh); - return xdr_ressize_check(rqstp, p); + + return 1; } /* ACCESS */ int nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_accessres *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0) - *p++ = htonl(resp->access); - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->access) < 0) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + } + + return 1; } /* READLINK */ int nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_readlinkres *resp = rqstp->rq_resp; struct kvec *head = rqstp->rq_res.head; - *p++ = resp->status; - p = encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0) { - *p++ = htonl(resp->len); - xdr_ressize_check(rqstp, p); - rqstp->rq_res.page_len = resp->len; - if (resp->len & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); - } - if (svc_encode_result_payload(rqstp, head->iov_len, resp->len)) + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) return 0; - return 1; - } else - return xdr_ressize_check(rqstp, p); + if (xdr_stream_encode_u32(xdr, resp->len) < 0) + return 0; + xdr_write_pages(xdr, resp->pages, 0, resp->len); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + } + + return 1; } /* READ */ int nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_readres *resp = rqstp->rq_resp; struct kvec *head = rqstp->rq_res.head; - *p++ = resp->status; - p = encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0) { - *p++ = htonl(resp->count); - *p++ = htonl(resp->eof); - *p++ = htonl(resp->count); /* xdr opaque count */ - xdr_ressize_check(rqstp, p); - /* now update rqstp->rq_res to reflect data as well */ - rqstp->rq_res.page_len = resp->count; - if (resp->count & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3); - } - if (svc_encode_result_payload(rqstp, head->iov_len, - resp->count)) + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) return 0; - return 1; - } else - return xdr_ressize_check(rqstp, p); + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return 0; + if (xdr_stream_encode_bool(xdr, resp->eof) < 0) + return 0; + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return 0; + xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base, + resp->count); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + } + + return 1; } /* WRITE */ int nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_writeres *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_wcc_data(rqstp, p, &resp->fh); - if (resp->status == 0) { - *p++ = htonl(resp->count); - *p++ = htonl(resp->committed); - *p++ = resp->verf[0]; - *p++ = resp->verf[1]; + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return 0; + if (xdr_stream_encode_u32(xdr, resp->committed) < 0) + return 0; + if (!svcxdr_encode_writeverf3(xdr, resp->verf)) + return 0; + break; + default: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return 0; } - return xdr_ressize_check(rqstp, p); + + return 1; } /* CREATE, MKDIR, SYMLINK, MKNOD */ int nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_diropres *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status == 0) { - *p++ = xdr_one; - p = encode_fh(p, &resp->fh); - p = encode_post_op_attr(rqstp, p, &resp->fh); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_fh3(xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh)) + return 0; + break; + default: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh)) + return 0; } - p = encode_wcc_data(rqstp, p, &resp->dirfh); - return xdr_ressize_check(rqstp, p); + + return 1; } /* RENAME */ int nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_renameres *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_wcc_data(rqstp, p, &resp->ffh); - p = encode_wcc_data(rqstp, p, &resp->tfh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->ffh) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->tfh); } /* LINK */ int nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_linkres *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_post_op_attr(rqstp, p, &resp->fh); - p = encode_wcc_data(rqstp, p, &resp->tfh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->tfh); } /* READDIR */ int nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_readdirres *resp = rqstp->rq_resp; + struct xdr_buf *dirlist = &resp->dirlist; - *p++ = resp->status; - p = encode_post_op_attr(rqstp, p, &resp->fh); - - if (resp->status == 0) { - /* stupid readdir cookie */ - memcpy(p, resp->verf, 8); p += 2; - xdr_ressize_check(rqstp, p); - if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE) - return 1; /*No room for trailer */ - rqstp->rq_res.page_len = (resp->count) << 2; - - /* add the 'tail' to the end of the 'head' page - page 0. */ - rqstp->rq_res.tail[0].iov_base = p; - *p++ = 0; /* no more entries */ - *p++ = htonl(resp->common.err == nfserr_eof); - rqstp->rq_res.tail[0].iov_len = 2<<2; - return 1; - } else - return xdr_ressize_check(rqstp, p); -} - -static __be32 * -encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, - int namlen, u64 ino) -{ - *p++ = xdr_one; /* mark entry present */ - p = xdr_encode_hyper(p, ino); /* file id */ - p = xdr_encode_array(p, name, namlen);/* name length & name */ - - cd->offset = p; /* remember pointer */ - p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */ + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_cookieverf3(xdr, resp->verf)) + return 0; + xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len); + /* no more entries */ + if (xdr_stream_encode_item_absent(xdr) < 0) + return 0; + if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + } - return p; + return 1; } static __be32 @@ -957,267 +1122,327 @@ out: return rv; } -static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino) +/** + * nfs3svc_encode_cookie3 - Encode a directory offset cookie + * @resp: readdir result context + * @offset: offset cookie to encode + * + * The buffer space for the offset cookie has already been reserved + * by svcxdr_encode_entry3_common(). + */ +void nfs3svc_encode_cookie3(struct nfsd3_readdirres *resp, u64 offset) { - struct svc_fh *fh = &cd->scratch; - __be32 err; - - fh_init(fh, NFS3_FHSIZE); - err = compose_entry_fh(cd, fh, name, namlen, ino); - if (err) { - *p++ = 0; - *p++ = 0; - goto out; - } - p = encode_post_op_attr(cd->rqstp, p, fh); - *p++ = xdr_one; /* yes, a file handle follows */ - p = encode_fh(p, fh); -out: - fh_put(fh); - return p; -} + __be64 cookie = cpu_to_be64(offset); -/* - * Encode a directory entry. This one works for both normal readdir - * and readdirplus. - * The normal readdir reply requires 2 (fileid) + 1 (stringlen) - * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen. - * - * The readdirplus baggage is 1+21 words for post_op_attr, plus the - * file handle. - */ + if (!resp->cookie_offset) + return; + write_bytes_to_xdr_buf(&resp->dirlist, resp->cookie_offset, &cookie, + sizeof(cookie)); + resp->cookie_offset = 0; +} -#define NFS3_ENTRY_BAGGAGE (2 + 1 + 2 + 1) -#define NFS3_ENTRYPLUS_BAGGAGE (1 + 21 + 1 + (NFS3_FHSIZE >> 2)) -static int -encode_entry(struct readdir_cd *ccd, const char *name, int namlen, - loff_t offset, u64 ino, unsigned int d_type, int plus) +static bool +svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name, + int namlen, loff_t offset, u64 ino) { - struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres, - common); - __be32 *p = cd->buffer; - caddr_t curr_page_addr = NULL; - struct page ** page; - int slen; /* string (name) length */ - int elen; /* estimated entry length in words */ - int num_entry_words = 0; /* actual number of words */ - - if (cd->offset) { - u64 offset64 = offset; - - if (unlikely(cd->offset1)) { - /* we ended up with offset on a page boundary */ - *cd->offset = htonl(offset64 >> 32); - *cd->offset1 = htonl(offset64 & 0xffffffff); - cd->offset1 = NULL; - } else { - xdr_encode_hyper(cd->offset, offset64); - } - cd->offset = NULL; - } + struct xdr_buf *dirlist = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; - /* - dprintk("encode_entry(%.*s @%ld%s)\n", - namlen, name, (long) offset, plus? " plus" : ""); - */ - - /* truncate filename if too long */ - namlen = min(namlen, NFS3_MAXNAMLEN); + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + /* fileid */ + if (xdr_stream_encode_u64(xdr, ino) < 0) + return false; + /* name */ + if (xdr_stream_encode_opaque(xdr, name, min(namlen, NFS3_MAXNAMLEN)) < 0) + return false; + /* cookie */ + resp->cookie_offset = dirlist->len; + if (xdr_stream_encode_u64(xdr, NFS_OFFSET_MAX) < 0) + return false; - slen = XDR_QUADLEN(namlen); - elen = slen + NFS3_ENTRY_BAGGAGE - + (plus? NFS3_ENTRYPLUS_BAGGAGE : 0); + return true; +} - if (cd->buflen < elen) { - cd->common.err = nfserr_toosmall; - return -EINVAL; - } +/** + * nfs3svc_encode_entry3 - encode one NFSv3 READDIR entry + * @data: directory context + * @name: name of the object to be encoded + * @namlen: length of that name, in bytes + * @offset: the offset of the previous entry + * @ino: the fileid of this entry + * @d_type: unused + * + * Return values: + * %0: Entry was successfully encoded. + * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * + * On exit, the following fields are updated: + * - resp->xdr + * - resp->common.err + * - resp->cookie_offset + */ +int nfs3svc_encode_entry3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = data; + struct nfsd3_readdirres *resp = container_of(ccd, + struct nfsd3_readdirres, + common); + unsigned int starting_length = resp->dirlist.len; - /* determine which page in rq_respages[] we are currently filling */ - for (page = cd->rqstp->rq_respages + 1; - page < cd->rqstp->rq_next_page; page++) { - curr_page_addr = page_address(*page); + /* The offset cookie for the previous entry */ + nfs3svc_encode_cookie3(resp, offset); - if (((caddr_t)cd->buffer >= curr_page_addr) && - ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE)) - break; - } + if (!svcxdr_encode_entry3_common(resp, name, namlen, offset, ino)) + goto out_toosmall; - if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) { - /* encode entry in current page */ + xdr_commit_encode(&resp->xdr); + resp->common.err = nfs_ok; + return 0; - p = encode_entry_baggage(cd, p, name, namlen, ino); +out_toosmall: + resp->cookie_offset = 0; + resp->common.err = nfserr_toosmall; + resp->dirlist.len = starting_length; + return -EINVAL; +} - if (plus) - p = encode_entryplus_baggage(cd, p, name, namlen, ino); - num_entry_words = p - cd->buffer; - } else if (*(page+1) != NULL) { - /* temporarily encode entry into next page, then move back to - * current and next page in rq_respages[] */ - __be32 *p1, *tmp; - int len1, len2; +static bool +svcxdr_encode_entry3_plus(struct nfsd3_readdirres *resp, const char *name, + int namlen, u64 ino) +{ + struct xdr_stream *xdr = &resp->xdr; + struct svc_fh *fhp = &resp->scratch; + bool result; - /* grab next page for temporary storage of entry */ - p1 = tmp = page_address(*(page+1)); + result = false; + fh_init(fhp, NFS3_FHSIZE); + if (compose_entry_fh(resp, fhp, name, namlen, ino) != nfs_ok) + goto out_noattrs; - p1 = encode_entry_baggage(cd, p1, name, namlen, ino); + if (!svcxdr_encode_post_op_attr(resp->rqstp, xdr, fhp)) + goto out; + if (!svcxdr_encode_post_op_fh3(xdr, fhp)) + goto out; + result = true; - if (plus) - p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino); +out: + fh_put(fhp); + return result; - /* determine entry word length and lengths to go in pages */ - num_entry_words = p1 - tmp; - len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer; - if ((num_entry_words << 2) < len1) { - /* the actual number of words in the entry is less - * than elen and can still fit in the current page - */ - memmove(p, tmp, num_entry_words << 2); - p += num_entry_words; - - /* update offset */ - cd->offset = cd->buffer + (cd->offset - tmp); - } else { - unsigned int offset_r = (cd->offset - tmp) << 2; - - /* update pointer to offset location. - * This is a 64bit quantity, so we need to - * deal with 3 cases: - * - entirely in first page - * - entirely in second page - * - 4 bytes in each page - */ - if (offset_r + 8 <= len1) { - cd->offset = p + (cd->offset - tmp); - } else if (offset_r >= len1) { - cd->offset -= len1 >> 2; - } else { - /* sitting on the fence */ - BUG_ON(offset_r != len1 - 4); - cd->offset = p + (cd->offset - tmp); - cd->offset1 = tmp; - } - - len2 = (num_entry_words << 2) - len1; - - /* move from temp page to current and next pages */ - memmove(p, tmp, len1); - memmove(tmp, (caddr_t)tmp+len1, len2); - - p = tmp + (len2 >> 2); - } - } - else { - cd->common.err = nfserr_toosmall; - return -EINVAL; - } +out_noattrs: + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + return true; +} - cd->buflen -= num_entry_words; - cd->buffer = p; - cd->common.err = nfs_ok; +/** + * nfs3svc_encode_entryplus3 - encode one NFSv3 READDIRPLUS entry + * @data: directory context + * @name: name of the object to be encoded + * @namlen: length of that name, in bytes + * @offset: the offset of the previous entry + * @ino: the fileid of this entry + * @d_type: unused + * + * Return values: + * %0: Entry was successfully encoded. + * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * + * On exit, the following fields are updated: + * - resp->xdr + * - resp->common.err + * - resp->cookie_offset + */ +int nfs3svc_encode_entryplus3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = data; + struct nfsd3_readdirres *resp = container_of(ccd, + struct nfsd3_readdirres, + common); + unsigned int starting_length = resp->dirlist.len; + + /* The offset cookie for the previous entry */ + nfs3svc_encode_cookie3(resp, offset); + + if (!svcxdr_encode_entry3_common(resp, name, namlen, offset, ino)) + goto out_toosmall; + if (!svcxdr_encode_entry3_plus(resp, name, namlen, ino)) + goto out_toosmall; + + xdr_commit_encode(&resp->xdr); + resp->common.err = nfs_ok; return 0; +out_toosmall: + resp->cookie_offset = 0; + resp->common.err = nfserr_toosmall; + resp->dirlist.len = starting_length; + return -EINVAL; } -int -nfs3svc_encode_entry(void *cd, const char *name, - int namlen, loff_t offset, u64 ino, unsigned int d_type) +static bool +svcxdr_encode_fsstat3resok(struct xdr_stream *xdr, + const struct nfsd3_fsstatres *resp) { - return encode_entry(cd, name, namlen, offset, ino, d_type, 0); -} + const struct kstatfs *s = &resp->stats; + u64 bs = s->f_bsize; + __be32 *p; -int -nfs3svc_encode_entry_plus(void *cd, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int d_type) -{ - return encode_entry(cd, name, namlen, offset, ino, d_type, 1); + p = xdr_reserve_space(xdr, XDR_UNIT * 13); + if (!p) + return false; + p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */ + p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */ + p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */ + p = xdr_encode_hyper(p, s->f_files); /* total inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */ + *p = cpu_to_be32(resp->invarsec); /* mean unchanged time */ + + return true; } /* FSSTAT */ int nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_fsstatres *resp = rqstp->rq_resp; - struct kstatfs *s = &resp->stats; - u64 bs = s->f_bsize; - - *p++ = resp->status; - *p++ = xdr_zero; /* no post_op_attr */ - - if (resp->status == 0) { - p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */ - p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */ - p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */ - p = xdr_encode_hyper(p, s->f_files); /* total inodes */ - p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */ - p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */ - *p++ = htonl(resp->invarsec); /* mean unchanged time */ + + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; + if (!svcxdr_encode_fsstat3resok(xdr, resp)) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; } - return xdr_ressize_check(rqstp, p); + + return 1; +} + +static bool +svcxdr_encode_fsinfo3resok(struct xdr_stream *xdr, + const struct nfsd3_fsinfores *resp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 12); + if (!p) + return false; + *p++ = cpu_to_be32(resp->f_rtmax); + *p++ = cpu_to_be32(resp->f_rtpref); + *p++ = cpu_to_be32(resp->f_rtmult); + *p++ = cpu_to_be32(resp->f_wtmax); + *p++ = cpu_to_be32(resp->f_wtpref); + *p++ = cpu_to_be32(resp->f_wtmult); + *p++ = cpu_to_be32(resp->f_dtpref); + p = xdr_encode_hyper(p, resp->f_maxfilesize); + p = encode_nfstime3(p, &nfs3svc_time_delta); + *p = cpu_to_be32(resp->f_properties); + + return true; } /* FSINFO */ int nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_fsinfores *resp = rqstp->rq_resp; - *p++ = resp->status; - *p++ = xdr_zero; /* no post_op_attr */ - - if (resp->status == 0) { - *p++ = htonl(resp->f_rtmax); - *p++ = htonl(resp->f_rtpref); - *p++ = htonl(resp->f_rtmult); - *p++ = htonl(resp->f_wtmax); - *p++ = htonl(resp->f_wtpref); - *p++ = htonl(resp->f_wtmult); - *p++ = htonl(resp->f_dtpref); - p = xdr_encode_hyper(p, resp->f_maxfilesize); - *p++ = xdr_one; - *p++ = xdr_zero; - *p++ = htonl(resp->f_properties); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; + if (!svcxdr_encode_fsinfo3resok(xdr, resp)) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; } - return xdr_ressize_check(rqstp, p); + return 1; +} + +static bool +svcxdr_encode_pathconf3resok(struct xdr_stream *xdr, + const struct nfsd3_pathconfres *resp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 6); + if (!p) + return false; + *p++ = cpu_to_be32(resp->p_link_max); + *p++ = cpu_to_be32(resp->p_name_max); + p = xdr_encode_bool(p, resp->p_no_trunc); + p = xdr_encode_bool(p, resp->p_chown_restricted); + p = xdr_encode_bool(p, resp->p_case_insensitive); + xdr_encode_bool(p, resp->p_case_preserving); + + return true; } /* PATHCONF */ int nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_pathconfres *resp = rqstp->rq_resp; - *p++ = resp->status; - *p++ = xdr_zero; /* no post_op_attr */ - - if (resp->status == 0) { - *p++ = htonl(resp->p_link_max); - *p++ = htonl(resp->p_name_max); - *p++ = htonl(resp->p_no_trunc); - *p++ = htonl(resp->p_chown_restricted); - *p++ = htonl(resp->p_case_insensitive); - *p++ = htonl(resp->p_case_preserving); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; + if (!svcxdr_encode_pathconf3resok(xdr, resp)) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; } - return xdr_ressize_check(rqstp, p); + return 1; } /* COMMIT */ int nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_commitres *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_wcc_data(rqstp, p, &resp->fh); - /* Write verifier */ - if (resp->status == 0) { - *p++ = resp->verf[0]; - *p++ = resp->verf[1]; + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_writeverf3(xdr, resp->verf)) + return 0; + break; + default: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return 0; } - return xdr_ressize_check(rqstp, p); + + return 1; } /* diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 052be5bf9ef5..7325592b456e 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1189,6 +1189,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case -EIO: case -ETIMEDOUT: + case -EACCES: nfsd4_mark_cb_down(clp, task->tk_status); } break; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index acdb3cd806a1..daf43b980d4b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1302,7 +1302,7 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src, struct nfsd_file *dst) { nfs42_ssc_close(src->nf_file); - /* 'src' is freed by nfsd4_do_async_copy */ + fput(src->nf_file); nfsd_file_put(dst); mntput(ss_mnt); } @@ -1383,10 +1383,13 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) { ssize_t bytes_copied = 0; - size_t bytes_total = copy->cp_count; + u64 bytes_total = copy->cp_count; u64 src_pos = copy->cp_src_pos; u64 dst_pos = copy->cp_dst_pos; + /* See RFC 7862 p.67: */ + if (bytes_total == 0) + bytes_total = ULLONG_MAX; do { if (kthread_should_stop()) break; @@ -1538,8 +1541,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!nfs4_init_copy_state(nn, copy)) goto out_err; refcount_set(&async_copy->refcount, 1); - memcpy(©->cp_res.cb_stateid, ©->cp_stateid, - sizeof(copy->cp_stateid)); + memcpy(©->cp_res.cb_stateid, ©->cp_stateid.stid, + sizeof(copy->cp_res.cb_stateid)); dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); @@ -2262,25 +2265,6 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp) return !(nextd->op_flags & OP_HANDLES_WRONGSEC); } -static void svcxdr_init_encode(struct svc_rqst *rqstp, - struct nfsd4_compoundres *resp) -{ - struct xdr_stream *xdr = &resp->xdr; - struct xdr_buf *buf = &rqstp->rq_res; - struct kvec *head = buf->head; - - xdr->buf = buf; - xdr->iov = head; - xdr->p = head->iov_base + head->iov_len; - xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; - /* Tail and page_len should be zero at this point: */ - buf->len = buf->head[0].iov_len; - xdr_reset_scratch_buffer(xdr); - xdr->page_ptr = buf->pages - 1; - buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages) - - rqstp->rq_auth_slack; -} - #ifdef CONFIG_NFSD_V4_2_INTER_SSC static void check_if_stalefh_allowed(struct nfsd4_compoundargs *args) @@ -2335,10 +2319,14 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); __be32 status; - svcxdr_init_encode(rqstp, resp); - resp->tagp = resp->xdr.p; + resp->xdr = &rqstp->rq_res_stream; + + /* reserve space for: NFS status code */ + xdr_reserve_space(resp->xdr, XDR_UNIT); + + resp->tagp = resp->xdr->p; /* reserve space for: taglen, tag, and opcnt */ - xdr_reserve_space(&resp->xdr, 8 + args->taglen); + xdr_reserve_space(resp->xdr, XDR_UNIT * 2 + args->taglen); resp->taglen = args->taglen; resp->tag = args->tag; resp->rqstp = rqstp; @@ -2444,7 +2432,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) encode_op: if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; - nfsd4_encode_replay(&resp->xdr, op); + nfsd4_encode_replay(resp->xdr, op); status = op->status = op->replay->rp_status; } else { nfsd4_encode_operation(resp, op); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 891395c6c7d3..6fedc49726bf 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -626,7 +626,7 @@ nfsd4_legacy_tracking_init(struct net *net) status = nfsd4_load_reboot_recovery_data(net); if (status) goto err; - printk("NFSD: Using legacy client tracking operations.\n"); + pr_info("NFSD: Using legacy client tracking operations.\n"); return 0; err: @@ -1028,7 +1028,7 @@ nfsd4_init_cld_pipe(struct net *net) status = __nfsd4_init_cld_pipe(net); if (!status) - printk("NFSD: Using old nfsdcld client tracking operations.\n"); + pr_info("NFSD: Using old nfsdcld client tracking operations.\n"); return status; } @@ -1605,7 +1605,7 @@ nfsd4_cld_tracking_init(struct net *net) nfs4_release_reclaim(nn); goto err_remove; } else - printk("NFSD: Using nfsdcld client tracking operations.\n"); + pr_info("NFSD: Using nfsdcld client tracking operations.\n"); return 0; err_remove: @@ -1864,7 +1864,7 @@ nfsd4_umh_cltrack_init(struct net *net) ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL); kfree(grace_start); if (!ret) - printk("NFSD: Using UMH upcall client tracking operations.\n"); + pr_info("NFSD: Using UMH upcall client tracking operations.\n"); return ret; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 423fd6683f3a..7698172ac0c7 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -43,6 +43,7 @@ #include <linux/sunrpc/addr.h> #include <linux/jhash.h> #include <linux/string_helpers.h> +#include <linux/fsnotify.h> #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" @@ -2352,6 +2353,10 @@ static int client_info_show(struct seq_file *m, void *v) memcpy(&clid, &clp->cl_clientid, sizeof(clid)); seq_printf(m, "clientid: 0x%llx\n", clid); seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr); + if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + seq_puts(m, "status: confirmed\n"); + else + seq_puts(m, "status: unconfirmed\n"); seq_printf(m, "name: "); seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); @@ -2702,6 +2707,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, int ret; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct dentry *dentries[ARRAY_SIZE(client_files)]; clp = alloc_client(name); if (clp == NULL) @@ -2721,9 +2727,11 @@ static struct nfs4_client *create_client(struct xdr_netobj name, memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage)); clp->cl_cb_session = NULL; clp->net = net; - clp->cl_nfsd_dentry = nfsd_client_mkdir(nn, &clp->cl_nfsdfs, - clp->cl_clientid.cl_id - nn->clientid_base, - client_files); + clp->cl_nfsd_dentry = nfsd_client_mkdir( + nn, &clp->cl_nfsdfs, + clp->cl_clientid.cl_id - nn->clientid_base, + client_files, dentries); + clp->cl_nfsd_info_dentry = dentries[0]; if (!clp->cl_nfsd_dentry) { free_client(clp); return NULL; @@ -2798,7 +2806,10 @@ move_to_confirmed(struct nfs4_client *clp) list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); add_clp_to_name_tree(clp, &nn->conf_name_tree); - set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); + if (!test_and_set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags) && + clp->cl_nfsd_dentry && + clp->cl_nfsd_info_dentry) + fsnotify_dentry(clp->cl_nfsd_info_dentry, FS_MODIFY); renew_client_locked(clp); } @@ -2903,7 +2914,7 @@ out_err: static void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { - struct xdr_buf *buf = resp->xdr.buf; + struct xdr_buf *buf = resp->xdr->buf; struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; @@ -2973,7 +2984,7 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { struct nfsd4_slot *slot = resp->cstate.slot; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; __be32 status; @@ -3708,7 +3719,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_sequence *seq = &u->sequence; struct nfsd4_compoundres *resp = rqstp->rq_resp; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct nfsd4_session *session; struct nfs4_client *clp; struct nfsd4_slot *slot; @@ -4940,31 +4951,6 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, return fl; } -static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, - struct nfs4_file *fp) -{ - struct nfs4_clnt_odstate *co; - struct file *f = fp->fi_deleg_file->nf_file; - struct inode *ino = locks_inode(f); - int writes = atomic_read(&ino->i_writecount); - - if (fp->fi_fds[O_WRONLY]) - writes--; - if (fp->fi_fds[O_RDWR]) - writes--; - if (writes > 0) - return -EAGAIN; - spin_lock(&fp->fi_lock); - list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { - if (co->co_client != clp) { - spin_unlock(&fp->fi_lock); - return -EAGAIN; - } - } - spin_unlock(&fp->fi_lock); - return 0; -} - static struct nfs4_delegation * nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) @@ -4984,12 +4970,9 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, nf = find_readable_file(fp); if (!nf) { - /* - * We probably could attempt another open and get a read - * delegation, but for now, don't bother until the - * client actually sends us one. - */ - return ERR_PTR(-EAGAIN); + /* We should always have a readable file here */ + WARN_ON_ONCE(1); + return ERR_PTR(-EBADF); } spin_lock(&state_lock); spin_lock(&fp->fi_lock); @@ -5019,19 +5002,11 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (!fl) goto out_clnt_odstate; - status = nfsd4_check_conflicting_opens(clp, fp); - if (status) { - locks_free_lock(fl); - goto out_clnt_odstate; - } status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL); if (fl) locks_free_lock(fl); if (status) goto out_clnt_odstate; - status = nfsd4_check_conflicting_opens(clp, fp); - if (status) - goto out_clnt_odstate; spin_lock(&state_lock); spin_lock(&fp->fi_lock); @@ -5113,6 +5088,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; + /* + * Also, if the file was opened for write or + * create, there's a good chance the client's + * about to write to it, resulting in an + * immediate recall (since we don't support + * write delegations): + */ + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + goto out_no_deleg; + if (open->op_create == NFS4_OPEN_CREATE) + goto out_no_deleg; break; default: goto out_no_deleg; @@ -5363,6 +5349,22 @@ static bool clients_still_reclaiming(struct nfsd_net *nn) return true; } +struct laundry_time { + time64_t cutoff; + time64_t new_timeo; +}; + +static bool state_expired(struct laundry_time *lt, time64_t last_refresh) +{ + time64_t time_remaining; + + if (last_refresh < lt->cutoff) + return true; + time_remaining = last_refresh - lt->cutoff; + lt->new_timeo = min(lt->new_timeo, time_remaining); + return false; +} + static time64_t nfs4_laundromat(struct nfsd_net *nn) { @@ -5372,14 +5374,16 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_ol_stateid *stp; struct nfsd4_blocked_lock *nbl; struct list_head *pos, *next, reaplist; - time64_t cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease; - time64_t t, new_timeo = nn->nfsd4_lease; + struct laundry_time lt = { + .cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease, + .new_timeo = nn->nfsd4_lease + }; struct nfs4_cpntf_state *cps; copy_stateid_t *cps_t; int i; if (clients_still_reclaiming(nn)) { - new_timeo = 0; + lt.new_timeo = 0; goto out; } nfsd4_end_grace(nn); @@ -5389,7 +5393,7 @@ nfs4_laundromat(struct nfsd_net *nn) idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID && - cps->cpntf_time > cutoff) + state_expired(<, cps->cpntf_time)) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); @@ -5397,11 +5401,8 @@ nfs4_laundromat(struct nfsd_net *nn) spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); - if (clp->cl_time > cutoff) { - t = clp->cl_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, clp->cl_time)) break; - } if (mark_client_expired_locked(clp)) { trace_nfsd_clid_expired(&clp->cl_clientid); continue; @@ -5418,11 +5419,8 @@ nfs4_laundromat(struct nfsd_net *nn) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - if (dp->dl_time > cutoff) { - t = dp->dl_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, dp->dl_time)) break; - } WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, &reaplist); } @@ -5438,11 +5436,8 @@ nfs4_laundromat(struct nfsd_net *nn) while (!list_empty(&nn->close_lru)) { oo = list_first_entry(&nn->close_lru, struct nfs4_openowner, oo_close_lru); - if (oo->oo_time > cutoff) { - t = oo->oo_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, oo->oo_time)) break; - } list_del_init(&oo->oo_close_lru); stp = oo->oo_last_closed_stid; oo->oo_last_closed_stid = NULL; @@ -5468,11 +5463,8 @@ nfs4_laundromat(struct nfsd_net *nn) while (!list_empty(&nn->blocked_locks_lru)) { nbl = list_first_entry(&nn->blocked_locks_lru, struct nfsd4_blocked_lock, nbl_lru); - if (nbl->nbl_time > cutoff) { - t = nbl->nbl_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, nbl->nbl_time)) break; - } list_move(&nbl->nbl_lru, &reaplist); list_del_init(&nbl->nbl_list); } @@ -5485,8 +5477,7 @@ nfs4_laundromat(struct nfsd_net *nn) free_blocked_lock(nbl); } out: - new_timeo = max_t(time64_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); - return new_timeo; + return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } static struct workqueue_struct *laundry_wq; @@ -7346,14 +7337,9 @@ nfs4_state_start_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; - ret = get_nfsdfs(net); - if (ret) - return ret; ret = nfs4_state_create_net(net); - if (ret) { - mntput(nn->nfsd_mnt); + if (ret) return ret; - } locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0) @@ -7423,7 +7409,6 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); - mntput(nn->nfsd_mnt); } void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index eaaa1605b5b5..e0f06d3cbd44 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3581,7 +3581,7 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 8); @@ -3594,7 +3594,7 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); @@ -3611,7 +3611,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &close->cl_stateid); } @@ -3620,7 +3620,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c static __be32 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); @@ -3634,7 +3634,7 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 20); @@ -3649,7 +3649,7 @@ static __be32 nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) { struct svc_fh *fhp = getattr->ga_fhp; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, getattr->ga_bmval, resp->rqstp, 0); @@ -3658,7 +3658,7 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 static __be32 nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct svc_fh *fhp = *fhpp; unsigned int len; __be32 *p; @@ -3713,7 +3713,7 @@ again: static __be32 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; if (!nfserr) nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); @@ -3726,7 +3726,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo static __be32 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(xdr, &lockt->lt_denied); @@ -3736,7 +3736,7 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &locku->lu_stateid); } @@ -3745,7 +3745,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 20); @@ -3759,7 +3759,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); @@ -3853,7 +3853,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); } @@ -3861,7 +3861,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &od->od_stateid); } @@ -3871,7 +3871,7 @@ static __be32 nfsd4_encode_splice_read( struct nfsd4_read *read, struct file *file, unsigned long maxcount) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct xdr_buf *buf = xdr->buf; int status, space_left; u32 eof; @@ -3937,7 +3937,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, struct nfsd4_read *read, struct file *file, unsigned long maxcount) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; u32 eof; int starting_len = xdr->buf->len - 8; __be32 nfserr; @@ -3976,7 +3976,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_read *read) { unsigned long maxcount; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct file *file; int starting_len = xdr->buf->len; __be32 *p; @@ -3990,7 +3990,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); return nfserr_resource; } - if (resp->xdr.buf->page_len && + if (resp->xdr->buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { WARN_ON_ONCE(1); return nfserr_resource; @@ -4020,7 +4020,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd int maxcount; __be32 wire_count; int zero = 0; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; int length_offset = xdr->buf->len; int status; __be32 *p; @@ -4072,7 +4072,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 int bytes_left; loff_t offset; __be64 wire_offset; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; int starting_len = xdr->buf->len; __be32 *p; @@ -4083,8 +4083,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ *p++ = cpu_to_be32(0); *p++ = cpu_to_be32(0); - resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p) - - (char *)resp->xdr.buf->head[0].iov_base; + xdr->buf->head[0].iov_len = (char *)xdr->p - + (char *)xdr->buf->head[0].iov_base; /* * Number of bytes left for directory entries allowing for the @@ -4159,7 +4159,7 @@ err_no_verf: static __be32 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 20); @@ -4172,7 +4172,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 40); @@ -4255,7 +4255,7 @@ static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo *secinfo) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp); } @@ -4264,7 +4264,7 @@ static __be32 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo_no_name *secinfo) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp); } @@ -4276,7 +4276,7 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 16); @@ -4300,7 +4300,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 static __be32 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; if (!nfserr) { @@ -4324,7 +4324,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n static __be32 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 16); @@ -4341,7 +4341,7 @@ static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_exchange_id *exid) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; char *major_id; char *server_scope; @@ -4419,7 +4419,7 @@ static __be32 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create_session *sess) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 24); @@ -4472,7 +4472,7 @@ static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_sequence *seq) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); @@ -4495,7 +4495,7 @@ static __be32 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_test_stateid *test_stateid) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; @@ -4516,7 +4516,7 @@ static __be32 nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getdeviceinfo *gdev) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; const struct nfsd4_layout_ops *ops; u32 starting_len = xdr->buf->len, needed_len; __be32 *p; @@ -4572,7 +4572,7 @@ static __be32 nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutget *lgp) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; const struct nfsd4_layout_ops *ops; __be32 *p; @@ -4599,7 +4599,7 @@ static __be32 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutcommit *lcp) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 4); @@ -4620,7 +4620,7 @@ static __be32 nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutreturn *lrp) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 4); @@ -4638,7 +4638,7 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write, bool sync) { __be32 *p; - p = xdr_reserve_space(&resp->xdr, 4); + p = xdr_reserve_space(resp->xdr, 4); if (!p) return nfserr_resource; @@ -4647,11 +4647,11 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp, else { __be32 nfserr; *p++ = cpu_to_be32(1); - nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid); + nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid); if (nfserr) return nfserr; } - p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); + p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); if (!p) return nfserr_resource; @@ -4665,7 +4665,7 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp, static __be32 nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct nfs42_netaddr *addr; __be32 *p; @@ -4713,7 +4713,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, if (nfserr) return nfserr; - p = xdr_reserve_space(&resp->xdr, 4 + 4); + p = xdr_reserve_space(resp->xdr, 4 + 4); *p++ = xdr_one; /* cr_consecutive */ *p++ = cpu_to_be32(copy->cp_synchronous); return 0; @@ -4723,7 +4723,7 @@ static __be32 nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_offload_status *os) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 8 + 4); @@ -4740,7 +4740,7 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, unsigned long *maxcount, u32 *eof, loff_t *pos) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct file *file = read->rd_nf->nf_file; int starting_len = xdr->buf->len; loff_t hole_pos; @@ -4799,7 +4799,7 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, count = data_pos - read->rd_offset; /* Content type, offset, byte count */ - p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8); + p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); if (!p) return nfserr_resource; @@ -4817,7 +4817,7 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_read *read) { unsigned long maxcount, count; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct file *file; int starting_len = xdr->buf->len; int last_segment = xdr->buf->len; @@ -4888,7 +4888,7 @@ static __be32 nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_copy_notify *cn) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; if (nfserr) @@ -4924,7 +4924,7 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, { __be32 *p; - p = xdr_reserve_space(&resp->xdr, 4 + 8); + p = xdr_reserve_space(resp->xdr, 4 + 8); *p++ = cpu_to_be32(seek->seek_eof); p = xdr_encode_hyper(p, seek->seek_pos); @@ -4985,7 +4985,7 @@ static __be32 nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getxattr *getxattr) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p, err; p = xdr_reserve_space(xdr, 4); @@ -5009,7 +5009,7 @@ static __be32 nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setxattr *setxattr) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 20); @@ -5050,7 +5050,7 @@ static __be32 nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_listxattrs *listxattrs) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; u32 cookie_offset, count_offset, eof; u32 left, xdrleft, slen, count; u32 xdrlen, offset; @@ -5161,7 +5161,7 @@ static __be32 nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_removexattr *removexattr) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 20); @@ -5301,7 +5301,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize) void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct nfs4_stateowner *so = resp->cstate.replay_owner; struct svc_rqst *rqstp = resp->rqstp; const struct nfsd4_operation *opdesc = op->opdesc; @@ -5430,14 +5430,14 @@ int nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd4_compoundres *resp = rqstp->rq_resp; - struct xdr_buf *buf = resp->xdr.buf; + struct xdr_buf *buf = resp->xdr->buf; WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + buf->tail[0].iov_len); *p = resp->cstate.status; - rqstp->rq_next_page = resp->xdr.page_ptr + 1; + rqstp->rq_next_page = resp->xdr->page_ptr + 1; p = resp->tagp; *p++ = htonl(resp->taglen); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ef86ed23af82..853bf50a2a9b 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1266,7 +1266,8 @@ static void nfsdfs_remove_files(struct dentry *root) /* XXX: cut'n'paste from simple_fill_super; figure out if we could share * code instead. */ static int nfsdfs_create_files(struct dentry *root, - const struct tree_descr *files) + const struct tree_descr *files, + struct dentry **fdentries) { struct inode *dir = d_inode(root); struct inode *inode; @@ -1275,8 +1276,6 @@ static int nfsdfs_create_files(struct dentry *root, inode_lock(dir); for (i = 0; files->name && files->name[0]; i++, files++) { - if (!files->name) - continue; dentry = d_alloc_name(root, files->name); if (!dentry) goto out; @@ -1290,6 +1289,8 @@ static int nfsdfs_create_files(struct dentry *root, inode->i_private = __get_nfsdfs_client(dir); d_add(dentry, inode); fsnotify_create(dir, dentry); + if (fdentries) + fdentries[i] = dentry; } inode_unlock(dir); return 0; @@ -1301,8 +1302,9 @@ out: /* on success, returns positive number unique to that client. */ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, - struct nfsdfs_client *ncl, u32 id, - const struct tree_descr *files) + struct nfsdfs_client *ncl, u32 id, + const struct tree_descr *files, + struct dentry **fdentries) { struct dentry *dentry; char name[11]; @@ -1313,7 +1315,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name); if (IS_ERR(dentry)) /* XXX: tossing errors? */ return NULL; - ret = nfsdfs_create_files(dentry, files); + ret = nfsdfs_create_files(dentry, files, fdentries); if (ret) { nfsd_client_rmdir(dentry); return NULL; @@ -1416,6 +1418,8 @@ static void nfsd_umount(struct super_block *sb) { struct net *net = sb->s_fs_info; + nfsd_shutdown_threads(net); + kill_litter_super(sb); put_net(net); } @@ -1428,18 +1432,6 @@ static struct file_system_type nfsd_fs_type = { }; MODULE_ALIAS_FS("nfsd"); -int get_nfsdfs(struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct vfsmount *mnt; - - mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - nn->nfsd_mnt = mnt; - return 0; -} - #ifdef CONFIG_PROC_FS static int create_proc_exports_entry(void) { diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 8bdc37aa2c2e..14dbfa75059d 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -93,13 +93,12 @@ int nfsd_get_nrthreads(int n, int *, struct net *); int nfsd_set_nrthreads(int n, int *, struct net *); int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); +void nfsd_shutdown_threads(struct net *net); void nfsd_destroy(struct net *net); bool i_am_nfsd(void); -int get_nfsdfs(struct net *); - struct nfsdfs_client { struct kref cl_ref; void (*cl_release)(struct kref *kref); @@ -107,7 +106,9 @@ struct nfsdfs_client { struct nfsdfs_client *get_nfsdfs_client(struct inode *); struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, - struct nfsdfs_client *ncl, u32 id, const struct tree_descr *); + struct nfsdfs_client *ncl, u32 id, + const struct tree_descr *, + struct dentry **fdentries); void nfsd_client_rmdir(struct dentry *dentry); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 10b44421eace..c475d2271f9c 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -711,7 +711,7 @@ char * SVCFH_fmt(struct svc_fh *fhp) return buf; } -enum fsid_source fsid_source(struct svc_fh *fhp) +enum fsid_source fsid_source(const struct svc_fh *fhp) { if (fhp->fh_handle.fh_version != 1) return FSIDSOURCE_DEV; diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index f58933519f38..aff2cda5c6c3 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -82,7 +82,7 @@ enum fsid_source { FSIDSOURCE_FSID, FSIDSOURCE_UUID, }; -extern enum fsid_source fsid_source(struct svc_fh *fhp); +extern enum fsid_source fsid_source(const struct svc_fh *fhp); /* diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index a8d5449dd0e9..60d7c59e7935 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -151,13 +151,14 @@ nfsd_proc_readlink(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_readlinkres *resp = rqstp->rq_resp; - char *buffer = page_address(*(rqstp->rq_next_page++)); dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ resp->len = NFS_MAXPATHLEN; - resp->status = nfsd_readlink(rqstp, &argp->fh, buffer, &resp->len); + resp->page = *(rqstp->rq_next_page++); + resp->status = nfsd_readlink(rqstp, &argp->fh, + page_address(resp->page), &resp->len); fh_put(&argp->fh); return rpc_success; @@ -184,6 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) v = 0; len = argp->count; + resp->pages = rqstp->rq_next_page; while (len > 0) { struct page *page = *(rqstp->rq_next_page++); @@ -381,7 +383,7 @@ nfsd_proc_create(struct svc_rqst *rqstp) /* Make sure the type and device matches */ resp->status = nfserr_exist; - if (inode && type != (inode->i_mode & S_IFMT)) + if (inode && inode_wrong_type(inode, type)) goto out_unlock; } @@ -557,14 +559,27 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, struct nfsd_readdirres *resp, int count) { + struct xdr_buf *buf = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; + count = min_t(u32, count, PAGE_SIZE); - /* Convert byte count to number of words (i.e. >> 2), - * and reserve room for the NULL ptr & eof flag (-2 words) */ - resp->buflen = (count >> 2) - 2; + memset(buf, 0, sizeof(*buf)); - resp->buffer = page_address(*rqstp->rq_next_page); + /* Reserve room for the NULL ptr & eof flag (-2 words) */ + buf->buflen = count - sizeof(__be32) * 2; + buf->pages = rqstp->rq_next_page; rqstp->rq_next_page++; + + /* This is xdr_init_encode(), but it assumes that + * the head kvec has already been consumed. */ + xdr_set_scratch_buffer(xdr, NULL, 0); + xdr->buf = buf; + xdr->page_ptr = buf->pages; + xdr->iov = NULL; + xdr->p = page_address(*buf->pages); + xdr->end = xdr->p + (PAGE_SIZE >> 2); + xdr->rqst = NULL; } /* @@ -576,25 +591,19 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) struct nfsd_readdirargs *argp = rqstp->rq_argp; struct nfsd_readdirres *resp = rqstp->rq_resp; loff_t offset; - __be32 *buffer; dprintk("nfsd: READDIR %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->count, argp->cookie); nfsd_init_dirlist_pages(rqstp, resp, argp->count); - buffer = resp->buffer; - resp->offset = NULL; resp->common.err = nfs_ok; - /* Read directory and encode entries on the fly */ + resp->cookie_offset = 0; offset = argp->cookie; resp->status = nfsd_readdir(rqstp, &argp->fh, &offset, &resp->common, nfssvc_encode_entry); - - resp->count = resp->buffer - buffer; - if (resp->offset) - *resp->offset = htonl(offset); + nfssvc_encode_nfscookie(resp, offset); fh_put(&argp->fh); return rpc_success; @@ -640,7 +649,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_GETATTR] = { .pc_func = nfsd_proc_getattr, .pc_decode = nfssvc_decode_fhandleargs, - .pc_encode = nfssvc_encode_attrstat, + .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), @@ -651,7 +660,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_SETATTR] = { .pc_func = nfsd_proc_setattr, .pc_decode = nfssvc_decode_sattrargs, - .pc_encode = nfssvc_encode_attrstat, + .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_sattrargs), .pc_ressize = sizeof(struct nfsd_attrstat), @@ -714,7 +723,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_WRITE] = { .pc_func = nfsd_proc_write, .pc_decode = nfssvc_decode_writeargs, - .pc_encode = nfssvc_encode_attrstat, + .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_writeargs), .pc_ressize = sizeof(struct nfsd_attrstat), @@ -736,7 +745,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_REMOVE] = { .pc_func = nfsd_proc_remove, .pc_decode = nfssvc_decode_diropargs, - .pc_encode = nfssvc_encode_stat, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, @@ -746,7 +755,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_RENAME] = { .pc_func = nfsd_proc_rename, .pc_decode = nfssvc_decode_renameargs, - .pc_encode = nfssvc_encode_stat, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_renameargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, @@ -756,7 +765,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_LINK] = { .pc_func = nfsd_proc_link, .pc_decode = nfssvc_decode_linkargs, - .pc_encode = nfssvc_encode_stat, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_linkargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, @@ -766,7 +775,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_SYMLINK] = { .pc_func = nfsd_proc_symlink, .pc_decode = nfssvc_decode_symlinkargs, - .pc_encode = nfssvc_encode_stat, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_symlinkargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, @@ -787,7 +796,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_RMDIR] = { .pc_func = nfsd_proc_rmdir, .pc_decode = nfssvc_decode_diropargs, - .pc_encode = nfssvc_encode_stat, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 6de406322106..82ba034fa579 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -84,7 +84,7 @@ DEFINE_MUTEX(nfsd_mutex); * version 4.1 DRC caches. * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. */ -spinlock_t nfsd_drc_lock; +DEFINE_SPINLOCK(nfsd_drc_lock); unsigned long nfsd_drc_max_mem; unsigned long nfsd_drc_mem_used; @@ -563,7 +563,6 @@ static void set_max_drc(void) nfsd_drc_max_mem = (nr_free_buffer_pages() >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; nfsd_drc_mem_used = 0; - spin_lock_init(&nfsd_drc_lock); dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem); } @@ -596,6 +595,37 @@ static const struct svc_serv_ops nfsd_thread_sv_ops = { .svo_module = THIS_MODULE, }; +static void nfsd_complete_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + + nn->nfsd_serv = NULL; + complete(&nn->nfsd_shutdown_complete); +} + +void nfsd_shutdown_threads(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; + + mutex_lock(&nfsd_mutex); + serv = nn->nfsd_serv; + if (serv == NULL) { + mutex_unlock(&nfsd_mutex); + return; + } + + svc_get(serv); + /* Kill outstanding nfsd threads */ + serv->sv_ops->svo_setup(serv, NULL, 0); + nfsd_destroy(net); + mutex_unlock(&nfsd_mutex); + /* Wait for shutdown of nfsd_serv to complete */ + wait_for_completion(&nn->nfsd_shutdown_complete); +} + bool i_am_nfsd(void) { return kthread_func(current) == nfsd; @@ -618,11 +648,13 @@ int nfsd_create_serv(struct net *net) &nfsd_thread_sv_ops); if (nn->nfsd_serv == NULL) return -ENOMEM; + init_completion(&nn->nfsd_shutdown_complete); nn->nfsd_serv->sv_maxconn = nn->max_connections; error = svc_bind(nn->nfsd_serv, net); if (error < 0) { svc_destroy(nn->nfsd_serv); + nfsd_complete_shutdown(net); return error; } @@ -671,7 +703,7 @@ void nfsd_destroy(struct net *net) svc_shutdown_net(nn->nfsd_serv, net); svc_destroy(nn->nfsd_serv); if (destroy) - nn->nfsd_serv = NULL; + nfsd_complete_shutdown(net); } int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) @@ -997,7 +1029,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) * NFSv4 does some encoding while processing */ p = resv->iov_base + resv->iov_len; - resv->iov_len += sizeof(__be32); + svcxdr_init_encode(rqstp); *statp = proc->pc_func(rqstp); if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags)) @@ -1052,7 +1084,7 @@ int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) */ int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) { - return xdr_ressize_check(rqstp, p); + return 1; } int nfsd_pool_stats_open(struct inode *inode, struct file *file) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 5d79ef6a0c7f..a06c05fe3b42 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -9,12 +9,10 @@ #include "xdr.h" #include "auth.h" -#define NFSDDBG_FACILITY NFSDDBG_XDR - /* * Mapping of S_IF* types to NFS file types */ -static u32 nfs_ftypes[] = { +static const u32 nfs_ftypes[] = { NFNON, NFCHR, NFCHR, NFBAD, NFDIR, NFBAD, NFBLK, NFBAD, NFREG, NFBAD, NFLNK, NFBAD, @@ -27,6 +25,28 @@ static u32 nfs_ftypes[] = { */ /** + * svcxdr_encode_stat - Encode an NFSv2 status code + * @xdr: XDR stream + * @status: status value to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_stat(struct xdr_stream *xdr, __be32 status) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(status)); + if (!p) + return false; + *p = status; + + return true; +} + +/** * svcxdr_decode_fhandle - Decode an NFSv2 file handle * @xdr: XDR stream positioned at an encoded NFSv2 FH * @fhp: OUT: filled-in server file handle @@ -50,11 +70,28 @@ svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp) return true; } -static __be32 * -encode_fh(__be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_fhandle(struct xdr_stream *xdr, const struct svc_fh *fhp) { + __be32 *p; + + p = xdr_reserve_space(xdr, NFS_FHSIZE); + if (!p) + return false; memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE); - return p + (NFS_FHSIZE>> 2); + + return true; +} + +static __be32 * +encode_timeval(__be32 *p, const struct timespec64 *time) +{ + *p++ = cpu_to_be32((u32)time->tv_sec); + if (time->tv_nsec) + *p++ = cpu_to_be32(time->tv_nsec / NSEC_PER_USEC); + else + *p++ = xdr_zero; + return p; } static bool @@ -162,68 +199,73 @@ svcxdr_decode_sattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, return true; } -static __be32 * -encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, - struct kstat *stat) +/** + * svcxdr_encode_fattr - Encode NFSv2 file attributes + * @rqstp: Context of a completed RPC transaction + * @xdr: XDR stream + * @fhp: File handle to encode + * @stat: Attributes to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp, const struct kstat *stat) { struct user_namespace *userns = nfsd_user_namespace(rqstp); - struct dentry *dentry = fhp->fh_dentry; - int type; + struct dentry *dentry = fhp->fh_dentry; + int type = stat->mode & S_IFMT; struct timespec64 time; - u32 f; + __be32 *p; + u32 fsid; - type = (stat->mode & S_IFMT); + p = xdr_reserve_space(xdr, XDR_UNIT * 17); + if (!p) + return false; - *p++ = htonl(nfs_ftypes[type >> 12]); - *p++ = htonl((u32) stat->mode); - *p++ = htonl((u32) stat->nlink); - *p++ = htonl((u32) from_kuid_munged(userns, stat->uid)); - *p++ = htonl((u32) from_kgid_munged(userns, stat->gid)); + *p++ = cpu_to_be32(nfs_ftypes[type >> 12]); + *p++ = cpu_to_be32((u32)stat->mode); + *p++ = cpu_to_be32((u32)stat->nlink); + *p++ = cpu_to_be32((u32)from_kuid_munged(userns, stat->uid)); + *p++ = cpu_to_be32((u32)from_kgid_munged(userns, stat->gid)); - if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { - *p++ = htonl(NFS_MAXPATHLEN); - } else { - *p++ = htonl((u32) stat->size); - } - *p++ = htonl((u32) stat->blksize); + if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) + *p++ = cpu_to_be32(NFS_MAXPATHLEN); + else + *p++ = cpu_to_be32((u32) stat->size); + *p++ = cpu_to_be32((u32) stat->blksize); if (S_ISCHR(type) || S_ISBLK(type)) - *p++ = htonl(new_encode_dev(stat->rdev)); + *p++ = cpu_to_be32(new_encode_dev(stat->rdev)); else - *p++ = htonl(0xffffffff); - *p++ = htonl((u32) stat->blocks); + *p++ = cpu_to_be32(0xffffffff); + *p++ = cpu_to_be32((u32)stat->blocks); + switch (fsid_source(fhp)) { - default: - case FSIDSOURCE_DEV: - *p++ = htonl(new_encode_dev(stat->dev)); - break; case FSIDSOURCE_FSID: - *p++ = htonl((u32) fhp->fh_export->ex_fsid); + fsid = (u32)fhp->fh_export->ex_fsid; break; case FSIDSOURCE_UUID: - f = ((u32*)fhp->fh_export->ex_uuid)[0]; - f ^= ((u32*)fhp->fh_export->ex_uuid)[1]; - f ^= ((u32*)fhp->fh_export->ex_uuid)[2]; - f ^= ((u32*)fhp->fh_export->ex_uuid)[3]; - *p++ = htonl(f); + fsid = ((u32 *)fhp->fh_export->ex_uuid)[0]; + fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[1]; + fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[2]; + fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[3]; + break; + default: + fsid = new_encode_dev(stat->dev); break; } - *p++ = htonl((u32) stat->ino); - *p++ = htonl((u32) stat->atime.tv_sec); - *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); - time = stat->mtime; - lease_get_mtime(d_inode(dentry), &time); - *p++ = htonl((u32) time.tv_sec); - *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); - *p++ = htonl((u32) stat->ctime.tv_sec); - *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0); + *p++ = cpu_to_be32(fsid); - return p; -} + *p++ = cpu_to_be32((u32)stat->ino); + p = encode_timeval(p, &stat->atime); + time = stat->mtime; + lease_get_mtime(d_inode(dentry), &time); + p = encode_timeval(p, &time); + encode_timeval(p, &stat->ctime); -/* Helper function for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) -{ - return encode_fattr(rqstp, p, fhp, stat); + return true; } /* @@ -390,106 +432,118 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) */ int -nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p) +nfssvc_encode_statres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_stat *resp = rqstp->rq_resp; - *p++ = resp->status; - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_stat(xdr, resp->status); } int -nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) +nfssvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_attrstat *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status != nfs_ok) - goto out; - p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); -out: - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + break; + } + + return 1; } int nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_diropres *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status != nfs_ok) - goto out; - p = encode_fh(p, &resp->fh); - p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); -out: - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fhandle(xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + break; + } + + return 1; } int nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_readlinkres *resp = rqstp->rq_resp; struct kvec *head = rqstp->rq_res.head; - *p++ = resp->status; - if (resp->status != nfs_ok) - return xdr_ressize_check(rqstp, p); - - *p++ = htonl(resp->len); - xdr_ressize_check(rqstp, p); - rqstp->rq_res.page_len = resp->len; - if (resp->len & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); - } - if (svc_encode_result_payload(rqstp, head->iov_len, resp->len)) + if (!svcxdr_encode_stat(xdr, resp->status)) return 0; + switch (resp->status) { + case nfs_ok: + if (xdr_stream_encode_u32(xdr, resp->len) < 0) + return 0; + xdr_write_pages(xdr, &resp->page, 0, resp->len); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) + return 0; + break; + } + return 1; } int nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_readres *resp = rqstp->rq_resp; struct kvec *head = rqstp->rq_res.head; - *p++ = resp->status; - if (resp->status != nfs_ok) - return xdr_ressize_check(rqstp, p); - - p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); - *p++ = htonl(resp->count); - xdr_ressize_check(rqstp, p); - - /* now update rqstp->rq_res to reflect data as well */ - rqstp->rq_res.page_len = resp->count; - if (resp->count & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3); - } - if (svc_encode_result_payload(rqstp, head->iov_len, resp->count)) + if (!svcxdr_encode_stat(xdr, resp->status)) return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return 0; + xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base, + resp->count); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) + return 0; + break; + } + return 1; } int nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_readdirres *resp = rqstp->rq_resp; + struct xdr_buf *dirlist = &resp->dirlist; - *p++ = resp->status; - if (resp->status != nfs_ok) - return xdr_ressize_check(rqstp, p); - - xdr_ressize_check(rqstp, p); - p = resp->buffer; - *p++ = 0; /* no more entries */ - *p++ = htonl((resp->common.err == nfserr_eof)); - rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1; + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len); + /* no more entries */ + if (xdr_stream_encode_item_absent(xdr) < 0) + return 0; + if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0) + return 0; + break; + } return 1; } @@ -497,64 +551,113 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) int nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_statfsres *resp = rqstp->rq_resp; struct kstatfs *stat = &resp->stats; - *p++ = resp->status; - if (resp->status != nfs_ok) - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + p = xdr_reserve_space(xdr, XDR_UNIT * 5); + if (!p) + return 0; + *p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2); + *p++ = cpu_to_be32(stat->f_bsize); + *p++ = cpu_to_be32(stat->f_blocks); + *p++ = cpu_to_be32(stat->f_bfree); + *p = cpu_to_be32(stat->f_bavail); + break; + } - *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */ - *p++ = htonl(stat->f_bsize); - *p++ = htonl(stat->f_blocks); - *p++ = htonl(stat->f_bfree); - *p++ = htonl(stat->f_bavail); - return xdr_ressize_check(rqstp, p); + return 1; } -int -nfssvc_encode_entry(void *ccdv, const char *name, - int namlen, loff_t offset, u64 ino, unsigned int d_type) +/** + * nfssvc_encode_nfscookie - Encode a directory offset cookie + * @resp: readdir result context + * @offset: offset cookie to encode + * + * The buffer space for the offset cookie has already been reserved + * by svcxdr_encode_entry_common(). + */ +void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset) { - struct readdir_cd *ccd = ccdv; - struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common); - __be32 *p = cd->buffer; - int buflen, slen; + __be32 cookie = cpu_to_be32(offset); - /* - dprintk("nfsd: entry(%.*s off %ld ino %ld)\n", - namlen, name, offset, ino); - */ + if (!resp->cookie_offset) + return; - if (offset > ~((u32) 0)) { - cd->common.err = nfserr_fbig; - return -EINVAL; - } - if (cd->offset) - *cd->offset = htonl(offset); + write_bytes_to_xdr_buf(&resp->dirlist, resp->cookie_offset, &cookie, + sizeof(cookie)); + resp->cookie_offset = 0; +} - /* truncate filename */ - namlen = min(namlen, NFS2_MAXNAMLEN); - slen = XDR_QUADLEN(namlen); +static bool +svcxdr_encode_entry_common(struct nfsd_readdirres *resp, const char *name, + int namlen, loff_t offset, u64 ino) +{ + struct xdr_buf *dirlist = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; - if ((buflen = cd->buflen - slen - 4) < 0) { - cd->common.err = nfserr_toosmall; - return -EINVAL; - } - if (ino > ~((u32) 0)) { - cd->common.err = nfserr_fbig; - return -EINVAL; - } - *p++ = xdr_one; /* mark entry present */ - *p++ = htonl((u32) ino); /* file id */ - p = xdr_encode_array(p, name, namlen);/* name length & name */ - cd->offset = p; /* remember pointer */ - *p++ = htonl(~0U); /* offset of next entry */ - - cd->buflen = buflen; - cd->buffer = p; - cd->common.err = nfs_ok; + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + /* fileid */ + if (xdr_stream_encode_u32(xdr, (u32)ino) < 0) + return false; + /* name */ + if (xdr_stream_encode_opaque(xdr, name, min(namlen, NFS2_MAXNAMLEN)) < 0) + return false; + /* cookie */ + resp->cookie_offset = dirlist->len; + if (xdr_stream_encode_u32(xdr, ~0U) < 0) + return false; + + return true; +} + +/** + * nfssvc_encode_entry - encode one NFSv2 READDIR entry + * @data: directory context + * @name: name of the object to be encoded + * @namlen: length of that name, in bytes + * @offset: the offset of the previous entry + * @ino: the fileid of this entry + * @d_type: unused + * + * Return values: + * %0: Entry was successfully encoded. + * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * + * On exit, the following fields are updated: + * - resp->xdr + * - resp->common.err + * - resp->cookie_offset + */ +int nfssvc_encode_entry(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = data; + struct nfsd_readdirres *resp = container_of(ccd, + struct nfsd_readdirres, + common); + unsigned int starting_length = resp->dirlist.len; + + /* The offset cookie for the previous entry */ + nfssvc_encode_nfscookie(resp, offset); + + if (!svcxdr_encode_entry_common(resp, name, namlen, offset, ino)) + goto out_toosmall; + + xdr_commit_encode(&resp->xdr); + resp->common.err = nfs_ok; return 0; + +out_toosmall: + resp->cookie_offset = 0; + resp->common.err = nfserr_toosmall; + resp->dirlist.len = starting_length; + return -EINVAL; } /* diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 73deea353169..54cab651ac1d 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -371,6 +371,10 @@ struct nfs4_client { /* debugging info directory under nfsd/clients/ : */ struct dentry *cl_nfsd_dentry; + /* 'info' file within that directory. Ref is not counted, + * but will remain valid iff cl_nfsd_dentry != NULL + */ + struct dentry *cl_nfsd_info_dentry; /* for nfs41 callbacks */ /* We currently support a single back channel with a single slot */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 92a0973dd671..27a93ebd1d80 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -391,6 +391,30 @@ DEFINE_EVENT(nfsd_err_class, nfsd_##name, \ DEFINE_NFSD_ERR_EVENT(read_err); DEFINE_NFSD_ERR_EVENT(write_err); +TRACE_EVENT(nfsd_dirent, + TP_PROTO(struct svc_fh *fhp, + u64 ino, + const char *name, + int namlen), + TP_ARGS(fhp, ino, name, namlen), + TP_STRUCT__entry( + __field(u32, fh_hash) + __field(u64, ino) + __field(int, len) + __dynamic_array(unsigned char, name, namlen) + ), + TP_fast_assign( + __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; + __entry->ino = ino; + __entry->len = namlen; + memcpy(__get_str(name), name, namlen); + __assign_str(name, name); + ), + TP_printk("fh_hash=0x%08x ino=%llu name=%.*s", + __entry->fh_hash, __entry->ino, + __entry->len, __get_str(name)) +) + #include "state.h" #include "filecache.h" #include "vfs.h" diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fd6be35a1642..15adf1f6ab21 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1968,8 +1968,9 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, return 0; } -static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, - struct readdir_cd *cdp, loff_t *offsetp) +static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp, + nfsd_filldir_t func, struct readdir_cd *cdp, + loff_t *offsetp) { struct buffered_dirent *de; int host_err; @@ -2015,6 +2016,8 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, if (cdp->err != nfs_ok) break; + trace_nfsd_dirent(fhp, de->ino, de->name, de->namlen); + reclen = ALIGN(sizeof(*de) + de->namlen, sizeof(u64)); size -= reclen; @@ -2062,7 +2065,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, goto out_close; } - err = nfsd_buffered_readdir(file, func, cdp, offsetp); + err = nfsd_buffered_readdir(file, fhp, func, cdp, offsetp); if (err == nfserr_eof || err == nfserr_toosmall) err = nfs_ok; /* can still be found in ->err */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index a2442ebe5acf..b21b76e6b9a8 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -152,7 +152,7 @@ static inline void fh_drop_write(struct svc_fh *fh) } } -static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat) +static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) { struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 3018b52b6d5e..f45b4bc93f52 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -94,6 +94,7 @@ struct nfsd_diropres { struct nfsd_readlinkres { __be32 status; int len; + struct page *page; }; struct nfsd_readres { @@ -101,17 +102,20 @@ struct nfsd_readres { struct svc_fh fh; unsigned long count; struct kstat stat; + struct page **pages; }; struct nfsd_readdirres { + /* Components of the reply */ __be32 status; int count; + /* Used to encode the reply's entry list */ + struct xdr_stream xdr; + struct xdr_buf dirlist; struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; + unsigned int cookie_offset; }; struct nfsd_statfsres { @@ -147,23 +151,26 @@ int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *); int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *); -int nfssvc_encode_stat(struct svc_rqst *, __be32 *); -int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *); +int nfssvc_encode_statres(struct svc_rqst *, __be32 *); +int nfssvc_encode_attrstatres(struct svc_rqst *, __be32 *); int nfssvc_encode_diropres(struct svc_rqst *, __be32 *); int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *); int nfssvc_encode_readres(struct svc_rqst *, __be32 *); int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *); int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *); -int nfssvc_encode_entry(void *, const char *name, - int namlen, loff_t offset, u64 ino, unsigned int); +void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset); +int nfssvc_encode_entry(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type); void nfssvc_release_attrstat(struct svc_rqst *rqstp); void nfssvc_release_diropres(struct svc_rqst *rqstp); void nfssvc_release_readres(struct svc_rqst *rqstp); /* Helper functions for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp); +bool svcxdr_encode_stat(struct xdr_stream *xdr, __be32 status); +bool svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp, const struct kstat *stat); #endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 3e1578953f54..933008382bbe 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -137,6 +137,7 @@ struct nfsd3_readlinkres { __be32 status; struct svc_fh fh; __u32 len; + struct page **pages; }; struct nfsd3_readres { @@ -144,6 +145,7 @@ struct nfsd3_readres { struct svc_fh fh; unsigned long count; __u32 eof; + struct page **pages; }; struct nfsd3_writeres { @@ -167,19 +169,17 @@ struct nfsd3_linkres { }; struct nfsd3_readdirres { + /* Components of the reply */ __be32 status; struct svc_fh fh; - /* Just to save kmalloc on every readdirplus entry (svc_fh is a - * little large for the stack): */ - struct svc_fh scratch; - int count; __be32 verf[2]; + /* Used to encode the reply's entry list */ + struct xdr_stream xdr; + struct xdr_buf dirlist; + struct svc_fh scratch; struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; - __be32 * offset1; + unsigned int cookie_offset; struct svc_rqst * rqstp; }; @@ -280,9 +280,9 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *); -int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *); +int nfs3svc_encode_getattrres(struct svc_rqst *, __be32 *); int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *); -int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_lookupres(struct svc_rqst *, __be32 *); int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *); int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *); int nfs3svc_encode_readres(struct svc_rqst *, __be32 *); @@ -298,15 +298,16 @@ int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *); void nfs3svc_release_fhandle(struct svc_rqst *); void nfs3svc_release_fhandle2(struct svc_rqst *); -int nfs3svc_encode_entry(void *, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int); -int nfs3svc_encode_entry_plus(void *, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int); + +void nfs3svc_encode_cookie3(struct nfsd3_readdirres *resp, u64 offset); +int nfs3svc_encode_entry3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type); +int nfs3svc_encode_entryplus3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type); /* Helper functions for NFSv3 ACL code */ -__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, - struct svc_fh *fhp); bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp); +bool svcxdr_encode_nfsstat3(struct xdr_stream *xdr, __be32 status); +bool svcxdr_encode_post_op_attr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp); #endif /* _LINUX_NFSD_XDR3_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index c300885ae75d..fe540a3415c6 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -698,7 +698,7 @@ struct nfsd4_compoundargs { struct nfsd4_compoundres { /* scratch variables for XDR encode */ - struct xdr_stream xdr; + struct xdr_stream *xdr; struct svc_rqst * rqstp; u32 taglen; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index e1bd592ce700..7cf765258fda 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -148,6 +148,8 @@ const struct inode_operations nilfs_file_inode_operations = { .setattr = nilfs_setattr, .permission = nilfs_permission, .fiemap = nilfs_fiemap, + .fileattr_get = nilfs_fileattr_get, + .fileattr_set = nilfs_fileattr_set, }; /* end of file */ diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index b053b40315bf..3fcb9357bbfd 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -16,6 +16,7 @@ #include <linux/compat.h> /* compat_ptr() */ #include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */ #include <linux/buffer_head.h> +#include <linux/fileattr.h> #include "nilfs.h" #include "segment.h" #include "bmap.h" @@ -113,51 +114,39 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, } /** - * nilfs_ioctl_getflags - ioctl to support lsattr + * nilfs_fileattr_get - ioctl to support lsattr */ -static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp) +int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { - unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE; + struct inode *inode = d_inode(dentry); - return put_user(flags, (int __user *)argp); + fileattr_fill_flags(fa, NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE); + + return 0; } /** - * nilfs_ioctl_setflags - ioctl to support chattr + * nilfs_fileattr_set - ioctl to support chattr */ -static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, - void __user *argp) +int nilfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) { + struct inode *inode = d_inode(dentry); struct nilfs_transaction_info ti; unsigned int flags, oldflags; int ret; - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EACCES; - - if (get_user(flags, (int __user *)argp)) - return -EFAULT; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - flags = nilfs_mask_flags(inode->i_mode, flags); - - inode_lock(inode); - - oldflags = NILFS_I(inode)->i_flags; + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; - ret = vfs_ioc_setflags_prepare(inode, oldflags, flags); - if (ret) - goto out; + flags = nilfs_mask_flags(inode->i_mode, fa->flags); ret = nilfs_transaction_begin(inode->i_sb, &ti, 0); if (ret) - goto out; + return ret; - NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) | - (flags & FS_FL_USER_MODIFIABLE); + oldflags = NILFS_I(inode)->i_flags & ~FS_FL_USER_MODIFIABLE; + NILFS_I(inode)->i_flags = oldflags | (flags & FS_FL_USER_MODIFIABLE); nilfs_set_inode_flags(inode); inode->i_ctime = current_time(inode); @@ -165,11 +154,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, nilfs_set_transaction_flag(NILFS_TI_SYNC); nilfs_mark_inode_dirty(inode); - ret = nilfs_transaction_commit(inode->i_sb); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; + return nilfs_transaction_commit(inode->i_sb); } /** @@ -1282,10 +1267,6 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; switch (cmd) { - case FS_IOC_GETFLAGS: - return nilfs_ioctl_getflags(inode, argp); - case FS_IOC_SETFLAGS: - return nilfs_ioctl_setflags(inode, filp, argp); case FS_IOC_GETVERSION: return nilfs_ioctl_getversion(inode, argp); case NILFS_IOCTL_CHANGE_CPMODE: @@ -1331,12 +1312,6 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index ecace5f96a95..91eebeb0c48b 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -440,10 +440,9 @@ static struct dentry *nilfs_get_parent(struct dentry *child) { unsigned long ino; struct inode *inode; - struct qstr dotdot = QSTR_INIT("..", 2); struct nilfs_root *root; - ino = nilfs_inode_by_name(d_inode(child), &dotdot); + ino = nilfs_inode_by_name(d_inode(child), &dotdot_name); if (!ino) return ERR_PTR(-ENOENT); @@ -552,6 +551,8 @@ const struct inode_operations nilfs_dir_inode_operations = { .setattr = nilfs_setattr, .permission = nilfs_permission, .fiemap = nilfs_fiemap, + .fileattr_get = nilfs_fileattr_get, + .fileattr_set = nilfs_fileattr_set, }; const struct inode_operations nilfs_special_inode_operations = { diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index c4a45a081ade..60b21b6eeac0 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -243,6 +243,9 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); /* ioctl.c */ +int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *m); +int nilfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); long nilfs_ioctl(struct file *, unsigned int, unsigned long); long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *, diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 1192c9953620..057abd2cf887 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -14,6 +14,7 @@ #include <linux/audit.h> #include <linux/sched/mm.h> #include <linux/statfs.h> +#include <linux/stringhash.h> #include "fanotify.h" @@ -22,12 +23,24 @@ static bool fanotify_path_equal(struct path *p1, struct path *p2) return p1->mnt == p2->mnt && p1->dentry == p2->dentry; } +static unsigned int fanotify_hash_path(const struct path *path) +{ + return hash_ptr(path->dentry, FANOTIFY_EVENT_HASH_BITS) ^ + hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS); +} + static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, __kernel_fsid_t *fsid2) { return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; } +static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid) +{ + return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^ + hash_32(fsid->val[1], FANOTIFY_EVENT_HASH_BITS); +} + static bool fanotify_fh_equal(struct fanotify_fh *fh1, struct fanotify_fh *fh2) { @@ -38,6 +51,16 @@ static bool fanotify_fh_equal(struct fanotify_fh *fh1, !memcmp(fanotify_fh_buf(fh1), fanotify_fh_buf(fh2), fh1->len); } +static unsigned int fanotify_hash_fh(struct fanotify_fh *fh) +{ + long salt = (long)fh->type | (long)fh->len << 8; + + /* + * full_name_hash() works long by long, so it handles fh buf optimally. + */ + return full_name_hash((void *)salt, fanotify_fh_buf(fh), fh->len); +} + static bool fanotify_fid_event_equal(struct fanotify_fid_event *ffe1, struct fanotify_fid_event *ffe2) { @@ -88,16 +111,12 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1, return fanotify_info_equal(info1, info2); } -static bool fanotify_should_merge(struct fsnotify_event *old_fsn, - struct fsnotify_event *new_fsn) +static bool fanotify_should_merge(struct fanotify_event *old, + struct fanotify_event *new) { - struct fanotify_event *old, *new; - - pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); - old = FANOTIFY_E(old_fsn); - new = FANOTIFY_E(new_fsn); + pr_debug("%s: old=%p new=%p\n", __func__, old, new); - if (old_fsn->objectid != new_fsn->objectid || + if (old->hash != new->hash || old->type != new->type || old->pid != new->pid) return false; @@ -129,14 +148,20 @@ static bool fanotify_should_merge(struct fsnotify_event *old_fsn, return false; } +/* Limit event merges to limit CPU overhead per event */ +#define FANOTIFY_MAX_MERGE_EVENTS 128 + /* and the list better be locked by something too! */ -static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) +static int fanotify_merge(struct fsnotify_group *group, + struct fsnotify_event *event) { - struct fsnotify_event *test_event; - struct fanotify_event *new; + struct fanotify_event *old, *new = FANOTIFY_E(event); + unsigned int bucket = fanotify_event_hash_bucket(group, new); + struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket]; + int i = 0; - pr_debug("%s: list=%p event=%p\n", __func__, list, event); - new = FANOTIFY_E(event); + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, + group, event, bucket); /* * Don't merge a permission event with any other event so that we know @@ -146,9 +171,11 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) if (fanotify_is_perm_event(new->mask)) return 0; - list_for_each_entry_reverse(test_event, list, list) { - if (fanotify_should_merge(test_event, event)) { - FANOTIFY_E(test_event)->mask |= new->mask; + hlist_for_each_entry(old, hlist, merge_list) { + if (++i > FANOTIFY_MAX_MERGE_EVENTS) + break; + if (fanotify_should_merge(old, new)) { + old->mask |= new->mask; return 1; } } @@ -184,8 +211,11 @@ static int fanotify_get_response(struct fsnotify_group *group, return ret; } /* Event not yet reported? Just remove it. */ - if (event->state == FAN_EVENT_INIT) + if (event->state == FAN_EVENT_INIT) { fsnotify_remove_queued_event(group, &event->fae.fse); + /* Permission events are not supposed to be hashed */ + WARN_ON_ONCE(!hlist_unhashed(&event->fae.merge_list)); + } /* * Event may be also answered in case signal delivery raced * with wakeup. In that case we have nothing to do besides @@ -329,7 +359,8 @@ static int fanotify_encode_fh_len(struct inode *inode) * Return 0 on failure to encode. */ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, - unsigned int fh_len, gfp_t gfp) + unsigned int fh_len, unsigned int *hash, + gfp_t gfp) { int dwords, type = 0; char *ext_buf = NULL; @@ -372,6 +403,9 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, fh->type = type; fh->len = fh_len; + /* Mix fh into event merge key */ + *hash ^= fanotify_hash_fh(fh); + return FANOTIFY_FH_HDR_LEN + fh_len; out_err: @@ -425,6 +459,7 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data, } static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, + unsigned int *hash, gfp_t gfp) { struct fanotify_path_event *pevent; @@ -435,6 +470,7 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, pevent->fae.type = FANOTIFY_EVENT_TYPE_PATH; pevent->path = *path; + *hash ^= fanotify_hash_path(path); path_get(path); return &pevent->fae; @@ -460,6 +496,7 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path, static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id, __kernel_fsid_t *fsid, + unsigned int *hash, gfp_t gfp) { struct fanotify_fid_event *ffe; @@ -470,16 +507,18 @@ static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id, ffe->fae.type = FANOTIFY_EVENT_TYPE_FID; ffe->fsid = *fsid; + *hash ^= fanotify_hash_fsid(fsid); fanotify_encode_fh(&ffe->object_fh, id, fanotify_encode_fh_len(id), - gfp); + hash, gfp); return &ffe->fae; } static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, __kernel_fsid_t *fsid, - const struct qstr *file_name, + const struct qstr *name, struct inode *child, + unsigned int *hash, gfp_t gfp) { struct fanotify_name_event *fne; @@ -492,24 +531,30 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, size = sizeof(*fne) + FANOTIFY_FH_HDR_LEN + dir_fh_len; if (child_fh_len) size += FANOTIFY_FH_HDR_LEN + child_fh_len; - if (file_name) - size += file_name->len + 1; + if (name) + size += name->len + 1; fne = kmalloc(size, gfp); if (!fne) return NULL; fne->fae.type = FANOTIFY_EVENT_TYPE_FID_NAME; fne->fsid = *fsid; + *hash ^= fanotify_hash_fsid(fsid); info = &fne->info; fanotify_info_init(info); dfh = fanotify_info_dir_fh(info); - info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, 0); + info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, hash, 0); if (child_fh_len) { ffh = fanotify_info_file_fh(info); - info->file_fh_totlen = fanotify_encode_fh(ffh, child, child_fh_len, 0); + info->file_fh_totlen = fanotify_encode_fh(ffh, child, + child_fh_len, hash, 0); + } + if (name) { + long salt = name->len; + + fanotify_info_copy_name(info, name); + *hash ^= full_name_hash((void *)salt, name->name, name->len); } - if (file_name) - fanotify_info_copy_name(info, file_name); pr_debug("%s: ino=%lu size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n", __func__, id->i_ino, size, dir_fh_len, child_fh_len, @@ -533,6 +578,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct mem_cgroup *old_memcg; struct inode *child = NULL; bool name_event = false; + unsigned int hash = 0; + bool ondir = mask & FAN_ONDIR; + struct pid *pid; if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) { /* @@ -540,8 +588,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, * report the child fid for events reported on a non-dir child * in addition to reporting the parent fid and maybe child name. */ - if ((fid_mode & FAN_REPORT_FID) && - id != dirid && !(mask & FAN_ONDIR)) + if ((fid_mode & FAN_REPORT_FID) && id != dirid && !ondir) child = id; id = dirid; @@ -562,8 +609,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, if (!(fid_mode & FAN_REPORT_NAME)) { name_event = !!child; file_name = NULL; - } else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || - !(mask & FAN_ONDIR)) { + } else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || !ondir) { name_event = true; } } @@ -586,26 +632,25 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, event = fanotify_alloc_perm_event(path, gfp); } else if (name_event && (file_name || child)) { event = fanotify_alloc_name_event(id, fsid, file_name, child, - gfp); + &hash, gfp); } else if (fid_mode) { - event = fanotify_alloc_fid_event(id, fsid, gfp); + event = fanotify_alloc_fid_event(id, fsid, &hash, gfp); } else { - event = fanotify_alloc_path_event(path, gfp); + event = fanotify_alloc_path_event(path, &hash, gfp); } if (!event) goto out; - /* - * Use the victim inode instead of the watching inode as the id for - * event queue, so event reported on parent is merged with event - * reported on child when both directory and child watches exist. - */ - fanotify_init_event(event, (unsigned long)id, mask); if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) - event->pid = get_pid(task_pid(current)); + pid = get_pid(task_pid(current)); else - event->pid = get_pid(task_tgid(current)); + pid = get_pid(task_tgid(current)); + + /* Mix event info, FAN_ONDIR flag and pid into event merge key */ + hash ^= hash_long((unsigned long)pid | ondir, FANOTIFY_EVENT_HASH_BITS); + fanotify_init_event(event, hash, mask); + event->pid = pid; out: set_active_memcg(old_memcg); @@ -645,6 +690,24 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) return fsid; } +/* + * Add an event to hash table for faster merge. + */ +static void fanotify_insert_event(struct fsnotify_group *group, + struct fsnotify_event *fsn_event) +{ + struct fanotify_event *event = FANOTIFY_E(fsn_event); + unsigned int bucket = fanotify_event_hash_bucket(group, event); + struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket]; + + assert_spin_locked(&group->notification_lock); + + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, + group, event, bucket); + + hlist_add_head(&event->merge_list, hlist); +} + static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, @@ -715,7 +778,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, } fsn_event = &event->fse; - ret = fsnotify_add_event(group, fsn_event, fanotify_merge); + ret = fsnotify_add_event(group, fsn_event, fanotify_merge, + fanotify_is_hashed_event(mask) ? + fanotify_insert_event : NULL); if (ret) { /* Permission events shouldn't be merged */ BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); @@ -736,11 +801,10 @@ finish: static void fanotify_free_group_priv(struct fsnotify_group *group) { - struct user_struct *user; - - user = group->fanotify_data.user; - atomic_dec(&user->fanotify_listeners); - free_uid(user); + kfree(group->fanotify_data.merge_hash); + if (group->fanotify_data.ucounts) + dec_ucount(group->fanotify_data.ucounts, + UCOUNT_FANOTIFY_GROUPS); } static void fanotify_free_path_event(struct fanotify_event *event) @@ -796,6 +860,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) } } +static void fanotify_freeing_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group) +{ + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) + dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_MARKS); +} + static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) { kmem_cache_free(fanotify_mark_cache, fsn_mark); @@ -805,5 +876,6 @@ const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, .free_group_priv = fanotify_free_group_priv, .free_event = fanotify_free_event, + .freeing_mark = fanotify_freeing_mark, .free_mark = fanotify_free_mark, }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 896c819a1786..4a5e555dc3d2 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -3,6 +3,7 @@ #include <linux/path.h> #include <linux/slab.h> #include <linux/exportfs.h> +#include <linux/hashtable.h> extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_fid_event_cachep; @@ -115,6 +116,11 @@ static inline void fanotify_info_init(struct fanotify_info *info) info->name_len = 0; } +static inline unsigned int fanotify_info_len(struct fanotify_info *info) +{ + return info->dir_fh_totlen + info->file_fh_totlen + info->name_len; +} + static inline void fanotify_info_copy_name(struct fanotify_info *info, const struct qstr *name) { @@ -135,19 +141,31 @@ enum fanotify_event_type { FANOTIFY_EVENT_TYPE_PATH, FANOTIFY_EVENT_TYPE_PATH_PERM, FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ + __FANOTIFY_EVENT_TYPE_NUM }; +#define FANOTIFY_EVENT_TYPE_BITS \ + (ilog2(__FANOTIFY_EVENT_TYPE_NUM - 1) + 1) +#define FANOTIFY_EVENT_HASH_BITS \ + (32 - FANOTIFY_EVENT_TYPE_BITS) + struct fanotify_event { struct fsnotify_event fse; + struct hlist_node merge_list; /* List for hashed merge */ u32 mask; - enum fanotify_event_type type; + struct { + unsigned int type : FANOTIFY_EVENT_TYPE_BITS; + unsigned int hash : FANOTIFY_EVENT_HASH_BITS; + }; struct pid *pid; }; static inline void fanotify_init_event(struct fanotify_event *event, - unsigned long id, u32 mask) + unsigned int hash, u32 mask) { - fsnotify_init_event(&event->fse, id); + fsnotify_init_event(&event->fse); + INIT_HLIST_NODE(&event->merge_list); + event->hash = hash; event->mask = mask; event->pid = NULL; } @@ -284,3 +302,25 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event) else return NULL; } + +/* + * Use 128 size hash table to speed up events merge. + */ +#define FANOTIFY_HTABLE_BITS (7) +#define FANOTIFY_HTABLE_SIZE (1 << FANOTIFY_HTABLE_BITS) +#define FANOTIFY_HTABLE_MASK (FANOTIFY_HTABLE_SIZE - 1) + +/* + * Permission events and overflow event do not get merged - don't hash them. + */ +static inline bool fanotify_is_hashed_event(u32 mask) +{ + return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW); +} + +static inline unsigned int fanotify_event_hash_bucket( + struct fsnotify_group *group, + struct fanotify_event *event) +{ + return event->hash & FANOTIFY_HTABLE_MASK; +} diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9e0c1afac8bd..71fefb30e015 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -27,8 +27,61 @@ #include "fanotify.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 -#define FANOTIFY_DEFAULT_MAX_MARKS 8192 -#define FANOTIFY_DEFAULT_MAX_LISTENERS 128 +#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 +#define FANOTIFY_DEFAULT_MAX_GROUPS 128 + +/* + * Legacy fanotify marks limits (8192) is per group and we introduced a tunable + * limit of marks per user, similar to inotify. Effectively, the legacy limit + * of fanotify marks per user is <max marks per group> * <max groups per user>. + * This default limit (1M) also happens to match the increased limit of inotify + * max_user_watches since v5.10. + */ +#define FANOTIFY_DEFAULT_MAX_USER_MARKS \ + (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) + +/* + * Most of the memory cost of adding an inode mark is pinning the marked inode. + * The size of the filesystem inode struct is not uniform across filesystems, + * so double the size of a VFS inode is used as a conservative approximation. + */ +#define INODE_MARK_COST (2 * sizeof(struct inode)) + +/* configurable via /proc/sys/fs/fanotify/ */ +static int fanotify_max_queued_events __read_mostly; + +#ifdef CONFIG_SYSCTL + +#include <linux/sysctl.h> + +struct ctl_table fanotify_table[] = { + { + .procname = "max_user_groups", + .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "max_user_marks", + .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "max_queued_events", + .data = &fanotify_max_queued_events, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO + }, + { } +}; +#endif /* CONFIG_SYSCTL */ /* * All flags that may be specified in parameter event_f_flags of fanotify_init. @@ -90,6 +143,23 @@ static int fanotify_event_info_len(unsigned int fid_mode, } /* + * Remove an hashed event from merge hash table. + */ +static void fanotify_unhash_event(struct fsnotify_group *group, + struct fanotify_event *event) +{ + assert_spin_locked(&group->notification_lock); + + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, + group, event, fanotify_event_hash_bucket(group, event)); + + if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) + return; + + hlist_del_init(&event->merge_list); +} + +/* * Get an fanotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count * is not large enough. When permission event is dequeued, its state is @@ -100,26 +170,34 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, { size_t event_size = FAN_EVENT_METADATA_LEN; struct fanotify_event *event = NULL; + struct fsnotify_event *fsn_event; unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); spin_lock(&group->notification_lock); - if (fsnotify_notify_queue_is_empty(group)) + fsn_event = fsnotify_peek_first_event(group); + if (!fsn_event) goto out; - if (fid_mode) { - event_size += fanotify_event_info_len(fid_mode, - FANOTIFY_E(fsnotify_peek_first_event(group))); - } + event = FANOTIFY_E(fsn_event); + if (fid_mode) + event_size += fanotify_event_info_len(fid_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); goto out; } - event = FANOTIFY_E(fsnotify_remove_first_event(group)); + + /* + * Held the notification_lock the whole time, so this is the + * same event we peeked above. + */ + fsnotify_remove_first_event(group); if (fanotify_is_perm_event(event->mask)) FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; + if (fanotify_is_hashed_event(event->mask)) + fanotify_unhash_event(group, event); out: spin_unlock(&group->notification_lock); return event; @@ -341,6 +419,14 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, metadata.reserved = 0; metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; metadata.pid = pid_vnr(event->pid); + /* + * For an unprivileged listener, event->pid can be used to identify the + * events generated by the listener process itself, without disclosing + * the pids of other processes. + */ + if (!capable(CAP_SYS_ADMIN) && + task_tgid(current) != event->pid) + metadata.pid = 0; if (path && path->mnt && path->dentry) { fd = create_fd(group, path, &f); @@ -573,6 +659,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; + struct fsnotify_event *fsn_event; /* * Stop new events from arriving in the notification queue. since @@ -601,13 +688,12 @@ static int fanotify_release(struct inode *ignored, struct file *file) * dequeue them and set the response. They will be freed once the * response is consumed and fanotify_get_response() returns. */ - while (!fsnotify_notify_queue_is_empty(group)) { - struct fanotify_event *event; + while ((fsn_event = fsnotify_remove_first_event(group))) { + struct fanotify_event *event = FANOTIFY_E(fsn_event); - event = FANOTIFY_E(fsnotify_remove_first_event(group)); if (!(event->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); - fsnotify_destroy_event(group, &event->fse); + fsnotify_destroy_event(group, fsn_event); } else { finish_permission_event(group, FANOTIFY_PERM(event), FAN_ALLOW); @@ -822,24 +908,38 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, unsigned int type, __kernel_fsid_t *fsid) { + struct ucounts *ucounts = group->fanotify_data.ucounts; struct fsnotify_mark *mark; int ret; - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + /* + * Enforce per user marks limits per user in all containing user ns. + * A group with FAN_UNLIMITED_MARKS does not contribute to mark count + * in the limited groups account. + */ + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && + !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!mark) - return ERR_PTR(-ENOMEM); + if (!mark) { + ret = -ENOMEM; + goto out_dec_ucounts; + } fsnotify_init_mark(mark, group); ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid); if (ret) { fsnotify_put_mark(mark); - return ERR_PTR(ret); + goto out_dec_ucounts; } return mark; + +out_dec_ucounts: + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) + dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); + return ERR_PTR(ret); } @@ -919,20 +1019,41 @@ static struct fsnotify_event *fanotify_alloc_overflow_event(void) return &oevent->fse; } +static struct hlist_head *fanotify_alloc_merge_hash(void) +{ + struct hlist_head *hash; + + hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS, + GFP_KERNEL_ACCOUNT); + if (!hash) + return NULL; + + __hash_init(hash, FANOTIFY_HTABLE_SIZE); + + return hash; +} + /* fanotify syscalls */ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) { struct fsnotify_group *group; int f_flags, fd; - struct user_struct *user; unsigned int fid_mode = flags & FANOTIFY_FID_BITS; unsigned int class = flags & FANOTIFY_CLASS_BITS; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!capable(CAP_SYS_ADMIN)) { + /* + * An unprivileged user can setup an fanotify group with + * limited functionality - an unprivileged group is limited to + * notification events with file handles and it cannot use + * unlimited queue/marks. + */ + if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode) + return -EPERM; + } #ifdef CONFIG_AUDITSYSCALL if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) @@ -963,12 +1084,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) return -EINVAL; - user = get_current_user(); - if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { - free_uid(user); - return -EMFILE; - } - f_flags = O_RDWR | FMODE_NONOTIFY; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; @@ -978,15 +1093,27 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops); if (IS_ERR(group)) { - free_uid(user); return PTR_ERR(group); } - group->fanotify_data.user = user; + /* Enforce groups limits per user in all containing user ns */ + group->fanotify_data.ucounts = inc_ucount(current_user_ns(), + current_euid(), + UCOUNT_FANOTIFY_GROUPS); + if (!group->fanotify_data.ucounts) { + fd = -EMFILE; + goto out_destroy_group; + } + group->fanotify_data.flags = flags; - atomic_inc(&user->fanotify_listeners); group->memcg = get_mem_cgroup_from_mm(current->mm); + group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); + if (!group->fanotify_data.merge_hash) { + fd = -ENOMEM; + goto out_destroy_group; + } + group->overflow_event = fanotify_alloc_overflow_event(); if (unlikely(!group->overflow_event)) { fd = -ENOMEM; @@ -1019,16 +1146,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) goto out_destroy_group; group->max_events = UINT_MAX; } else { - group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; + group->max_events = fanotify_max_queued_events; } if (flags & FAN_UNLIMITED_MARKS) { fd = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto out_destroy_group; - group->fanotify_data.max_marks = UINT_MAX; - } else { - group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; } if (flags & FAN_ENABLE_AUDIT) { @@ -1126,7 +1250,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, __func__, fanotify_fd, flags, dfd, pathname, mask); /* we only use the lower 32 bits as of right now. */ - if (mask & ((__u64)0xffffffff << 32)) + if (upper_32_bits(mask)) return -EINVAL; if (flags & ~FANOTIFY_MARK_FLAGS) @@ -1181,6 +1305,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, group = f.file->private_data; /* + * An unprivileged user is not allowed to watch a mount point nor + * a filesystem. + */ + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN) && + mark_type != FAN_MARK_INODE) + goto fput_and_out; + + /* * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not * allowed to set permissions events. */ @@ -1312,6 +1445,21 @@ SYSCALL32_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { + struct sysinfo si; + int max_marks; + + si_meminfo(&si); + /* + * Allow up to 1% of addressable memory to be accounted for per user + * marks limited to the range [8192, 1048576]. mount and sb marks are + * a lot cheaper than inode marks, but there is no reason for a user + * to have many of those, so calculate by the cost of inode marks. + */ + max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / + INODE_MARK_COST; + max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, + FANOTIFY_DEFAULT_MAX_USER_MARKS); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); @@ -1326,6 +1474,11 @@ static int __init fanotify_user_setup(void) KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } + fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; + init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = + FANOTIFY_DEFAULT_MAX_GROUPS; + init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; + return 0; } device_initcall(fanotify_user_setup); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index f0d6b54be412..a712b2aaa9ac 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -144,7 +144,8 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f) struct fsnotify_group *group = f->private_data; seq_printf(m, "fanotify flags:%x event-flags:%x\n", - group->fanotify_data.flags, group->fanotify_data.f_flags); + group->fanotify_data.flags, + group->fanotify_data.f_flags); show_fdinfo(m, f, fanotify_fdinfo); } diff --git a/fs/notify/group.c b/fs/notify/group.c index ffd723ffe46d..fb89c351295d 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -122,7 +122,6 @@ static struct fsnotify_group *__fsnotify_alloc_group( /* set to 0 when there a no external references to this group */ refcount_set(&group->refcnt, 1); - atomic_set(&group->num_marks, 0); atomic_set(&group->user_waits, 0); spin_lock_init(&group->notification_lock); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 1901d799909b..d1a64daa0171 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -46,9 +46,10 @@ static bool event_compare(struct fsnotify_event *old_fsn, return false; } -static int inotify_merge(struct list_head *list, - struct fsnotify_event *event) +static int inotify_merge(struct fsnotify_group *group, + struct fsnotify_event *event) { + struct list_head *list = &group->notification_list; struct fsnotify_event *last_event; last_event = list_entry(list->prev, struct fsnotify_event, list); @@ -107,7 +108,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, mask &= ~IN_ISDIR; fsn_event = &event->fse; - fsnotify_init_event(fsn_event, 0); + fsnotify_init_event(fsn_event); event->mask = mask; event->wd = i_mark->wd; event->sync_cookie = cookie; @@ -115,7 +116,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, if (len) strcpy(event->name, name->name); - ret = fsnotify_add_event(group, fsn_event, inotify_merge); + ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index c71be4fb7dc5..98f61b31745a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -146,10 +146,9 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t event_size = sizeof(struct inotify_event); struct fsnotify_event *event; - if (fsnotify_notify_queue_is_empty(group)) - return NULL; - event = fsnotify_peek_first_event(group); + if (!event) + return NULL; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -642,7 +641,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) return ERR_PTR(-ENOMEM); } group->overflow_event = &oevent->fse; - fsnotify_init_event(group->overflow_event, 0); + fsnotify_init_event(group->overflow_event); oevent->mask = FS_Q_OVERFLOW; oevent->wd = -1; oevent->sync_cookie = 0; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 8387937b9d01..d32ab349db74 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -391,8 +391,6 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) list_del_init(&mark->g_list); spin_unlock(&mark->lock); - atomic_dec(&group->num_marks); - /* Drop mark reference acquired in fsnotify_add_mark_locked() */ fsnotify_put_mark(mark); } @@ -656,7 +654,6 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; list_add(&mark->g_list, &group->marks_list); - atomic_inc(&group->num_marks); fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); @@ -674,7 +671,6 @@ err: FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); spin_unlock(&mark->lock); - atomic_dec(&group->num_marks); fsnotify_put_mark(mark); return ret; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 75d79d6d3ef0..32f45543b9c6 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -47,13 +47,6 @@ u32 fsnotify_get_cookie(void) } EXPORT_SYMBOL_GPL(fsnotify_get_cookie); -/* return true if the notify queue is empty, false otherwise */ -bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) -{ - assert_spin_locked(&group->notification_lock); - return list_empty(&group->notification_list) ? true : false; -} - void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event) { @@ -75,16 +68,22 @@ void fsnotify_destroy_event(struct fsnotify_group *group, } /* - * Add an event to the group notification queue. The group can later pull this - * event off the queue to deal with. The function returns 0 if the event was - * added to the queue, 1 if the event was merged with some other queued event, + * Try to add an event to the notification queue. + * The group can later pull this event off the queue to deal with. + * The group can use the @merge hook to merge the event with a queued event. + * The group can use the @insert hook to insert the event into hash table. + * The function returns: + * 0 if the event was added to a queue + * 1 if the event was merged with some other queued event * 2 if the event was not queued - either the queue of events has overflown - * or the group is shutting down. + * or the group is shutting down. */ int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, - int (*merge)(struct list_head *, - struct fsnotify_event *)) + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *), + void (*insert)(struct fsnotify_group *, + struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; @@ -111,7 +110,7 @@ int fsnotify_add_event(struct fsnotify_group *group, } if (!list_empty(list) && merge) { - ret = merge(list, event); + ret = merge(group, event); if (ret) { spin_unlock(&group->notification_lock); return ret; @@ -121,6 +120,8 @@ int fsnotify_add_event(struct fsnotify_group *group, queue: group->q_len++; list_add_tail(&event->list, list); + if (insert) + insert(group, event); spin_unlock(&group->notification_lock); wake_up(&group->notification_waitq); @@ -141,33 +142,36 @@ void fsnotify_remove_queued_event(struct fsnotify_group *group, } /* - * Remove and return the first event from the notification list. It is the - * responsibility of the caller to destroy the obtained event + * Return the first event on the notification list without removing it. + * Returns NULL if the list is empty. */ -struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { - struct fsnotify_event *event; - assert_spin_locked(&group->notification_lock); - pr_debug("%s: group=%p\n", __func__, group); + if (fsnotify_notify_queue_is_empty(group)) + return NULL; - event = list_first_entry(&group->notification_list, - struct fsnotify_event, list); - fsnotify_remove_queued_event(group, event); - return event; + return list_first_entry(&group->notification_list, + struct fsnotify_event, list); } /* - * This will not remove the event, that must be done with - * fsnotify_remove_first_event() + * Remove and return the first event from the notification list. It is the + * responsibility of the caller to destroy the obtained event */ -struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { - assert_spin_locked(&group->notification_lock); + struct fsnotify_event *event = fsnotify_peek_first_event(group); - return list_first_entry(&group->notification_list, - struct fsnotify_event, list); + if (!event) + return NULL; + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + fsnotify_remove_queued_event(group, event); + + return event; } /* diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3bfb4147895a..ad20403b383f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2295,7 +2295,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle = NULL; loff_t end = offset + bytes; - int ret = 0, credits = 0, locked = 0; + int ret = 0, credits = 0; ocfs2_init_dealloc_ctxt(&dealloc); @@ -2306,13 +2306,6 @@ static int ocfs2_dio_end_io_write(struct inode *inode, !dwc->dw_orphaned) goto out; - /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we - * are in that context. */ - if (dwc->dw_writer_pid != task_pid_nr(current)) { - inode_lock(inode); - locked = 1; - } - ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); @@ -2393,8 +2386,6 @@ out: if (meta_ac) ocfs2_free_alloc_context(meta_ac); ocfs2_run_deallocs(osb, &dealloc); - if (locked) - inode_unlock(inode); ocfs2_dio_free_write_ctx(inode, dwc); return ret; diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 6e07ddb0e3c0..dabfef9c2bc0 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -229,7 +229,7 @@ static int blockcheck_u64_get(void *data, u64 *val) *val = *(u64 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 4b566e88582f..afc51736686c 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -126,13 +126,6 @@ static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); } -static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) -{ - spin_lock(&dlm->spinlock); - __dlm_reset_recovery(dlm); - spin_unlock(&dlm->spinlock); -} - /* Worker function used during recovery. */ void dlm_dispatch_work(struct work_struct *work) { diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8e3a369086db..0fbe8bf7190f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2204,7 +2204,7 @@ static void ocfs2_unpack_timespec(struct timespec64 *spec, spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; } -static void ocfs2_refresh_inode_from_lvb(struct inode *inode) +static int ocfs2_refresh_inode_from_lvb(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; @@ -2213,6 +2213,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) mlog_meta_lvb(0, lockres); lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (inode_wrong_type(inode, be16_to_cpu(lvb->lvb_imode))) + return -ESTALE; /* We're safe here without the lockres lock... */ spin_lock(&oi->ip_lock); @@ -2240,6 +2242,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) ocfs2_unpack_timespec(&inode->i_ctime, be64_to_cpu(lvb->lvb_ictime_packed)); spin_unlock(&oi->ip_lock); + return 0; } static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, @@ -2342,7 +2345,8 @@ static int ocfs2_inode_lock_update(struct inode *inode, if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { mlog(0, "Trusting LVB on inode %llu\n", (unsigned long long)oi->ip_blkno); - ocfs2_refresh_inode_from_lvb(inode); + status = ocfs2_refresh_inode_from_lvb(inode); + goto bail_refresh; } else { /* Boo, we have to go to disk. */ /* read bh, cast, ocfs2_refresh_inode */ @@ -2352,6 +2356,10 @@ static int ocfs2_inode_lock_update(struct inode *inode, goto bail_refresh; } fe = (struct ocfs2_dinode *) (*bh)->b_data; + if (inode_wrong_type(inode, le16_to_cpu(fe->i_mode))) { + status = -ESTALE; + goto bail_refresh; + } /* This is a good chance to make sure we're not * locking an invalid object. ocfs2_read_inode_block() diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6611c64ca0be..db8a6265b749 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1245,22 +1245,24 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto bail_unlock; } } + down_write(&OCFS2_I(inode)->ip_alloc_sem); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + 2 * ocfs2_quota_trans_credits(sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail_unlock_alloc; } status = __dquot_transfer(inode, transfer_to); if (status < 0) goto bail_commit; } else { + down_write(&OCFS2_I(inode)->ip_alloc_sem); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail_unlock_alloc; } } @@ -1273,6 +1275,8 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, bail_commit: ocfs2_commit_trans(osb, handle); +bail_unlock_alloc: + up_write(&OCFS2_I(inode)->ip_alloc_sem); bail_unlock: if (status && inode_locked) { ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); @@ -2645,6 +2649,8 @@ const struct inode_operations ocfs2_file_iops = { .fiemap = ocfs2_fiemap, .get_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, + .fileattr_get = ocfs2_fileattr_get, + .fileattr_set = ocfs2_fileattr_set, }; const struct inode_operations ocfs2_special_file_iops = { diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 50c9b30ee9f6..f59461d85da4 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -10,6 +10,7 @@ #include <linux/mount.h> #include <linux/blkdev.h> #include <linux/compat.h> +#include <linux/fileattr.h> #include <cluster/masklog.h> @@ -61,8 +62,10 @@ static inline int o2info_coherent(struct ocfs2_info_request *req) return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); } -static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) +int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) { + struct inode *inode = d_inode(dentry); + unsigned int flags; int status; status = ocfs2_inode_lock(inode, NULL, 0); @@ -71,15 +74,19 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) return status; } ocfs2_get_inode_flags(OCFS2_I(inode)); - *flags = OCFS2_I(inode)->ip_attr; + flags = OCFS2_I(inode)->ip_attr; ocfs2_inode_unlock(inode, 0); + fileattr_fill_flags(fa, flags & OCFS2_FL_VISIBLE); + return status; } -static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, - unsigned mask) +int ocfs2_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) { + struct inode *inode = d_inode(dentry); + unsigned int flags = fa->flags; struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -87,7 +94,8 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, unsigned oldflags; int status; - inode_lock(inode); + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { @@ -95,19 +103,17 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, goto bail; } - status = -EACCES; - if (!inode_owner_or_capable(&init_user_ns, inode)) - goto bail_unlock; - if (!S_ISDIR(inode->i_mode)) flags &= ~OCFS2_DIRSYNC_FL; oldflags = ocfs2_inode->ip_attr; - flags = flags & mask; - flags |= oldflags & ~mask; + flags = flags & OCFS2_FL_MODIFIABLE; + flags |= oldflags & ~OCFS2_FL_MODIFIABLE; - status = vfs_ioc_setflags_prepare(inode, oldflags, flags); - if (status) + /* Check already done by VFS, but repeat with ocfs lock */ + status = -EPERM; + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && + !capable(CAP_LINUX_IMMUTABLE)) goto bail_unlock; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); @@ -129,8 +135,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, bail_unlock: ocfs2_inode_unlock(inode, 1); bail: - inode_unlock(inode); - brelse(bh); return status; @@ -836,7 +840,6 @@ bail: long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - unsigned int flags; int new_clusters; int status; struct ocfs2_space_resv sr; @@ -849,24 +852,6 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; switch (cmd) { - case OCFS2_IOC_GETFLAGS: - status = ocfs2_get_inode_attr(inode, &flags); - if (status < 0) - return status; - - flags &= OCFS2_FL_VISIBLE; - return put_user(flags, (int __user *) arg); - case OCFS2_IOC_SETFLAGS: - if (get_user(flags, (int __user *) arg)) - return -EFAULT; - - status = mnt_want_write_file(filp); - if (status) - return status; - status = ocfs2_set_inode_attr(inode, flags, - OCFS2_FL_MODIFIABLE); - mnt_drop_write_file(filp); - return status; case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: @@ -959,12 +944,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) void __user *argp = (void __user *)arg; switch (cmd) { - case OCFS2_IOC32_GETFLAGS: - cmd = OCFS2_IOC_GETFLAGS; - break; - case OCFS2_IOC32_SETFLAGS: - cmd = OCFS2_IOC_SETFLAGS; - break; case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 9f5e4d95e37f..0297c8846945 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -11,6 +11,9 @@ #ifndef OCFS2_IOCTL_PROTO_H #define OCFS2_IOCTL_PROTO_H +int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int ocfs2_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 3abdd36da2e2..05ced86580d1 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -50,6 +50,7 @@ #include "xattr.h" #include "acl.h" #include "ocfs2_trace.h" +#include "ioctl.h" #include "buffer_head_io.h" @@ -2918,4 +2919,6 @@ const struct inode_operations ocfs2_dir_iops = { .fiemap = ocfs2_fiemap, .get_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, + .fileattr_get = ocfs2_fileattr_get, + .fileattr_set = ocfs2_fileattr_set, }; diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index d7b31734f6be..273616bd4f19 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -13,14 +13,6 @@ #define OCFS2_IOCTL_H /* - * ioctl commands - */ -#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS -#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS -#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS - -/* * Space reservation / allocation / free ioctls and argument structure * are designed to be compatible with XFS. * diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index dbf8b5735808..f70012038383 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -59,31 +59,31 @@ static inline int mode_to_o2dlm(int mode) return mode; } -#define map_flag(_generic, _o2dlm) \ - if (flags & (_generic)) { \ - flags &= ~(_generic); \ - o2dlm_flags |= (_o2dlm); \ - } static int flags_to_o2dlm(u32 flags) { int o2dlm_flags = 0; - map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE); - map_flag(DLM_LKF_CANCEL, LKM_CANCEL); - map_flag(DLM_LKF_CONVERT, LKM_CONVERT); - map_flag(DLM_LKF_VALBLK, LKM_VALBLK); - map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK); - map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN); - map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE); - map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT); - map_flag(DLM_LKF_LOCAL, LKM_LOCAL); - - /* map_flag() should have cleared every flag passed in */ - BUG_ON(flags != 0); + if (flags & DLM_LKF_NOQUEUE) + o2dlm_flags |= LKM_NOQUEUE; + if (flags & DLM_LKF_CANCEL) + o2dlm_flags |= LKM_CANCEL; + if (flags & DLM_LKF_CONVERT) + o2dlm_flags |= LKM_CONVERT; + if (flags & DLM_LKF_VALBLK) + o2dlm_flags |= LKM_VALBLK; + if (flags & DLM_LKF_IVVALBLK) + o2dlm_flags |= LKM_INVVALBLK; + if (flags & DLM_LKF_ORPHAN) + o2dlm_flags |= LKM_ORPHAN; + if (flags & DLM_LKF_FORCEUNLOCK) + o2dlm_flags |= LKM_FORCE; + if (flags & DLM_LKF_TIMEOUT) + o2dlm_flags |= LKM_TIMEOUT; + if (flags & DLM_LKF_LOCAL) + o2dlm_flags |= LKM_LOCAL; return o2dlm_flags; } -#undef map_flag /* * Map an o2dlm status to standard errno values. diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index a191094694c6..8d33ebc6b6fc 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -731,7 +731,7 @@ static void __exit ocfs2_stack_glue_exit(void) } MODULE_AUTHOR("Oracle"); -MODULE_DESCRIPTION("ocfs2 cluter stack glue layer"); +MODULE_DESCRIPTION("ocfs2 cluster stack glue layer"); MODULE_LICENSE("GPL"); module_init(ocfs2_stack_glue_init); module_exit(ocfs2_stack_glue_exit); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 40c8c2e32fa3..f825176ff4ed 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -236,27 +236,31 @@ found: mutex_unlock(&op_mutex); if (IS_ERR(inode)) return ERR_CAST(inode); - ent_oi = OP_I(inode); - ent_oi->type = ent_type; - ent_oi->u = ent_data; - - switch (ent_type) { - case op_inode_node: - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - inode->i_op = &openprom_inode_operations; - inode->i_fop = &openprom_operations; - set_nlink(inode, 2); - break; - case op_inode_prop: - if (of_node_name_eq(dp, "options") && (len == 17) && - !strncmp (name, "security-password", 17)) - inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; - else - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &openpromfs_prop_ops; - set_nlink(inode, 1); - inode->i_size = ent_oi->u.prop->length; - break; + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + ent_oi = OP_I(inode); + ent_oi->type = ent_type; + ent_oi->u = ent_data; + + switch (ent_type) { + case op_inode_node: + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_op = &openprom_inode_operations; + inode->i_fop = &openprom_operations; + set_nlink(inode, 2); + break; + case op_inode_prop: + if (of_node_name_eq(dp, "options") && (len == 17) && + !strncmp (name, "security-password", 17)) + inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; + else + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &openpromfs_prop_ops; + set_nlink(inode, 1); + inode->i_size = ent_oi->u.prop->length; + break; + } + unlock_new_inode(inode); } return d_splice_alias(inode, dentry); @@ -345,20 +349,9 @@ static void openprom_free_inode(struct inode *inode) static struct inode *openprom_iget(struct super_block *sb, ino_t ino) { - struct inode *inode; - - inode = iget_locked(sb, ino); + struct inode *inode = iget_locked(sb, ino); if (!inode) - return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - if (inode->i_ino == OPENPROM_ROOT_INO) { - inode->i_op = &openprom_inode_operations; - inode->i_fop = &openprom_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - } - unlock_new_inode(inode); - } + inode = ERR_PTR(-ENOMEM); return inode; } @@ -394,9 +387,15 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc) goto out_no_root; } + root_inode->i_mtime = root_inode->i_atime = + root_inode->i_ctime = current_time(root_inode); + root_inode->i_op = &openprom_inode_operations; + root_inode->i_fop = &openprom_operations; + root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; oi = OP_I(root_inode); oi->type = op_inode_node; oi->u.node = of_find_node_by_path("/"); + unlock_new_inode(root_inode); s->s_root = d_make_root(root_inode); if (!s->s_root) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 9b28a7132466..86810e5d7914 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -248,21 +248,7 @@ populate_shared_memory: * or it can pointers to struct page's */ - /* - * When reading, readahead_size will only be zero when - * we're doing O_DIRECT, otherwise we got here from - * orangefs_readpage. - * - * If we got here from orangefs_readpage we want to - * copy either a page or the whole file into the io - * vector, whichever is smaller. - */ - if (readahead_size) - copy_amount = - min(new_op->downcall.resp.io.amt_complete, - (__s64)PAGE_SIZE); - else - copy_amount = new_op->downcall.resp.io.amt_complete; + copy_amount = new_op->downcall.resp.io.amt_complete; ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, copy_amount); @@ -283,19 +269,11 @@ populate_shared_memory: out: if (buffer_index >= 0) { - if ((readahead_size) && (type == ORANGEFS_IO_READ)) { - /* readpage */ - *index_return = buffer_index; - gossip_debug(GOSSIP_FILE_DEBUG, - "%s: hold on to buffer_index :%d:\n", - __func__, buffer_index); - } else { - /* O_DIRECT */ - orangefs_bufmap_put(buffer_index); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): PUT buffer_index %d\n", - __func__, handle, buffer_index); - } + orangefs_bufmap_put(buffer_index); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): PUT buffer_index %d\n", + __func__, handle, buffer_index); + buffer_index = -1; } op_release(new_op); return ret; @@ -375,84 +353,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, return ret; } -static int orangefs_getflags(struct inode *inode, unsigned long *uval) -{ - __u64 val = 0; - int ret; - - ret = orangefs_inode_getxattr(inode, - "user.pvfs2.meta_hint", - &val, sizeof(val)); - if (ret < 0 && ret != -ENODATA) - return ret; - else if (ret == -ENODATA) - val = 0; - *uval = val; - return 0; -} - -/* - * Perform a miscellaneous operation on a file. - */ -static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(file); - int ret = -ENOTTY; - __u64 val = 0; - unsigned long uval; - - gossip_debug(GOSSIP_FILE_DEBUG, - "orangefs_ioctl: called with cmd %d\n", - cmd); - - /* - * we understand some general ioctls on files, such as the immutable - * and append flags - */ - if (cmd == FS_IOC_GETFLAGS) { - ret = orangefs_getflags(inode, &uval); - if (ret) - return ret; - gossip_debug(GOSSIP_FILE_DEBUG, - "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n", - (unsigned long long)uval); - return put_user(uval, (int __user *)arg); - } else if (cmd == FS_IOC_SETFLAGS) { - unsigned long old_uval; - - ret = 0; - if (get_user(uval, (int __user *)arg)) - return -EFAULT; - /* - * ORANGEFS_MIRROR_FL is set internally when the mirroring mode - * is turned on for a file. The user is not allowed to turn - * on this bit, but the bit is present if the user first gets - * the flags and then updates the flags with some new - * settings. So, we ignore it in the following edit. bligon. - */ - if ((uval & ~ORANGEFS_MIRROR_FL) & - (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) { - gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n"); - return -EINVAL; - } - ret = orangefs_getflags(inode, &old_uval); - if (ret) - return ret; - ret = vfs_ioc_setflags_prepare(inode, old_uval, uval); - if (ret) - return ret; - val = uval; - gossip_debug(GOSSIP_FILE_DEBUG, - "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n", - (unsigned long long)val); - ret = orangefs_inode_setxattr(inode, - "user.pvfs2.meta_hint", - &val, sizeof(val), 0); - } - - return ret; -} - static vm_fault_t orangefs_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; @@ -657,7 +557,6 @@ const struct file_operations orangefs_file_operations = { .read_iter = orangefs_file_read_iter, .write_iter = orangefs_file_write_iter, .lock = orangefs_lock, - .unlocked_ioctl = orangefs_ioctl, .mmap = orangefs_file_mmap, .open = generic_file_open, .splice_read = generic_file_splice_read, diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 5079cfafa8d7..6bf35a0d61f3 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -11,6 +11,7 @@ */ #include <linux/bvec.h> +#include <linux/fileattr.h> #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-bufmap.h" @@ -244,6 +245,50 @@ static int orangefs_writepages(struct address_space *mapping, static int orangefs_launder_page(struct page *); +static void orangefs_readahead(struct readahead_control *rac) +{ + loff_t offset; + struct iov_iter iter; + struct file *file = rac->file; + struct inode *inode = file->f_mapping->host; + struct xarray *i_pages; + struct page *page; + loff_t new_start = readahead_pos(rac); + int ret; + size_t new_len = 0; + + loff_t bytes_remaining = inode->i_size - readahead_pos(rac); + loff_t pages_remaining = bytes_remaining / PAGE_SIZE; + + if (pages_remaining >= 1024) + new_len = 4194304; + else if (pages_remaining > readahead_count(rac)) + new_len = bytes_remaining; + + if (new_len) + readahead_expand(rac, new_start, new_len); + + offset = readahead_pos(rac); + i_pages = &file->f_mapping->i_pages; + + iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac)); + + /* read in the pages. */ + if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, + &offset, &iter, readahead_length(rac), + inode->i_size, NULL, NULL, file)) < 0) + gossip_debug(GOSSIP_FILE_DEBUG, + "%s: wait_for_direct_io failed. \n", __func__); + else + ret = 0; + + /* clean up. */ + while ((page = readahead_page(rac))) { + page_endio(page, false, ret); + put_page(page); + } +} + static int orangefs_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; @@ -251,44 +296,24 @@ static int orangefs_readpage(struct file *file, struct page *page) struct bio_vec bv; ssize_t ret; loff_t off; /* offset into this page */ - pgoff_t index; /* which page */ - struct page *next_page; - char *kaddr; - loff_t read_size; - int buffer_index = -1; /* orangefs shared memory slot */ - int slot_index; /* index into slot */ - int remaining; - - /* - * Get up to this many bytes from Orangefs at a time and try - * to fill them into the page cache at once. Tests with dd made - * this seem like a reasonable static number, if there was - * interest perhaps this number could be made setable through - * sysfs... - */ - read_size = 524288; if (PageDirty(page)) orangefs_launder_page(page); off = page_offset(page); - index = off >> PAGE_SHIFT; bv.bv_page = page; bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, - read_size, inode->i_size, NULL, &buffer_index, file); - remaining = ret; + PAGE_SIZE, inode->i_size, NULL, NULL, file); /* this will only zero remaining unread portions of the page data */ iov_iter_zero(~0U, &iter); /* takes care of potential aliasing */ flush_dcache_page(page); if (ret < 0) { SetPageError(page); - unlock_page(page); - goto out; } else { SetPageUptodate(page); if (PageError(page)) @@ -297,60 +322,7 @@ static int orangefs_readpage(struct file *file, struct page *page) } /* unlock the page after the ->readpage() routine completes */ unlock_page(page); - - if (remaining > PAGE_SIZE) { - slot_index = 0; - while ((remaining - PAGE_SIZE) >= PAGE_SIZE) { - remaining -= PAGE_SIZE; - /* - * It is an optimization to try and fill more than one - * page... by now we've already gotten the single - * page we were after, if stuff doesn't seem to - * be going our way at this point just return - * and hope for the best. - * - * If we look for pages and they're already there is - * one reason to give up, and if they're not there - * and we can't create them is another reason. - */ - - index++; - slot_index++; - next_page = find_get_page(inode->i_mapping, index); - if (next_page) { - gossip_debug(GOSSIP_FILE_DEBUG, - "%s: found next page, quitting\n", - __func__); - put_page(next_page); - goto out; - } - next_page = find_or_create_page(inode->i_mapping, - index, - GFP_KERNEL); - /* - * I've never hit this, leave it as a printk for - * now so it will be obvious. - */ - if (!next_page) { - printk("%s: can't create next page, quitting\n", - __func__); - goto out; - } - kaddr = kmap_atomic(next_page); - orangefs_bufmap_page_fill(kaddr, - buffer_index, - slot_index); - kunmap_atomic(kaddr); - SetPageUptodate(next_page); - unlock_page(next_page); - put_page(next_page); - } - } - -out: - if (buffer_index != -1) - orangefs_bufmap_put(buffer_index); - return ret; + return ret; } static int orangefs_write_begin(struct file *file, @@ -659,6 +631,7 @@ out: /** ORANGEFS2 implementation of address space operations */ static const struct address_space_operations orangefs_address_operations = { .writepage = orangefs_writepage, + .readahead = orangefs_readahead, .readpage = orangefs_readpage, .writepages = orangefs_writepages, .set_page_dirty = __set_page_dirty_nobuffers, @@ -954,6 +927,53 @@ int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags return __orangefs_setattr(inode, &iattr); } +static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + u64 val = 0; + int ret; + + gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__, + dentry); + + ret = orangefs_inode_getxattr(d_inode(dentry), + "user.pvfs2.meta_hint", + &val, sizeof(val)); + if (ret < 0 && ret != -ENODATA) + return ret; + + gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val); + + fileattr_fill_flags(fa, val); + return 0; +} + +static int orangefs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + u64 val = 0; + + gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__, + dentry); + /* + * ORANGEFS_MIRROR_FL is set internally when the mirroring mode is + * turned on for a file. The user is not allowed to turn on this bit, + * but the bit is present if the user first gets the flags and then + * updates the flags with some new settings. So, we ignore it in the + * following edit. bligon. + */ + if (fileattr_has_fsx(fa) || + (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | ORANGEFS_MIRROR_FL))) { + gossip_err("%s: only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n", + __func__); + return -EOPNOTSUPP; + } + val = fa->flags; + gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val); + return orangefs_inode_setxattr(d_inode(dentry), + "user.pvfs2.meta_hint", + &val, sizeof(val), 0); +} + /* ORANGEFS2 implementation of VFS inode operations for files */ static const struct inode_operations orangefs_file_inode_operations = { .get_acl = orangefs_get_acl, @@ -963,6 +983,8 @@ static const struct inode_operations orangefs_file_inode_operations = { .listxattr = orangefs_listxattr, .permission = orangefs_permission, .update_time = orangefs_update_time, + .fileattr_get = orangefs_fileattr_get, + .fileattr_set = orangefs_fileattr_set, }; static int orangefs_init_iops(struct inode *inode) diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index 74a3d6337ef4..cd7297815f91 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -31,7 +31,7 @@ static ulong module_parm_debug_mask; __u64 orangefs_gossip_debug_mask; int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS; int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS; -int orangefs_cache_timeout_msecs = 50; +int orangefs_cache_timeout_msecs = 500; int orangefs_dcache_timeout_msecs = 50; int orangefs_getattr_timeout_msecs = 50; diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index d4b7ae763186..46b7dcff18ac 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -221,7 +221,7 @@ static int orangefs_inode_is_stale(struct inode *inode, * If the inode type or symlink target have changed then this * inode is stale. */ - if (type == -1 || !(inode->i_mode & type)) { + if (type == -1 || inode_wrong_type(inode, type)) { orangefs_make_bad_inode(inode); return 1; } diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 0b2891c6c71e..2846b943e80c 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -932,7 +932,7 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, static int ovl_copy_up_flags(struct dentry *dentry, int flags) { int err = 0; - const struct cred *old_cred = ovl_override_creds(dentry->d_sb); + const struct cred *old_cred; bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED); /* @@ -943,6 +943,7 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags) if (WARN_ON(disconnected && d_is_dir(dentry))) return -EIO; + old_cred = ovl_override_creds(dentry->d_sb); while (!err) { struct dentry *next; struct dentry *parent = NULL; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 836f14b9d3a6..93efe7048a77 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1301,4 +1301,6 @@ const struct inode_operations ovl_dir_inode_operations = { .listxattr = ovl_listxattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, + .fileattr_get = ovl_fileattr_get, + .fileattr_set = ovl_fileattr_set, }; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index dbfb35fb0ff7..4d53d3b7e5fe 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -430,20 +430,11 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma) if (WARN_ON(file != vma->vm_file)) return -EIO; - vma->vm_file = get_file(realfile); + vma_set_file(vma, realfile); old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = call_mmap(vma->vm_file, vma); revert_creds(old_cred); - - if (ret) { - /* Drop reference count from new vm_file value */ - fput(realfile); - } else { - /* Drop reference count from previous vm_file value */ - fput(file); - } - ovl_file_accessed(file); return ret; @@ -491,112 +482,6 @@ static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) return ret; } -static long ovl_real_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct fd real; - long ret; - - ret = ovl_real_fdget(file, &real); - if (ret) - return ret; - - ret = security_file_ioctl(real.file, cmd, arg); - if (!ret) { - /* - * Don't override creds, since we currently can't safely check - * permissions before doing so. - */ - ret = vfs_ioctl(real.file, cmd, arg); - } - - fdput(real); - - return ret; -} - -static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd, - unsigned long arg) -{ - long ret; - struct inode *inode = file_inode(file); - - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EACCES; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - inode_lock(inode); - - /* - * Prevent copy up if immutable and has no CAP_LINUX_IMMUTABLE - * capability. - */ - ret = -EPERM; - if (!ovl_has_upperdata(inode) && IS_IMMUTABLE(inode) && - !capable(CAP_LINUX_IMMUTABLE)) - goto unlock; - - ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY); - if (ret) - goto unlock; - - ret = ovl_real_ioctl(file, cmd, arg); - - ovl_copyflags(ovl_inode_real(inode), inode); -unlock: - inode_unlock(inode); - - mnt_drop_write_file(file); - - return ret; - -} - -long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long ret; - - switch (cmd) { - case FS_IOC_GETFLAGS: - case FS_IOC_FSGETXATTR: - ret = ovl_real_ioctl(file, cmd, arg); - break; - - case FS_IOC_FSSETXATTR: - case FS_IOC_SETFLAGS: - ret = ovl_ioctl_set_flags(file, cmd, arg); - break; - - default: - ret = -ENOTTY; - } - - return ret; -} - -#ifdef CONFIG_COMPAT -long ovl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; - - default: - return -ENOIOCTLCMD; - } - - return ovl_ioctl(file, cmd, arg); -} -#endif - enum ovl_copyop { OVL_COPY, OVL_CLONE, @@ -686,6 +571,26 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, remap_flags, op); } +static int ovl_flush(struct file *file, fl_owner_t id) +{ + struct fd real; + const struct cred *old_cred; + int err; + + err = ovl_real_fdget(file, &real); + if (err) + return err; + + if (real.file->f_op->flush) { + old_cred = ovl_override_creds(file_inode(file)->i_sb); + err = real.file->f_op->flush(real.file, id); + revert_creds(old_cred); + } + fdput(real); + + return err; +} + const struct file_operations ovl_file_operations = { .open = ovl_open, .release = ovl_release, @@ -696,10 +601,7 @@ const struct file_operations ovl_file_operations = { .mmap = ovl_mmap, .fallocate = ovl_fallocate, .fadvise = ovl_fadvise, - .unlocked_ioctl = ovl_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ovl_compat_ioctl, -#endif + .flush = ovl_flush, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 003cf83bf78a..5e828a1c98a8 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -11,6 +11,8 @@ #include <linux/posix_acl.h> #include <linux/ratelimit.h> #include <linux/fiemap.h> +#include <linux/fileattr.h> +#include <linux/security.h> #include "overlayfs.h" @@ -95,7 +97,7 @@ out: return err; } -static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) +static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) { bool samefs = ovl_same_fs(dentry->d_sb); unsigned int xinobits = ovl_xino_bits(dentry->d_sb); @@ -108,21 +110,21 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) * which is friendly to du -x. */ stat->dev = dentry->d_sb->s_dev; - return 0; + return; } else if (xinobits) { /* * All inode numbers of underlying fs should not be using the * high xinobits, so we use high xinobits to partition the * overlay st_ino address space. The high bits holds the fsid * (upper fsid is 0). The lowest xinobit is reserved for mapping - * the non-peresistent inode numbers range in case of overflow. + * the non-persistent inode numbers range in case of overflow. * This way all overlay inode numbers are unique and use the * overlay st_dev. */ if (likely(!(stat->ino >> xinoshift))) { stat->ino |= ((u64)fsid) << (xinoshift + 1); stat->dev = dentry->d_sb->s_dev; - return 0; + return; } else if (ovl_xino_warn(dentry->d_sb)) { pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", dentry, stat->ino, xinobits); @@ -151,8 +153,6 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) */ stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev; } - - return 0; } int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, @@ -251,9 +251,7 @@ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, } } - err = ovl_map_dev_ino(dentry, stat, fsid); - if (err) - goto out; + ovl_map_dev_ino(dentry, stat, fsid); /* * It's probably not worth it to count subdirs to get the @@ -408,7 +406,7 @@ static bool ovl_can_list(struct super_block *sb, const char *s) if (ovl_is_private_xattr(sb, s)) return false; - /* List all non-trusted xatts */ + /* List all non-trusted xattrs */ if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) return true; @@ -500,6 +498,79 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return err; } +/* + * Work around the fact that security_file_ioctl() takes a file argument. + * Introducing security_inode_fileattr_get/set() hooks would solve this issue + * properly. + */ +static int ovl_security_fileattr(struct dentry *dentry, struct fileattr *fa, + bool set) +{ + struct path realpath; + struct file *file; + unsigned int cmd; + int err; + + ovl_path_real(dentry, &realpath); + file = dentry_open(&realpath, O_RDONLY, current_cred()); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (set) + cmd = fa->fsx_valid ? FS_IOC_FSSETXATTR : FS_IOC_SETFLAGS; + else + cmd = fa->fsx_valid ? FS_IOC_FSGETXATTR : FS_IOC_GETFLAGS; + + err = security_file_ioctl(file, cmd, 0); + fput(file); + + return err; +} + +int ovl_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct dentry *upperdentry; + const struct cred *old_cred; + int err; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = ovl_copy_up(dentry); + if (!err) { + upperdentry = ovl_dentry_upper(dentry); + + old_cred = ovl_override_creds(inode->i_sb); + err = ovl_security_fileattr(dentry, fa, true); + if (!err) + err = vfs_fileattr_set(&init_user_ns, upperdentry, fa); + revert_creds(old_cred); + ovl_copyflags(ovl_inode_real(inode), inode); + } + ovl_drop_write(dentry); +out: + return err; +} + +int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct dentry *realdentry = ovl_dentry_real(dentry); + const struct cred *old_cred; + int err; + + old_cred = ovl_override_creds(inode->i_sb); + err = ovl_security_fileattr(dentry, fa, false); + if (!err) + err = vfs_fileattr_get(realdentry, fa); + revert_creds(old_cred); + + return err; +} + static const struct inode_operations ovl_file_inode_operations = { .setattr = ovl_setattr, .permission = ovl_permission, @@ -508,6 +579,8 @@ static const struct inode_operations ovl_file_inode_operations = { .get_acl = ovl_get_acl, .update_time = ovl_update_time, .fiemap = ovl_fiemap, + .fileattr_get = ovl_fileattr_get, + .fileattr_set = ovl_fileattr_set, }; static const struct inode_operations ovl_symlink_inode_operations = { @@ -538,7 +611,7 @@ static const struct address_space_operations ovl_aops = { * stackable i_mutex locks according to stack level of the super * block instance. An overlayfs instance can never be in stack * depth 0 (there is always a real fs below it). An overlayfs - * inode lock will use the lockdep annotaion ovl_i_mutex_key[depth]. + * inode lock will use the lockdep annotation ovl_i_mutex_key[depth]. * * For example, here is a snip from /proc/lockdep_chains after * dir_iterate of nested overlayfs: diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 3fe05fb5d145..210cd6f66e28 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -371,7 +371,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, return PTR_ERR(origin); if (upperdentry && !ovl_is_whiteout(upperdentry) && - ((d_inode(origin)->i_mode ^ d_inode(upperdentry)->i_mode) & S_IFMT)) + inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode)) goto invalid; if (!*stackp) @@ -730,7 +730,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, index = ERR_PTR(-ESTALE); goto out; } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || - ((inode->i_mode ^ d_inode(origin)->i_mode) & S_IFMT)) { + inode_wrong_type(inode, d_inode(origin)->i_mode)) { /* * Index should always be of the same file type as origin * except for the case of a whiteout index. A whiteout @@ -919,6 +919,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, continue; if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) { + dput(this); err = -EPERM; pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); goto out_put; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 95cff83786a5..6ec73db4bf9e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -186,7 +186,12 @@ static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry, size_t size) { const char *name = ovl_xattr(ofs, ox); - return vfs_getxattr(&init_user_ns, dentry, name, value, size); + int err = vfs_getxattr(&init_user_ns, dentry, name, value, size); + int len = (value && err > 0) ? err : 0; + + pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", + dentry, name, min(len, 48), value, size, err); + return err; } static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, @@ -319,9 +324,6 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, enum ovl_xattr ox, const void *value, size_t size, int xerr); int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); -void ovl_set_flag(unsigned long flag, struct inode *inode); -void ovl_clear_flag(unsigned long flag, struct inode *inode); -bool ovl_test_flag(unsigned long flag, struct inode *inode); bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); bool ovl_is_inuse(struct dentry *dentry); @@ -335,6 +337,21 @@ char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, int padding); int ovl_sync_status(struct ovl_fs *ofs); +static inline void ovl_set_flag(unsigned long flag, struct inode *inode) +{ + set_bit(flag, &OVL_I(inode)->flags); +} + +static inline void ovl_clear_flag(unsigned long flag, struct inode *inode) +{ + clear_bit(flag, &OVL_I(inode)->flags); +} + +static inline bool ovl_test_flag(unsigned long flag, struct inode *inode) +{ + return test_bit(flag, &OVL_I(inode)->flags); +} + static inline bool ovl_is_impuredir(struct super_block *sb, struct dentry *dentry) { @@ -439,6 +456,18 @@ int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level); int ovl_indexdir_cleanup(struct ovl_fs *ofs); +/* + * Can we iterate real dir directly? + * + * Non-merge dir may contain whiteouts from a time it was a merge upper, before + * lower dir was removed under it and possibly before it was rotated from upper + * to lower layer. + */ +static inline bool ovl_dir_is_real(struct dentry *dir) +{ + return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); +} + /* inode.c */ int ovl_set_nlink_upper(struct dentry *dentry); int ovl_set_nlink_lower(struct dentry *dentry); @@ -519,8 +548,9 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); extern const struct file_operations ovl_file_operations; int __init ovl_aio_request_cache_init(void); void ovl_aio_request_cache_destroy(void); -long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -long ovl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int ovl_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index f404a78e6b60..e8ad2c2c77dd 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -319,18 +319,6 @@ static inline int ovl_dir_read(struct path *realpath, return err; } -/* - * Can we iterate real dir directly? - * - * Non-merge dir may contain whiteouts from a time it was a merge upper, before - * lower dir was removed under it and possibly before it was rotated from upper - * to lower layer. - */ -static bool ovl_dir_is_real(struct dentry *dir) -{ - return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); -} - static void ovl_dir_reset(struct file *file) { struct ovl_dir_file *od = file->private_data; @@ -963,10 +951,6 @@ const struct file_operations ovl_dir_operations = { .llseek = ovl_dir_llseek, .fsync = ovl_dir_fsync, .release = ovl_dir_release, - .unlocked_ioctl = ovl_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ovl_compat_ioctl, -#endif }; int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index fdd72f1a9c5e..b01d4147520d 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -380,6 +380,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) ofs->config.metacopy ? "on" : "off"); if (ofs->config.ovl_volatile) seq_puts(m, ",volatile"); + if (ofs->config.userxattr) + seq_puts(m, ",userxattr"); return 0; } @@ -945,6 +947,16 @@ static int ovl_lower_dir(const char *name, struct path *path, pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", name); } + /* + * Decoding origin file handle is required for persistent st_ino. + * Without persistent st_ino, xino=auto falls back to xino=off. + */ + if (ofs->config.xino == OVL_XINO_AUTO && + ofs->config.upperdir && !fh_type) { + ofs->config.xino = OVL_XINO_OFF; + pr_warn("fs on '%s' does not support file handles, falling back to xino=off.\n", + name); + } /* Check if lower fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) @@ -1042,9 +1054,6 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, } err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags); - if (!err) - ovl_copyattr(ovl_inode_real(inode), inode); - return err; out_acl_release: @@ -1185,8 +1194,8 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, if (err) goto out; - /* Upper fs should not be r/o */ - if (sb_rdonly(upperpath->mnt->mnt_sb)) { + /* Upperdir path should not be r/o */ + if (__mnt_is_readonly(upperpath->mnt)) { pr_err("upper fs is r/o, try multi-lower layers mount\n"); err = -EINVAL; goto out; @@ -1401,9 +1410,19 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); if (err) { ofs->noxattr = true; - ofs->config.index = false; - ofs->config.metacopy = false; - pr_warn("upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); + if (ofs->config.index || ofs->config.metacopy) { + ofs->config.index = false; + ofs->config.metacopy = false; + pr_warn("upper fs does not support xattr, falling back to index=off,metacopy=off.\n"); + } + /* + * xattr support is required for persistent st_ino. + * Without persistent st_ino, xino=auto falls back to xino=off. + */ + if (ofs->config.xino == OVL_XINO_AUTO) { + ofs->config.xino = OVL_XINO_OFF; + pr_warn("upper fs does not support xattr, falling back to xino=off.\n"); + } err = 0; } else { ovl_do_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE); @@ -1580,7 +1599,8 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) * user opted-in to one of the new features that require following the * lower inode of non-dir upper. */ - if (!ofs->config.index && !ofs->config.metacopy && !ofs->config.xino && + if (!ofs->config.index && !ofs->config.metacopy && + ofs->config.xino != OVL_XINO_ON && uuid_is_null(uuid)) return false; @@ -1609,6 +1629,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) dev_t dev; int err; bool bad_uuid = false; + bool warn = false; for (i = 0; i < ofs->numfs; i++) { if (ofs->fs[i].sb == sb) @@ -1617,13 +1638,20 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { bad_uuid = true; + if (ofs->config.xino == OVL_XINO_AUTO) { + ofs->config.xino = OVL_XINO_OFF; + warn = true; + } if (ofs->config.index || ofs->config.nfs_export) { ofs->config.index = false; ofs->config.nfs_export = false; - pr_warn("%s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", + warn = true; + } + if (warn) { + pr_warn("%s uuid detected in lower fs '%pd2', falling back to xino=%s,index=off,nfs_export=off.\n", uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", - path->dentry); + path->dentry, ovl_xino_str[ofs->config.xino]); } } @@ -1826,7 +1854,8 @@ out_err: * - upper/work dir of any overlayfs instance */ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, - struct dentry *dentry, const char *name) + struct dentry *dentry, const char *name, + bool is_lower) { struct dentry *next = dentry, *parent; int err = 0; @@ -1838,7 +1867,7 @@ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, /* Walk back ancestors to root (inclusive) looking for traps */ while (!err && parent != next) { - if (ovl_lookup_trap_inode(sb, parent)) { + if (is_lower && ovl_lookup_trap_inode(sb, parent)) { err = -ELOOP; pr_err("overlapping %s path\n", name); } else if (ovl_is_inuse(parent)) { @@ -1864,7 +1893,7 @@ static int ovl_check_overlapping_layers(struct super_block *sb, if (ovl_upper_mnt(ofs)) { err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root, - "upperdir"); + "upperdir", false); if (err) return err; @@ -1875,7 +1904,8 @@ static int ovl_check_overlapping_layers(struct super_block *sb, * workbasedir. In that case, we already have their traps in * inode cache and we will catch that case on lookup. */ - err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir"); + err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir", + false); if (err) return err; } @@ -1883,7 +1913,7 @@ static int ovl_check_overlapping_layers(struct super_block *sb, for (i = 1; i < ofs->numlayer; i++) { err = ovl_check_layer(sb, ofs, ofs->layers[i].mnt->mnt_root, - "lowerdir"); + "lowerdir", true); if (err) return err; } @@ -1952,6 +1982,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ofs) goto out; + err = -ENOMEM; ofs->creator_cred = cred = prepare_creds(); if (!cred) goto out_err; @@ -1980,6 +2011,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!splitlower) goto out_err; + err = -EINVAL; numlower = ovl_split_lowerdirs(splitlower); if (numlower > OVL_MAX_STACK) { pr_err("too many lower directories, limit is %d\n", @@ -1987,6 +2019,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_err; } + err = -ENOMEM; layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL); if (!layers) goto out_err; @@ -2013,6 +2046,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (ofs->config.upperdir) { struct super_block *upper_sb; + err = -EINVAL; if (!ofs->config.workdir) { pr_err("missing 'workdir'\n"); goto out_err; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 7f5a01a11f97..b9d03627f364 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -214,7 +214,7 @@ const struct ovl_layer *ovl_layer_lower(struct dentry *dentry) /* * ovl_dentry_lower() could return either a data dentry or metacopy dentry - * dependig on what is stored in lowerstack[0]. At times we need to find + * depending on what is stored in lowerstack[0]. At times we need to find * lower dentry which has data (and not metacopy dentry). This helper * returns the lower data dentry. */ @@ -422,18 +422,20 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) } } -static void ovl_dentry_version_inc(struct dentry *dentry, bool impurity) +static void ovl_dir_version_inc(struct dentry *dentry, bool impurity) { struct inode *inode = d_inode(dentry); WARN_ON(!inode_is_locked(inode)); + WARN_ON(!d_is_dir(dentry)); /* - * Version is used by readdir code to keep cache consistent. For merge - * dirs all changes need to be noted. For non-merge dirs, cache only - * contains impure (ones which have been copied up and have origins) - * entries, so only need to note changes to impure entries. + * Version is used by readdir code to keep cache consistent. + * For merge dirs (or dirs with origin) all changes need to be noted. + * For non-merge dirs, cache contains only impure entries (i.e. ones + * which have been copied up and have origins), so only need to note + * changes to impure entries. */ - if (OVL_TYPE_MERGE(ovl_path_type(dentry)) || impurity) + if (!ovl_dir_is_real(dentry) || impurity) OVL_I(inode)->version++; } @@ -442,7 +444,7 @@ void ovl_dir_modified(struct dentry *dentry, bool impurity) /* Copy mtime/ctime */ ovl_copyattr(d_inode(ovl_dentry_upper(dentry)), d_inode(dentry)); - ovl_dentry_version_inc(dentry, impurity); + ovl_dir_version_inc(dentry, impurity); } u64 ovl_dentry_version_get(struct dentry *dentry) @@ -638,21 +640,6 @@ int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) return err; } -void ovl_set_flag(unsigned long flag, struct inode *inode) -{ - set_bit(flag, &OVL_I(inode)->flags); -} - -void ovl_clear_flag(unsigned long flag, struct inode *inode) -{ - clear_bit(flag, &OVL_I(inode)->flags); -} - -bool ovl_test_flag(unsigned long flag, struct inode *inode) -{ - return test_bit(flag, &OVL_I(inode)->flags); -} - /** * Caller must hold a reference to inode to prevent it from being freed while * it is marked inuse. diff --git a/fs/proc/array.c b/fs/proc/array.c index bb87e4d89cd8..7ec59171f197 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -342,9 +342,11 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); +#ifdef CONFIG_SECCOMP_FILTER seq_put_decimal_ull(m, "\nSeccomp_filters:\t", atomic_read(&p->seccomp.filter_count)); #endif +#endif seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { case -EINVAL: diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 984e42f8cb11..66c7dd11bd7c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -94,14 +94,9 @@ static void sysctl_print_dir(struct ctl_dir *dir) static int namecmp(const char *name1, int len1, const char *name2, int len2) { - int minlen; int cmp; - minlen = len1; - if (minlen > len2) - minlen = len2; - - cmp = memcmp(name1, name2, minlen); + cmp = memcmp(name1, name2, min(len1, len2)); if (cmp == 0) cmp = len1 - len2; return cmp; @@ -1108,6 +1103,11 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) err |= sysctl_err(path, table, "array not allowed"); } + if (table->proc_handler == proc_dou8vec_minmax) { + if (table->maxlen != sizeof(u8)) + err |= sysctl_err(path, table, "array not allowed"); + } + return err; } @@ -1123,6 +1123,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) (table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || + (table->proc_handler == proc_dou8vec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || (table->proc_handler == proc_dointvec_userhz_jiffies) || (table->proc_handler == proc_dointvec_ms_jiffies) || diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index d963ae7902f9..b9614db48b1d 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -385,6 +385,7 @@ void pstore_record_init(struct pstore_record *record, static void pstore_dump(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason) { + struct kmsg_dump_iter iter; unsigned long total = 0; const char *why; unsigned int part = 1; @@ -405,6 +406,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, } } + kmsg_dump_rewind(&iter); + oopscount++; while (total < kmsg_bytes) { char *dst; @@ -435,7 +438,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, dst_size -= header_size; /* Write dump contents. */ - if (!kmsg_dump_get_buffer(dumper, true, dst + header_size, + if (!kmsg_dump_get_buffer(&iter, true, dst + header_size, dst_size, &dump_size)) break; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index ca6d8a867285..fefe3d391d3a 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -56,7 +56,7 @@ MODULE_PARM_DESC(mem_size, static unsigned int mem_type; module_param(mem_type, uint, 0400); MODULE_PARM_DESC(mem_type, - "set to 1 to try to use unbuffered memory (default 0)"); + "memory type: 0=write-combined (default), 1=unbuffered, 2=cached"); static int ramoops_max_reason = -1; module_param_named(max_reason, ramoops_max_reason, int, 0400); @@ -648,6 +648,10 @@ static int ramoops_parse_dt(struct platform_device *pdev, pdata->mem_size = resource_size(res); pdata->mem_address = res->start; + /* + * Setting "unbuffered" is deprecated and will be ignored if + * "mem_type" is also specified. + */ pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); /* * Setting "no-dump-oops" is deprecated and will be ignored if @@ -666,6 +670,7 @@ static int ramoops_parse_dt(struct platform_device *pdev, field = value; \ } + parse_u32("mem-type", pdata->record_size, pdata->mem_type); parse_u32("record-size", pdata->record_size, 0); parse_u32("console-size", pdata->console_size, 0); parse_u32("ftrace-size", pdata->ftrace_size, 0); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index fff363bfd484..fe5305028c6e 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -396,6 +396,10 @@ void persistent_ram_zap(struct persistent_ram_zone *prz) persistent_ram_update_header_ecc(prz); } +#define MEM_TYPE_WCOMBINE 0 +#define MEM_TYPE_NONCACHED 1 +#define MEM_TYPE_NORMAL 2 + static void *persistent_ram_vmap(phys_addr_t start, size_t size, unsigned int memtype) { @@ -409,10 +413,20 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size, page_start = start - offset_in_page(start); page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE); - if (memtype) + switch (memtype) { + case MEM_TYPE_NORMAL: + prot = PAGE_KERNEL; + break; + case MEM_TYPE_NONCACHED: prot = pgprot_noncached(PAGE_KERNEL); - else + break; + case MEM_TYPE_WCOMBINE: prot = pgprot_writecombine(PAGE_KERNEL); + break; + default: + pr_err("invalid mem_type=%d\n", memtype); + return NULL; + } pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); if (!pages) { diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 6d16b2be5ac4..05e4bd9ab6d6 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -17,6 +17,7 @@ #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/types.h> +#include <linux/mount.h> #include <linux/writeback.h> #include <linux/nospec.h> #include "compat.h" @@ -471,6 +472,7 @@ static int quota_getstatev(struct super_block *sb, int type, fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; + fqs->qs_rtbwarnlimit = state.s_state[type].rt_spc_warnlimit; /* Inodes may be allocated even if inactive; copy out if present */ if (state.s_state[USRQUOTA].ino) { @@ -827,8 +829,6 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, } } -#ifdef CONFIG_BLOCK - /* Return 1 if 'cmd' will block on frozen filesystem */ static int quotactl_cmd_write(int cmd) { @@ -850,7 +850,6 @@ static int quotactl_cmd_write(int cmd) } return 1; } -#endif /* CONFIG_BLOCK */ /* Return true if quotactl command is manipulating quota on/off state */ static bool quotactl_cmd_onoff(int cmd) @@ -968,3 +967,48 @@ out: path_put(pathp); return ret; } + +SYSCALL_DEFINE4(quotactl_path, unsigned int, cmd, const char __user *, + mountpoint, qid_t, id, void __user *, addr) +{ + struct super_block *sb; + struct path mountpath; + unsigned int cmds = cmd >> SUBCMDSHIFT; + unsigned int type = cmd & SUBCMDMASK; + int ret; + + if (type >= MAXQUOTAS) + return -EINVAL; + + ret = user_path_at(AT_FDCWD, mountpoint, + LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT, &mountpath); + if (ret) + return ret; + + sb = mountpath.mnt->mnt_sb; + + if (quotactl_cmd_write(cmds)) { + ret = mnt_want_write(mountpath.mnt); + if (ret) + goto out; + } + + if (quotactl_cmd_onoff(cmds)) + down_write(&sb->s_umount); + else + down_read(&sb->s_umount); + + ret = do_quotactl(sb, type, cmds, id, addr, ERR_PTR(-EINVAL)); + + if (quotactl_cmd_onoff(cmds)) + up_write(&sb->s_umount); + else + up_read(&sb->s_umount); + + if (quotactl_cmd_write(cmds)) + mnt_drop_write(mountpath.mnt); +out: + path_put(&mountpath); + + return ret; +} diff --git a/fs/readdir.c b/fs/readdir.c index 19434b3c982c..09e8ed7d4161 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -150,6 +150,9 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, if (buf->result) return -EINVAL; + buf->result = verify_dirent_name(name, namlen); + if (buf->result < 0) + return buf->result; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; @@ -405,6 +408,9 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, if (buf->result) return -EINVAL; + buf->result = verify_dirent_name(name, namlen); + if (buf->result < 0) + return buf->result; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 1db0254bc38b..203a47232707 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -258,4 +258,6 @@ const struct inode_operations reiserfs_file_inode_operations = { .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, + .fileattr_get = reiserfs_fileattr_get, + .fileattr_set = reiserfs_fileattr_set, }; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 4f1cbd930179..4b86ecf5817e 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -10,6 +10,59 @@ #include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/compat.h> +#include <linux/fileattr.h> + +int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + + if (!reiserfs_attrs(inode->i_sb)) + return -ENOTTY; + + fileattr_fill_flags(fa, REISERFS_I(inode)->i_attrs); + + return 0; +} + +int reiserfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + unsigned int flags = fa->flags; + int err; + + reiserfs_write_lock(inode->i_sb); + + err = -ENOTTY; + if (!reiserfs_attrs(inode->i_sb)) + goto unlock; + + err = -EOPNOTSUPP; + if (fileattr_has_fsx(fa)) + goto unlock; + + /* + * Is it quota file? Do not allow user to mess with it + */ + err = -EPERM; + if (IS_NOQUOTA(inode)) + goto unlock; + + if ((flags & REISERFS_NOTAIL_FL) && S_ISREG(inode->i_mode)) { + err = reiserfs_unpack(inode); + if (err) + goto unlock; + } + sd_attrs_to_i_attrs(flags, inode); + REISERFS_I(inode)->i_attrs = flags; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + err = 0; +unlock: + reiserfs_write_unlock(inode->i_sb); + + return err; +} /* * reiserfs_ioctl - handler for ioctl for inode @@ -23,7 +76,6 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - unsigned int flags; int err = 0; reiserfs_write_lock(inode->i_sb); @@ -32,7 +84,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case REISERFS_IOC_UNPACK: if (S_ISREG(inode->i_mode)) { if (arg) - err = reiserfs_unpack(inode, filp); + err = reiserfs_unpack(inode); } else err = -ENOTTY; break; @@ -40,63 +92,6 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) * following two cases are taken from fs/ext2/ioctl.c by Remy * Card (card@masi.ibp.fr) */ - case REISERFS_IOC_GETFLAGS: - if (!reiserfs_attrs(inode->i_sb)) { - err = -ENOTTY; - break; - } - - flags = REISERFS_I(inode)->i_attrs; - err = put_user(flags, (int __user *)arg); - break; - case REISERFS_IOC_SETFLAGS:{ - if (!reiserfs_attrs(inode->i_sb)) { - err = -ENOTTY; - break; - } - - err = mnt_want_write_file(filp); - if (err) - break; - - if (!inode_owner_or_capable(&init_user_ns, inode)) { - err = -EPERM; - goto setflags_out; - } - if (get_user(flags, (int __user *)arg)) { - err = -EFAULT; - goto setflags_out; - } - /* - * Is it quota file? Do not allow user to mess with it - */ - if (IS_NOQUOTA(inode)) { - err = -EPERM; - goto setflags_out; - } - err = vfs_ioc_setflags_prepare(inode, - REISERFS_I(inode)->i_attrs, - flags); - if (err) - goto setflags_out; - if ((flags & REISERFS_NOTAIL_FL) && - S_ISREG(inode->i_mode)) { - int result; - - result = reiserfs_unpack(inode, filp); - if (result) { - err = result; - goto setflags_out; - } - } - sd_attrs_to_i_attrs(flags, inode); - REISERFS_I(inode)->i_attrs = flags; - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); -setflags_out: - mnt_drop_write_file(filp); - break; - } case REISERFS_IOC_GETVERSION: err = put_user(inode->i_generation, (int __user *)arg); break; @@ -138,12 +133,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, case REISERFS_IOC32_UNPACK: cmd = REISERFS_IOC_UNPACK; break; - case REISERFS_IOC32_GETFLAGS: - cmd = REISERFS_IOC_GETFLAGS; - break; - case REISERFS_IOC32_SETFLAGS: - cmd = REISERFS_IOC_SETFLAGS; - break; case REISERFS_IOC32_GETVERSION: cmd = REISERFS_IOC_GETVERSION; break; @@ -165,7 +154,7 @@ int reiserfs_commit_write(struct file *f, struct page *page, * Function try to convert tail from direct item into indirect. * It set up nopack attribute in the REISERFS_I(inode)->nopack */ -int reiserfs_unpack(struct inode *inode, struct file *filp) +int reiserfs_unpack(struct inode *inode) { int retval = 0; int index; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index e98f99338f8f..9edc8e2b154e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -461,7 +461,6 @@ int reiserfs_in_journal(struct super_block *sb, b_blocknr_t * next_zero_bit) { struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_journal_cnode *cn; struct reiserfs_list_bitmap *jb; int i; unsigned long bl; @@ -497,13 +496,12 @@ int reiserfs_in_journal(struct super_block *sb, bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr; /* is it in any old transactions? */ if (search_all - && (cn = - get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) { + && (get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) { return 1; } /* is it in the current transaction. This should never happen */ - if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) { + if ((get_journal_hash_dev(sb, journal->j_hash_table, bl))) { BUG(); return 1; } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index e6eb05e2b2f1..017db70d0f48 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1660,6 +1660,8 @@ const struct inode_operations reiserfs_dir_inode_operations = { .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, + .fileattr_get = reiserfs_fileattr_get, + .fileattr_set = reiserfs_fileattr_set, }; /* diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 0ca2ac62e534..3aa928ec527a 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -18,8 +18,6 @@ /* the 32 bit compat definitions with int argument */ #define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int) -#define REISERFS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define REISERFS_IOC32_SETFLAGS FS_IOC32_SETFLAGS #define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION #define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION @@ -3408,7 +3406,10 @@ __u32 r5_hash(const signed char *msg, int len); #define SPARE_SPACE 500 /* prototypes from ioctl.c */ +int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int reiserfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long reiserfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); -int reiserfs_unpack(struct inode *inode, struct file *filp); +int reiserfs_unpack(struct inode *inode); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 1b9c7a387dc7..3ffafc73acf0 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2408,7 +2408,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, * IO to work */ if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { - err = reiserfs_unpack(inode, NULL); + err = reiserfs_unpack(inode); if (err) { reiserfs_warning(sb, "super-6520", "Unpacking tail of quota file failed" diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index 9b3b06da568c..e47fde1182de 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -44,7 +44,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec); static inline int reiserfs_xattrs_initialized(struct super_block *sb) { - return REISERFS_SB(sb)->priv_root != NULL; + return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root; } #define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header)) diff --git a/fs/select.c b/fs/select.c index 37aaa8317f3a..945896d0ac9e 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1055,10 +1055,9 @@ static long do_restart_poll(struct restart_block *restart_block) ret = do_sys_poll(ufds, nfds, to); - if (ret == -ERESTARTNOHAND) { - restart_block->fn = do_restart_poll; - ret = -ERESTART_RESTARTBLOCK; - } + if (ret == -ERESTARTNOHAND) + ret = set_restart_fn(restart_block, do_restart_poll); + return ret; } @@ -1080,7 +1079,6 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, struct restart_block *restart_block; restart_block = ¤t->restart_block; - restart_block->fn = do_restart_poll; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; @@ -1091,7 +1089,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, } else restart_block->poll.has_timeout = 0; - ret = -ERESTART_RESTARTBLOCK; + ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } diff --git a/fs/seq_file.c b/fs/seq_file.c index cb11a34fb871..5059248f2d64 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -412,6 +412,24 @@ void seq_printf(struct seq_file *m, const char *f, ...) } EXPORT_SYMBOL(seq_printf); +#ifdef CONFIG_BINARY_PRINTF +void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary) +{ + int len; + + if (m->count < m->size) { + len = bstr_printf(m->buf + m->count, m->size - m->count, f, + binary); + if (m->count + len < m->size) { + m->count += len; + return; + } + } + seq_set_overflow(m); +} +EXPORT_SYMBOL(seq_bprintf); +#endif /* CONFIG_BINARY_PRINTF */ + /** * mangle_path - mangle and copy path to buffer beginning * @s: buffer start diff --git a/fs/signalfd.c b/fs/signalfd.c index 456046e15873..040a1142915f 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -134,6 +134,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, #endif new.ssi_addr_lsb = (short) kinfo->si_addr_lsb; break; + case SIL_PERF_EVENT: + new.ssi_addr = (long) kinfo->si_addr; + new.ssi_perf = kinfo->si_perf; + break; case SIL_CHLD: new.ssi_pid = kinfo->si_pid; new.ssi_uid = kinfo->si_uid; diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index eb02072d28dd..723763746238 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -152,14 +152,18 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); - if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + if (start >= end + || (end - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); - if (start >= lookup_table_start || (lookup_table_start - start) > SQUASHFS_METADATA_SIZE) { + if (start >= lookup_table_start || + (lookup_table_start - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index 11581bf31af4..ea5387679723 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -97,14 +97,16 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb, start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); - if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + if (start >= end || (end - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); - if (start >= id_table_start || (id_table_start - start) > SQUASHFS_METADATA_SIZE) { + if (start >= id_table_start || (id_table_start - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 8d64edb80ebf..b3fdc8212c5f 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -17,6 +17,7 @@ /* size of metadata (inode and directory) blocks */ #define SQUASHFS_METADATA_SIZE 8192 +#define SQUASHFS_BLOCK_OFFSET 2 /* default size of block device I/O */ #ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index ead66670b41a..087cab8c78f4 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -109,14 +109,16 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); - if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + if (start >= end || (end - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); - if (start >= table_start || (table_start - start) > SQUASHFS_METADATA_SIZE) { + if (start >= table_start || (table_start - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/stat.c b/fs/stat.c index fbc171d038aa..1fa38bdec1a6 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -86,12 +86,20 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, /* SB_NOATIME means filesystem supplies dummy atime value */ if (inode->i_sb->s_flags & SB_NOATIME) stat->result_mask &= ~STATX_ATIME; + + /* + * Note: If you add another clause to set an attribute flag, please + * update attributes_mask below. + */ if (IS_AUTOMOUNT(inode)) stat->attributes |= STATX_ATTR_AUTOMOUNT; if (IS_DAX(inode)) stat->attributes |= STATX_ATTR_DAX; + stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | + STATX_ATTR_DAX); + mnt_userns = mnt_user_ns(path->mnt); if (inode->i_op->getattr) return inode->i_op->getattr(mnt_userns, path, stat, diff --git a/fs/super.c b/fs/super.c index 8c1baca35c16..11b7e7213fd1 100644 --- a/fs/super.c +++ b/fs/super.c @@ -454,6 +454,7 @@ void generic_shutdown_super(struct super_block *sb) evict_inodes(sb); /* only nonzero refcount inodes can have marks */ fsnotify_sb_delete(sb); + security_sb_delete(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 4b83cbded559..1261e8b41edb 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -477,7 +477,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) * * The instances directory is special as it allows for mkdir and rmdir to * to be done by userspace. When a mkdir or rmdir is performed, the inode - * locks are released and the methhods passed in (@mkdir and @rmdir) are + * locks are released and the methods passed in (@mkdir and @rmdir) are * called without locks and with the name of the directory being created * within the instances directory. * diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index d9d8d7794eff..5bd8482e660a 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1637,6 +1637,8 @@ const struct inode_operations ubifs_dir_inode_operations = { .listxattr = ubifs_listxattr, .update_time = ubifs_update_time, .tmpfile = ubifs_tmpfile, + .fileattr_get = ubifs_fileattr_get, + .fileattr_set = ubifs_fileattr_set, }; const struct file_operations ubifs_dir_operations = { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 0e4b4be3aa26..2e4e1d159969 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1648,6 +1648,8 @@ const struct inode_operations ubifs_file_inode_operations = { .getattr = ubifs_getattr, .listxattr = ubifs_listxattr, .update_time = ubifs_update_time, + .fileattr_get = ubifs_fileattr_get, + .fileattr_set = ubifs_fileattr_set, }; const struct inode_operations ubifs_symlink_inode_operations = { diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index a4aaeea63893..dc3e26e9ed7b 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -102,7 +102,8 @@ static int switch_gc_head(struct ubifs_info *c) * This function compares data nodes @a and @b. Returns %1 if @a has greater * inode or block number, and %-1 otherwise. */ -static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +static int data_nodes_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { ino_t inuma, inumb; struct ubifs_info *c = priv; @@ -145,8 +146,8 @@ static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) * first and sorted by length in descending order. Directory entry nodes go * after inode nodes and are sorted in ascending hash valuer order. */ -static int nondata_nodes_cmp(void *priv, struct list_head *a, - struct list_head *b) +static int nondata_nodes_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { ino_t inuma, inumb; struct ubifs_info *c = priv; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 2326d5122beb..c6a863487780 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -14,6 +14,7 @@ #include <linux/compat.h> #include <linux/mount.h> +#include <linux/fileattr.h> #include "ubifs.h" /* Need to be kept consistent with checked flags in ioctl2ubifs() */ @@ -103,7 +104,7 @@ static int ubifs2ioctl(int ubifs_flags) static int setflags(struct inode *inode, int flags) { - int oldflags, err, release; + int err, release; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_budget_req req = { .dirtied_ino = 1, @@ -114,11 +115,6 @@ static int setflags(struct inode *inode, int flags) return err; mutex_lock(&ui->ui_mutex); - oldflags = ubifs2ioctl(ui->flags); - err = vfs_ioc_setflags_prepare(inode, oldflags, flags); - if (err) - goto out_unlock; - ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS); ui->flags |= ioctl2ubifs(flags); ubifs_set_inode_flags(inode); @@ -132,54 +128,52 @@ static int setflags(struct inode *inode, int flags) if (IS_SYNC(inode)) err = write_inode_now(inode, 1); return err; +} -out_unlock: - mutex_unlock(&ui->ui_mutex); - ubifs_release_budget(c, &req); - return err; +int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + int flags = ubifs2ioctl(ubifs_inode(inode)->flags); + + if (d_is_special(dentry)) + return -ENOTTY; + + dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags); + fileattr_fill_flags(fa, flags); + + return 0; } -long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +int ubifs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) { - int flags, err; - struct inode *inode = file_inode(file); + struct inode *inode = d_inode(dentry); + int flags = fa->flags; - switch (cmd) { - case FS_IOC_GETFLAGS: - flags = ubifs2ioctl(ubifs_inode(inode)->flags); + if (d_is_special(dentry)) + return -ENOTTY; - dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags); - return put_user(flags, (int __user *) arg); + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; - case FS_IOC_SETFLAGS: { - if (IS_RDONLY(inode)) - return -EROFS; + if (flags & ~UBIFS_GETTABLE_IOCTL_FLAGS) + return -EOPNOTSUPP; - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EACCES; + flags &= UBIFS_SETTABLE_IOCTL_FLAGS; - if (get_user(flags, (int __user *) arg)) - return -EFAULT; + if (!S_ISDIR(inode->i_mode)) + flags &= ~FS_DIRSYNC_FL; - if (flags & ~UBIFS_GETTABLE_IOCTL_FLAGS) - return -EOPNOTSUPP; - flags &= UBIFS_SETTABLE_IOCTL_FLAGS; + dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags); + return setflags(inode, flags); +} - if (!S_ISDIR(inode->i_mode)) - flags &= ~FS_DIRSYNC_FL; +long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + struct inode *inode = file_inode(file); - /* - * Make sure the file-system is read-write and make sure it - * will not become read-only while we are changing the flags. - */ - err = mnt_want_write_file(file); - if (err) - return err; - dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags); - err = setflags(inode, flags); - mnt_drop_write_file(file); - return err; - } + switch (cmd) { case FS_IOC_SET_ENCRYPTION_POLICY: { struct ubifs_info *c = inode->i_sb->s_fs_info; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 0f8a6a16421b..4d17e5382b74 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -298,8 +298,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) * entries @a and @b by comparing their sequence numer. Returns %1 if @a has * greater sequence number and %-1 otherwise. */ -static int replay_entries_cmp(void *priv, struct list_head *a, - struct list_head *b) +static int replay_entries_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct ubifs_info *c = priv; struct replay_entry *ra, *rb; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 7fdfdbda4b8a..b65c599a386a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2053,6 +2053,9 @@ int ubifs_recover_size(struct ubifs_info *c, bool in_place); void ubifs_destroy_size_tree(struct ubifs_info *c); /* ioctl.c */ +int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int ubifs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); void ubifs_set_inode_flags(struct inode *inode); #ifdef CONFIG_COMPAT diff --git a/fs/udf/namei.c b/fs/udf/namei.c index f146b3089f3d..3ae9f1e91984 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1215,11 +1215,10 @@ static struct dentry *udf_get_parent(struct dentry *child) { struct kernel_lb_addr tloc; struct inode *inode = NULL; - struct qstr dotdot = QSTR_INIT("..", 2); struct fileIdentDesc cfi; struct udf_fileident_bh fibh; - if (!udf_find_entry(d_inode(child), &dotdot, &fibh, &cfi)) + if (!udf_find_entry(d_inode(child), &dotdot_name, &fibh, &cfi)) return ERR_PTR(-EACCES); if (fibh.sbh != fibh.ebh) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 983558b572c7..74028b5a7b0a 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -128,10 +128,9 @@ static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid, static struct dentry *ufs_get_parent(struct dentry *child) { - struct qstr dot_dot = QSTR_INIT("..", 2); ino_t ino; - ino = ufs_inode_by_name(d_inode(child), &dot_dot); + ino = ufs_inode_by_name(d_inode(child), &dotdot_name); if (!ino) return ERR_PTR(-ENOENT); return d_obtain_alias(ufs_iget(child->d_sb, ino)); diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 7aee0ec63ade..eac6788fc6cf 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -225,7 +225,7 @@ static struct dentry *vboxsf_dir_lookup(struct inode *parent, } else { inode = vboxsf_new_inode(parent->i_sb); if (!IS_ERR(inode)) - vboxsf_init_inode(sbi, inode, &fsinfo); + vboxsf_init_inode(sbi, inode, &fsinfo, false); } return d_splice_alias(inode, dentry); @@ -245,7 +245,7 @@ static int vboxsf_dir_instantiate(struct inode *parent, struct dentry *dentry, sf_i = VBOXSF_I(inode); /* The host may have given us different attr then requested */ sf_i->force_restat = 1; - vboxsf_init_inode(sbi, inode, info); + vboxsf_init_inode(sbi, inode, info, false); d_instantiate(dentry, inode); diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index d7816c01a4f6..4f5e59f06284 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -207,7 +207,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) err = -ENOMEM; goto fail_unmap; } - vboxsf_init_inode(sbi, iroot, &sbi->root_info); + vboxsf_init_inode(sbi, iroot, &sbi->root_info, false); unlock_new_inode(iroot); droot = d_make_root(iroot); @@ -418,7 +418,7 @@ static int vboxsf_reconfigure(struct fs_context *fc) /* Apply changed options to the root inode */ sbi->o = ctx->o; - vboxsf_init_inode(sbi, iroot, &sbi->root_info); + vboxsf_init_inode(sbi, iroot, &sbi->root_info, true); return 0; } diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index 3b847e3fba24..aec2ebf7d25a 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -45,12 +45,12 @@ struct inode *vboxsf_new_inode(struct super_block *sb) } /* set [inode] attributes based on [info], uid/gid based on [sbi] */ -void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, - const struct shfl_fsobjinfo *info) +int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, + const struct shfl_fsobjinfo *info, bool reinit) { const struct shfl_fsobjattr *attr; s64 allocated; - int mode; + umode_t mode; attr = &info->attr; @@ -75,29 +75,44 @@ void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, inode->i_mapping->a_ops = &vboxsf_reg_aops; if (SHFL_IS_DIRECTORY(attr->mode)) { - inode->i_mode = sbi->o.dmode_set ? sbi->o.dmode : mode; - inode->i_mode &= ~sbi->o.dmask; - inode->i_mode |= S_IFDIR; - inode->i_op = &vboxsf_dir_iops; - inode->i_fop = &vboxsf_dir_fops; - /* - * XXX: this probably should be set to the number of entries - * in the directory plus two (. ..) - */ - set_nlink(inode, 1); + if (sbi->o.dmode_set) + mode = sbi->o.dmode; + mode &= ~sbi->o.dmask; + mode |= S_IFDIR; + if (!reinit) { + inode->i_op = &vboxsf_dir_iops; + inode->i_fop = &vboxsf_dir_fops; + /* + * XXX: this probably should be set to the number of entries + * in the directory plus two (. ..) + */ + set_nlink(inode, 1); + } else if (!S_ISDIR(inode->i_mode)) + return -ESTALE; + inode->i_mode = mode; } else if (SHFL_IS_SYMLINK(attr->mode)) { - inode->i_mode = sbi->o.fmode_set ? sbi->o.fmode : mode; - inode->i_mode &= ~sbi->o.fmask; - inode->i_mode |= S_IFLNK; - inode->i_op = &vboxsf_lnk_iops; - set_nlink(inode, 1); + if (sbi->o.fmode_set) + mode = sbi->o.fmode; + mode &= ~sbi->o.fmask; + mode |= S_IFLNK; + if (!reinit) { + inode->i_op = &vboxsf_lnk_iops; + set_nlink(inode, 1); + } else if (!S_ISLNK(inode->i_mode)) + return -ESTALE; + inode->i_mode = mode; } else { - inode->i_mode = sbi->o.fmode_set ? sbi->o.fmode : mode; - inode->i_mode &= ~sbi->o.fmask; - inode->i_mode |= S_IFREG; - inode->i_op = &vboxsf_reg_iops; - inode->i_fop = &vboxsf_reg_fops; - set_nlink(inode, 1); + if (sbi->o.fmode_set) + mode = sbi->o.fmode; + mode &= ~sbi->o.fmask; + mode |= S_IFREG; + if (!reinit) { + inode->i_op = &vboxsf_reg_iops; + inode->i_fop = &vboxsf_reg_fops; + set_nlink(inode, 1); + } else if (!S_ISREG(inode->i_mode)) + return -ESTALE; + inode->i_mode = mode; } inode->i_uid = sbi->o.uid; @@ -116,6 +131,7 @@ void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, info->change_time.ns_relative_to_unix_epoch); inode->i_mtime = ns_to_timespec64( info->modification_time.ns_relative_to_unix_epoch); + return 0; } int vboxsf_create_at_dentry(struct dentry *dentry, @@ -199,7 +215,9 @@ int vboxsf_inode_revalidate(struct dentry *dentry) dentry->d_time = jiffies; sf_i->force_restat = 0; - vboxsf_init_inode(sbi, inode, &info); + err = vboxsf_init_inode(sbi, inode, &info, true); + if (err) + return err; /* * If the file was changed on the host side we need to invalidate the diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h index 760524e78c88..6a7a9cedebc6 100644 --- a/fs/vboxsf/vfsmod.h +++ b/fs/vboxsf/vfsmod.h @@ -82,8 +82,8 @@ extern const struct dentry_operations vboxsf_dentry_ops; /* from utils.c */ struct inode *vboxsf_new_inode(struct super_block *sb); -void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, - const struct shfl_fsobjinfo *info); +int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, + const struct shfl_fsobjinfo *info, bool reinit); int vboxsf_create_at_dentry(struct dentry *dentry, struct shfl_createparms *params); int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path, diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index 88fb25119899..24d1b54de807 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -3,9 +3,13 @@ config FS_VERITY bool "FS Verity (read-only file-based authenticity protection)" select CRYPTO - # SHA-256 is selected as it's intended to be the default hash algorithm. + # SHA-256 is implied as it's intended to be the default hash algorithm. # To avoid bloat, other wanted algorithms must be selected explicitly. - select CRYPTO_SHA256 + # Note that CRYPTO_SHA256 denotes the generic C implementation, but + # some architectures provided optimized implementations of the same + # algorithm that may be used instead. In this case, CRYPTO_SHA256 may + # be omitted even if SHA-256 is being used. + imply CRYPTO_SHA256 help This option enables fs-verity. fs-verity is the dm-verity mechanism implemented at the file level. On supported diff --git a/fs/xattr.c b/fs/xattr.c index b3444e06cded..5c8c5175b385 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -186,12 +186,12 @@ EXPORT_SYMBOL(__vfs_setxattr); * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * - * @mnt_userns - user namespace of the mount the inode was found from - * @dentry - object to perform setxattr on - * @name - xattr name to set - * @value - value to set @name to - * @size - size of @value - * @flags - flags to pass into filesystem operations + * @mnt_userns: user namespace of the mount the inode was found from + * @dentry: object to perform setxattr on + * @name: xattr name to set + * @value: value to set @name to + * @size: size of @value + * @flags: flags to pass into filesystem operations * * returns the result of the internal setxattr or setsecurity operations. * @@ -242,6 +242,7 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * + * @mnt_userns: user namespace of the mount of the target inode * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to @@ -473,6 +474,7 @@ EXPORT_SYMBOL(__vfs_removexattr); * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * + * @mnt_userns: user namespace of the mount of the target inode * @dentry: object to perform setxattr on * @name: name of xattr to remove * @delegated_inode: on return, will contain an inode pointer that diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 9331f3516afa..c68a36688474 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -22,6 +22,11 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_health.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_defer.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" static int xfs_get_aghdr_buf( @@ -485,6 +490,116 @@ xfs_ag_init_headers( return error; } +int +xfs_ag_shrink_space( + struct xfs_mount *mp, + struct xfs_trans **tpp, + xfs_agnumber_t agno, + xfs_extlen_t delta) +{ + struct xfs_alloc_arg args = { + .tp = *tpp, + .mp = mp, + .type = XFS_ALLOCTYPE_THIS_BNO, + .minlen = delta, + .maxlen = delta, + .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, + .resv = XFS_AG_RESV_NONE, + .prod = 1 + }; + struct xfs_buf *agibp, *agfbp; + struct xfs_agi *agi; + struct xfs_agf *agf; + int error, err2; + + ASSERT(agno == mp->m_sb.sb_agcount - 1); + error = xfs_ialloc_read_agi(mp, *tpp, agno, &agibp); + if (error) + return error; + + agi = agibp->b_addr; + + error = xfs_alloc_read_agf(mp, *tpp, agno, 0, &agfbp); + if (error) + return error; + + agf = agfbp->b_addr; + /* some extra paranoid checks before we shrink the ag */ + if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) + return -EFSCORRUPTED; + if (delta >= agi->agi_length) + return -EINVAL; + + args.fsbno = XFS_AGB_TO_FSB(mp, agno, + be32_to_cpu(agi->agi_length) - delta); + + /* + * Disable perag reservations so it doesn't cause the allocation request + * to fail. We'll reestablish reservation before we return. + */ + error = xfs_ag_resv_free(agibp->b_pag); + if (error) + return error; + + /* internal log shouldn't also show up in the free space btrees */ + error = xfs_alloc_vextent(&args); + if (!error && args.agbno == NULLAGBLOCK) + error = -ENOSPC; + + if (error) { + /* + * if extent allocation fails, need to roll the transaction to + * ensure that the AGFL fixup has been committed anyway. + */ + xfs_trans_bhold(*tpp, agfbp); + err2 = xfs_trans_roll(tpp); + if (err2) + return err2; + xfs_trans_bjoin(*tpp, agfbp); + goto resv_init_out; + } + + /* + * if successfully deleted from freespace btrees, need to confirm + * per-AG reservation works as expected. + */ + be32_add_cpu(&agi->agi_length, -delta); + be32_add_cpu(&agf->agf_length, -delta); + + err2 = xfs_ag_resv_init(agibp->b_pag, *tpp); + if (err2) { + be32_add_cpu(&agi->agi_length, delta); + be32_add_cpu(&agf->agf_length, delta); + if (err2 != -ENOSPC) + goto resv_err; + + __xfs_bmap_add_free(*tpp, args.fsbno, delta, NULL, true); + + /* + * Roll the transaction before trying to re-init the per-ag + * reservation. The new transaction is clean so it will cancel + * without any side effects. + */ + error = xfs_defer_finish(tpp); + if (error) + return error; + + error = -ENOSPC; + goto resv_init_out; + } + xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); + xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); + return 0; +resv_init_out: + err2 = xfs_ag_resv_init(agibp->b_pag, *tpp); + if (!err2) + return error; +resv_err: + xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", err2); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return err2; +} + /* * Extent the AG indicated by the @id by the length passed in */ diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 5166322807e7..4535de1d88ea 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -24,6 +24,8 @@ struct aghdr_init_data { }; int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); +int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans **tpp, + xfs_agnumber_t agno, xfs_extlen_t delta); int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp, struct aghdr_init_data *id, xfs_extlen_t len); int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fdfe6dc0d307..6c5f8d10589c 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -211,7 +211,11 @@ __xfs_ag_resv_init( ASSERT(0); return -EINVAL; } - error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) + error = -ENOSPC; + else + error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 0c623d3c1036..aaa19101bb2a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2906,13 +2906,13 @@ xfs_agf_verify( if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > mp->m_ag_maxlevels || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > mp->m_ag_maxlevels) return __this_address; if (xfs_sb_version_hasrmapbt(&mp->m_sb) && (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > mp->m_rmap_maxlevels)) return __this_address; if (xfs_sb_version_hasrmapbt(&mp->m_sb) && @@ -2939,7 +2939,7 @@ xfs_agf_verify( if (xfs_sb_version_hasreflink(&mp->m_sb) && (be32_to_cpu(agf->agf_refcount_level) < 1 || - be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) + be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)) return __this_address; return NULL; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 472b3039eabb..96146f425e50 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -70,6 +70,26 @@ xfs_inode_hasattr( return 1; } +/* + * Returns true if the there is exactly only block in the attr fork, in which + * case the attribute fork consists of a single leaf block entry. + */ +bool +xfs_attr_is_leaf( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = ip->i_afp; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap; + + if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS) + return false; + + xfs_iext_first(ifp, &icur); + xfs_iext_get_extent(ifp, &icur, &imap); + return imap.br_startoff == 0 && imap.br_blockcount == 1; +} + /*======================================================================== * Overall external interface routines. *========================================================================*/ @@ -89,7 +109,7 @@ xfs_attr_get_ilocked( if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); - if (xfs_bmap_one_block(args->dp, XFS_ATTR_FORK)) + if (xfs_attr_is_leaf(args->dp)) return xfs_attr_leaf_get(args); return xfs_attr_node_get(args); } @@ -293,7 +313,7 @@ xfs_attr_set_args( return error; } - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_addname(args); if (error != -ENOSPC) return error; @@ -342,12 +362,10 @@ xfs_has_attr( if (!xfs_inode_hasattr(dp)) return -ENOATTR; - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) { - ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_sf_findname(args, NULL, NULL); - } - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_hasname(args, &bp); if (bp) @@ -366,21 +384,14 @@ int xfs_attr_remove_args( struct xfs_da_args *args) { - struct xfs_inode *dp = args->dp; - int error; - - if (!xfs_inode_hasattr(dp)) { - error = -ENOATTR; - } else if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) { - ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); - error = xfs_attr_shortform_remove(args); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_removename(args); - } else { - error = xfs_attr_node_removename(args); - } + if (!xfs_inode_hasattr(args->dp)) + return -ENOATTR; - return error; + if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + return xfs_attr_shortform_remove(args); + if (xfs_attr_is_leaf(args->dp)) + return xfs_attr_leaf_removename(args); + return xfs_attr_node_removename(args); } /* @@ -928,6 +939,7 @@ restart: * Search to see if name already exists, and get back a pointer * to where it should go. */ + error = 0; retval = xfs_attr_node_hasname(args, &state); if (retval != -ENOATTR && retval != -EEXIST) goto out; @@ -1282,7 +1294,7 @@ xfs_attr_node_removename( /* * If the result is small enough, push it all into the inode. */ - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + if (xfs_attr_is_leaf(dp)) error = xfs_attr_node_shrink(args, state); out: diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 3e97a935e712..2b1f61987a9d 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -85,6 +85,7 @@ int xfs_attr_inactive(struct xfs_inode *dp); int xfs_attr_list_ilocked(struct xfs_attr_list_context *); int xfs_attr_list(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); +bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index d6ef69ab1c67..556184b63061 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -518,10 +518,10 @@ xfs_attr_copy_value( * Query whether the total requested number of attr fork bytes of extended * attribute space will be able to fit inline. * - * Returns zero if not, else the di_forkoff fork offset to be used in the + * Returns zero if not, else the i_forkoff fork offset to be used in the * literal area for attribute data once the new bytes have been added. * - * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value; + * i_forkoff must be 8 byte aligned, hence is stored as a >>3 value; * special case for dev/uuid inodes, they have fixed size data forks. */ int @@ -560,7 +560,7 @@ xfs_attr_shortform_bytesfit( * literal area rebalancing. */ if (bytes <= XFS_IFORK_ASIZE(dp)) - return dp->i_d.di_forkoff; + return dp->i_forkoff; /* * For attr2 we can try to move the forkoff if there is space in the @@ -581,7 +581,7 @@ xfs_attr_shortform_bytesfit( * minimum offset only needs to be the space required for * the btree root. */ - if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > + if (!dp->i_forkoff && dp->i_df.if_bytes > xfs_default_attroffset(dp)) dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); break; @@ -592,10 +592,10 @@ xfs_attr_shortform_bytesfit( * minforkoff to where the btree root can finish so we have * plenty of room for attrs */ - if (dp->i_d.di_forkoff) { - if (offset < dp->i_d.di_forkoff) + if (dp->i_forkoff) { + if (offset < dp->i_forkoff) return 0; - return dp->i_d.di_forkoff; + return dp->i_forkoff; } dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); break; @@ -651,13 +651,8 @@ xfs_attr_shortform_create( trace_xfs_attr_sf_create(args); ASSERT(ifp->if_bytes == 0); - if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) { - ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */ + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) ifp->if_format = XFS_DINODE_FMT_LOCAL; - ifp->if_flags |= XFS_IFINLINE; - } else { - ASSERT(ifp->if_flags & XFS_IFINLINE); - } xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data; memset(hdr, 0, sizeof(*hdr)); @@ -730,10 +725,10 @@ xfs_attr_shortform_add( dp = args->dp; mp = dp->i_mount; - dp->i_d.di_forkoff = forkoff; + dp->i_forkoff = forkoff; ifp = dp->i_afp; - ASSERT(ifp->if_flags & XFS_IFINLINE); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) ASSERT(0); @@ -770,7 +765,7 @@ xfs_attr_fork_remove( xfs_idestroy_fork(ip->i_afp); kmem_cache_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; - ip->i_d.di_forkoff = 0; + ip->i_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -821,8 +816,8 @@ xfs_attr_shortform_remove( xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); - dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); - ASSERT(dp->i_d.di_forkoff); + dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); + ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || (args->op_flags & XFS_DA_OP_ADDNAME) || !(mp->m_flags & XFS_MOUNT_ATTR2) || @@ -851,7 +846,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) trace_xfs_attr_sf_lookup(args); ifp = args->dp->i_afp; - ASSERT(ifp->if_flags & XFS_IFINLINE); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; @@ -878,7 +873,7 @@ xfs_attr_shortform_getvalue( struct xfs_attr_sf_entry *sfe; int i; - ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); + ASSERT(args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL); sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e0905ad171f0..7e3b9b01431e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -66,13 +66,13 @@ xfs_bmap_compute_maxlevels( * either a signed 32-bit number for the data fork, or a signed 16-bit * number for the attr fork. * - * Note that we can no longer assume that if we are in ATTR1 that - * the fork offset of all the inodes will be - * (xfs_default_attroffset(ip) >> 3) because we could have mounted - * with ATTR2 and then mounted back with ATTR1, keeping the - * di_forkoff's fixed but probably at various positions. Therefore, - * for both ATTR1 and ATTR2 we have to assume the worst case scenario - * of a minimum size available. + * Note that we can no longer assume that if we are in ATTR1 that the + * fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted with + * ATTR2 and then mounted back with ATTR1, keeping the i_forkoff's fixed + * but probably at various positions. Therefore, for both ATTR1 and + * ATTR2 we have to assume the worst case scenario of a minimum size + * available. */ if (whichfork == XFS_DATA_FORK) { maxleafents = MAXEXTNUM; @@ -94,6 +94,15 @@ xfs_bmap_compute_maxlevels( mp->m_bm_maxlevels[whichfork] = level; } +unsigned int +xfs_bmap_compute_attr_offset( + struct xfs_mount *mp) +{ + if (mp->m_sb.sb_inodesize == 256) + return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); +} + STATIC int /* error */ xfs_bmbt_lookup_eq( struct xfs_btree_cur *cur, @@ -192,22 +201,15 @@ uint xfs_default_attroffset( struct xfs_inode *ip) { - struct xfs_mount *mp = ip->i_mount; - uint offset; - - if (mp->m_sb.sb_inodesize == 256) - offset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); - else - offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - - ASSERT(offset < XFS_LITINO(mp)); - return offset; + if (ip->i_df.if_format == XFS_DINODE_FMT_DEV) + return roundup(sizeof(xfs_dev_t), 8); + return M_IGEO(ip->i_mount)->attr_fork_offset; } /* - * Helper routine to reset inode di_forkoff field when switching - * attribute fork from local to extent format - we reset it where - * possible to make space available for inline data fork extents. + * Helper routine to reset inode i_forkoff field when switching attribute fork + * from local to extent format - we reset it where possible to make space + * available for inline data fork extents. */ STATIC void xfs_bmap_forkoff_reset( @@ -219,8 +221,8 @@ xfs_bmap_forkoff_reset( ip->i_df.if_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - if (dfl_forkoff > ip->i_d.di_forkoff) - ip->i_d.di_forkoff = dfl_forkoff; + if (dfl_forkoff > ip->i_forkoff) + ip->i_forkoff = dfl_forkoff; } } @@ -603,7 +605,7 @@ xfs_bmap_btree_to_extents( ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(!xfs_need_iread_extents(ifp)); ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); @@ -624,14 +626,13 @@ xfs_bmap_btree_to_extents( return error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); xfs_bmap_add_free(cur->bc_tp, cbno, 1, &oinfo); - ip->i_d.di_nblocks--; + ip->i_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); if (cur->bc_bufs[0] == cbp) cur->bc_bufs[0] = NULL; xfs_iroot_realloc(ip, -1, whichfork); ASSERT(ifp->if_broot == NULL); - ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); ifp->if_format = XFS_DINODE_FMT_EXTENTS; *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; @@ -675,7 +676,6 @@ xfs_bmap_extents_to_btree( * to expand the root. */ xfs_iroot_realloc(ip, 1, whichfork); - ifp->if_flags |= XFS_IFBROOT; /* * Fill in the root. @@ -726,7 +726,7 @@ xfs_bmap_extents_to_btree( args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock)); tp->t_firstblock = args.fsbno; cur->bc_ino.allocated++; - ip->i_d.di_nblocks++; + ip->i_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, args.fsbno), @@ -805,8 +805,6 @@ xfs_bmap_local_to_extents_empty( ASSERT(ifp->if_nextents == 0); xfs_bmap_forkoff_reset(ip, whichfork); - ifp->if_flags &= ~XFS_IFINLINE; - ifp->if_flags |= XFS_IFEXTENTS; ifp->if_u1.if_root = NULL; ifp->if_height = 0; ifp->if_format = XFS_DINODE_FMT_EXTENTS; @@ -850,7 +848,6 @@ xfs_bmap_local_to_extents( flags = 0; error = 0; - ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS)) == XFS_IFINLINE); memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; @@ -908,7 +905,7 @@ xfs_bmap_local_to_extents( xfs_iext_insert(ip, &icur, &rec, 0); ifp->if_nextents = 1; - ip->i_d.di_nblocks = 1; + ip->i_nblocks = 1; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); flags |= xfs_ilog_fext(whichfork); @@ -927,13 +924,15 @@ xfs_bmap_add_attrfork_btree( xfs_inode_t *ip, /* incore inode pointer */ int *flags) /* inode logging flags */ { + struct xfs_btree_block *block = ip->i_df.if_broot; xfs_btree_cur_t *cur; /* btree cursor */ int error; /* error return value */ xfs_mount_t *mp; /* file system mount struct */ int stat; /* newroot status */ mp = ip->i_mount; - if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip)) + + if (XFS_BMAP_BMDR_SPACE(block) <= XFS_IFORK_DSIZE(ip)) *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); @@ -1027,23 +1026,27 @@ xfs_bmap_add_attrfork_local( return -EFSCORRUPTED; } -/* Set an inode attr fork off based on the format */ +/* + * Set an inode attr fork offset based on the format of the data fork. + */ int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, int size, int *version) { + int default_size = xfs_default_attroffset(ip) >> 3; + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_DEV: - ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + ip->i_forkoff = default_size; break; case XFS_DINODE_FMT_LOCAL: case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: - ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); - if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_forkoff) + ip->i_forkoff = default_size; else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version) *version = 2; break; @@ -1092,11 +1095,7 @@ xfs_bmap_add_attrfork( goto trans_cancel; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, - GFP_KERNEL | __GFP_NOFAIL); - - ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; - ip->i_afp->if_flags = XFS_IFEXTENTS; + ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; switch (ip->i_df.if_format) { case XFS_DINODE_FMT_LOCAL: @@ -1222,12 +1221,10 @@ xfs_iread_extents( struct xfs_btree_cur *cur; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (!xfs_need_iread_extents(ifp)) + return 0; - if (XFS_IS_CORRUPT(mp, ifp->if_format != XFS_DINODE_FMT_BTREE)) { - error = -EFSCORRUPTED; - goto out; - } + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ir.loaded = 0; xfs_iext_first(ifp, &ir.icur); @@ -1243,8 +1240,6 @@ xfs_iread_extents( goto out; } ASSERT(ir.loaded == xfs_iext_count(ifp)); - - ifp->if_flags |= XFS_IFEXTENTS; return 0; out: xfs_iext_destroy(ifp); @@ -1279,11 +1274,9 @@ xfs_bmap_first_unused( ASSERT(xfs_ifork_has_extents(ifp)); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; lowest = max = *first_unused; for_each_xfs_iext(ifp, &icur, &got) { @@ -1331,11 +1324,9 @@ xfs_bmap_last_before( return -EFSCORRUPTED; } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; if (!xfs_iext_lookup_extent_before(ip, ifp, last_block, &icur, &got)) *last_block = 0; @@ -1354,11 +1345,9 @@ xfs_bmap_last_extent( struct xfs_iext_cursor icur; int error; - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; xfs_iext_last(ifp, &icur); if (!xfs_iext_get_extent(ifp, &icur, rec)) @@ -1440,38 +1429,6 @@ xfs_bmap_last_offset( } /* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int /* 1=>1 block, 0=>otherwise */ -xfs_bmap_one_block( - struct xfs_inode *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int rval; /* return value */ - struct xfs_bmbt_irec s; /* internal version of extent */ - struct xfs_iext_cursor icur; - -#ifndef DEBUG - if (whichfork == XFS_DATA_FORK) - return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; -#endif /* !DEBUG */ - if (ifp->if_nextents != 1) - return 0; - if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) - return 0; - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - xfs_iext_first(ifp, &icur); - xfs_iext_get_extent(ifp, &icur, &s); - rval = s.br_startoff == 0 && s.br_blockcount == 1; - if (rval && whichfork == XFS_DATA_FORK) - ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); - return rval; -} - -/* * Extent tree manipulation functions used during allocation. */ @@ -2938,7 +2895,7 @@ done: */ /* - * Adjust the size of the new extent based on di_extsize and rt extsize. + * Adjust the size of the new extent based on i_extsize and rt extsize. */ int xfs_bmap_extsize_align( @@ -3444,7 +3401,7 @@ xfs_bmap_btalloc_accounting( } /* data/attr fork only */ - ap->ip->i_d.di_nblocks += args->len; + ap->ip->i_nblocks += args->len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { ap->ip->i_delayed_blks -= args->len; @@ -3586,7 +3543,8 @@ xfs_bmap_exact_minlen_extent_alloc( args.fsbno = ap->blkno; args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; args.type = XFS_ALLOCTYPE_FIRST_AG; - args.total = args.minlen = args.maxlen = ap->minlen; + args.minlen = args.maxlen = ap->minlen; + args.total = ap->total; args.alignment = 1; args.minalignslop = 0; @@ -3985,11 +3943,9 @@ xfs_bmapi_read( XFS_STATS_INC(mp, xs_blk_mapr); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + return error; if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) eof = true; @@ -4227,7 +4183,7 @@ xfs_bmapi_allocate( return error; } - if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) + if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur) bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); /* * Bump the number of extents we've allocated @@ -4300,7 +4256,7 @@ xfs_bmapi_convert_unwritten( * Modify (by adding) the state flag, if writing. */ ASSERT(mval->br_blockcount <= len); - if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur) { bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, bma->ip, whichfork); } @@ -4469,11 +4425,9 @@ xfs_bmapi_write( XFS_STATS_INC(mp, xs_blk_mapw); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - goto error0; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + goto error0; if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; @@ -4752,11 +4706,9 @@ xfs_bmapi_remap( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; if (xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) { /* make sure we only reflink into a hole. */ @@ -4764,10 +4716,10 @@ xfs_bmapi_remap( ASSERT(got.br_startoff - bno >= len); } - ip->i_d.di_nblocks += len; + ip->i_nblocks += len; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (ifp->if_flags & XFS_IFBROOT) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_ino.flags = 0; } @@ -5355,7 +5307,7 @@ xfs_bmap_del_extent_real( * Adjust inode # blocks in the file. */ if (nblks) - ip->i_d.di_nblocks -= nblks; + ip->i_nblocks -= nblks; /* * Adjust quota data. */ @@ -5427,9 +5379,10 @@ __xfs_bunmapi( else max_len = len; - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) + error = xfs_iread_extents(tp, ip, whichfork); + if (error) return error; + if (xfs_iext_count(ifp) == 0) { *rlen = 0; return 0; @@ -5445,7 +5398,7 @@ __xfs_bunmapi( end--; logflags = 0; - if (ifp->if_flags & XFS_IFBROOT) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_ino.flags = 0; @@ -5915,13 +5868,11 @@ xfs_bmap_collapse_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; - if (ifp->if_flags & XFS_IFBROOT) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_ino.flags = 0; } @@ -6032,13 +5983,11 @@ xfs_bmap_insert_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; - if (ifp->if_flags & XFS_IFBROOT) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_ino.flags = 0; } @@ -6135,12 +6084,10 @@ xfs_bmap_split_extent( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - /* Read in all the extents */ - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; /* * If there are not extents, or split_fsb lies in a hole we are done. @@ -6155,7 +6102,7 @@ xfs_bmap_split_extent( new.br_blockcount = got.br_blockcount - gotblkcnt; new.br_state = got.br_state; - if (ifp->if_flags & XFS_IFBROOT) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_ino.flags = 0; error = xfs_bmbt_lookup_eq(cur, &got, &i); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 6747e97a7949..f9a390ecfb1d 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -185,6 +185,7 @@ static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); +unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, @@ -199,7 +200,6 @@ int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork); int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); -int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 976659190d27..1ceba020940e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -183,7 +183,7 @@ xfs_bmbt_update_cursor( struct xfs_btree_cur *dst) { ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) || - (dst->bc_ino.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME)); dst->bc_ino.allocated += src->bc_ino.allocated; dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock; @@ -260,7 +260,7 @@ xfs_bmbt_alloc_block( ASSERT(args.len == 1); cur->bc_tp->t_firstblock = args.fsbno; cur->bc_ino.allocated++; - cur->bc_ino.ip->i_d.di_nblocks++; + cur->bc_ino.ip->i_nblocks++; xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip, XFS_TRANS_DQ_BCOUNT, 1L); @@ -287,7 +287,7 @@ xfs_bmbt_free_block( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo); - ip->i_d.di_nblocks--; + ip->i_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index f464a7c7cf22..aa8dc9521c39 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -387,7 +387,6 @@ xfs_btree_bload_prep_block( new_size = bbl->iroot_size(cur, nr_this_block, priv); ifp->if_broot = kmem_zalloc(new_size, 0); ifp->if_broot_bytes = (int)new_size; - ifp->if_flags |= XFS_IFBROOT; /* Initialize it and send it out. */ xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e46bc03365db..83ac9771bfb5 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2145,7 +2145,7 @@ xfs_da_grow_inode_int( struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; int w = args->whichfork; - xfs_rfsblock_t nblks = dp->i_d.di_nblocks; + xfs_rfsblock_t nblks = dp->i_nblocks; struct xfs_bmbt_irec map, *mapp; int nmap, error, got, i, mapi; @@ -2211,7 +2211,7 @@ xfs_da_grow_inode_int( } /* account for newly allocated blocks in reserved blocks total */ - args->total -= dp->i_d.di_nblocks - nblks; + args->total -= dp->i_nblocks - nblks; out_free_map: if (mapp != &map) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 612a9c5e41b1..050bdcc4fe73 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -179,9 +179,9 @@ xfs_dir_isempty( xfs_dir2_sf_hdr_t *sfp; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - if (dp->i_d.di_size == 0) /* might happen during shutdown. */ + if (dp->i_disk_size == 0) /* might happen during shutdown. */ return 1; - if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) + if (dp->i_disk_size > XFS_IFORK_DSIZE(dp)) return 0; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; return !sfp->count; @@ -584,8 +584,8 @@ xfs_dir2_grow_inode( xfs_fsize_t size; /* directory file (data) size */ size = XFS_FSB_TO_B(mp, bno + count); - if (size > dp->i_d.di_size) { - dp->i_d.di_size = size; + if (size > dp->i_disk_size) { + dp->i_disk_size = size; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); } } @@ -608,7 +608,7 @@ xfs_dir2_isblock( rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; if (XFS_IS_CORRUPT(args->dp->i_mount, rval != 0 && - args->dp->i_d.di_size != args->geo->blksize)) + args->dp->i_disk_size != args->geo->blksize)) return -EFSCORRUPTED; *vp = rval; return 0; @@ -687,7 +687,7 @@ xfs_dir2_shrink_inode( /* * If the block isn't the last one in the directory, we're done. */ - if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) + if (dp->i_disk_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { @@ -703,7 +703,7 @@ xfs_dir2_shrink_inode( /* * Set the size to the new last block. */ - dp->i_d.di_size = XFS_FSB_TO_B(mp, bno); + dp->i_disk_size = XFS_FSB_TO_B(mp, bno); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 5b59d3f7746b..75e1421f69c4 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -961,7 +961,7 @@ xfs_dir2_leaf_to_block( * been left behind during no-space-reservation operations. * These will show up in the leaf bests table. */ - while (dp->i_d.di_size > args->geo->blksize) { + while (dp->i_disk_size > args->geo->blksize) { int hdrsz; hdrsz = args->geo->data_entry_offset; @@ -1096,14 +1096,14 @@ xfs_dir2_sf_to_block( trace_xfs_dir2_sf_to_block(args); - ASSERT(ifp->if_flags & XFS_IFINLINE); - ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; - ASSERT(ifp->if_bytes == dp->i_d.di_size); + ASSERT(ifp->if_bytes == dp->i_disk_size); ASSERT(ifp->if_u1.if_data != NULL); - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); + ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); ASSERT(dp->i_df.if_nextents == 0); /* @@ -1115,7 +1115,7 @@ xfs_dir2_sf_to_block( xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK); - dp->i_d.di_size = 0; + dp->i_disk_size = 0; /* * Add block 0 to the inode. diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 375b3edb2ad2..e67fa086f2c1 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -218,7 +218,7 @@ __xfs_dir3_data_check( */ if (dep->namelen == 0) return __this_address; - if (xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))) + if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber))) return __this_address; if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end) return __this_address; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 95d2a3f92d75..5369d8bb2593 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -113,7 +113,7 @@ xfs_dir3_leaf1_check( } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) return __this_address; - return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf, false); } static inline void @@ -139,7 +139,8 @@ xfs_failaddr_t xfs_dir3_leaf_check_int( struct xfs_mount *mp, struct xfs_dir3_icleaf_hdr *hdr, - struct xfs_dir2_leaf *leaf) + struct xfs_dir2_leaf *leaf, + bool expensive_checking) { struct xfs_da_geometry *geo = mp->m_dir_geo; xfs_dir2_leaf_tail_t *ltp; @@ -151,7 +152,7 @@ xfs_dir3_leaf_check_int( /* * XXX (dgc): This value is not restrictive enough. * Should factor in the size of the bests table as well. - * We can deduce a value for that from di_size. + * We can deduce a value for that from i_disk_size. */ if (hdr->count > geo->leaf_max_ents) return __this_address; @@ -162,6 +163,9 @@ xfs_dir3_leaf_check_int( (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) return __this_address; + if (!expensive_checking) + return NULL; + /* Check hash value order, count stale entries. */ for (i = stale = 0; i < hdr->count; i++) { if (i + 1 < hdr->count) { @@ -195,7 +199,7 @@ xfs_dir3_leaf_verify( return fa; xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, bp->b_addr); - return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr); + return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true); } static void diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 5d51265d29d6..d0520afb913a 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -73,7 +73,7 @@ xfs_dir3_leafn_check( } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) return __this_address; - return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf, false); } static inline void @@ -441,7 +441,7 @@ xfs_dir2_leaf_to_node( leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); if (be32_to_cpu(ltp->bestcount) > - (uint)dp->i_d.di_size / args->geo->blksize) { + (uint)dp->i_disk_size / args->geo->blksize) { xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 44c6a77cba05..94943ce49cab 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -127,7 +127,8 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp, - struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf, + bool expensive_checks); /* xfs_dir2_node.c */ void xfs_dir2_free_hdr_from_disk(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 8c4f76bba88b..46d18bf9d5e1 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -344,7 +344,7 @@ xfs_dir2_block_to_sf( ASSERT(dp->i_df.if_bytes == 0); xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size); dp->i_df.if_format = XFS_DINODE_FMT_LOCAL; - dp->i_d.di_size = size; + dp->i_disk_size = size; logflags |= XFS_ILOG_DDATA; xfs_dir2_sf_check(args); @@ -367,7 +367,7 @@ xfs_dir2_sf_addname( xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ int incr_isize; /* total change in size */ - int new_isize; /* di_size after adding name */ + int new_isize; /* size after adding name */ int objchange; /* changing to 8-byte inodes */ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ int pick; /* which algorithm to use */ @@ -378,12 +378,12 @@ xfs_dir2_sf_addname( ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); dp = args->dp; - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Compute entry (and change in) size. */ @@ -401,7 +401,7 @@ xfs_dir2_sf_addname( objchange = 1; } - new_isize = (int)dp->i_d.di_size + incr_isize; + new_isize = (int)dp->i_disk_size + incr_isize; /* * Won't fit as shortform any more (due to size), * or the pick routine says it won't (due to offset values). @@ -492,7 +492,7 @@ xfs_dir2_sf_addname_easy( sfp->count++; if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) sfp->i8count++; - dp->i_d.di_size = new_isize; + dp->i_disk_size = new_isize; xfs_dir2_sf_check(args); } @@ -519,7 +519,7 @@ xfs_dir2_sf_addname_hard( int nbytes; /* temp for byte copies */ xfs_dir2_data_aoff_t new_offset; /* next offset value */ xfs_dir2_data_aoff_t offset; /* current offset value */ - int old_isize; /* previous di_size */ + int old_isize; /* previous size */ xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */ xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ @@ -529,7 +529,7 @@ xfs_dir2_sf_addname_hard( * Copy the old directory to the stack buffer. */ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - old_isize = (int)dp->i_d.di_size; + old_isize = (int)dp->i_disk_size; buf = kmem_alloc(old_isize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)buf; memcpy(oldsfp, sfp, old_isize); @@ -586,7 +586,7 @@ xfs_dir2_sf_addname_hard( memcpy(sfep, oldsfep, old_isize - nbytes); } kmem_free(buf); - dp->i_d.di_size = new_isize; + dp->i_disk_size = new_isize; xfs_dir2_sf_check(args); } @@ -697,7 +697,7 @@ xfs_dir2_sf_check( ASSERT(xfs_dir2_sf_get_ftype(mp, sfep) < XFS_DIR3_FT_MAX); } ASSERT(i8count == sfp->i8count); - ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); + ASSERT((char *)sfep - (char *)sfp == dp->i_disk_size); ASSERT(offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize); @@ -821,18 +821,16 @@ xfs_dir2_sf_create( dp = args->dp; ASSERT(dp != NULL); - ASSERT(dp->i_d.di_size == 0); + ASSERT(dp->i_disk_size == 0); /* * If it's currently a zero-length extent file, * convert it to local format. */ if (dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS) { - dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */ dp->i_df.if_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); - dp->i_df.if_flags |= XFS_IFINLINE; } - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_df.if_bytes == 0); i8count = pino > XFS_DIR2_MAX_SHORT_INUM; size = xfs_dir2_sf_hdr_size(i8count); @@ -850,7 +848,7 @@ xfs_dir2_sf_create( */ xfs_dir2_sf_put_parent_ino(sfp, pino); sfp->count = 0; - dp->i_d.di_size = size; + dp->i_disk_size = size; xfs_dir2_sf_check(args); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); return 0; @@ -877,12 +875,12 @@ xfs_dir2_sf_lookup( xfs_dir2_sf_check(args); - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Special case for . */ @@ -954,8 +952,8 @@ xfs_dir2_sf_removename( trace_xfs_dir2_sf_removename(args); - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - oldsize = (int)dp->i_d.di_size; + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); + oldsize = (int)dp->i_disk_size; ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == oldsize); ASSERT(dp->i_df.if_u1.if_data != NULL); @@ -995,7 +993,7 @@ xfs_dir2_sf_removename( * Fix up the header and file size. */ sfp->count--; - dp->i_d.di_size = newsize; + dp->i_disk_size = newsize; /* * Reallocate, making it smaller. */ @@ -1053,12 +1051,12 @@ xfs_dir2_sf_replace( trace_xfs_dir2_sf_replace(args); - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * New inode number is large, and need to convert to 8-byte inodes. @@ -1219,7 +1217,7 @@ xfs_dir2_sf_toino4( * Clean up the inode. */ kmem_free(buf); - dp->i_d.di_size = newsize; + dp->i_disk_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } @@ -1292,6 +1290,6 @@ xfs_dir2_sf_toino8( * Clean up the inode. */ kmem_free(buf); - dp->i_d.di_size = newsize; + dp->i_disk_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 6ca9084b6934..a23a52e643ad 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -58,7 +58,8 @@ #define XFS_ERRTAG_BUF_IOERROR 35 #define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36 #define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 -#define XFS_ERRTAG_MAX 38 +#define XFS_ERRTAG_AG_RESV_FAIL 38 +#define XFS_ERRTAG_MAX 39 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -101,5 +102,6 @@ #define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT #define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 #define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 +#define XFS_RANDOM_AG_RESV_FAIL 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 630388b72dbe..76e2461b9e66 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -955,9 +955,8 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros * below. * - * There is a very similar struct icdinode in xfs_inode which matches the - * layout of the first 96 bytes of this structure, but is kept in native - * format instead of big endian. + * There is a very similar struct xfs_log_dinode which matches the layout of + * this structure, but is kept in native format instead of big endian. * * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed * padding field for v3 inodes. diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 6fad140d4c8e..a83bdd0c47a8 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -65,7 +65,7 @@ struct getbmapx { /* bmv_iflags values - set by XFS_IOC_GETBMAPX caller. */ #define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */ -#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ +#define BMV_IF_NO_DMAPI_READ 0x2 /* Deprecated */ #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ #define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ #define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ @@ -770,8 +770,6 @@ struct xfs_scrub_metadata { /* * ioctl commands that are used by Linux filesystems */ -#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS -#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS #define XFS_IOC_GETVERSION FS_IOC_GETVERSION /* @@ -782,8 +780,6 @@ struct xfs_scrub_metadata { #define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) #define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) -#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR -#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 69b228fce81a..eefdb518fe64 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2535,12 +2535,12 @@ xfs_agi_verify( return __this_address; if (be32_to_cpu(agi->agi_level) < 1 || - be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; if (xfs_sb_version_hasfinobt(&mp->m_sb) && (be32_to_cpu(agi->agi_free_level) < 1 || - be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS)) + be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; /* diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index b4164256993d..773cf4349428 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -8,9 +8,9 @@ #include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log_format.h" -#include "xfs_inode.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_trace.h" /* diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 4d7410e49db4..5c9a7440d9e4 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -124,37 +124,18 @@ const struct xfs_buf_ops xfs_inode_buf_ra_ops = { /* * This routine is called to map an inode to the buffer containing the on-disk * version of the inode. It returns a pointer to the buffer containing the - * on-disk inode in the bpp parameter, and in the dipp parameter it returns a - * pointer to the on-disk inode within that buffer. - * - * If a non-zero error is returned, then the contents of bpp and dipp are - * undefined. + * on-disk inode in the bpp parameter. */ int xfs_imap_to_bp( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp, - uint buf_flags) + struct xfs_buf **bpp) { - struct xfs_buf *bp; - int error; - - buf_flags |= XBF_UNMAPPED; - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp, + return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); - if (error) { - ASSERT(error != -EAGAIN || (buf_flags & XBF_TRYLOCK)); - return error; - } - - *bpp = bp; - if (dipp) - *dipp = xfs_buf_offset(bp, imap->im_boffset); - return 0; } static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) @@ -192,7 +173,6 @@ xfs_inode_from_disk( struct xfs_inode *ip, struct xfs_dinode *from) { - struct xfs_icdinode *to = &ip->i_d; struct inode *inode = VFS_I(ip); int error; xfs_failaddr_t fa; @@ -212,7 +192,8 @@ xfs_inode_from_disk( * inode. If the inode is unused, mode is zero and we shouldn't mess * with the uninitialized part of it. */ - to->di_flushiter = be16_to_cpu(from->di_flushiter); + if (!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) + ip->i_flushiter = be16_to_cpu(from->di_flushiter); inode->i_generation = be32_to_cpu(from->di_gen); inode->i_mode = be16_to_cpu(from->di_mode); if (!inode->i_mode) @@ -225,10 +206,10 @@ xfs_inode_from_disk( */ if (unlikely(from->di_version == 1)) { set_nlink(inode, be16_to_cpu(from->di_onlink)); - to->di_projid = 0; + ip->i_projid = 0; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); - to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | + ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | be16_to_cpu(from->di_projid_lo); } @@ -245,20 +226,21 @@ xfs_inode_from_disk( inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime); - to->di_size = be64_to_cpu(from->di_size); - to->di_nblocks = be64_to_cpu(from->di_nblocks); - to->di_extsize = be32_to_cpu(from->di_extsize); - to->di_forkoff = from->di_forkoff; - to->di_dmevmask = be32_to_cpu(from->di_dmevmask); - to->di_dmstate = be16_to_cpu(from->di_dmstate); - to->di_flags = be16_to_cpu(from->di_flags); + ip->i_disk_size = be64_to_cpu(from->di_size); + ip->i_nblocks = be64_to_cpu(from->di_nblocks); + ip->i_extsize = be32_to_cpu(from->di_extsize); + ip->i_forkoff = from->di_forkoff; + ip->i_diflags = be16_to_cpu(from->di_flags); + + if (from->di_dmevmask || from->di_dmstate) + xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS); if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); - to->di_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); - to->di_flags2 = be64_to_cpu(from->di_flags2); - to->di_cowextsize = be32_to_cpu(from->di_cowextsize); + ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); + ip->i_diflags2 = be64_to_cpu(from->di_flags2); + ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); } error = xfs_iformat_data_fork(ip, from); @@ -303,7 +285,6 @@ xfs_inode_to_disk( struct xfs_dinode *to, xfs_lsn_t lsn) { - struct xfs_icdinode *from = &ip->i_d; struct inode *inode = VFS_I(ip); to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); @@ -312,8 +293,8 @@ xfs_inode_to_disk( to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = cpu_to_be32(i_uid_read(inode)); to->di_gid = cpu_to_be32(i_gid_read(inode)); - to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff); - to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); + to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); + to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); memset(to->di_pad, 0, sizeof(to->di_pad)); to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); @@ -323,23 +304,21 @@ xfs_inode_to_disk( to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); - to->di_size = cpu_to_be64(from->di_size); - to->di_nblocks = cpu_to_be64(from->di_nblocks); - to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_size = cpu_to_be64(ip->i_disk_size); + to->di_nblocks = cpu_to_be64(ip->i_nblocks); + to->di_extsize = cpu_to_be32(ip->i_extsize); to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); - to->di_forkoff = from->di_forkoff; + to->di_forkoff = ip->i_forkoff; to->di_aformat = xfs_ifork_format(ip->i_afp); - to->di_dmevmask = cpu_to_be32(from->di_dmevmask); - to->di_dmstate = cpu_to_be16(from->di_dmstate); - to->di_flags = cpu_to_be16(from->di_flags); + to->di_flags = cpu_to_be16(ip->i_diflags); if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); - to->di_crtime = xfs_inode_to_disk_ts(ip, from->di_crtime); - to->di_flags2 = cpu_to_be64(from->di_flags2); - to->di_cowextsize = cpu_to_be32(from->di_cowextsize); + to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); + to->di_flags2 = cpu_to_be64(ip->i_diflags2); + to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); @@ -347,7 +326,7 @@ xfs_inode_to_disk( to->di_flushiter = 0; } else { to->di_version = 2; - to->di_flushiter = cpu_to_be16(from->di_flushiter); + to->di_flushiter = cpu_to_be16(ip->i_flushiter); } } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index ef5eaf33d146..7f865bb4df84 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -10,34 +10,6 @@ struct xfs_inode; struct xfs_dinode; /* - * In memory representation of the XFS inode. This is held in the in-core struct - * xfs_inode and represents the current on disk values but the structure is not - * in on-disk format. That is, this structure is always translated to on-disk - * format specific structures at the appropriate time. - */ -struct xfs_icdinode { - uint16_t di_flushiter; /* incremented on flush */ - prid_t di_projid; /* owner's project id */ - xfs_fsize_t di_size; /* number of bytes in file */ - xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ - xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - uint32_t di_dmevmask; /* DMIG event mask */ - uint16_t di_dmstate; /* DMIG state info */ - uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ - - uint64_t di_flags2; /* more random flags */ - uint32_t di_cowextsize; /* basic cow extent size for file */ - - struct timespec64 di_crtime; /* time created */ -}; - -static inline bool xfs_icdinode_has_bigtime(const struct xfs_icdinode *icd) -{ - return icd->di_flags2 & XFS_DIFLAG2_BIGTIME; -} - -/* * Inode location information. Stored in the inode and passed to * xfs_imap_to_bp() to get a buffer and dinode for a given inode. */ @@ -47,9 +19,8 @@ struct xfs_imap { unsigned short im_boffset; /* inode offset in block in bytes */ }; -int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, - struct xfs_imap *, struct xfs_dinode **, - struct xfs_buf **, uint); +int xfs_imap_to_bp(struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_imap *imap, struct xfs_buf **bpp); void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index e080d7e07643..1d174909f9bd 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -60,8 +60,6 @@ xfs_init_local_fork( } ifp->if_bytes = size; - ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ifp->if_flags |= XFS_IFINLINE; } /* @@ -151,7 +149,6 @@ xfs_iformat_extents( xfs_iext_next(ifp, &icur); } } - ifp->if_flags |= XFS_IFEXTENTS; return 0; } @@ -194,8 +191,8 @@ xfs_iformat_btree( nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || - ifp->if_nextents > ip->i_d.di_nblocks) || - level == 0 || level > XFS_BTREE_MAXLEVELS) { + ifp->if_nextents > ip->i_nblocks) || + level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) { xfs_warn(mp, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, @@ -213,8 +210,6 @@ xfs_iformat_btree( */ xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), ifp->if_broot, size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFBROOT; ifp->if_bytes = 0; ifp->if_u1.if_root = NULL; @@ -242,7 +237,7 @@ xfs_iformat_data_fork( case S_IFCHR: case S_IFBLK: case S_IFSOCK: - ip->i_d.di_size = 0; + ip->i_disk_size = 0; inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); return 0; case S_IFREG: @@ -282,6 +277,19 @@ xfs_dfork_attr_shortform_size( return be16_to_cpu(atp->hdr.totsize); } +struct xfs_ifork * +xfs_ifork_alloc( + enum xfs_dinode_fmt format, + xfs_extnum_t nextents) +{ + struct xfs_ifork *ifp; + + ifp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL); + ifp->if_format = format; + ifp->if_nextents = nextents; + return ifp; +} + int xfs_iformat_attr_fork( struct xfs_inode *ip, @@ -293,11 +301,8 @@ xfs_iformat_attr_fork( * Initialize the extent count early, as the per-format routines may * depend on it. */ - ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL); - ip->i_afp->if_format = dip->di_aformat; - if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */ - ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; - ip->i_afp->if_nextents = be16_to_cpu(dip->di_anextents); + ip->i_afp = xfs_ifork_alloc(dip->di_aformat, + be16_to_cpu(dip->di_anextents)); switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_LOCAL: @@ -423,7 +428,6 @@ xfs_iroot_realloc( XFS_BMBT_BLOCK_LEN(ip->i_mount)); } else { new_broot = NULL; - ifp->if_flags &= ~XFS_IFBROOT; } /* @@ -512,17 +516,16 @@ xfs_idestroy_fork( ifp->if_broot = NULL; } - /* - * If the format is local, then we can't have an extents array so just - * look for an inline data array. If we're not local then we may or may - * not have an extents list, so check and free it up if we do. - */ - if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { + switch (ifp->if_format) { + case XFS_DINODE_FMT_LOCAL: kmem_free(ifp->if_u1.if_data); ifp->if_u1.if_data = NULL; - } else if (ifp->if_flags & XFS_IFEXTENTS) { + break; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: if (ifp->if_height) xfs_iext_destroy(ifp); + break; } } @@ -616,8 +619,6 @@ xfs_iflush_fork( break; case XFS_DINODE_FMT_EXTENTS: - ASSERT((ifp->if_flags & XFS_IFEXTENTS) || - !(iip->ili_fields & extflag[whichfork])); if ((iip->ili_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_nextents > 0); @@ -677,7 +678,6 @@ xfs_ifork_init_cow( ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL); - ip->i_cowfp->if_flags = XFS_IFEXTENTS; ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 9e2137cd7372..a6f7897b6887 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -22,19 +22,11 @@ struct xfs_ifork { char *if_data; /* inline file data */ } if_u1; short if_broot_bytes; /* bytes allocated for root */ - unsigned char if_flags; /* per-fork flags */ int8_t if_format; /* format of this fork */ xfs_extnum_t if_nextents; /* # of extents in this fork */ }; /* - * Per-fork incore inode flags. - */ -#define XFS_IFINLINE 0x01 /* Inline data is read in */ -#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ -#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ - -/* * Worst-case increase in the fork extent count when we're adding a single * extent to a fork and there's no possibility of splitting an existing mapping. */ @@ -99,8 +91,8 @@ struct xfs_ifork { * Fork handling. */ -#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) -#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) +#define XFS_IFORK_Q(ip) ((ip)->i_forkoff != 0) +#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_forkoff << 3)) #define XFS_IFORK_PTR(ip,w) \ ((w) == XFS_DATA_FORK ? \ @@ -141,6 +133,8 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) return ifp->if_format; } +struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format, + xfs_extnum_t nextents); struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *); @@ -236,4 +230,10 @@ int xfs_ifork_verify_local_attr(struct xfs_inode *ip); int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, int nr_to_add); +/* returns true if the fork has extents but they are not read in yet. */ +static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) +{ + return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0; +} + #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index fe3a49575ff3..483375c6a735 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -997,8 +997,8 @@ xfs_rtfree_extent( */ if (tp->t_frextents_delta + mp->m_sb.sb_frextents == mp->m_sb.sb_rextents) { - if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) - mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) + mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0; xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); } diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 8c61a461bf7b..782fdd08f759 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -176,8 +176,12 @@ struct xfs_ino_geometry { unsigned int agino_log; /* #bits for agino in inum */ + /* precomputed default inode attribute fork offset */ + unsigned int attr_fork_offset; + /* precomputed value for di_flags2 */ uint64_t new_diflags2; + }; #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 90f1d5645052..78324e043e25 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -70,7 +70,7 @@ xfs_trans_ichgtime( if (flags & XFS_ICHGTIME_CHG) inode->i_ctime = tv; if (flags & XFS_ICHGTIME_CREATE) - ip->i_d.di_crtime = tv; + ip->i_crtime = tv; } /* @@ -138,7 +138,7 @@ xfs_trans_log_inode( if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) && !xfs_inode_has_bigtime(ip)) { - ip->i_d.di_flags2 |= XFS_DIFLAG2_BIGTIME; + ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; flags |= XFS_ILOG_CORE; } @@ -164,8 +164,7 @@ xfs_trans_log_inode( * here. */ spin_unlock(&iip->ili_lock); - error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, NULL, - &bp, 0); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); if (error) { xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); return; diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index b254fbeaaa50..04801362e1a7 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -13,7 +13,7 @@ #include "xfs_mount.h" /* Find the size of the AG, in blocks. */ -xfs_agblock_t +inline xfs_agblock_t xfs_ag_block_count( struct xfs_mount *mp, xfs_agnumber_t agno) @@ -29,7 +29,7 @@ xfs_ag_block_count( * Verify that an AG block number pointer neither points outside the AG * nor points at static metadata. */ -bool +inline bool xfs_verify_agbno( struct xfs_mount *mp, xfs_agnumber_t agno, @@ -49,7 +49,7 @@ xfs_verify_agbno( * Verify that an FS block number pointer neither points outside the * filesystem nor points at static AG metadata. */ -bool +inline bool xfs_verify_fsbno( struct xfs_mount *mp, xfs_fsblock_t fsbno) @@ -85,7 +85,7 @@ xfs_verify_fsbext( } /* Calculate the first and last possible inode number in an AG. */ -void +inline void xfs_agino_range( struct xfs_mount *mp, xfs_agnumber_t agno, @@ -116,7 +116,7 @@ xfs_agino_range( * Verify that an AG inode number pointer neither points outside the AG * nor points at static metadata. */ -bool +inline bool xfs_verify_agino( struct xfs_mount *mp, xfs_agnumber_t agno, @@ -146,7 +146,7 @@ xfs_verify_agino_or_null( * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. */ -bool +inline bool xfs_verify_ino( struct xfs_mount *mp, xfs_ino_t ino) @@ -162,7 +162,7 @@ xfs_verify_ino( } /* Is this an internal inode number? */ -bool +inline bool xfs_internal_inum( struct xfs_mount *mp, xfs_ino_t ino) @@ -190,7 +190,7 @@ xfs_verify_dir_ino( * Verify that an realtime block number pointer doesn't point off the * end of the realtime device. */ -bool +inline bool xfs_verify_rtbno( struct xfs_mount *mp, xfs_rtblock_t rtbno) @@ -215,7 +215,7 @@ xfs_verify_rtext( } /* Calculate the range of valid icount values. */ -void +inline void xfs_icount_range( struct xfs_mount *mp, unsigned long long *min, diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index ae8e2e0ac64a..749faa17f8e2 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -477,16 +477,13 @@ xchk_agf_xref( { struct xfs_mount *mp = sc->mp; xfs_agblock_t agbno; - int error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; agbno = XFS_AGF_BLOCK(mp); - error = xchk_ag_btcur_init(sc, &sc->sa); - if (error) - return; + xchk_ag_btcur_init(sc, &sc->sa); xchk_xref_is_used_space(sc, agbno, 1); xchk_agf_xref_freeblks(sc); @@ -508,7 +505,7 @@ xchk_agf( struct xfs_mount *mp = sc->mp; struct xfs_agf *agf; struct xfs_perag *pag; - xfs_agnumber_t agno; + xfs_agnumber_t agno = sc->sm->sm_agno; xfs_agblock_t agbno; xfs_agblock_t eoag; xfs_agblock_t agfl_first; @@ -518,9 +515,7 @@ xchk_agf( int level; int error = 0; - agno = sc->sa.agno = sc->sm->sm_agno; - error = xchk_ag_read_headers(sc, agno, &sc->sa.agi_bp, - &sc->sa.agf_bp, &sc->sa.agfl_bp); + error = xchk_ag_read_headers(sc, agno, &sc->sa); if (!xchk_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error)) goto out; xchk_buffer_recheck(sc, sc->sa.agf_bp); @@ -662,16 +657,13 @@ xchk_agfl_xref( { struct xfs_mount *mp = sc->mp; xfs_agblock_t agbno; - int error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; agbno = XFS_AGFL_BLOCK(mp); - error = xchk_ag_btcur_init(sc, &sc->sa); - if (error) - return; + xchk_ag_btcur_init(sc, &sc->sa); xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); @@ -691,14 +683,12 @@ xchk_agfl( { struct xchk_agfl_info sai; struct xfs_agf *agf; - xfs_agnumber_t agno; + xfs_agnumber_t agno = sc->sm->sm_agno; unsigned int agflcount; unsigned int i; int error; - agno = sc->sa.agno = sc->sm->sm_agno; - error = xchk_ag_read_headers(sc, agno, &sc->sa.agi_bp, - &sc->sa.agf_bp, &sc->sa.agfl_bp); + error = xchk_ag_read_headers(sc, agno, &sc->sa); if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) goto out; if (!sc->sa.agf_bp) @@ -817,16 +807,13 @@ xchk_agi_xref( { struct xfs_mount *mp = sc->mp; xfs_agblock_t agbno; - int error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; agbno = XFS_AGI_BLOCK(mp); - error = xchk_ag_btcur_init(sc, &sc->sa); - if (error) - return; + xchk_ag_btcur_init(sc, &sc->sa); xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); @@ -846,7 +833,7 @@ xchk_agi( struct xfs_mount *mp = sc->mp; struct xfs_agi *agi; struct xfs_perag *pag; - xfs_agnumber_t agno; + xfs_agnumber_t agno = sc->sm->sm_agno; xfs_agblock_t agbno; xfs_agblock_t eoag; xfs_agino_t agino; @@ -857,9 +844,7 @@ xchk_agi( int level; int error = 0; - agno = sc->sa.agno = sc->sm->sm_agno; - error = xchk_ag_read_headers(sc, agno, &sc->sa.agi_bp, - &sc->sa.agf_bp, &sc->sa.agfl_bp); + error = xchk_ag_read_headers(sc, agno, &sc->sa); if (!xchk_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error)) goto out; xchk_buffer_recheck(sc, sc->sa.agi_bp); diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 73d924e47565..2720bd7fe53b 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -21,10 +21,9 @@ */ int xchk_setup_ag_allocbt( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { - return xchk_setup_ag_btree(sc, ip, false); + return xchk_setup_ag_btree(sc, false); } /* Free space btree scrubber. */ diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 9faddb334a2c..552af0cf8482 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -69,8 +69,7 @@ xchk_setup_xattr_buf( /* Set us up to scrub an inode's extended attributes. */ int xchk_setup_xattr( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { int error; @@ -85,7 +84,7 @@ xchk_setup_xattr( return error; } - return xchk_setup_inode_contents(sc, ip, 0); + return xchk_setup_inode_contents(sc, 0); } /* Extended Attributes */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index f88694f22d05..813b5f219113 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -63,8 +63,8 @@ xbitmap_init( static int xbitmap_range_cmp( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct xbitmap_range *ap; struct xbitmap_range *bp; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 33559c3a4bc3..b5ebf1d1b4db 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -26,12 +26,11 @@ /* Set us up with an inode's bmap. */ int xchk_setup_inode_bmap( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { int error; - error = xchk_get_inode(sc, ip); + error = xchk_get_inode(sc); if (error) goto out; @@ -448,12 +447,11 @@ xchk_bmap_btree( int error; /* Load the incore bmap cache if it's not loaded. */ - info->was_loaded = ifp->if_flags & XFS_IFEXTENTS; - if (!info->was_loaded) { - error = xfs_iread_extents(sc->tp, ip, whichfork); - if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) - goto out; - } + info->was_loaded = !xfs_need_iread_extents(ifp); + + error = xfs_iread_extents(sc->tp, ip, whichfork); + if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) + goto out; /* Check the btree structure. */ cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); @@ -675,10 +673,6 @@ xchk_bmap( /* No mappings to check. */ goto out; case XFS_DINODE_FMT_EXTENTS: - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - xchk_fblock_set_corrupt(sc, whichfork, 0); - goto out; - } break; case XFS_DINODE_FMT_BTREE: if (whichfork == XFS_COW_FORK) { diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index debf392e0515..a94bd8122c60 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -9,6 +9,7 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -442,6 +443,30 @@ xchk_btree_check_owner( return xchk_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp)); } +/* Decide if we want to check minrecs of a btree block in the inode root. */ +static inline bool +xchk_btree_check_iroot_minrecs( + struct xchk_btree *bs) +{ + /* + * xfs_bmap_add_attrfork_btree had an implementation bug wherein it + * would miscalculate the space required for the data fork bmbt root + * when adding an attr fork, and promote the iroot contents to an + * external block unnecessarily. This went unnoticed for many years + * until scrub found filesystems in this state. Inode rooted btrees are + * not supposed to have immediate child blocks that are small enough + * that the contents could fit in the inode root, but we can't fail + * existing filesystems, so instead we disable the check for data fork + * bmap btrees when there's an attr fork. + */ + if (bs->cur->bc_btnum == XFS_BTNUM_BMAP && + bs->cur->bc_ino.whichfork == XFS_DATA_FORK && + XFS_IFORK_Q(bs->sc->ip)) + return false; + + return true; +} + /* * Check that this btree block has at least minrecs records or is one of the * special blocks that don't require that. @@ -475,8 +500,9 @@ xchk_btree_check_minrecs( root_block = xfs_btree_get_block(cur, root_level, &root_bp); root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level); - if (be16_to_cpu(root_block->bb_numrecs) != 1 || - numrecs <= root_maxrecs) + if (xchk_btree_check_iroot_minrecs(bs) && + (be16_to_cpu(root_block->bb_numrecs) != 1 || + numrecs <= root_maxrecs)) xchk_btree_set_corrupt(bs->sc, cur, level); return; } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 53456f3de881..aa874607618a 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -402,22 +402,22 @@ int xchk_ag_read_headers( struct xfs_scrub *sc, xfs_agnumber_t agno, - struct xfs_buf **agi, - struct xfs_buf **agf, - struct xfs_buf **agfl) + struct xchk_ag *sa) { struct xfs_mount *mp = sc->mp; int error; - error = xfs_ialloc_read_agi(mp, sc->tp, agno, agi); + sa->agno = agno; + + error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) goto out; - error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, agf); + error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) goto out; - error = xfs_alloc_read_agfl(mp, sc->tp, agno, agfl); + error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) goto out; error = 0; @@ -452,7 +452,7 @@ xchk_ag_btcur_free( } /* Initialize all the btree cursors for an AG. */ -int +void xchk_ag_btcur_init( struct xfs_scrub *sc, struct xchk_ag *sa) @@ -502,8 +502,6 @@ xchk_ag_btcur_init( sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, sa->agf_bp, agno); } - - return 0; } /* Release the AG header context and btree cursors. */ @@ -547,13 +545,12 @@ xchk_ag_init( { int error; - sa->agno = agno; - error = xchk_ag_read_headers(sc, agno, &sa->agi_bp, - &sa->agf_bp, &sa->agfl_bp); + error = xchk_ag_read_headers(sc, agno, sa); if (error) return error; - return xchk_ag_btcur_init(sc, sa); + xchk_ag_btcur_init(sc, sa); + return 0; } /* @@ -596,8 +593,7 @@ xchk_trans_alloc( /* Set us up with a transaction and an empty context. */ int xchk_setup_fs( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { uint resblks; @@ -609,7 +605,6 @@ xchk_setup_fs( int xchk_setup_ag_btree( struct xfs_scrub *sc, - struct xfs_inode *ip, bool force_log) { struct xfs_mount *mp = sc->mp; @@ -627,7 +622,7 @@ xchk_setup_ag_btree( return error; } - error = xchk_setup_fs(sc, ip); + error = xchk_setup_fs(sc); if (error) return error; @@ -655,11 +650,11 @@ xchk_checkpoint_log( */ int xchk_get_inode( - struct xfs_scrub *sc, - struct xfs_inode *ip_in) + struct xfs_scrub *sc) { struct xfs_imap imap; struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); struct xfs_inode *ip = NULL; int error; @@ -720,12 +715,11 @@ xchk_get_inode( int xchk_setup_inode_contents( struct xfs_scrub *sc, - struct xfs_inode *ip, unsigned int resblks) { int error; - error = xchk_get_inode(sc, ip); + error = xchk_get_inode(sc); if (error) return error; @@ -821,7 +815,7 @@ xchk_metadata_inode_forks( return 0; /* Metadata inodes don't live on the rt device. */ - if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); return 0; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 2e50d146105d..0410faf7d735 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -72,66 +72,52 @@ bool xchk_should_check_xref(struct xfs_scrub *sc, int *error, struct xfs_btree_cur **curpp); /* Setup functions */ -int xchk_setup_fs(struct xfs_scrub *sc, struct xfs_inode *ip); -int xchk_setup_ag_allocbt(struct xfs_scrub *sc, - struct xfs_inode *ip); -int xchk_setup_ag_iallocbt(struct xfs_scrub *sc, - struct xfs_inode *ip); -int xchk_setup_ag_rmapbt(struct xfs_scrub *sc, - struct xfs_inode *ip); -int xchk_setup_ag_refcountbt(struct xfs_scrub *sc, - struct xfs_inode *ip); -int xchk_setup_inode(struct xfs_scrub *sc, - struct xfs_inode *ip); -int xchk_setup_inode_bmap(struct xfs_scrub *sc, - struct xfs_inode *ip); -int xchk_setup_inode_bmap_data(struct xfs_scrub *sc, - struct xfs_inode *ip); -int xchk_setup_directory(struct xfs_scrub *sc, - struct xfs_inode *ip); -int xchk_setup_xattr(struct xfs_scrub *sc, - struct xfs_inode *ip); -int xchk_setup_symlink(struct xfs_scrub *sc, - struct xfs_inode *ip); -int xchk_setup_parent(struct xfs_scrub *sc, - struct xfs_inode *ip); +int xchk_setup_fs(struct xfs_scrub *sc); +int xchk_setup_ag_allocbt(struct xfs_scrub *sc); +int xchk_setup_ag_iallocbt(struct xfs_scrub *sc); +int xchk_setup_ag_rmapbt(struct xfs_scrub *sc); +int xchk_setup_ag_refcountbt(struct xfs_scrub *sc); +int xchk_setup_inode(struct xfs_scrub *sc); +int xchk_setup_inode_bmap(struct xfs_scrub *sc); +int xchk_setup_inode_bmap_data(struct xfs_scrub *sc); +int xchk_setup_directory(struct xfs_scrub *sc); +int xchk_setup_xattr(struct xfs_scrub *sc); +int xchk_setup_symlink(struct xfs_scrub *sc); +int xchk_setup_parent(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT -int xchk_setup_rt(struct xfs_scrub *sc, struct xfs_inode *ip); +int xchk_setup_rt(struct xfs_scrub *sc); #else static inline int -xchk_setup_rt(struct xfs_scrub *sc, struct xfs_inode *ip) +xchk_setup_rt(struct xfs_scrub *sc) { return -ENOENT; } #endif #ifdef CONFIG_XFS_QUOTA -int xchk_setup_quota(struct xfs_scrub *sc, struct xfs_inode *ip); +int xchk_setup_quota(struct xfs_scrub *sc); #else static inline int -xchk_setup_quota(struct xfs_scrub *sc, struct xfs_inode *ip) +xchk_setup_quota(struct xfs_scrub *sc) { return -ENOENT; } #endif -int xchk_setup_fscounters(struct xfs_scrub *sc, struct xfs_inode *ip); +int xchk_setup_fscounters(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); void xchk_perag_get(struct xfs_mount *mp, struct xchk_ag *sa); int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno, - struct xfs_buf **agi, struct xfs_buf **agf, - struct xfs_buf **agfl); + struct xchk_ag *sa); void xchk_ag_btcur_free(struct xchk_ag *sa); -int xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); +void xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks); -int xchk_setup_ag_btree(struct xfs_scrub *sc, struct xfs_inode *ip, - bool force_log); -int xchk_get_inode(struct xfs_scrub *sc, struct xfs_inode *ip_in); -int xchk_setup_inode_contents(struct xfs_scrub *sc, struct xfs_inode *ip, - unsigned int resblks); +int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log); +int xchk_get_inode(struct xfs_scrub *sc); +int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks); void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); /* diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 178b3455a170..28dda391d5df 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -22,10 +22,9 @@ /* Set us up to scrub directories. */ int xchk_setup_directory( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { - return xchk_setup_inode_contents(sc, ip, 0); + return xchk_setup_inode_contents(sc, 0); } /* Directories */ @@ -538,7 +537,7 @@ xchk_directory_leaf1_bestfree( * There should be as many bestfree slots as there are dir data * blocks that can fit under i_size. */ - if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_d.di_size)) { + if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_disk_size)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } @@ -694,15 +693,6 @@ xchk_directory_blocks( /* Iterate all the data extents in the directory... */ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { - /* Block directories only have a single block at offset 0. */ - if (is_block && - (got.br_startoff > 0 || - got.br_blockcount != args.geo->fsbcount)) { - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, - got.br_startoff); - break; - } - /* No more data blocks... */ if (got.br_startoff >= leaf_lblk) break; @@ -817,7 +807,7 @@ xchk_directory( return -ENOENT; /* Plausible size? */ - if (sc->ip->i_d.di_size < xfs_dir2_sf_hdr_size(0)) { + if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); goto out; } @@ -843,7 +833,7 @@ xchk_directory( * Userspace usually asks for a 32k buffer, so we will too. */ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, - sc->ip->i_d.di_size); + sc->ip->i_disk_size); /* * Look up every name in this directory by hash. diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index ec2064ed3c30..7b4386c78fbf 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -116,8 +116,7 @@ next_loop_perag: int xchk_setup_fscounters( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { struct xchk_fscounters *fsc; int error; diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 83d27cdf579b..3de59b5c2ce6 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -133,7 +133,8 @@ xchk_update_health( if (!sc->sick_mask) return; - bad = (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT); + bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT)); switch (type_to_health_flag[sc->sm->sm_type].group) { case XHG_AG: pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 6517d67e8d51..8d9f3fb0cd22 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -29,10 +29,9 @@ */ int xchk_setup_ag_iallocbt( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { - return xchk_setup_ag_btree(sc, ip, sc->flags & XCHK_TRY_HARDER); + return xchk_setup_ag_btree(sc, sc->flags & XCHK_TRY_HARDER); } /* Inode btree scrubber. */ @@ -212,7 +211,6 @@ xchk_iallocbt_check_cluster( { struct xfs_imap imap; struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_dinode *dip; struct xfs_buf *cluster_bp; unsigned int nr_inodes; xfs_agnumber_t agno = bs->cur->bc_ag.agno; @@ -278,7 +276,7 @@ xchk_iallocbt_check_cluster( &XFS_RMAP_OINFO_INODES); /* Grab the inode cluster buffer. */ - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, 0); + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &cluster_bp); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) return error; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index faf65eb5bd31..61f90b2c9430 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -28,8 +28,7 @@ */ int xchk_setup_inode( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { int error; @@ -37,7 +36,7 @@ xchk_setup_inode( * Try to get the inode. If the verifiers fail, we try again * in raw mode. */ - error = xchk_get_inode(sc, ip); + error = xchk_get_inode(sc); switch (error) { case 0: break; diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 66c35f6dfc24..ab182a5cd0c0 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -20,10 +20,9 @@ /* Set us up to scrub parents. */ int xchk_setup_parent( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { - return xchk_setup_inode_contents(sc, ip, 0); + return xchk_setup_inode_contents(sc, 0); } /* Parent pointers */ @@ -102,7 +101,7 @@ xchk_parent_count_parent_dentries( * scanned. */ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, - parent->i_d.di_size); + parent->i_disk_size); oldpos = 0; while (true) { error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize); diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index e34ca20ae8e4..acbb9839d42f 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -37,8 +37,7 @@ xchk_quota_to_dqtype( /* Set us up to scrub a quota. */ int xchk_setup_quota( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { xfs_dqtype_t dqtype; int error; @@ -53,7 +52,7 @@ xchk_setup_quota( mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; - error = xchk_setup_fs(sc, ip); + error = xchk_setup_fs(sc); if (error) return error; sc->ip = xfs_quota_inode(sc->mp, dqtype); @@ -85,7 +84,7 @@ xchk_quota_item( int error = 0; if (xchk_should_terminate(sc, &error)) - return error; + return -ECANCELED; /* * Except for the root dquot, the actual dquot we got must either have @@ -162,7 +161,7 @@ xchk_quota_item( out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return -EFSCORRUPTED; + return -ECANCELED; return 0; } @@ -238,6 +237,8 @@ xchk_quota( error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); sc->ilock_flags = XFS_ILOCK_EXCL; xfs_ilock(sc->ip, sc->ilock_flags); + if (error == -ECANCELED) + error = 0; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, sqi.last_id * qi->qi_dqperchunk, &error)) goto out; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index dd672e6bbc75..744530a66c0c 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -19,10 +19,9 @@ */ int xchk_setup_ag_refcountbt( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { - return xchk_setup_ag_btree(sc, ip, false); + return xchk_setup_ag_btree(sc, false); } /* Reference count btree scrubber. */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 25e86c71e7b9..c2857d854c83 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -37,19 +37,18 @@ */ int xrep_attempt( - struct xfs_inode *ip, struct xfs_scrub *sc) { int error = 0; - trace_xrep_attempt(ip, sc->sm, error); + trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error); xchk_ag_btcur_free(&sc->sa); /* Repair whatever's broken. */ ASSERT(sc->ops->repair); error = sc->ops->repair(sc); - trace_xrep_done(ip, sc->sm, error); + trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error); switch (error) { case 0: /* @@ -207,7 +206,11 @@ xrep_calc_ag_resblks( /* Now grab the block counters from the AGF. */ error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); - if (!error) { + if (error) { + aglen = xfs_ag_block_count(mp, sm->sm_agno); + freelen = aglen; + usedlen = aglen; + } else { struct xfs_agf *agf = bp->b_addr; aglen = be32_to_cpu(agf->agf_length); diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index fe77de01abe0..3bb152d52a07 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -17,7 +17,7 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) /* Repair helpers */ -int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc); +int xrep_attempt(struct xfs_scrub *sc); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, @@ -64,8 +64,8 @@ int xrep_agi(struct xfs_scrub *sc); #else -static inline int xrep_attempt( - struct xfs_inode *ip, +static inline int +xrep_attempt( struct xfs_scrub *sc) { return -EOPNOTSUPP; diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index f4fcb4719f41..a4f17477c5d1 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -21,10 +21,9 @@ */ int xchk_setup_ag_rmapbt( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { - return xchk_setup_ag_btree(sc, ip, false); + return xchk_setup_ag_btree(sc, false); } /* Reverse-mapping scrubber. */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index d409ca592178..37c0e2266c85 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -20,12 +20,11 @@ /* Set us up with the realtime metadata locked. */ int xchk_setup_rt( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { int error; - error = xchk_setup_fs(sc, ip); + error = xchk_setup_fs(sc); if (error) return error; @@ -100,7 +99,7 @@ xchk_rtbitmap( int error; /* Is the size of the rtbitmap correct? */ - if (sc->mp->m_rbmip->i_d.di_size != + if (sc->mp->m_rbmip->i_disk_size != XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); return 0; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 8ebf35b115ce..0e542636227c 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -149,9 +149,10 @@ xchk_probe( STATIC int xchk_teardown( struct xfs_scrub *sc, - struct xfs_inode *ip_in, int error) { + struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); + xchk_ag_free(sc, &sc->sa); if (sc->tp) { if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) @@ -168,7 +169,8 @@ xchk_teardown( xfs_irele(sc->ip); sc->ip = NULL; } - sb_end_write(sc->mp->m_super); + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mnt_drop_write_file(sc->file); if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { @@ -456,23 +458,25 @@ static inline void xchk_postmortem(struct xfs_scrub *sc) /* Dispatch metadata scrubbing. */ int xfs_scrub_metadata( - struct xfs_inode *ip, + struct file *file, struct xfs_scrub_metadata *sm) { struct xfs_scrub sc = { - .mp = ip->i_mount, + .file = file, .sm = sm, .sa = { .agno = NULLAGNUMBER, }, }; - struct xfs_mount *mp = ip->i_mount; + struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; int error = 0; + sc.mp = mp; + BUILD_BUG_ON(sizeof(meta_scrub_ops) != (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR)); - trace_xchk_start(ip, sm, error); + trace_xchk_start(XFS_I(file_inode(file)), sm, error); /* Forbidden if we are shut down or mounted norecovery. */ error = -ESHUTDOWN; @@ -492,15 +496,17 @@ xfs_scrub_metadata( sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); retry_op: /* - * If freeze runs concurrently with a scrub, the freeze can be delayed - * indefinitely as we walk the filesystem and iterate over metadata - * buffers. Freeze quiesces the log (which waits for the buffer LRU to - * be emptied) and that won't happen while checking is running. + * When repairs are allowed, prevent freezing or readonly remount while + * scrub is running with a real transaction. */ - sb_start_write(mp->m_super); + if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { + error = mnt_want_write_file(sc.file); + if (error) + goto out; + } /* Set up for the operation. */ - error = sc.ops->setup(&sc, ip); + error = sc.ops->setup(&sc); if (error) goto out_teardown; @@ -512,12 +518,12 @@ retry_op: * Tear down everything we hold, then set up again with * preparation for worst-case scenarios. */ - error = xchk_teardown(&sc, ip, 0); + error = xchk_teardown(&sc, 0); if (error) goto out; sc.flags |= XCHK_TRY_HARDER; goto retry_op; - } else if (error) + } else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) goto out_teardown; xchk_update_health(&sc); @@ -546,14 +552,14 @@ retry_op: * If it's broken, userspace wants us to fix it, and we haven't * already tried to fix it, then attempt a repair. */ - error = xrep_attempt(ip, &sc); + error = xrep_attempt(&sc); if (error == -EAGAIN) { /* * Either the repair function succeeded or it couldn't * get all the resources it needs; either way, we go * back to the beginning and call the scrub function. */ - error = xchk_teardown(&sc, ip, 0); + error = xchk_teardown(&sc, 0); if (error) { xrep_failure(mp); goto out; @@ -565,9 +571,9 @@ retry_op: out_nofix: xchk_postmortem(&sc); out_teardown: - error = xchk_teardown(&sc, ip, error); + error = xchk_teardown(&sc, error); out: - trace_xchk_done(ip, sm, error); + trace_xchk_done(XFS_I(file_inode(file)), sm, error); if (error == -EFSCORRUPTED || error == -EFSBADCRC) { sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; error = 0; diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index ad1ceb44a628..08a483cb46e2 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -18,8 +18,7 @@ enum xchk_type { struct xchk_meta_ops { /* Acquire whatever resources are needed for the operation. */ - int (*setup)(struct xfs_scrub *, - struct xfs_inode *); + int (*setup)(struct xfs_scrub *sc); /* Examine metadata for errors. */ int (*scrub)(struct xfs_scrub *); @@ -59,7 +58,18 @@ struct xfs_scrub { struct xfs_scrub_metadata *sm; const struct xchk_meta_ops *ops; struct xfs_trans *tp; + + /* File that scrub was called with. */ + struct file *file; + + /* + * File that is undergoing the scrub operation. This can differ from + * the file that scrub was called with if we're checking file-based fs + * metadata (e.g. rt bitmaps) or if we're doing a scrub-by-handle for + * something that can't be opened directly (e.g. symlinks). + */ struct xfs_inode *ip; + void *buf; uint ilock_flags; diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index c08be5ede066..599ee277bba2 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -18,15 +18,14 @@ /* Set us up to scrub a symbolic link. */ int xchk_setup_symlink( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { /* Allocate the buffer without the inode lock held. */ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL); if (!sc->buf) return -ENOMEM; - return xchk_setup_inode_contents(sc, ip, 0); + return xchk_setup_inode_contents(sc, 0); } /* Symbolic links. */ @@ -43,7 +42,7 @@ xchk_symlink( if (!S_ISLNK(VFS_I(ip)->i_mode)) return -ENOENT; ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - len = ip->i_d.di_size; + len = ip->i_disk_size; /* Plausible size? */ if (len > XFS_SYMLINK_MAXLEN || len <= 0) { @@ -52,7 +51,7 @@ xchk_symlink( } /* Inline symlink? */ - if (ifp->if_flags & XFS_IFINLINE) { + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { if (len > XFS_IFORK_DSIZE(ip) || len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index 2897ba3a17e6..2ceae614ade8 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -7,9 +7,9 @@ #define __XFS_SCRUB_H__ #ifndef CONFIG_XFS_ONLINE_SCRUB -# define xfs_scrub_metadata(ip, sm) (-ENOTTY) +# define xfs_scrub_metadata(file, sm) (-ENOTTY) #else -int xfs_scrub_metadata(struct xfs_inode *ip, struct xfs_scrub_metadata *sm); +int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm); #endif /* CONFIG_XFS_ONLINE_SCRUB */ #endif /* __XFS_SCRUB_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index b4186d666157..9b08db45ce85 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -36,47 +36,26 @@ XFS_WPC(struct iomap_writepage_ctx *ctx) static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend) { return ioend->io_offset + ioend->io_size > - XFS_I(ioend->io_inode)->i_d.di_size; -} - -STATIC int -xfs_setfilesize_trans_alloc( - struct iomap_ioend *ioend) -{ - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); - if (error) - return error; - - ioend->io_private = tp; - - /* - * We may pass freeze protection with a transaction. So tell lockdep - * we released it. - */ - __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); - /* - * We hand off the transaction to the completion thread now, so - * clear the flag here. - */ - xfs_trans_clear_context(tp); - return 0; + XFS_I(ioend->io_inode)->i_disk_size; } /* * Update on-disk file size now that data has been written to disk. */ -STATIC int -__xfs_setfilesize( +int +xfs_setfilesize( struct xfs_inode *ip, - struct xfs_trans *tp, xfs_off_t offset, size_t size) { + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; xfs_fsize_t isize; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) + return error; xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_new_eof(ip, offset + size); @@ -88,55 +67,13 @@ __xfs_setfilesize( trace_xfs_setfilesize(ip, offset, size); - ip->i_d.di_size = isize; + ip->i_disk_size = isize; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); return xfs_trans_commit(tp); } -int -xfs_setfilesize( - struct xfs_inode *ip, - xfs_off_t offset, - size_t size) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); - if (error) - return error; - - return __xfs_setfilesize(ip, tp, offset, size); -} - -STATIC int -xfs_setfilesize_ioend( - struct iomap_ioend *ioend, - int error) -{ - struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_trans *tp = ioend->io_private; - - /* - * The transaction may have been allocated in the I/O submission thread, - * thus we need to mark ourselves as being in a transaction manually. - * Similarly for freeze protection. - */ - xfs_trans_set_context(tp); - __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); - - /* we abort the update if there was an IO error */ - if (error) { - xfs_trans_cancel(tp); - return error; - } - - return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); -} - /* * IO write completion. */ @@ -158,7 +95,7 @@ xfs_end_ioend( nofs_flag = memalloc_nofs_save(); /* - * Just clean up the in-memory strutures if the fs has been shut down. + * Just clean up the in-memory structures if the fs has been shut down. */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { error = -EIO; @@ -182,35 +119,14 @@ xfs_end_ioend( error = xfs_reflink_end_cow(ip, offset, size); else if (ioend->io_type == IOMAP_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - else - ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private); + if (!error && xfs_ioend_is_append(ioend)) + error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); done: - if (ioend->io_private) - error = xfs_setfilesize_ioend(ioend, error); iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } -/* - * If the to be merged ioend has a preallocated transaction for file - * size updates we need to ensure the ioend it is merged into also - * has one. If it already has one we can simply cancel the transaction - * as it is guaranteed to be clean. - */ -static void -xfs_ioend_merge_private( - struct iomap_ioend *ioend, - struct iomap_ioend *next) -{ - if (!ioend->io_private) { - ioend->io_private = next->io_private; - next->io_private = NULL; - } else { - xfs_setfilesize_ioend(next, -ECANCELED); - } -} - /* Finish all pending io completions. */ void xfs_end_io( @@ -230,18 +146,11 @@ xfs_end_io( while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, io_list))) { list_del_init(&ioend->io_list); - iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private); + iomap_ioend_try_merge(ioend, &tmp, NULL); xfs_end_ioend(ioend); } } -static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend) -{ - return ioend->io_private || - ioend->io_type == IOMAP_UNWRITTEN || - (ioend->io_flags & IOMAP_F_SHARED); -} - STATIC void xfs_end_bio( struct bio *bio) @@ -250,8 +159,6 @@ xfs_end_bio( struct xfs_inode *ip = XFS_I(ioend->io_inode); unsigned long flags; - ASSERT(xfs_ioend_needs_workqueue(ioend)); - spin_lock_irqsave(&ip->i_ioend_lock, flags); if (list_empty(&ip->i_ioend_list)) WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, @@ -384,8 +291,7 @@ retry: cow_fsb = NULLFILEOFF; whichfork = XFS_DATA_FORK; xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || - (ip->i_df.if_flags & XFS_IFEXTENTS)); + ASSERT(!xfs_need_iread_extents(&ip->i_df)); /* * Check if this is offset is covered by a COW extents, and if yes use @@ -501,17 +407,11 @@ xfs_prepare_ioend( ioend->io_offset, ioend->io_size); } - /* Reserve log space if we might write beyond the on-disk inode size. */ - if (!status && - ((ioend->io_flags & IOMAP_F_SHARED) || - ioend->io_type != IOMAP_UNWRITTEN) && - xfs_ioend_is_append(ioend) && - !ioend->io_private) - status = xfs_setfilesize_trans_alloc(ioend); - memalloc_nofs_restore(nofs_flag); - if (xfs_ioend_needs_workqueue(ioend)) + /* send ioends that might require a transaction to the completion wq */ + if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || + (ioend->io_flags & IOMAP_F_SHARED)) ioend->io_bio->bi_end_io = xfs_end_bio; return status; } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 8f8837fe21cf..25dcc98d50e6 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -514,7 +514,7 @@ xfs_attr_list_ilocked( return 0; if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_list(context); - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + if (xfs_attr_is_leaf(dp)) return xfs_attr_leaf_list(context); return xfs_attr_node_list(context); } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 2344757ede63..e3a691937e92 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -265,8 +265,8 @@ xfs_trans_log_finish_bmap_update( static int xfs_bmap_update_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct xfs_bmap_intent *ba; struct xfs_bmap_intent *bb; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e7d68318e6a5..a5e9d7d34023 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -154,7 +154,7 @@ xfs_bmap_rtalloc( ap->blkno *= mp->m_sb.sb_rextsize; ralen *= mp->m_sb.sb_rextsize; ap->length = ralen; - ap->ip->i_d.di_nblocks += ralen; + ap->ip->i_nblocks += ralen; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) ap->ip->i_delayed_blks -= ralen; @@ -225,11 +225,9 @@ xfs_bmap_count_blocks( switch (ifp->if_format) { case XFS_DINODE_FMT_BTREE: - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); error = xfs_btree_count_blocks(cur, &btblocks); @@ -423,7 +421,7 @@ xfs_getbmap( break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && - (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { + (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_disk_size)) { error = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; @@ -439,7 +437,7 @@ xfs_getbmap( } if (xfs_get_extsz_hint(ip) || - (ip->i_d.di_flags & + (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))) max_len = mp->m_super->s_maxbytes; else @@ -471,11 +469,9 @@ xfs_getbmap( first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset); len = XFS_BB_TO_FSB(mp, bmv->bmv_length); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, whichfork); - if (error) - goto out_unlock_ilock; - } + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + goto out_unlock_ilock; if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) { /* @@ -558,7 +554,7 @@ xfs_bmap_punch_delalloc_range( struct xfs_iext_cursor icur; int error = 0; - ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(!xfs_need_iread_extents(ifp)); xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) @@ -597,8 +593,24 @@ out_unlock: * regular files that are marked preallocated or append-only. */ bool -xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +xfs_can_free_eofblocks( + struct xfs_inode *ip, + bool force) { + struct xfs_bmbt_irec imap; + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + int nimaps = 1; + int error; + + /* + * Caller must either hold the exclusive io lock; or be inactivating + * the inode, which guarantees there are no other users of the inode. + */ + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) || + (VFS_I(ip)->i_state & I_FREEING)); + /* prealloc/delalloc exists only on regular files */ if (!S_ISREG(VFS_I(ip)->i_mode)) return false; @@ -613,18 +625,43 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) return false; /* If we haven't read in the extent list, then don't do it now. */ - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + if (xfs_need_iread_extents(&ip->i_df)) return false; /* * Do not free real preallocated or append-only files unless the file * has delalloc blocks and we are forced to remove them. */ - if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) if (!force || ip->i_delayed_blks == 0) return false; - return true; + /* + * Do not try to free post-EOF blocks if EOF is beyond the end of the + * range supported by the page cache, because the truncation will loop + * forever. + */ + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + if (last_fsb <= end_fsb) + return false; + + /* + * Look up the mapping for the first block past EOF. If we can't find + * it, there's nothing to free. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, end_fsb, last_fsb - end_fsb, &imap, &nimaps, + 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error || nimaps == 0) + return false; + + /* + * If there's a real mapping there or there are delayed allocation + * reservations, then we have post-EOF blocks to try to free. + */ + return imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks; } /* @@ -637,78 +674,52 @@ xfs_free_eofblocks( struct xfs_inode *ip) { struct xfs_trans *tp; - int error; - xfs_fileoff_t end_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t map_len; - int nimaps; - struct xfs_bmbt_irec imap; struct xfs_mount *mp = ip->i_mount; + int error; - /* - * Figure out if there are any blocks beyond the end - * of the file. If not, then there is nothing to do. - */ - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); - last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (last_fsb <= end_fsb) - return 0; - map_len = last_fsb - end_fsb; + /* Attach the dquots to the inode up front. */ + error = xfs_qm_dqattach(ip); + if (error) + return error; - nimaps = 1; - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_SHARED); + /* Wait on dio to ensure i_size has settled. */ + inode_dio_wait(VFS_I(ip)); - /* - * If there are blocks after the end of file, truncate the file to its - * current size to free them up. - */ - if (!error && (nimaps != 0) && - (imap.br_startblock != HOLESTARTBLOCK || - ip->i_delayed_blks)) { - /* - * Attach the dquots to the inode up front. - */ - error = xfs_qm_dqattach(ip); - if (error) - return error; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + return error; + } - /* wait on dio to ensure i_size has settled */ - inode_dio_wait(VFS_I(ip)); + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, - &tp); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - return error; - } + /* + * Do not update the on-disk file size. If we update the on-disk file + * size and then the system crashes before the contents of the file are + * flushed to disk then the files may be full of holes (ie NULL files + * bug). + */ + error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip), XFS_BMAPI_NODISCARD); + if (error) + goto err_cancel; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); + error = xfs_trans_commit(tp); + if (error) + goto out_unlock; - /* - * Do not update the on-disk file size. If we update the - * on-disk file size and then the system crashes before the - * contents of the file are flushed to disk then the files - * may be full of holes (ie NULL files bug). - */ - error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK, - XFS_ISIZE(ip), XFS_BMAPI_NODISCARD); - if (error) { - /* - * If we get an error at this point we simply don't - * bother truncating the file. - */ - xfs_trans_cancel(tp); - } else { - error = xfs_trans_commit(tp); - if (!error) - xfs_inode_clear_eofblocks_tag(ip); - } + xfs_inode_clear_eofblocks_tag(ip); + goto out_unlock; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } +err_cancel: + /* + * If we get an error at this point we simply don't + * bother truncating the file. + */ + xfs_trans_cancel(tp); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1208,7 +1219,7 @@ xfs_swap_extents_check_format( if (XFS_IS_QUOTA_ON(ip->i_mount) && (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) || !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) || - ip->i_d.di_projid != tip->i_d.di_projid)) + ip->i_projid != tip->i_projid)) return -EINVAL; /* Should never get a local format */ @@ -1323,9 +1334,9 @@ xfs_swap_extent_rmap( * rmap functions when we go to fix up the rmaps. The flags * will be switch for reals later. */ - tip_flags2 = tip->i_d.di_flags2; - if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) - tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + tip_flags2 = tip->i_diflags2; + if (ip->i_diflags2 & XFS_DIFLAG2_REFLINK) + tip->i_diflags2 |= XFS_DIFLAG2_REFLINK; offset_fsb = 0; end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip))); @@ -1412,12 +1423,12 @@ xfs_swap_extent_rmap( offset_fsb += ilen; } - tip->i_d.di_flags2 = tip_flags2; + tip->i_diflags2 = tip_flags2; return 0; out: trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); - tip->i_d.di_flags2 = tip_flags2; + tip->i_diflags2 = tip_flags2; return error; } @@ -1476,9 +1487,9 @@ xfs_swap_extent_forks( /* * Fix the on-disk inode values */ - tmp = (uint64_t)ip->i_d.di_nblocks; - ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; - tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; + tmp = (uint64_t)ip->i_nblocks; + ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks; + tip->i_nblocks = tmp + taforkblks - aforkblks; /* * The extents in the source inode could still contain speculative @@ -1663,8 +1674,8 @@ xfs_swap_extents( /* Verify all data are being swapped */ if (sxp->sx_offset != 0 || - sxp->sx_length != ip->i_d.di_size || - sxp->sx_length != tip->i_d.di_size) { + sxp->sx_length != ip->i_disk_size || + sxp->sx_length != tip->i_disk_size) { error = -EFAULT; goto out_trans_cancel; } @@ -1715,13 +1726,13 @@ xfs_swap_extents( goto out_trans_cancel; /* Do we have to swap reflink flags? */ - if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^ - (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) { - f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; - ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; - tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; - tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; + if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^ + (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) { + f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK; + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK; + tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK; } /* Swap the cow forks. */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 37a1d12762d8..592800c8852f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2124,9 +2124,9 @@ xfs_buf_delwri_queue( */ static int xfs_buf_cmp( - void *priv, - struct list_head *a, - struct list_head *b) + void *priv, + const struct list_head *a, + const struct list_head *b) { struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index dc0be2a639cc..fb69879e4b2b 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -55,6 +55,24 @@ xfs_buf_log_format_size( (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); } +static inline bool +xfs_buf_item_straddle( + struct xfs_buf *bp, + uint offset, + int first_bit, + int nbits) +{ + void *first, *last; + + first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT)); + last = xfs_buf_offset(bp, + offset + ((first_bit + nbits) << XFS_BLF_SHIFT)); + + if (last - first != nbits * XFS_BLF_CHUNK) + return true; + return false; +} + /* * This returns the number of log iovecs needed to log the * given buf log item. @@ -69,24 +87,56 @@ STATIC void xfs_buf_item_size_segment( struct xfs_buf_log_item *bip, struct xfs_buf_log_format *blfp, + uint offset, int *nvecs, int *nbytes) { struct xfs_buf *bp = bip->bli_buf; + int first_bit; + int nbits; int next_bit; int last_bit; - last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); - if (last_bit == -1) + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (first_bit == -1) return; - /* - * initial count for a dirty buffer is 2 vectors - the format structure - * and the first dirty region. - */ - *nvecs += 2; - *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK; + (*nvecs)++; + *nbytes += xfs_buf_log_format_size(blfp); + + do { + nbits = xfs_contig_bits(blfp->blf_data_map, + blfp->blf_map_size, first_bit); + ASSERT(nbits > 0); + /* + * Straddling a page is rare because we don't log contiguous + * chunks of unmapped buffers anywhere. + */ + if (nbits > 1 && + xfs_buf_item_straddle(bp, offset, first_bit, nbits)) + goto slow_scan; + + (*nvecs)++; + *nbytes += nbits * XFS_BLF_CHUNK; + + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + (uint)first_bit + nbits + 1); + } while (first_bit != -1); + + return; + +slow_scan: + /* Count the first bit we jumped out of the above loop from */ + (*nvecs)++; + *nbytes += XFS_BLF_CHUNK; + last_bit = first_bit; while (last_bit != -1) { /* * This takes the bit number to start looking from and @@ -103,16 +153,15 @@ xfs_buf_item_size_segment( */ if (next_bit == -1) { break; - } else if (next_bit != last_bit + 1) { - last_bit = next_bit; - (*nvecs)++; - } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != - (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + - XFS_BLF_CHUNK)) { + } else if (next_bit != last_bit + 1 || + xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { last_bit = next_bit; + first_bit = next_bit; (*nvecs)++; + nbits = 1; } else { last_bit++; + nbits++; } *nbytes += XFS_BLF_CHUNK; } @@ -142,7 +191,10 @@ xfs_buf_item_size( int *nbytes) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; int i; + int bytes; + uint offset = 0; ASSERT(atomic_read(&bip->bli_refcount) > 0); if (bip->bli_flags & XFS_BLI_STALE) { @@ -174,7 +226,7 @@ xfs_buf_item_size( } /* - * the vector count is based on the number of buffer vectors we have + * The vector count is based on the number of buffer vectors we have * dirty bits in. This will only be greater than one when we have a * compound buffer with more than one segment dirty. Hence for compound * buffers we need to track which segment the dirty bits correspond to, @@ -182,10 +234,19 @@ xfs_buf_item_size( * count for the extra buf log format structure that will need to be * written. */ + bytes = 0; for (i = 0; i < bip->bli_format_count; i++) { - xfs_buf_item_size_segment(bip, &bip->bli_formats[i], - nvecs, nbytes); + xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset, + nvecs, &bytes); + offset += BBTOB(bp->b_maps[i].bm_len); } + + /* + * Round up the buffer size required to minimise the number of memory + * allocations that need to be done as this item grows when relogged by + * repeated modifications. + */ + *nbytes = round_up(bytes, 512); trace_xfs_buf_item_size(bip); } @@ -204,18 +265,6 @@ xfs_buf_item_copy_iovec( nbits * XFS_BLF_CHUNK); } -static inline bool -xfs_buf_item_straddle( - struct xfs_buf *bp, - uint offset, - int next_bit, - int last_bit) -{ - return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != - (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + - XFS_BLF_CHUNK); -} - static void xfs_buf_item_format_segment( struct xfs_buf_log_item *bip, @@ -268,6 +317,38 @@ xfs_buf_item_format_segment( /* * Fill in an iovec for each set of contiguous chunks. */ + do { + ASSERT(first_bit >= 0); + nbits = xfs_contig_bits(blfp->blf_data_map, + blfp->blf_map_size, first_bit); + ASSERT(nbits > 0); + + /* + * Straddling a page is rare because we don't log contiguous + * chunks of unmapped buffers anywhere. + */ + if (nbits > 1 && + xfs_buf_item_straddle(bp, offset, first_bit, nbits)) + goto slow_scan; + + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; + + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + (uint)first_bit + nbits + 1); + } while (first_bit != -1); + + return; + +slow_scan: + ASSERT(bp->b_addr == NULL); last_bit = first_bit; nbits = 1; for (;;) { @@ -292,7 +373,7 @@ xfs_buf_item_format_segment( blfp->blf_size++; break; } else if (next_bit != last_bit + 1 || - xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { + xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { xfs_buf_item_copy_iovec(lv, vecp, bp, offset, first_bit, nbits); blfp->blf_size++; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 66deddd5e296..da1cc683560c 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -57,8 +57,8 @@ xfs_dir2_sf_getdents( xfs_ino_t ino; struct xfs_da_geometry *geo = args->geo; - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; @@ -258,11 +258,9 @@ xfs_dir2_leaf_readbuf( int ra_want; int error = 0; - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK); - if (error) - goto out; - } + error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK); + if (error) + goto out; /* * Look for mapped directory blocks at or above the current offset. diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bd8379b98374..ecd5059d6928 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -748,11 +748,9 @@ xfs_dq_get_next_id( start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk; lock_flags = xfs_ilock_data_map_shared(quotip); - if (!(quotip->i_df.if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK); - if (error) - return error; - } + error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK); + if (error) + return error; if (xfs_iext_lookup_extent(quotip, "ip->i_df, start, &cur, &got)) { /* contiguous chunk, bump startoff for the id calculation */ @@ -953,7 +951,7 @@ xfs_qm_id_for_quotatype( case XFS_DQTYPE_GROUP: return i_gid_read(VFS_I(ip)); case XFS_DQTYPE_PROJ: - return ip->i_d.di_projid; + return ip->i_projid; } ASSERT(0); return 0; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 185b4915b7bf..ce3bc1b291a1 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -56,6 +56,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_BUF_IOERROR, XFS_RANDOM_REDUCE_MAX_IEXTENTS, XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, + XFS_RANDOM_AG_RESV_FAIL, }; struct xfs_errortag_attr { @@ -168,6 +169,7 @@ XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); +XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -208,6 +210,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(buf_ioerror), XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), + XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), NULL, }; @@ -299,6 +302,8 @@ xfs_errortag_add( struct xfs_mount *mp, unsigned int error_tag) { + BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX); + if (error_tag >= XFS_ERRTAG_MAX) return -EINVAL; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index ef17c1f6db32..a4075685d9eb 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -629,8 +629,8 @@ xfs_extent_busy_wait_all( int xfs_extent_busy_ag_cmp( void *priv, - struct list_head *l1, - struct list_head *l2) + const struct list_head *l1, + const struct list_head *l2) { struct xfs_extent_busy *b1 = container_of(l1, struct xfs_extent_busy, list); diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 990ab3891971..8aea07100092 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -58,7 +58,8 @@ void xfs_extent_busy_wait_all(struct xfs_mount *mp); int -xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); +xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a, + const struct list_head *b); static inline void xfs_extent_busy_sort(struct list_head *list) { diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 93223ebb3372..2424230ca2c3 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -397,8 +397,8 @@ xfs_trans_free_extent( static int xfs_extent_free_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct xfs_mount *mp = priv; struct xfs_extent_free_item *ra; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a007ca0711d9..396ef36dcd0a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -90,9 +90,9 @@ xfs_update_prealloc_flags( } if (flags & XFS_PREALLOC_SET) - ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + ip->i_diflags |= XFS_DIFLAG_PREALLOC; if (flags & XFS_PREALLOC_CLEAR) - ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + ip->i_diflags &= ~XFS_DIFLAG_PREALLOC; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (flags & XFS_PREALLOC_SYNC) @@ -1159,10 +1159,10 @@ xfs_file_remap_range( */ cowextsize = 0; if (pos_in == 0 && len == i_size_read(inode_in) && - (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && + (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && pos_out == 0 && len >= i_size_read(inode_out) && - !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) - cowextsize = src->i_d.di_cowextsize; + !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_cowextsize; ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, remap_flags); @@ -1244,7 +1244,7 @@ xfs_file_readdir( * point we can change the ->readdir prototype to include the * buffer size. For now we use the current glibc buffer size. */ - bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size); + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); return xfs_readdir(NULL, ip, ctx, bufsize); } diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 5cc7665e93c9..3af963743e4d 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -22,7 +22,7 @@ xfs_inode_is_filestream( struct xfs_inode *ip) { return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || - (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); + (ip->i_diflags & XFS_DIFLAG_FILESTREAM); } #endif /* __XFS_FILESTREAM_H__ */ diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 9ce5e7d5bf8f..34f2b971ce43 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -904,14 +904,6 @@ xfs_getfsmap( info.fsmap_recs = fsmap_recs; info.head = head; - /* - * If fsmap runs concurrently with a scrub, the freeze can be delayed - * indefinitely as we walk the rmapbt and iterate over metadata - * buffers. Freeze quiesces the log (which waits for the buffer LRU to - * be emptied) and that won't happen while we're reading buffers. - */ - sb_start_write(mp->m_super); - /* For each device we support... */ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { /* Is this device within the range the user asked for? */ @@ -934,6 +926,11 @@ xfs_getfsmap( if (handlers[i].dev > head->fmh_keys[0].fmr_device) memset(&dkeys[0], 0, sizeof(struct xfs_fsmap)); + /* + * Grab an empty transaction so that we can use its recursive + * buffer locking abilities to detect cycles in the rmapbt + * without deadlocking. + */ error = xfs_trans_alloc_empty(mp, &tp); if (error) break; @@ -951,7 +948,6 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); - sb_end_write(mp->m_super); head->fmh_oflags = FMH_OF_DEV_T; return error; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a2a407039227..b33c894b6cf3 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -21,6 +21,64 @@ #include "xfs_ag_resv.h" /* + * Write new AG headers to disk. Non-transactional, but need to be + * written and completed prior to the growfs transaction being logged. + * To do this, we use a delayed write buffer list and wait for + * submission and IO completion of the list as a whole. This allows the + * IO subsystem to merge all the AG headers in a single AG into a single + * IO and hide most of the latency of the IO from us. + * + * This also means that if we get an error whilst building the buffer + * list to write, we can cancel the entire list without having written + * anything. + */ +static int +xfs_resizefs_init_new_ags( + struct xfs_trans *tp, + struct aghdr_init_data *id, + xfs_agnumber_t oagcount, + xfs_agnumber_t nagcount, + xfs_rfsblock_t delta, + bool *lastag_extended) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_rfsblock_t nb = mp->m_sb.sb_dblocks + delta; + int error; + + *lastag_extended = false; + + INIT_LIST_HEAD(&id->buffer_list); + for (id->agno = nagcount - 1; + id->agno >= oagcount; + id->agno--, delta -= id->agsize) { + + if (id->agno == nagcount - 1) + id->agsize = nb - (id->agno * + (xfs_rfsblock_t)mp->m_sb.sb_agblocks); + else + id->agsize = mp->m_sb.sb_agblocks; + + error = xfs_ag_init_headers(mp, id); + if (error) { + xfs_buf_delwri_cancel(&id->buffer_list); + return error; + } + } + + error = xfs_buf_delwri_submit(&id->buffer_list); + if (error) + return error; + + xfs_trans_agblocks_delta(tp, id->nfree); + + if (delta) { + *lastag_extended = true; + error = xfs_ag_extend_space(mp, tp, id, delta); + } + return error; +} + +/* * growfs operations */ static int @@ -33,22 +91,25 @@ xfs_growfs_data_private( xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; xfs_rfsblock_t nb, nb_div, nb_mod; - xfs_rfsblock_t delta; + int64_t delta; + bool lastag_extended; xfs_agnumber_t oagcount; struct xfs_trans *tp; struct aghdr_init_data id = {}; nb = in->newblocks; - if (nb < mp->m_sb.sb_dblocks) - return -EINVAL; - if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) + error = xfs_sb_validate_fsb_count(&mp->m_sb, nb); + if (error) return error; - error = xfs_buf_read_uncached(mp->m_ddev_targp, + + if (nb > mp->m_sb.sb_dblocks) { + error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); - if (error) - return error; - xfs_buf_relse(bp); + if (error) + return error; + xfs_buf_relse(bp); + } nb_div = nb; nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); @@ -56,10 +117,16 @@ xfs_growfs_data_private( if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { nagcount--; nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; - if (nb < mp->m_sb.sb_dblocks) - return -EINVAL; } delta = nb - mp->m_sb.sb_dblocks; + /* + * Reject filesystems with a single AG because they are not + * supported, and reject a shrink operation that would cause a + * filesystem to become unsupported. + */ + if (delta < 0 && nagcount < 2) + return -EINVAL; + oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ @@ -67,55 +134,34 @@ xfs_growfs_data_private( error = xfs_initialize_perag(mp, nagcount, &nagimax); if (error) return error; + } else if (nagcount < oagcount) { + /* TODO: shrinking the entire AGs hasn't yet completed */ + return -EINVAL; } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, - XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + (delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0, + XFS_TRANS_RESERVE, &tp); if (error) return error; - /* - * Write new AG headers to disk. Non-transactional, but need to be - * written and completed prior to the growfs transaction being logged. - * To do this, we use a delayed write buffer list and wait for - * submission and IO completion of the list as a whole. This allows the - * IO subsystem to merge all the AG headers in a single AG into a single - * IO and hide most of the latency of the IO from us. - * - * This also means that if we get an error whilst building the buffer - * list to write, we can cancel the entire list without having written - * anything. - */ - INIT_LIST_HEAD(&id.buffer_list); - for (id.agno = nagcount - 1; - id.agno >= oagcount; - id.agno--, delta -= id.agsize) { - - if (id.agno == nagcount - 1) - id.agsize = nb - - (id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); - else - id.agsize = mp->m_sb.sb_agblocks; + if (delta > 0) { + error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, + delta, &lastag_extended); + } else { + static struct ratelimit_state shrink_warning = \ + RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1); + ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE); - error = xfs_ag_init_headers(mp, &id); - if (error) { - xfs_buf_delwri_cancel(&id.buffer_list); - goto out_trans_cancel; - } + if (__ratelimit(&shrink_warning)) + xfs_alert(mp, + "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); + + error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta); } - error = xfs_buf_delwri_submit(&id.buffer_list); if (error) goto out_trans_cancel; - xfs_trans_agblocks_delta(tp, id.nfree); - - /* If there are new blocks in the old last AG, extend it. */ - if (delta) { - error = xfs_ag_extend_space(mp, tp, &id, delta); - if (error) - goto out_trans_cancel; - } - /* * Update changed superblock fields transactionally. These are not * seen by the rest of the world until the transaction commit applies @@ -123,11 +169,19 @@ xfs_growfs_data_private( */ if (nagcount > oagcount) xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); - if (nb > mp->m_sb.sb_dblocks) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, - nb - mp->m_sb.sb_dblocks); + if (delta) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta); if (id.nfree) xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree); + + /* + * Sync sb counters now to reflect the updated values. This is + * particularly important for shrink because the write verifier + * will fail if sb_fdblocks is ever larger than sb_dblocks. + */ + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + xfs_log_sb(tp); + xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) @@ -139,28 +193,29 @@ xfs_growfs_data_private( xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); - /* - * If we expanded the last AG, free the per-AG reservation - * so we can reinitialize it with the new size. - */ - if (delta) { - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, id.agno); - error = xfs_ag_resv_free(pag); - xfs_perag_put(pag); - if (error) - return error; + if (delta > 0) { + /* + * If we expanded the last AG, free the per-AG reservation + * so we can reinitialize it with the new size. + */ + if (lastag_extended) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, id.agno); + error = xfs_ag_resv_free(pag); + xfs_perag_put(pag); + if (error) + return error; + } + /* + * Reserve AG metadata blocks. ENOSPC here does not mean there + * was a growfs failure, just that there still isn't space for + * new user data after the grow has been run. + */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error == -ENOSPC) + error = 0; } - - /* - * Reserve AG metadata blocks. ENOSPC here does not mean there was a - * growfs failure, just that there still isn't space for new user data - * after the grow has been run. - */ - error = xfs_fs_reserve_ag_blocks(mp); - if (error == -ENOSPC) - error = 0; return error; out_trans_cancel: diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 1d7720a0c068..3c81daca0e9a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -63,7 +63,9 @@ xfs_inode_alloc( memset(&ip->i_df, 0, sizeof(ip->i_df)); ip->i_flags = 0; ip->i_delayed_blks = 0; - memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_diflags2 = mp->m_ino_geo.new_diflags2; + ip->i_nblocks = 0; + ip->i_forkoff = 0; ip->i_sick = 0; ip->i_checked = 0; INIT_WORK(&ip->i_ioend_work, xfs_end_io); @@ -307,7 +309,7 @@ xfs_iget_check_free_state( return -EFSCORRUPTED; } - if (ip->i_d.di_nblocks != 0) { + if (ip->i_nblocks != 0) { xfs_warn(ip->i_mount, "Corruption detected! Free inode 0x%llx has blocks allocated!", ip->i_ino); @@ -497,7 +499,7 @@ xfs_iget_cache_miss( * simply build the new inode core with a random generation number. * * For version 4 (and older) superblocks, log recovery is dependent on - * the di_flushiter field being initialised from the current on-disk + * the i_flushiter field being initialised from the current on-disk * value and hence we must also read the inode off disk even when * initializing new inodes. */ @@ -505,14 +507,14 @@ xfs_iget_cache_miss( (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { VFS_I(ip)->i_generation = prandom_u32(); } else { - struct xfs_dinode *dip; struct xfs_buf *bp; - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp); if (error) goto out_destroy; - error = xfs_inode_from_disk(ip, dip); + error = xfs_inode_from_disk(ip, + xfs_buf_offset(bp, ip->i_imap.im_boffset)); if (!error) xfs_buf_set_ref(bp, XFS_INO_REF); xfs_trans_brelse(tp, bp); @@ -1202,7 +1204,7 @@ xfs_inode_match_id( return false; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && - ip->i_d.di_projid != eofb->eof_prid) + ip->i_projid != eofb->eof_prid) return false; return true; @@ -1226,7 +1228,7 @@ xfs_inode_match_id_union( return true; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && - ip->i_d.di_projid == eofb->eof_prid) + ip->i_projid == eofb->eof_prid) return true; return false; @@ -1294,13 +1296,6 @@ xfs_inode_free_eofblocks( if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) return 0; - if (!xfs_can_free_eofblocks(ip, false)) { - /* inode could be preallocated or append-only */ - trace_xfs_inode_free_eofblocks_invalid(ip); - xfs_inode_clear_eofblocks_tag(ip); - return 0; - } - /* * If the mapping is dirty the operation can block and wait for some * time. Unless we are waiting, skip it. @@ -1322,7 +1317,13 @@ xfs_inode_free_eofblocks( } *lockflags |= XFS_IOLOCK_EXCL; - return xfs_free_eofblocks(ip); + if (xfs_can_free_eofblocks(ip, false)) + return xfs_free_eofblocks(ip); + + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); + xfs_inode_clear_eofblocks_tag(ip); + return 0; } /* @@ -1335,7 +1336,7 @@ xfs_blockgc_queue( { rcu_read_lock(); if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_blockgc_workqueue, + queue_delayed_work(pag->pag_mount->m_gc_workqueue, &pag->pag_blockgc_work, msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 46a861d55e48..0369eb22c1bb 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -60,8 +60,8 @@ xfs_get_extsz_hint( */ if (xfs_is_always_cow_inode(ip)) return 0; - if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) - return ip->i_d.di_extsize; + if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) + return ip->i_extsize; if (XFS_IS_REALTIME_INODE(ip)) return ip->i_mount->m_sb.sb_rextsize; return 0; @@ -80,8 +80,8 @@ xfs_get_cowextsz_hint( xfs_extlen_t a, b; a = 0; - if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) - a = ip->i_d.di_cowextsize; + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) + a = ip->i_cowextsize; b = xfs_get_extsz_hint(ip); a = max(a, b); @@ -111,8 +111,7 @@ xfs_ilock_data_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE && - (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) + if (xfs_need_iread_extents(&ip->i_df)) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); return lock_mode; @@ -124,9 +123,7 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_afp && - ip->i_afp->if_format == XFS_DINODE_FMT_BTREE && - (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) + if (ip->i_afp && xfs_need_iread_extents(ip->i_afp)) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); return lock_mode; @@ -598,67 +595,55 @@ xfs_lock_two_inodes( } } -STATIC uint -_xfs_dic2xflags( - uint16_t di_flags, - uint64_t di_flags2, - bool has_attr) +uint +xfs_ip2xflags( + struct xfs_inode *ip) { uint flags = 0; - if (di_flags & XFS_DIFLAG_ANY) { - if (di_flags & XFS_DIFLAG_REALTIME) + if (ip->i_diflags & XFS_DIFLAG_ANY) { + if (ip->i_diflags & XFS_DIFLAG_REALTIME) flags |= FS_XFLAG_REALTIME; - if (di_flags & XFS_DIFLAG_PREALLOC) + if (ip->i_diflags & XFS_DIFLAG_PREALLOC) flags |= FS_XFLAG_PREALLOC; - if (di_flags & XFS_DIFLAG_IMMUTABLE) + if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) flags |= FS_XFLAG_IMMUTABLE; - if (di_flags & XFS_DIFLAG_APPEND) + if (ip->i_diflags & XFS_DIFLAG_APPEND) flags |= FS_XFLAG_APPEND; - if (di_flags & XFS_DIFLAG_SYNC) + if (ip->i_diflags & XFS_DIFLAG_SYNC) flags |= FS_XFLAG_SYNC; - if (di_flags & XFS_DIFLAG_NOATIME) + if (ip->i_diflags & XFS_DIFLAG_NOATIME) flags |= FS_XFLAG_NOATIME; - if (di_flags & XFS_DIFLAG_NODUMP) + if (ip->i_diflags & XFS_DIFLAG_NODUMP) flags |= FS_XFLAG_NODUMP; - if (di_flags & XFS_DIFLAG_RTINHERIT) + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) flags |= FS_XFLAG_RTINHERIT; - if (di_flags & XFS_DIFLAG_PROJINHERIT) + if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) flags |= FS_XFLAG_PROJINHERIT; - if (di_flags & XFS_DIFLAG_NOSYMLINKS) + if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) flags |= FS_XFLAG_NOSYMLINKS; - if (di_flags & XFS_DIFLAG_EXTSIZE) + if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) flags |= FS_XFLAG_EXTSIZE; - if (di_flags & XFS_DIFLAG_EXTSZINHERIT) + if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) flags |= FS_XFLAG_EXTSZINHERIT; - if (di_flags & XFS_DIFLAG_NODEFRAG) + if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) flags |= FS_XFLAG_NODEFRAG; - if (di_flags & XFS_DIFLAG_FILESTREAM) + if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) flags |= FS_XFLAG_FILESTREAM; } - if (di_flags2 & XFS_DIFLAG2_ANY) { - if (di_flags2 & XFS_DIFLAG2_DAX) + if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { + if (ip->i_diflags2 & XFS_DIFLAG2_DAX) flags |= FS_XFLAG_DAX; - if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) flags |= FS_XFLAG_COWEXTSIZE; } - if (has_attr) + if (XFS_IFORK_Q(ip)) flags |= FS_XFLAG_HASATTR; - return flags; } -uint -xfs_ip2xflags( - struct xfs_inode *ip) -{ - struct xfs_icdinode *dic = &ip->i_d; - - return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); -} - /* * Lookups up an inode from "name". If ci_name is not NULL, then a CI match * is allowed, otherwise it has to be an exact match. If a CI match is found, @@ -708,42 +693,42 @@ xfs_inode_inherit_flags( umode_t mode = VFS_I(ip)->i_mode; if (S_ISDIR(mode)) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; - if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSZINHERIT; - ip->i_d.di_extsize = pip->i_d.di_extsize; + ip->i_extsize = pip->i_extsize; } - if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(mode)) { - if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) && + if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && xfs_sb_version_hasrealtime(&ip->i_mount->m_sb)) di_flags |= XFS_DIFLAG_REALTIME; - if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSIZE; - ip->i_d.di_extsize = pip->i_d.di_extsize; + ip->i_extsize = pip->i_extsize; } } - if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && + if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && xfs_inherit_noatime) di_flags |= XFS_DIFLAG_NOATIME; - if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && + if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && xfs_inherit_nodump) di_flags |= XFS_DIFLAG_NODUMP; - if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && + if ((pip->i_diflags & XFS_DIFLAG_SYNC) && xfs_inherit_sync) di_flags |= XFS_DIFLAG_SYNC; - if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && + if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && xfs_inherit_nosymlinks) di_flags |= XFS_DIFLAG_NOSYMLINKS; - if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && + if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && xfs_inherit_nodefrag) di_flags |= XFS_DIFLAG_NODEFRAG; - if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; - ip->i_d.di_flags |= di_flags; + ip->i_diflags |= di_flags; } /* Propagate di_flags2 from a parent inode to a child inode. */ @@ -752,12 +737,12 @@ xfs_inode_inherit_flags2( struct xfs_inode *ip, const struct xfs_inode *pip) { - if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; + if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = pip->i_cowextsize; } - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; + if (pip->i_diflags2 & XFS_DIFLAG2_DAX) + ip->i_diflags2 |= XFS_DIFLAG2_DAX; } /* @@ -774,6 +759,7 @@ xfs_init_new_inode( xfs_nlink_t nlink, dev_t rdev, prid_t prid, + bool init_xattrs, struct xfs_inode **ipp) { struct inode *dir = pip ? VFS_I(pip) : NULL; @@ -808,11 +794,11 @@ xfs_init_new_inode( inode = VFS_I(ip); set_nlink(inode, nlink); inode->i_rdev = rdev; - ip->i_d.di_projid = prid; + ip->i_projid = prid; if (dir && !(dir->i_mode & S_ISGID) && (mp->m_flags & XFS_MOUNT_GRPID)) { - inode->i_uid = fsuid_into_mnt(mnt_userns); + inode_fsuid_set(inode, mnt_userns); inode->i_gid = dir->i_gid; inode->i_mode = mode; } else { @@ -829,25 +815,22 @@ xfs_init_new_inode( !in_group_p(i_gid_into_mnt(mnt_userns, inode))) inode->i_mode &= ~S_ISGID; - ip->i_d.di_size = 0; + ip->i_disk_size = 0; ip->i_df.if_nextents = 0; - ASSERT(ip->i_d.di_nblocks == 0); + ASSERT(ip->i_nblocks == 0); tv = current_time(inode); inode->i_mtime = tv; inode->i_atime = tv; inode->i_ctime = tv; - ip->i_d.di_extsize = 0; - ip->i_d.di_dmevmask = 0; - ip->i_d.di_dmstate = 0; - ip->i_d.di_flags = 0; + ip->i_extsize = 0; + ip->i_diflags = 0; if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); - ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2; - ip->i_d.di_cowextsize = 0; - ip->i_d.di_crtime = tv; + ip->i_cowextsize = 0; + ip->i_crtime = tv; } flags = XFS_ILOG_CORE; @@ -857,19 +840,17 @@ xfs_init_new_inode( case S_IFBLK: case S_IFSOCK: ip->i_df.if_format = XFS_DINODE_FMT_DEV; - ip->i_df.if_flags = 0; flags |= XFS_ILOG_DEV; break; case S_IFREG: case S_IFDIR: - if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) + if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) xfs_inode_inherit_flags(ip, pip); - if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) + if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) xfs_inode_inherit_flags2(ip, pip); /* FALLTHROUGH */ case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; - ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = 0; ip->i_df.if_u1.if_root = NULL; break; @@ -878,6 +859,20 @@ xfs_init_new_inode( } /* + * If we need to create attributes immediately after allocating the + * inode, initialise an empty attribute fork right now. We use the + * default fork offset for attributes here as we don't know exactly what + * size or how many attributes we might be adding. We can do this + * safely here because we know the data fork is completely empty and + * this saves us from needing to run a separate transaction to set the + * fork offset in the immediate future. + */ + if (init_xattrs && xfs_sb_version_hasattr(&mp->m_sb)) { + ip->i_forkoff = xfs_default_attroffset(ip) >> 3; + ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); + } + + /* * Log the new values stuffed into the inode. */ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -910,6 +905,7 @@ xfs_dir_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, + bool init_xattrs, struct xfs_inode **ipp) { struct xfs_buf *agibp; @@ -937,7 +933,7 @@ xfs_dir_ialloc( ASSERT(ino != NULLFSINO); return xfs_init_new_inode(mnt_userns, *tpp, dp, ino, mode, nlink, rdev, - prid, ipp); + prid, init_xattrs, ipp); } /* @@ -982,6 +978,7 @@ xfs_create( struct xfs_name *name, umode_t mode, dev_t rdev, + bool init_xattrs, xfs_inode_t **ipp) { int is_dir = S_ISDIR(mode); @@ -1007,9 +1004,10 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, - &udqp, &gdqp, &pdqp); + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), + mapped_fsgid(mnt_userns), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); if (error) return error; @@ -1052,7 +1050,7 @@ xfs_create( * pointing to itself. */ error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, &ip); + prid, init_xattrs, &ip); if (error) goto out_trans_cancel; @@ -1157,9 +1155,10 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, - &udqp, &gdqp, &pdqp); + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), + mapped_fsgid(mnt_userns), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); if (error) return error; @@ -1171,7 +1170,8 @@ xfs_create_tmpfile( if (error) goto out_release_dquots; - error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, &ip); + error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, + false, &ip); if (error) goto out_trans_cancel; @@ -1270,8 +1270,8 @@ xfs_link( * creation in our tree when the project IDs are the same; else * the tree quota mechanism could be circumvented. */ - if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - tdp->i_d.di_projid != sip->i_d.di_projid)) { + if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && + tdp->i_projid != sip->i_projid)) { error = -EXDEV; goto error_return; } @@ -1329,7 +1329,7 @@ xfs_itruncate_clear_reflink_flags( dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK); cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); if (dfork->if_bytes == 0 && cfork->if_bytes == 0) - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; if (cfork->if_bytes == 0) xfs_inode_clear_cowblocks_tag(ip); } @@ -1440,7 +1440,7 @@ xfs_release( xfs_inode_t *ip) { xfs_mount_t *mp = ip->i_mount; - int error; + int error = 0; if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0)) return 0; @@ -1476,8 +1476,16 @@ xfs_release( if (VFS_I(ip)->i_nlink == 0) return 0; - if (xfs_can_free_eofblocks(ip, false)) { + /* + * If we can't get the iolock just skip truncating the blocks past EOF + * because we could deadlock with the mmap_lock otherwise. We'll get + * another chance to drop them once the last reference to the inode is + * dropped, so we'll never leak blocks permanently. + */ + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) + return 0; + if (xfs_can_free_eofblocks(ip, false)) { /* * Check if the inode is being opened, written and closed * frequently and we have delayed allocation blocks outstanding @@ -1493,26 +1501,20 @@ xfs_release( * place. */ if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) - return 0; - /* - * If we can't get the iolock just skip truncating the blocks - * past EOF because we could deadlock with the mmap_lock - * otherwise. We'll get another chance to drop them once the - * last reference to the inode is dropped, so we'll never leak - * blocks permanently. - */ - if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - error = xfs_free_eofblocks(ip); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - if (error) - return error; - } + goto out_unlock; + + error = xfs_free_eofblocks(ip); + if (error) + goto out_unlock; /* delalloc blocks after truncation means it really is dirty */ if (ip->i_delayed_blks) xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); } - return 0; + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; } /* @@ -1541,7 +1543,7 @@ xfs_inactive_truncate( * of a system crash before the truncate completes. See the related * comment in xfs_vn_setattr_size() for details. */ - ip->i_d.di_size = 0; + ip->i_disk_size = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); @@ -1695,6 +1697,10 @@ xfs_inactive( if (mp->m_flags & XFS_MOUNT_RDONLY) return; + /* Metadata inodes require explicit resource cleanup. */ + if (xfs_is_metadata_inode(ip)) + return; + /* Try to clean out the cow blocks if there are any. */ if (xfs_inode_has_cow_data(ip)) xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); @@ -1716,7 +1722,7 @@ xfs_inactive( } if (S_ISREG(VFS_I(ip)->i_mode) && - (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || + (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; @@ -1743,7 +1749,7 @@ xfs_inactive( } ASSERT(!ip->i_afp); - ASSERT(ip->i_d.di_forkoff == 0); + ASSERT(ip->i_forkoff == 0); /* * Free the inode. @@ -2051,9 +2057,10 @@ xfs_iunlink_update_inode( ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); if (error) return error; + dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); /* Make sure the old pointer isn't garbage. */ old_value = be32_to_cpu(dip->di_next_unlinked); @@ -2178,13 +2185,14 @@ xfs_iunlink_map_ino( return error; } - error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0); + error = xfs_imap_to_bp(mp, tp, imap, bpp); if (error) { xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", __func__, error); return error; } + *dipp = xfs_buf_offset(*bpp, imap->im_boffset); return 0; } @@ -2562,8 +2570,8 @@ xfs_ifree( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(ip->i_df.if_nextents == 0); - ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); - ASSERT(ip->i_d.di_nblocks == 0); + ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); + ASSERT(ip->i_nblocks == 0); /* * Pull the on-disk inode from the AGI unlinked list. @@ -2588,11 +2596,12 @@ xfs_ifree( } VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ - ip->i_d.di_flags = 0; - ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2; - ip->i_d.di_dmevmask = 0; - ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ + ip->i_diflags = 0; + ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2; + ip->i_forkoff = 0; /* mark the attr fork not in use */ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) + xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS); /* Don't attempt to replay owner changes for a deleted inode */ spin_lock(&iip->ili_lock); @@ -2868,7 +2877,7 @@ xfs_finish_rename( /* * xfs_cross_rename() * - * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall + * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall */ STATIC int xfs_cross_rename( @@ -3101,8 +3110,8 @@ xfs_rename( * into our tree when the project IDs are the same; else the * tree quota mechanism would be circumvented. */ - if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - target_dp->i_d.di_projid != src_ip->i_d.di_projid)) { + if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && + target_dp->i_projid != src_ip->i_projid)) { error = -EXDEV; goto out_trans_cancel; } @@ -3412,34 +3421,33 @@ xfs_iflush( } } if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > - ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { + ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), - ip->i_d.di_nblocks, ip); + ip->i_nblocks, ip); goto flush_out; } - if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, + if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, - __func__, ip->i_ino, ip->i_d.di_forkoff, ip); + __func__, ip->i_ino, ip->i_forkoff, ip); goto flush_out; } /* - * Inode item log recovery for v2 inodes are dependent on the - * di_flushiter count for correct sequencing. We bump the flush - * iteration count so we can detect flushes which postdate a log record - * during recovery. This is redundant as we now log every change and - * hence this can't happen but we need to still do it to ensure - * backwards compatibility with old kernels that predate logging all - * inode changes. + * Inode item log recovery for v2 inodes are dependent on the flushiter + * count for correct sequencing. We bump the flush iteration count so + * we can detect flushes which postdate a log record during recovery. + * This is redundant as we now log every change and hence this can't + * happen but we need to still do it to ensure backwards compatibility + * with old kernels that predate logging all inode changes. */ if (!xfs_sb_version_has_v3inode(&mp->m_sb)) - ip->i_d.di_flushiter++; + ip->i_flushiter++; /* * If there are inline format data / attr forks attached to this inode, @@ -3460,8 +3468,10 @@ xfs_iflush( xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); /* Wrap, we never let the log put out DI_MAX_FLUSH */ - if (ip->i_d.di_flushiter == DI_MAX_FLUSH) - ip->i_d.di_flushiter = 0; + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (ip->i_flushiter == DI_MAX_FLUSH) + ip->i_flushiter = 0; + } xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 88ee4c3930ae..ca826cfba91c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -54,8 +54,19 @@ typedef struct xfs_inode { /* Miscellaneous state. */ unsigned long i_flags; /* see defined flags below */ uint64_t i_delayed_blks; /* count of delay alloc blks */ - - struct xfs_icdinode i_d; /* most of ondisk inode */ + xfs_fsize_t i_disk_size; /* number of bytes in file */ + xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ + prid_t i_projid; /* owner's project id */ + xfs_extlen_t i_extsize; /* basic/minimum extent size */ + /* cowextsize is only used for v3 inodes, flushiter for v1/2 */ + union { + xfs_extlen_t i_cowextsize; /* basic cow extent size */ + uint16_t i_flushiter; /* incremented on flush */ + }; + uint8_t i_forkoff; /* attr fork offset >> 3 */ + uint16_t i_diflags; /* XFS_DIFLAG_... */ + uint64_t i_diflags2; /* XFS_DIFLAG2_... */ + struct timespec64 i_crtime; /* time created */ /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ @@ -87,7 +98,7 @@ static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) { if (S_ISREG(VFS_I(ip)->i_mode)) return i_size_read(VFS_I(ip)); - return ip->i_d.di_size; + return ip->i_disk_size; } /* @@ -101,7 +112,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) if (new_size > i_size || new_size < 0) new_size = i_size; - return new_size > ip->i_d.di_size ? new_size : 0; + return new_size > ip->i_disk_size ? new_size : 0; } /* @@ -174,15 +185,23 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) static inline prid_t xfs_get_initial_prid(struct xfs_inode *dp) { - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - return dp->i_d.di_projid; + if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT) + return dp->i_projid; return XFS_PROJID_DEFAULT; } static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) { - return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; + return ip->i_diflags2 & XFS_DIFLAG2_REFLINK; +} + +static inline bool xfs_is_metadata_inode(struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + return ip == mp->m_rbmip || ip == mp->m_rsumip || + xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } /* @@ -196,7 +215,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) { - return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME; + return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME; } /* @@ -214,6 +233,7 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ #define __XFS_INEW_BIT 3 /* inode has just been allocated */ #define XFS_INEW (1 << __XFS_INEW_BIT) +#define XFS_IPRESERVE_DM_FIELDS (1 << 4) /* has legacy DMAPI fields set */ #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ #define XFS_IFLUSHING (1 << 7) /* inode is being flushed */ @@ -371,7 +391,8 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct user_namespace *mnt_userns, struct xfs_inode *dp, struct xfs_name *name, - umode_t mode, dev_t rdev, struct xfs_inode **ipp); + umode_t mode, dev_t rdev, bool need_xattr, + struct xfs_inode **ipp); int xfs_create_tmpfile(struct user_namespace *mnt_userns, struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp); @@ -413,7 +434,8 @@ xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct user_namespace *mnt_userns, struct xfs_trans **tpp, struct xfs_inode *dp, umode_t mode, xfs_nlink_t nlink, dev_t dev, - prid_t prid, struct xfs_inode **ipp); + prid_t prid, bool need_xattr, + struct xfs_inode **ipp); static inline int xfs_itruncate_extents( diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 17e20a6d8b4e..c1b32680f71c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -196,7 +196,7 @@ xfs_inode_item_format_data_fork( */ data_bytes = roundup(ip->i_df.if_bytes, 4); ASSERT(ip->i_df.if_u1.if_data != NULL); - ASSERT(ip->i_d.di_size > 0); + ASSERT(ip->i_disk_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, ip->i_df.if_u1.if_data, data_bytes); ilf->ilf_dsize = (unsigned)data_bytes; @@ -317,21 +317,47 @@ xfs_inode_to_log_dinode_ts( return its; } +/* + * The legacy DMAPI fields are only present in the on-disk and in-log inodes, + * but not in the in-memory one. But we are guaranteed to have an inode buffer + * in memory when logging an inode, so we can just copy it from the on-disk + * inode to the in-log inode here so that recovery of file system with these + * fields set to non-zero values doesn't lose them. For all other cases we zero + * the fields. + */ +static void +xfs_copy_dm_fields_to_log_dinode( + struct xfs_inode *ip, + struct xfs_log_dinode *to) +{ + struct xfs_dinode *dip; + + dip = xfs_buf_offset(ip->i_itemp->ili_item.li_buf, + ip->i_imap.im_boffset); + + if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) { + to->di_dmevmask = be32_to_cpu(dip->di_dmevmask); + to->di_dmstate = be16_to_cpu(dip->di_dmstate); + } else { + to->di_dmevmask = 0; + to->di_dmstate = 0; + } +} + static void xfs_inode_to_log_dinode( struct xfs_inode *ip, struct xfs_log_dinode *to, xfs_lsn_t lsn) { - struct xfs_icdinode *from = &ip->i_d; struct inode *inode = VFS_I(ip); to->di_magic = XFS_DINODE_MAGIC; to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = i_uid_read(inode); to->di_gid = i_gid_read(inode); - to->di_projid_lo = from->di_projid & 0xffff; - to->di_projid_hi = from->di_projid >> 16; + to->di_projid_lo = ip->i_projid & 0xffff; + to->di_projid_hi = ip->i_projid >> 16; memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); @@ -342,16 +368,16 @@ xfs_inode_to_log_dinode( to->di_gen = inode->i_generation; to->di_mode = inode->i_mode; - to->di_size = from->di_size; - to->di_nblocks = from->di_nblocks; - to->di_extsize = from->di_extsize; + to->di_size = ip->i_disk_size; + to->di_nblocks = ip->i_nblocks; + to->di_extsize = ip->i_extsize; to->di_nextents = xfs_ifork_nextents(&ip->i_df); to->di_anextents = xfs_ifork_nextents(ip->i_afp); - to->di_forkoff = from->di_forkoff; + to->di_forkoff = ip->i_forkoff; to->di_aformat = xfs_ifork_format(ip->i_afp); - to->di_dmevmask = from->di_dmevmask; - to->di_dmstate = from->di_dmstate; - to->di_flags = from->di_flags; + to->di_flags = ip->i_diflags; + + xfs_copy_dm_fields_to_log_dinode(ip, to); /* log a dummy value to ensure log structure is fully initialised */ to->di_next_unlinked = NULLAGINO; @@ -359,9 +385,9 @@ xfs_inode_to_log_dinode( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); - to->di_crtime = xfs_inode_to_log_dinode_ts(ip, from->di_crtime); - to->di_flags2 = from->di_flags2; - to->di_cowextsize = from->di_cowextsize; + to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime); + to->di_flags2 = ip->i_diflags2; + to->di_cowextsize = ip->i_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; memset(to->di_pad2, 0, sizeof(to->di_pad2)); @@ -369,7 +395,7 @@ xfs_inode_to_log_dinode( to->di_flushiter = 0; } else { to->di_version = 2; - to->di_flushiter = from->di_flushiter; + to->di_flushiter = ip->i_flushiter; } } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 99dfe89a8d08..3925bfcb2365 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -40,6 +40,7 @@ #include <linux/mount.h> #include <linux/namei.h> +#include <linux/fileattr.h> /* * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to @@ -1053,97 +1054,55 @@ xfs_ioc_ag_geometry( * Linux extended inode flags interface. */ -STATIC unsigned int -xfs_merge_ioc_xflags( - unsigned int flags, - unsigned int start) -{ - unsigned int xflags = start; - - if (flags & FS_IMMUTABLE_FL) - xflags |= FS_XFLAG_IMMUTABLE; - else - xflags &= ~FS_XFLAG_IMMUTABLE; - if (flags & FS_APPEND_FL) - xflags |= FS_XFLAG_APPEND; - else - xflags &= ~FS_XFLAG_APPEND; - if (flags & FS_SYNC_FL) - xflags |= FS_XFLAG_SYNC; - else - xflags &= ~FS_XFLAG_SYNC; - if (flags & FS_NOATIME_FL) - xflags |= FS_XFLAG_NOATIME; - else - xflags &= ~FS_XFLAG_NOATIME; - if (flags & FS_NODUMP_FL) - xflags |= FS_XFLAG_NODUMP; - else - xflags &= ~FS_XFLAG_NODUMP; - if (flags & FS_DAX_FL) - xflags |= FS_XFLAG_DAX; - else - xflags &= ~FS_XFLAG_DAX; - - return xflags; -} - -STATIC unsigned int -xfs_di2lxflags( - uint16_t di_flags, - uint64_t di_flags2) -{ - unsigned int flags = 0; - - if (di_flags & XFS_DIFLAG_IMMUTABLE) - flags |= FS_IMMUTABLE_FL; - if (di_flags & XFS_DIFLAG_APPEND) - flags |= FS_APPEND_FL; - if (di_flags & XFS_DIFLAG_SYNC) - flags |= FS_SYNC_FL; - if (di_flags & XFS_DIFLAG_NOATIME) - flags |= FS_NOATIME_FL; - if (di_flags & XFS_DIFLAG_NODUMP) - flags |= FS_NODUMP_FL; - if (di_flags2 & XFS_DIFLAG2_DAX) { - flags |= FS_DAX_FL; - } - return flags; -} - static void xfs_fill_fsxattr( struct xfs_inode *ip, - bool attr, - struct fsxattr *fa) + int whichfork, + struct fileattr *fa) { - struct xfs_ifork *ifp = attr ? ip->i_afp : &ip->i_df; - - simple_fill_fsxattr(fa, xfs_ip2xflags(ip)); - fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; - fa->fsx_cowextsize = ip->i_d.di_cowextsize << - ip->i_mount->m_sb.sb_blocklog; - fa->fsx_projid = ip->i_d.di_projid; - if (ifp && (ifp->if_flags & XFS_IFEXTENTS)) + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + + fileattr_fill_xflags(fa, xfs_ip2xflags(ip)); + + fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize); + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) + fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize); + fa->fsx_projid = ip->i_projid; + if (ifp && !xfs_need_iread_extents(ifp)) fa->fsx_nextents = xfs_iext_count(ifp); else fa->fsx_nextents = xfs_ifork_nextents(ifp); } STATIC int -xfs_ioc_fsgetxattr( +xfs_ioc_fsgetxattra( xfs_inode_t *ip, - int attr, void __user *arg) { - struct fsxattr fa; + struct fileattr fa; xfs_ilock(ip, XFS_ILOCK_SHARED); - xfs_fill_fsxattr(ip, attr, &fa); + xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + return copy_fsxattr_to_user(&fa, arg); +} + +int +xfs_fileattr_get( + struct dentry *dentry, + struct fileattr *fa) +{ + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + + if (d_is_special(dentry)) + return -ENOTTY; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa); xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (copy_to_user(arg, &fa, sizeof(fa))) - return -EFAULT; return 0; } @@ -1154,7 +1113,7 @@ xfs_flags2diflags( { /* can't set PREALLOC this way, just preserve it */ uint16_t di_flags = - (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + (ip->i_diflags & XFS_DIFLAG_PREALLOC); if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; @@ -1195,8 +1154,8 @@ xfs_flags2diflags2( unsigned int xflags) { uint64_t di_flags2 = - (ip->i_d.di_flags2 & (XFS_DIFLAG2_REFLINK | - XFS_DIFLAG2_BIGTIME)); + (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | + XFS_DIFLAG2_BIGTIME)); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; @@ -1210,10 +1169,10 @@ static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, struct xfs_inode *ip, - struct fsxattr *fa) + struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; - uint64_t di_flags2; + uint64_t i_flags2; /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_df.if_nextents || ip->i_delayed_blks) && @@ -1223,25 +1182,25 @@ xfs_ioctl_setattr_xflags( /* If realtime flag is set then must have realtime device */ if (fa->fsx_xflags & FS_XFLAG_REALTIME) { if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || - (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) + (ip->i_extsize % mp->m_sb.sb_rextsize)) return -EINVAL; } /* Clear reflink if we are actually able to set the rt flag. */ if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; /* Don't allow us to set DAX mode for a reflinked file for now. */ if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) return -EINVAL; /* diflags2 only valid for v3 inodes. */ - di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); - if (di_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb)) + i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); + if (i_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb)) return -EINVAL; - ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); - ip->i_d.di_flags2 = di_flags2; + ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); + ip->i_diflags2 = i_flags2; xfs_diflags_to_iflags(ip, false); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); @@ -1253,7 +1212,7 @@ xfs_ioctl_setattr_xflags( static void xfs_ioctl_setattr_prepare_dax( struct xfs_inode *ip, - struct fsxattr *fa) + struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); @@ -1266,9 +1225,9 @@ xfs_ioctl_setattr_prepare_dax( return; if (((fa->fsx_xflags & FS_XFLAG_DAX) && - !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) || + !(ip->i_diflags2 & XFS_DIFLAG2_DAX)) || (!(fa->fsx_xflags & FS_XFLAG_DAX) && - (ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))) + (ip->i_diflags2 & XFS_DIFLAG2_DAX))) d_mark_dontcache(inode); } @@ -1280,10 +1239,9 @@ xfs_ioctl_setattr_prepare_dax( */ static struct xfs_trans * xfs_ioctl_setattr_get_trans( - struct file *file, + struct xfs_inode *ip, struct xfs_dquot *pdqp) { - struct xfs_inode *ip = XFS_I(file_inode(file)); struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = -EROFS; @@ -1299,24 +1257,11 @@ xfs_ioctl_setattr_get_trans( if (error) goto out_error; - /* - * CAP_FOWNER overrides the following restrictions: - * - * The user ID of the calling process must be equal to the file owner - * ID, except in cases where the CAP_FSETID capability is applicable. - */ - if (!inode_owner_or_capable(file_mnt_user_ns(file), VFS_I(ip))) { - error = -EPERM; - goto out_cancel; - } - if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); return tp; -out_cancel: - xfs_trans_cancel(tp); out_error: return ERR_PTR(error); } @@ -1340,14 +1285,17 @@ out_error: static int xfs_ioctl_setattr_check_extsize( struct xfs_inode *ip, - struct fsxattr *fa) + struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; xfs_extlen_t size; xfs_fsblock_t extsize_fsb; + if (!fa->fsx_valid) + return 0; + if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && - ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) + ((ip->i_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) return -EINVAL; if (fa->fsx_extsize == 0) @@ -1390,12 +1338,15 @@ xfs_ioctl_setattr_check_extsize( static int xfs_ioctl_setattr_check_cowextsize( struct xfs_inode *ip, - struct fsxattr *fa) + struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; xfs_extlen_t size; xfs_fsblock_t cowextsize_fsb; + if (!fa->fsx_valid) + return 0; + if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) return 0; @@ -1422,8 +1373,11 @@ xfs_ioctl_setattr_check_cowextsize( static int xfs_ioctl_setattr_check_projid( struct xfs_inode *ip, - struct fsxattr *fa) + struct fileattr *fa) { + if (!fa->fsx_valid) + return 0; + /* Disallow 32bit project ids if projid32bit feature is not enabled. */ if (fa->fsx_projid > (uint16_t)-1 && !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) @@ -1431,14 +1385,13 @@ xfs_ioctl_setattr_check_projid( return 0; } -STATIC int -xfs_ioctl_setattr( - struct file *file, - struct fsxattr *fa) +int +xfs_fileattr_set( + struct user_namespace *mnt_userns, + struct dentry *dentry, + struct fileattr *fa) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); - struct xfs_inode *ip = XFS_I(file_inode(file)); - struct fsxattr old_fa; + struct xfs_inode *ip = XFS_I(d_inode(dentry)); struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; struct xfs_dquot *pdqp = NULL; @@ -1447,6 +1400,16 @@ xfs_ioctl_setattr( trace_xfs_ioctl_setattr(ip); + if (d_is_special(dentry)) + return -ENOTTY; + + if (!fa->fsx_valid) { + if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | + FS_NOATIME_FL | FS_NODUMP_FL | + FS_SYNC_FL | FS_DAX_FL | FS_PROJINHERIT_FL)) + return -EOPNOTSUPP; + } + error = xfs_ioctl_setattr_check_projid(ip, fa); if (error) return error; @@ -1459,7 +1422,7 @@ xfs_ioctl_setattr( * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ - if (XFS_IS_QUOTA_ON(mp)) { + if (fa->fsx_valid && XFS_IS_QUOTA_ON(mp)) { error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, VFS_I(ip)->i_gid, fa->fsx_projid, XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp); @@ -1469,17 +1432,12 @@ xfs_ioctl_setattr( xfs_ioctl_setattr_prepare_dax(ip, fa); - tp = xfs_ioctl_setattr_get_trans(file, pdqp); + tp = xfs_ioctl_setattr_get_trans(ip, pdqp); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto error_free_dquots; } - xfs_fill_fsxattr(ip, false, &old_fa); - error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa); - if (error) - goto error_trans_cancel; - error = xfs_ioctl_setattr_check_extsize(ip, fa); if (error) goto error_trans_cancel; @@ -1492,6 +1450,8 @@ xfs_ioctl_setattr( if (error) goto error_trans_cancel; + if (!fa->fsx_valid) + goto skip_xattr; /* * Change file ownership. Must be the owner or privileged. CAP_FSETID * overrides the following restrictions: @@ -1505,12 +1465,12 @@ xfs_ioctl_setattr( VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ - if (ip->i_d.di_projid != fa->fsx_projid) { + if (ip->i_projid != fa->fsx_projid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } - ip->i_d.di_projid = fa->fsx_projid; + ip->i_projid = fa->fsx_projid; } /* @@ -1518,17 +1478,19 @@ xfs_ioctl_setattr( * extent size hint should be set on the inode. If no extent size flags * are set on the inode then unconditionally clear the extent size hint. */ - if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) - ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; - else - ip->i_d.di_extsize = 0; - if (xfs_sb_version_has_v3inode(&mp->m_sb) && - (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) - ip->i_d.di_cowextsize = fa->fsx_cowextsize >> - mp->m_sb.sb_blocklog; + if (ip->i_diflags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) + ip->i_extsize = XFS_B_TO_FSB(mp, fa->fsx_extsize); else - ip->i_d.di_cowextsize = 0; + ip->i_extsize = 0; + + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) + ip->i_cowextsize = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); + else + ip->i_cowextsize = 0; + } +skip_xattr: error = xfs_trans_commit(tp); /* @@ -1546,91 +1508,6 @@ error_free_dquots: return error; } -STATIC int -xfs_ioc_fssetxattr( - struct file *filp, - void __user *arg) -{ - struct fsxattr fa; - int error; - - if (copy_from_user(&fa, arg, sizeof(fa))) - return -EFAULT; - - error = mnt_want_write_file(filp); - if (error) - return error; - error = xfs_ioctl_setattr(filp, &fa); - mnt_drop_write_file(filp); - return error; -} - -STATIC int -xfs_ioc_getxflags( - xfs_inode_t *ip, - void __user *arg) -{ - unsigned int flags; - - flags = xfs_di2lxflags(ip->i_d.di_flags, ip->i_d.di_flags2); - if (copy_to_user(arg, &flags, sizeof(flags))) - return -EFAULT; - return 0; -} - -STATIC int -xfs_ioc_setxflags( - struct xfs_inode *ip, - struct file *filp, - void __user *arg) -{ - struct xfs_trans *tp; - struct fsxattr fa; - struct fsxattr old_fa; - unsigned int flags; - int error; - - if (copy_from_user(&flags, arg, sizeof(flags))) - return -EFAULT; - - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ - FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL | FS_DAX_FL)) - return -EOPNOTSUPP; - - fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); - - error = mnt_want_write_file(filp); - if (error) - return error; - - xfs_ioctl_setattr_prepare_dax(ip, &fa); - - tp = xfs_ioctl_setattr_get_trans(filp, NULL); - if (IS_ERR(tp)) { - error = PTR_ERR(tp); - goto out_drop_write; - } - - xfs_fill_fsxattr(ip, false, &old_fa); - error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa); - if (error) { - xfs_trans_cancel(tp); - goto out_drop_write; - } - - error = xfs_ioctl_setattr_xflags(tp, ip, &fa); - if (error) { - xfs_trans_cancel(tp); - goto out_drop_write; - } - - error = xfs_trans_commit(tp); -out_drop_write: - mnt_drop_write_file(filp); - return error; -} - static bool xfs_getbmap_format( struct kgetbmap *p, @@ -1669,8 +1546,6 @@ xfs_ioc_getbmap( bmx.bmv_iflags = BMV_IF_ATTRFORK; /*FALLTHRU*/ case XFS_IOC_GETBMAP: - if (file->f_mode & FMODE_NOCMTIME) - bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; /* struct getbmap is a strict subset of struct getbmapx. */ recsize = sizeof(struct getbmap); break; @@ -1846,7 +1721,7 @@ out_free: STATIC int xfs_ioc_scrub_metadata( - struct xfs_inode *ip, + struct file *file, void __user *arg) { struct xfs_scrub_metadata scrub; @@ -1858,7 +1733,7 @@ xfs_ioc_scrub_metadata( if (copy_from_user(&scrub, arg, sizeof(scrub))) return -EFAULT; - error = xfs_scrub_metadata(ip, &scrub); + error = xfs_scrub_metadata(file, &scrub); if (error) return error; @@ -2137,16 +2012,8 @@ xfs_file_ioctl( case XFS_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *)arg); - case XFS_IOC_FSGETXATTR: - return xfs_ioc_fsgetxattr(ip, 0, arg); case XFS_IOC_FSGETXATTRA: - return xfs_ioc_fsgetxattr(ip, 1, arg); - case XFS_IOC_FSSETXATTR: - return xfs_ioc_fssetxattr(filp, arg); - case XFS_IOC_GETXFLAGS: - return xfs_ioc_getxflags(ip, arg); - case XFS_IOC_SETXFLAGS: - return xfs_ioc_setxflags(ip, filp, arg); + return xfs_ioc_fsgetxattra(ip, arg); case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: @@ -2157,7 +2024,7 @@ xfs_file_ioctl( return xfs_ioc_getfsmap(ip, arg); case XFS_IOC_SCRUB_METADATA: - return xfs_ioc_scrub_metadata(ip, arg); + return xfs_ioc_scrub_metadata(filp, arg); case XFS_IOC_FD_TO_HANDLE: case XFS_IOC_PATH_TO_HANDLE: diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index bab6a5a92407..28453a6d4461 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -47,6 +47,17 @@ xfs_handle_to_dentry( void __user *uhandle, u32 hlen); +extern int +xfs_fileattr_get( + struct dentry *dentry, + struct fileattr *fa); + +extern int +xfs_fileattr_set( + struct user_namespace *mnt_userns, + struct dentry *dentry, + struct fileattr *fa); + extern long xfs_file_ioctl( struct file *filp, diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 33c09ec8e6c0..e6506773ba55 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -484,8 +484,6 @@ xfs_file_compat_ioctl( } #endif /* long changes size, but xfs only copiese out 32 bits */ - case XFS_IOC_GETXFLAGS_32: - case XFS_IOC_SETXFLAGS_32: case XFS_IOC_GETVERSION_32: cmd = _NATIVE_IOC(cmd, long); return xfs_file_ioctl(filp, cmd, p); diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 053de7d894cd..9929482bf358 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -17,8 +17,6 @@ */ /* stock kernel-level ioctls we support */ -#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS -#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS #define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index e17ab7f42928..d154f42e2dc6 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -159,7 +159,7 @@ xfs_iomap_eof_align_last_fsb( struct xfs_bmbt_irec irec; struct xfs_iext_cursor icur; - ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(!xfs_need_iread_extents(ifp)); /* * Always round up the allocation request to the extent hint boundary. @@ -198,6 +198,7 @@ xfs_iomap_write_direct( bool force = false; int error; int bmapi_flags = XFS_BMAPI_PREALLOC; + int nr_exts = XFS_IEXT_ADD_NOSPLIT_CNT; ASSERT(count_fsb > 0); @@ -232,6 +233,7 @@ xfs_iomap_write_direct( bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { force = true; + nr_exts = XFS_IEXT_WRITE_UNWRITTEN_CNT; dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } @@ -241,8 +243,7 @@ xfs_iomap_write_direct( if (error) return error; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, - XFS_IEXT_ADD_NOSPLIT_CNT); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts); if (error) goto out_trans_cancel; @@ -572,7 +573,7 @@ xfs_iomap_write_unwritten( i_size_write(inode, i_size); i_size = xfs_new_eof(ip, i_size); if (i_size) { - ip->i_d.di_size = i_size; + ip->i_disk_size = i_size; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -666,7 +667,7 @@ xfs_ilock_for_iomap( * is an opencoded xfs_ilock_data_map_shared() call but with * non-blocking behaviour. */ - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + if (xfs_need_iread_extents(&ip->i_df)) { if (flags & IOMAP_NOWAIT) return -EAGAIN; mode = XFS_ILOCK_EXCL; @@ -893,11 +894,9 @@ xfs_buffered_write_iomap_begin( XFS_STATS_INC(mp, xs_blk_mapw); - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - goto out_unlock; - } + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; /* * Search the data fork first to look up our source mapping. We @@ -1208,11 +1207,9 @@ xfs_seek_iomap_begin( return -EIO; lockmode = xfs_ilock_data_map_shared(ip); - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - goto out_unlock; - } + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) { /* diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 66ebccb5a6ff..dfe24b7f26e5 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -21,6 +21,7 @@ #include "xfs_dir2.h" #include "xfs_iomap.h" #include "xfs_error.h" +#include "xfs_ioctl.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -126,6 +127,37 @@ xfs_cleanup_inode( xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); } +/* + * Check to see if we are likely to need an extended attribute to be added to + * the inode we are about to allocate. This allows the attribute fork to be + * created during the inode allocation, reducing the number of transactions we + * need to do in this fast path. + * + * The security checks are optimistic, but not guaranteed. The two LSMs that + * require xattrs to be added here (selinux and smack) are also the only two + * LSMs that add a sb->s_security structure to the superblock. Hence if security + * is enabled and sb->s_security is set, we have a pretty good idea that we are + * going to be asked to add a security xattr immediately after allocating the + * xfs inode and instantiating the VFS inode. + */ +static inline bool +xfs_create_need_xattr( + struct inode *dir, + struct posix_acl *default_acl, + struct posix_acl *acl) +{ + if (acl) + return true; + if (default_acl) + return true; +#if IS_ENABLED(CONFIG_SECURITY) + if (dir->i_sb->s_security) + return true; +#endif + return false; +} + + STATIC int xfs_generic_create( struct user_namespace *mnt_userns, @@ -163,7 +195,8 @@ xfs_generic_create( if (!tmpfile) { error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev, - &ip); + xfs_create_need_xattr(dir, default_acl, acl), + &ip); } else { error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip); } @@ -487,7 +520,7 @@ xfs_vn_get_link_inline( struct xfs_inode *ip = XFS_I(inode); char *link; - ASSERT(ip->i_df.if_flags & XFS_IFINLINE); + ASSERT(ip->i_df.if_format == XFS_DINODE_FMT_LOCAL); /* * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if @@ -562,13 +595,12 @@ xfs_vn_getattr( stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; stat->ctime = inode->i_ctime; - stat->blocks = - XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); + stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; - stat->btime = ip->i_d.di_crtime; + stat->btime = ip->i_crtime; } } @@ -576,11 +608,11 @@ xfs_vn_getattr( * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. */ - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + if (ip->i_diflags & XFS_DIFLAG_APPEND) stat->attributes |= STATX_ATTR_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP) + if (ip->i_diflags & XFS_DIFLAG_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= (STATX_ATTR_IMMUTABLE | @@ -705,7 +737,7 @@ xfs_setattr_nonsize( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, + error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_projid, qflags, &udqp, &gdqp, NULL); if (error) return error; @@ -917,8 +949,8 @@ xfs_setattr_size( * operation. * * And we update in-core i_size and truncate page cache beyond newsize - * before writeback the [di_size, newsize] range, so we're guaranteed - * not to write stale data past the new EOF on truncate down. + * before writeback the [i_disk_size, newsize] range, so we're + * guaranteed not to write stale data past the new EOF on truncate down. */ truncate_setsize(inode, newsize); @@ -931,9 +963,9 @@ xfs_setattr_size( * otherwise those blocks may not be zeroed after a crash. */ if (did_zeroing || - (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { + (newsize > ip->i_disk_size && oldsize != ip->i_disk_size)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - ip->i_d.di_size, newsize - 1); + ip->i_disk_size, newsize - 1); if (error) return error; } @@ -975,7 +1007,7 @@ xfs_setattr_size( * permanent before actually freeing any blocks it doesn't matter if * they get written to. */ - ip->i_d.di_size = newsize; + ip->i_disk_size = newsize; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (newsize <= oldsize) { @@ -1152,6 +1184,8 @@ static const struct inode_operations xfs_inode_operations = { .listxattr = xfs_vn_listxattr, .fiemap = xfs_vn_fiemap, .update_time = xfs_vn_update_time, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; static const struct inode_operations xfs_dir_inode_operations = { @@ -1177,6 +1211,8 @@ static const struct inode_operations xfs_dir_inode_operations = { .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, .tmpfile = xfs_vn_tmpfile, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; static const struct inode_operations xfs_dir_ci_inode_operations = { @@ -1202,6 +1238,8 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, .tmpfile = xfs_vn_tmpfile, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; static const struct inode_operations xfs_symlink_inode_operations = { @@ -1255,7 +1293,7 @@ xfs_inode_should_enable_dax( return false; if (ip->i_mount->m_flags & XFS_MOUNT_DAX_ALWAYS) return true; - if (ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + if (ip->i_diflags2 & XFS_DIFLAG2_DAX) return true; return false; } @@ -1312,7 +1350,7 @@ xfs_setup_inode( /* make the inode look hashed for the writeback code */ inode_fake_hash(inode); - i_size_write(inode, ip->i_d.di_size); + i_size_write(inode, ip->i_disk_size); xfs_diflags_to_iflags(ip, true); if (S_ISDIR(inode->i_mode)) { @@ -1370,7 +1408,7 @@ xfs_setup_iops( inode->i_fop = &xfs_dir_file_operations; break; case S_IFLNK: - if (ip->i_df.if_flags & XFS_IFINLINE) + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) inode->i_op = &xfs_inline_symlink_inode_operations; else inode->i_op = &xfs_symlink_inode_operations; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ca310a125d1e..f331975a16de 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -60,7 +60,6 @@ xfs_bulkstat_one_int( struct xfs_bstat_chunk *bc) { struct user_namespace *sb_userns = mp->m_super->s_user_ns; - struct xfs_icdinode *dic; /* dinode core info pointer */ struct xfs_inode *ip; /* incore inode pointer */ struct inode *inode; struct xfs_bulkstat *buf = bc->buf; @@ -81,16 +80,14 @@ xfs_bulkstat_one_int( ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); - dic = &ip->i_d; - /* xfs_iget returns the following without needing * further change. */ - buf->bs_projectid = ip->i_d.di_projid; + buf->bs_projectid = ip->i_projid; buf->bs_ino = ino; buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode)); buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode)); - buf->bs_size = dic->di_size; + buf->bs_size = ip->i_disk_size; buf->bs_nlink = inode->i_nlink; buf->bs_atime = inode->i_atime.tv_sec; @@ -99,13 +96,11 @@ xfs_bulkstat_one_int( buf->bs_mtime_nsec = inode->i_mtime.tv_nsec; buf->bs_ctime = inode->i_ctime.tv_sec; buf->bs_ctime_nsec = inode->i_ctime.tv_nsec; - buf->bs_btime = dic->di_crtime.tv_sec; - buf->bs_btime_nsec = dic->di_crtime.tv_nsec; buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; buf->bs_xflags = xfs_ip2xflags(ip); - buf->bs_extsize_blks = dic->di_extsize; + buf->bs_extsize_blks = ip->i_extsize; buf->bs_extents = xfs_ifork_nextents(&ip->i_df); xfs_bulkstat_health(ip, buf); buf->bs_aextents = xfs_ifork_nextents(ip->i_afp); @@ -113,8 +108,10 @@ xfs_bulkstat_one_int( buf->bs_version = XFS_BULKSTAT_VERSION_V5; if (xfs_sb_version_has_v3inode(&mp->m_sb)) { - if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) - buf->bs_cowextsize_blks = dic->di_cowextsize; + buf->bs_btime = ip->i_crtime.tv_sec; + buf->bs_btime_nsec = ip->i_crtime.tv_nsec; + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) + buf->bs_cowextsize_blks = ip->i_cowextsize; } switch (ip->i_df.if_format) { @@ -132,7 +129,7 @@ xfs_bulkstat_one_int( case XFS_DINODE_FMT_BTREE: buf->bs_rdev = 0; buf->bs_blksize = mp->m_sb.sb_blocksize; - buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; + buf->bs_blocks = ip->i_nblocks + ip->i_delayed_blks; break; } xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -168,6 +165,12 @@ xfs_bulkstat_one( }; int error; + if (breq->mnt_userns != &init_user_ns) { + xfs_warn_ratelimited(breq->mp, + "bulkstat not supported inside of idmapped mounts."); + return -EINVAL; + } + ASSERT(breq->icount == 1); bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index af6be9b9ccdf..7688663b9773 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -233,7 +233,7 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, * configured realtime device. */ #define XFS_IS_REALTIME_INODE(ip) \ - (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \ + (((ip)->i_diflags & XFS_DIFLAG_REALTIME) && \ (ip)->i_mount->m_rtdev_targp) #define XFS_IS_REALTIME_MOUNT(mp) ((mp)->m_rtdev_targp ? 1 : 0) #else diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 97f31308de03..e5dd1c0c2f03 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2683,9 +2683,10 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &ibp); if (error) goto fail_iput; + dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); xfs_iflags_clear(ip, XFS_IRECOVERY); ASSERT(VFS_I(ip)->i_nlink == 0); @@ -2695,12 +2696,6 @@ xlog_recover_process_one_iunlink( agino = be32_to_cpu(dip->di_next_unlinked); xfs_buf_relse(ibp); - /* - * Prevent any DMAPI event from being sent when the reference on - * the inode is dropped. - */ - ip->i_d.di_dmevmask = 0; - xfs_irele(ip); return agino; @@ -2736,7 +2731,7 @@ xlog_recover_process_one_iunlink( * of log space. * * This behaviour is bad for latency on single CPU and non-preemptible kernels, - * and can prevent other filesytem work (such as CIL pushes) from running. This + * and can prevent other filesystem work (such as CIL pushes) from running. This * can lead to deadlocks if the recovery process runs out of log reservation * space. Hence we need to yield the CPU when there is other kernel work * scheduled on this CPU to ensure other scheduled work can run without undue @@ -3404,7 +3399,7 @@ xlog_recover( /* * Delay log recovery if the debug hook is set. This is debug - * instrumention to coordinate simulation of I/O failures with + * instrumentation to coordinate simulation of I/O failures with * log recovery. */ if (xfs_globals.log_recovery_delay) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 52370d0a3f43..cb1e2c4702c3 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -635,6 +635,59 @@ xfs_check_summary_counts( } /* + * Flush and reclaim dirty inodes in preparation for unmount. Inodes and + * internal inode structures can be sitting in the CIL and AIL at this point, + * so we need to unpin them, write them back and/or reclaim them before unmount + * can proceed. + * + * An inode cluster that has been freed can have its buffer still pinned in + * memory because the transaction is still sitting in a iclog. The stale inodes + * on that buffer will be pinned to the buffer until the transaction hits the + * disk and the callbacks run. Pushing the AIL will skip the stale inodes and + * may never see the pinned buffer, so nothing will push out the iclog and + * unpin the buffer. + * + * Hence we need to force the log to unpin everything first. However, log + * forces don't wait for the discards they issue to complete, so we have to + * explicitly wait for them to complete here as well. + * + * Then we can tell the world we are unmounting so that error handling knows + * that the filesystem is going away and we should error out anything that we + * have been retrying in the background. This will prevent never-ending + * retries in AIL pushing from hanging the unmount. + * + * Finally, we can push the AIL to clean all the remaining dirty objects, then + * reclaim the remaining inodes that are still in memory at this point in time. + */ +static void +xfs_unmount_flush_inodes( + struct xfs_mount *mp) +{ + xfs_log_force(mp, XFS_LOG_SYNC); + xfs_extent_busy_wait_all(mp); + flush_workqueue(xfs_discard_wq); + + mp->m_flags |= XFS_MOUNT_UNMOUNTING; + + xfs_ail_push_all_sync(mp->m_ail); + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp); + xfs_health_unmount(mp); +} + +static void +xfs_mount_setup_inode_geom( + struct xfs_mount *mp) +{ + struct xfs_ino_geometry *igeo = M_IGEO(mp); + + igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp); + ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp)); + + xfs_ialloc_setup_geometry(mp); +} + +/* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct * - if we're a 32-bit kernel, do a size check on the superblock @@ -717,7 +770,7 @@ xfs_mountfs( xfs_alloc_compute_maxlevels(mp); xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); - xfs_ialloc_setup_geometry(mp); + xfs_mount_setup_inode_geom(mp); xfs_rmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); @@ -1008,7 +1061,7 @@ xfs_mountfs( /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); /* - * Cancel all delayed reclaim work and reclaim the inodes directly. + * Flush all inode reclamation work and flush the log. * We have to do this /after/ rtunmount and qm_unmount because those * two will have scheduled delayed reclaim for the rt/quota inodes. * @@ -1018,11 +1071,8 @@ xfs_mountfs( * qm_unmount_quotas and therefore rely on qm_unmount to release the * quota inodes. */ - cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp); - xfs_health_unmount(mp); + xfs_unmount_flush_inodes(mp); out_log_dealloc: - mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) @@ -1063,47 +1113,7 @@ xfs_unmountfs( xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip); - /* - * We can potentially deadlock here if we have an inode cluster - * that has been freed has its buffer still pinned in memory because - * the transaction is still sitting in a iclog. The stale inodes - * on that buffer will be pinned to the buffer until the - * transaction hits the disk and the callbacks run. Pushing the AIL will - * skip the stale inodes and may never see the pinned buffer, so - * nothing will push out the iclog and unpin the buffer. Hence we - * need to force the log here to ensure all items are flushed into the - * AIL before we go any further. - */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* - * Wait for all busy extents to be freed, including completion of - * any discard operation. - */ - xfs_extent_busy_wait_all(mp); - flush_workqueue(xfs_discard_wq); - - /* - * We now need to tell the world we are unmounting. This will allow - * us to detect that the filesystem is going away and we should error - * out anything that we have been retrying in the background. This will - * prevent neverending retries in AIL pushing from hanging the unmount. - */ - mp->m_flags |= XFS_MOUNT_UNMOUNTING; - - /* - * Flush all pending changes from the AIL. - */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * Reclaim all inodes. At this point there should be no dirty inodes and - * none should be pinned or locked. Stop background inode reclaim here - * if it is still running. - */ - cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp); - xfs_health_unmount(mp); + xfs_unmount_flush_inodes(mp); xfs_qm_unmount(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 659ad95fe3e0..81829d19596e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -93,7 +93,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_blockgc_workqueue; + struct workqueue_struct *m_gc_workqueue; struct workqueue_struct *m_sync_workqueue; int m_bsize; /* fs logical block size */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index f3082a957d5e..956cca24e67f 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -286,7 +286,7 @@ xfs_fs_commit_blocks( xfs_setattr_time(ip, iattr); if (update_isize) { i_size_write(inode, iattr->ia_size); - ip->i_d.di_size = iattr->ia_size; + ip->i_disk_size = iattr->ia_size; } xfs_trans_set_sync(tp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index bfa4164990b1..4bf949a89d0d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -788,7 +788,7 @@ xfs_qm_qino_alloc( if (need_alloc) { error = xfs_dir_ialloc(&init_user_ns, &tp, NULL, S_IFREG, 1, 0, - 0, ipp); + 0, false, ipp); if (error) { xfs_trans_cancel(tp); return error; @@ -992,7 +992,7 @@ xfs_qm_reset_dqcounts_buf( * trans_reserve. But, this gets called during quotacheck, and that * happens only at mount time which is single threaded. */ - if (qip->i_d.di_nblocks == 0) + if (qip->i_nblocks == 0) return 0; map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0); @@ -1165,16 +1165,14 @@ xfs_qm_dqusage_adjust( if (XFS_IS_REALTIME_INODE(ip)) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); - if (error) - goto error0; - } + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto error0; xfs_bmap_count_leaves(ifp, &rtblks); } - nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; + nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks; /* * Add the (disk blocks and inode) resources occupied by this @@ -1716,7 +1714,7 @@ xfs_qm_vop_dqalloc( } if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { ASSERT(O_pdqpp); - if (ip->i_d.di_projid != prid) { + if (ip->i_projid != prid) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, prid, XFS_DQTYPE_PROJ, true, &pq); @@ -1779,11 +1777,11 @@ xfs_qm_vop_chown( ASSERT(prevdq); ASSERT(prevdq != newdq); - xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks)); + xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_nblocks)); xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); /* the sparkling new dquot */ - xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); + xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_nblocks); xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* @@ -1877,7 +1875,7 @@ xfs_qm_vop_create_dqattach( } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(ip->i_d.di_projid == pdqp->q_id); + ASSERT(ip->i_projid == pdqp->q_id); ip->i_pdquot = xfs_qm_dqhold(pdqp); xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 639398091ad6..df00dfbf5c9d 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -60,7 +60,7 @@ xfs_qm_statvfs( struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *dqp; - if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQTYPE_PROJ, false, &dqp)) { + if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index ca1b57d291dc..11f1e2fbf22f 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -293,7 +293,7 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - ip->i_d.di_size = 0; + ip->i_disk_size = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index d27c0e852c0b..88d70c236a54 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -35,7 +35,7 @@ xfs_qm_fill_state( tempqip = true; } tstate->flags |= QCI_SYSFILE; - tstate->blocks = ip->i_d.di_nblocks; + tstate->blocks = ip->i_nblocks; tstate->nextents = ip->i_df.if_nextents; tstate->spc_timelimit = (u32)defq->blk.time; tstate->ino_timelimit = (u32)defq->ino.time; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 07ebccbbf4df..746f4eda724c 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -269,8 +269,8 @@ xfs_trans_log_finish_refcount_update( static int xfs_refcount_update_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct xfs_mount *mp = priv; struct xfs_refcount_intent *ra; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 725c7d8e4438..4dd4af6ac2ef 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -874,7 +874,7 @@ xfs_reflink_set_inode_flag( if (!xfs_is_reflink_inode(src)) { trace_xfs_reflink_set_inode_flag(src); xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); - src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + src->i_diflags2 |= XFS_DIFLAG2_REFLINK; xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); xfs_ifork_init_cow(src); } else @@ -886,7 +886,7 @@ xfs_reflink_set_inode_flag( if (!xfs_is_reflink_inode(dest)) { trace_xfs_reflink_set_inode_flag(dest); xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); - dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + dest->i_diflags2 |= XFS_DIFLAG2_REFLINK; xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); xfs_ifork_init_cow(dest); } else @@ -930,12 +930,12 @@ xfs_reflink_update_dest( if (newlen > i_size_read(VFS_I(dest))) { trace_xfs_reflink_update_inode_size(dest, newlen); i_size_write(VFS_I(dest), newlen); - dest->i_d.di_size = newlen; + dest->i_disk_size = newlen; } if (cowextsize) { - dest->i_d.di_cowextsize = cowextsize; - dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + dest->i_cowextsize = cowextsize; + dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; } xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); @@ -1156,7 +1156,7 @@ xfs_reflink_remap_extent( if (newlen > i_size_read(VFS_I(ip))) { trace_xfs_reflink_update_inode_size(ip, newlen); i_size_write(VFS_I(ip), newlen); - ip->i_d.di_size = newlen; + ip->i_disk_size = newlen; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -1392,11 +1392,9 @@ xfs_reflink_inode_has_shared_extents( int error; ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); - if (error) - return error; - } + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + return error; *has_shared = false; found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); @@ -1455,7 +1453,7 @@ xfs_reflink_clear_inode_flag( /* Clear the inode flag. */ trace_xfs_reflink_unset_inode_flag(ip); - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 49cebd68b672..dc4f0c9f0897 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -337,8 +337,8 @@ xfs_trans_log_finish_rmap_update( static int xfs_rmap_update_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct xfs_mount *mp = priv; struct xfs_rmap_intent *ra; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 161b0e8992ba..4e7be6b4ca8e 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -966,8 +966,8 @@ xfs_growfs_rt( * Get the old block counts for bitmap and summary inodes. * These can't change since other growfs callers are locked out. */ - rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size); - rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size); + rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_disk_size); + rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_disk_size); /* * Allocate space to the bitmap and summary files, as necessary. */ @@ -1036,9 +1036,9 @@ xfs_growfs_rt( * to update the incore size so that inode inactivation won't * punch what it thinks are "posteof" blocks. */ - mp->m_rbmip->i_d.di_size = + mp->m_rbmip->i_disk_size = nsbp->sb_rbmblocks * nsbp->sb_blocksize; - i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_d.di_size); + i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_disk_size); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); /* * Get the summary inode into the transaction. @@ -1050,8 +1050,8 @@ xfs_growfs_rt( * incore size so that inode inactivation won't punch what it * thinks are "posteof" blocks. */ - mp->m_rsumip->i_d.di_size = nmp->m_rsumsize; - i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_d.di_size); + mp->m_rsumip->i_disk_size = nmp->m_rsumsize; + i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_disk_size); xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE); /* * Copy summary data from old to new sizes. @@ -1318,8 +1318,8 @@ xfs_rtpick_extent( ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime; - if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { - mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { + mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; *seqp = 0; } seq = *seqp; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e5e0713bebcd..a2dab05332ac 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -519,10 +519,10 @@ xfs_init_mount_workqueues( if (!mp->m_reclaim_workqueue) goto out_destroy_cil; - mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s", + mp->m_gc_workqueue = alloc_workqueue("xfs-gc/%s", WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM, 0, mp->m_super->s_id); - if (!mp->m_blockgc_workqueue) + if (!mp->m_gc_workqueue) goto out_destroy_reclaim; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", @@ -533,7 +533,7 @@ xfs_init_mount_workqueues( return 0; out_destroy_eofb: - destroy_workqueue(mp->m_blockgc_workqueue); + destroy_workqueue(mp->m_gc_workqueue); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); out_destroy_cil: @@ -551,7 +551,7 @@ xfs_destroy_mount_workqueues( struct xfs_mount *mp) { destroy_workqueue(mp->m_sync_workqueue); - destroy_workqueue(mp->m_blockgc_workqueue); + destroy_workqueue(mp->m_gc_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); @@ -834,13 +834,13 @@ xfs_fs_statfs( statp->f_ffree = max_t(int64_t, ffree, 0); - if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) && ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) xfs_qm_statvfs(ip, statp); if (XFS_IS_REALTIME_MOUNT(mp) && - (ip->i_d.di_flags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { + (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { statp->f_blocks = sbp->sb_rblocks; statp->f_bavail = statp->f_bfree = sbp->sb_frextents * sbp->sb_rextsize; @@ -1126,6 +1126,22 @@ suffix_kstrtoint( return ret; } +static inline void +xfs_fs_warn_deprecated( + struct fs_context *fc, + struct fs_parameter *param, + uint64_t flag, + bool value) +{ + /* Don't print the warning if reconfiguring and current mount point + * already had the flag set + */ + if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) && + !!(XFS_M(fc->root->d_sb)->m_flags & flag) == value) + return; + xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key); +} + /* * Set mount state from a mount option. * @@ -1136,7 +1152,7 @@ xfs_fs_parse_param( struct fs_context *fc, struct fs_parameter *param) { - struct xfs_mount *mp = fc->s_fs_info; + struct xfs_mount *parsing_mp = fc->s_fs_info; struct fs_parse_result result; int size = 0; int opt; @@ -1147,142 +1163,142 @@ xfs_fs_parse_param( switch (opt) { case Opt_logbufs: - mp->m_logbufs = result.uint_32; + parsing_mp->m_logbufs = result.uint_32; return 0; case Opt_logbsize: - if (suffix_kstrtoint(param->string, 10, &mp->m_logbsize)) + if (suffix_kstrtoint(param->string, 10, &parsing_mp->m_logbsize)) return -EINVAL; return 0; case Opt_logdev: - kfree(mp->m_logname); - mp->m_logname = kstrdup(param->string, GFP_KERNEL); - if (!mp->m_logname) + kfree(parsing_mp->m_logname); + parsing_mp->m_logname = kstrdup(param->string, GFP_KERNEL); + if (!parsing_mp->m_logname) return -ENOMEM; return 0; case Opt_rtdev: - kfree(mp->m_rtname); - mp->m_rtname = kstrdup(param->string, GFP_KERNEL); - if (!mp->m_rtname) + kfree(parsing_mp->m_rtname); + parsing_mp->m_rtname = kstrdup(param->string, GFP_KERNEL); + if (!parsing_mp->m_rtname) return -ENOMEM; return 0; case Opt_allocsize: if (suffix_kstrtoint(param->string, 10, &size)) return -EINVAL; - mp->m_allocsize_log = ffs(size) - 1; - mp->m_flags |= XFS_MOUNT_ALLOCSIZE; + parsing_mp->m_allocsize_log = ffs(size) - 1; + parsing_mp->m_flags |= XFS_MOUNT_ALLOCSIZE; return 0; case Opt_grpid: case Opt_bsdgroups: - mp->m_flags |= XFS_MOUNT_GRPID; + parsing_mp->m_flags |= XFS_MOUNT_GRPID; return 0; case Opt_nogrpid: case Opt_sysvgroups: - mp->m_flags &= ~XFS_MOUNT_GRPID; + parsing_mp->m_flags &= ~XFS_MOUNT_GRPID; return 0; case Opt_wsync: - mp->m_flags |= XFS_MOUNT_WSYNC; + parsing_mp->m_flags |= XFS_MOUNT_WSYNC; return 0; case Opt_norecovery: - mp->m_flags |= XFS_MOUNT_NORECOVERY; + parsing_mp->m_flags |= XFS_MOUNT_NORECOVERY; return 0; case Opt_noalign: - mp->m_flags |= XFS_MOUNT_NOALIGN; + parsing_mp->m_flags |= XFS_MOUNT_NOALIGN; return 0; case Opt_swalloc: - mp->m_flags |= XFS_MOUNT_SWALLOC; + parsing_mp->m_flags |= XFS_MOUNT_SWALLOC; return 0; case Opt_sunit: - mp->m_dalign = result.uint_32; + parsing_mp->m_dalign = result.uint_32; return 0; case Opt_swidth: - mp->m_swidth = result.uint_32; + parsing_mp->m_swidth = result.uint_32; return 0; case Opt_inode32: - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + parsing_mp->m_flags |= XFS_MOUNT_SMALL_INUMS; return 0; case Opt_inode64: - mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; + parsing_mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; return 0; case Opt_nouuid: - mp->m_flags |= XFS_MOUNT_NOUUID; + parsing_mp->m_flags |= XFS_MOUNT_NOUUID; return 0; case Opt_largeio: - mp->m_flags |= XFS_MOUNT_LARGEIO; + parsing_mp->m_flags |= XFS_MOUNT_LARGEIO; return 0; case Opt_nolargeio: - mp->m_flags &= ~XFS_MOUNT_LARGEIO; + parsing_mp->m_flags &= ~XFS_MOUNT_LARGEIO; return 0; case Opt_filestreams: - mp->m_flags |= XFS_MOUNT_FILESTREAMS; + parsing_mp->m_flags |= XFS_MOUNT_FILESTREAMS; return 0; case Opt_noquota: - mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; - mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; - mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; + parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; + parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; return 0; case Opt_quota: case Opt_uquota: case Opt_usrquota: - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | + parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | XFS_UQUOTA_ENFD); return 0; case Opt_qnoenforce: case Opt_uqnoenforce: - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_UQUOTA_ENFD; + parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD; return 0; case Opt_pquota: case Opt_prjquota: - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | + parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | XFS_PQUOTA_ENFD); return 0; case Opt_pqnoenforce: - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_PQUOTA_ENFD; + parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD; return 0; case Opt_gquota: case Opt_grpquota: - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | + parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | XFS_GQUOTA_ENFD); return 0; case Opt_gqnoenforce: - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_GQUOTA_ENFD; + parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD; return 0; case Opt_discard: - mp->m_flags |= XFS_MOUNT_DISCARD; + parsing_mp->m_flags |= XFS_MOUNT_DISCARD; return 0; case Opt_nodiscard: - mp->m_flags &= ~XFS_MOUNT_DISCARD; + parsing_mp->m_flags &= ~XFS_MOUNT_DISCARD; return 0; #ifdef CONFIG_FS_DAX case Opt_dax: - xfs_mount_set_dax_mode(mp, XFS_DAX_ALWAYS); + xfs_mount_set_dax_mode(parsing_mp, XFS_DAX_ALWAYS); return 0; case Opt_dax_enum: - xfs_mount_set_dax_mode(mp, result.uint_32); + xfs_mount_set_dax_mode(parsing_mp, result.uint_32); return 0; #endif /* Following mount options will be removed in September 2025 */ case Opt_ikeep: - xfs_warn(mp, "%s mount option is deprecated.", param->key); - mp->m_flags |= XFS_MOUNT_IKEEP; + xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, true); + parsing_mp->m_flags |= XFS_MOUNT_IKEEP; return 0; case Opt_noikeep: - xfs_warn(mp, "%s mount option is deprecated.", param->key); - mp->m_flags &= ~XFS_MOUNT_IKEEP; + xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, false); + parsing_mp->m_flags &= ~XFS_MOUNT_IKEEP; return 0; case Opt_attr2: - xfs_warn(mp, "%s mount option is deprecated.", param->key); - mp->m_flags |= XFS_MOUNT_ATTR2; + xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_ATTR2, true); + parsing_mp->m_flags |= XFS_MOUNT_ATTR2; return 0; case Opt_noattr2: - xfs_warn(mp, "%s mount option is deprecated.", param->key); - mp->m_flags &= ~XFS_MOUNT_ATTR2; - mp->m_flags |= XFS_MOUNT_NOATTR2; + xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_NOATTR2, true); + parsing_mp->m_flags &= ~XFS_MOUNT_ATTR2; + parsing_mp->m_flags |= XFS_MOUNT_NOATTR2; return 0; default: - xfs_warn(mp, "unknown mount option [%s].", param->key); + xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; } @@ -1918,7 +1934,7 @@ xfs_init_zones(void) if (!xfs_ifork_zone) goto out_destroy_da_state_zone; - xfs_trans_zone = kmem_cache_create("xf_trans", + xfs_trans_zone = kmem_cache_create("xfs_trans", sizeof(struct xfs_trans), 0, 0, NULL); if (!xfs_trans_zone) diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 1ca484b8357f..d2b40dc60dfc 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -86,7 +86,6 @@ struct xfs_mount; struct xfs_buftarg; struct block_device; -extern void xfs_quiesce_attr(struct xfs_mount *mp); extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 1379013d74b8..99fbec32c10a 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -33,7 +33,7 @@ xfs_readlink_bmap_ilocked( struct xfs_buf *bp; xfs_daddr_t d; char *cur_chunk; - int pathlen = ip->i_d.di_size; + int pathlen = ip->i_disk_size; int nmaps = XFS_SYMLINK_MAPS; int byte_cnt; int n; @@ -86,7 +86,7 @@ xfs_readlink_bmap_ilocked( } ASSERT(pathlen == 0); - link[ip->i_d.di_size] = '\0'; + link[ip->i_disk_size] = '\0'; error = 0; out: @@ -104,14 +104,14 @@ xfs_readlink( trace_xfs_readlink(ip); - ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE)); + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); - pathlen = ip->i_d.di_size; + pathlen = ip->i_disk_size; if (!pathlen) goto out; @@ -182,7 +182,8 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), + mapped_fsgid(mnt_userns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -209,7 +210,7 @@ xfs_symlink( /* * Check whether the directory allows new symlinks or not. */ - if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { + if (dp->i_diflags & XFS_DIFLAG_NOSYMLINKS) { error = -EPERM; goto out_trans_cancel; } @@ -223,7 +224,7 @@ xfs_symlink( * Allocate an inode for the symlink. */ error = xfs_dir_ialloc(mnt_userns, &tp, dp, S_IFLNK | (mode & ~S_IFMT), - 1, 0, prid, &ip); + 1, 0, prid, false, &ip); if (error) goto out_trans_cancel; @@ -249,7 +250,7 @@ xfs_symlink( if (pathlen <= XFS_IFORK_DSIZE(ip)) { xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); - ip->i_d.di_size = pathlen; + ip->i_disk_size = pathlen; ip->i_df.if_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); } else { @@ -264,7 +265,7 @@ xfs_symlink( goto out_trans_cancel; resblks -= fs_blocks; - ip->i_d.di_size = pathlen; + ip->i_disk_size = pathlen; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); cur_chunk = target_path; @@ -299,7 +300,7 @@ xfs_symlink( } ASSERT(pathlen == 0); } - i_size_write(VFS_I(ip), ip->i_d.di_size); + i_size_write(VFS_I(ip), ip->i_disk_size); /* * Create the directory entry for the symlink. @@ -376,7 +377,7 @@ xfs_inactive_symlink_rmt( xfs_trans_t *tp; mp = ip->i_mount; - ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS); + ASSERT(!xfs_need_iread_extents(&ip->i_df)); /* * We're freeing a symlink that has some * blocks allocated to it. Free the @@ -399,8 +400,8 @@ xfs_inactive_symlink_rmt( * locked for the second transaction. In the error paths we need it * held so the cancel won't rele it, see below. */ - size = (int)ip->i_d.di_size; - ip->i_d.di_size = 0; + size = (int)ip->i_disk_size; + ip->i_disk_size = 0; VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* @@ -476,7 +477,7 @@ xfs_inactive_symlink( return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); - pathlen = (int)ip->i_d.di_size; + pathlen = (int)ip->i_disk_size; ASSERT(pathlen); if (pathlen <= 0 || pathlen > XFS_SYMLINK_MAXLEN) { @@ -491,7 +492,7 @@ xfs_inactive_symlink( * Inline fork state gets removed by xfs_difree() so we have nothing to * do here in that case. */ - if (ip->i_df.if_flags & XFS_IFINLINE) { + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e74bbb648f83..808ae337b222 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1298,7 +1298,7 @@ DECLARE_EVENT_CLASS(xfs_file_class, TP_fast_assign( __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; __entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino; - __entry->size = XFS_I(file_inode(iocb->ki_filp))->i_d.di_size; + __entry->size = XFS_I(file_inode(iocb->ki_filp))->i_disk_size; __entry->offset = iocb->ki_pos; __entry->count = iov_iter_count(iter); ), @@ -1341,7 +1341,7 @@ DECLARE_EVENT_CLASS(xfs_imap_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->size = ip->i_d.di_size; + __entry->size = ip->i_disk_size; __entry->offset = offset; __entry->count = count; __entry->whichfork = whichfork; @@ -1387,7 +1387,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->isize = VFS_I(ip)->i_size; - __entry->disize = ip->i_d.di_size; + __entry->disize = ip->i_disk_size; __entry->offset = offset; __entry->count = count; ), @@ -1425,7 +1425,7 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->size = ip->i_d.di_size; + __entry->size = ip->i_disk_size; __entry->new_size = new_size; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", @@ -1455,7 +1455,7 @@ TRACE_EVENT(xfs_pagecache_inval, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->size = ip->i_d.di_size; + __entry->size = ip->i_disk_size; __entry->start = start; __entry->finish = finish; ), @@ -1483,7 +1483,7 @@ TRACE_EVENT(xfs_bunmap, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->size = ip->i_d.di_size; + __entry->size = ip->i_disk_size; __entry->bno = bno; __entry->len = len; __entry->caller_ip = caller_ip; @@ -3145,12 +3145,12 @@ DECLARE_EVENT_CLASS(xfs_double_io_class, __entry->dev = VFS_I(src)->i_sb->s_dev; __entry->src_ino = src->i_ino; __entry->src_isize = VFS_I(src)->i_size; - __entry->src_disize = src->i_d.di_size; + __entry->src_disize = src->i_disk_size; __entry->src_offset = soffset; __entry->len = len; __entry->dest_ino = dest->i_ino; __entry->dest_isize = VFS_I(dest)->i_size; - __entry->dest_disize = dest->i_d.di_size; + __entry->dest_disize = dest->i_disk_size; __entry->dest_offset = doffset; ), TP_printk("dev %d:%d count %zd " diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index b22a09e9daee..bcc978011869 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -436,7 +436,6 @@ xfs_trans_mod_sb( tp->t_res_frextents_delta += delta; break; case XFS_TRANS_SB_DBLOCKS: - ASSERT(delta > 0); tp->t_dblocks_delta += delta; break; case XFS_TRANS_SB_AGCOUNT: @@ -618,19 +617,12 @@ xfs_trans_unreserve_and_mod_sb( ASSERT(!error); } - if (idelta) { + if (idelta) percpu_counter_add_batch(&mp->m_icount, idelta, XFS_ICOUNT_BATCH); - if (idelta < 0) - ASSERT(__percpu_counter_compare(&mp->m_icount, 0, - XFS_ICOUNT_BATCH) >= 0); - } - if (ifreedelta) { + if (ifreedelta) percpu_counter_add(&mp->m_ifree, ifreedelta); - if (ifreedelta < 0) - ASSERT(percpu_counter_compare(&mp->m_ifree, 0) >= 0); - } if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) return; @@ -1197,7 +1189,7 @@ retry: * though that part is only semi-transactional. */ error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, - pdqp, ip->i_d.di_nblocks + ip->i_delayed_blks, + pdqp, ip->i_nblocks + ip->i_delayed_blks, 1, qflags); if ((error == -EDQUOT || error == -ENOSPC) && !retried) { xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 12be32f66dc1..0d050f8829ef 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -9,6 +9,8 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_da_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_acl.h" diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 0fe76f376dee..cd145d318b17 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -165,6 +165,21 @@ static int zonefs_writepages(struct address_space *mapping, return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); } +static int zonefs_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) +{ + struct inode *inode = file_inode(swap_file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { + zonefs_err(inode->i_sb, + "swap file: not a conventional zone file\n"); + return -EINVAL; + } + + return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops); +} + static const struct address_space_operations zonefs_file_aops = { .readpage = zonefs_readpage, .readahead = zonefs_readahead, @@ -177,6 +192,7 @@ static const struct address_space_operations zonefs_file_aops = { .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .direct_IO = noop_direct_IO, + .swap_activate = zonefs_swap_activate, }; static void zonefs_update_stats(struct inode *inode, loff_t new_isize) @@ -728,6 +744,68 @@ out_release: } /* + * Do not exceed the LFS limits nor the file zone size. If pos is under the + * limit it becomes a short access. If it exceeds the limit, return -EFBIG. + */ +static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, + loff_t count) +{ + struct inode *inode = file_inode(file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + loff_t limit = rlimit(RLIMIT_FSIZE); + loff_t max_size = zi->i_max_size; + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + count = min(count, limit - pos); + } + + if (!(file->f_flags & O_LARGEFILE)) + max_size = min_t(loff_t, MAX_NON_LFS, max_size); + + if (unlikely(pos >= max_size)) + return -EFBIG; + + return min(count, max_size - pos); +} + +static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + loff_t count; + + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + if (!iov_iter_count(from)) + return 0; + + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + if (iocb->ki_flags & IOCB_APPEND) { + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return -EINVAL; + mutex_lock(&zi->i_truncate_mutex); + iocb->ki_pos = zi->i_wpoffset; + mutex_unlock(&zi->i_truncate_mutex); + } + + count = zonefs_write_check_limits(file, iocb->ki_pos, + iov_iter_count(from)); + if (count < 0) + return count; + + iov_iter_truncate(from, count); + return iov_iter_count(from); +} + +/* * Handle direct writes. For sequential zone files, this is the only possible * write path. For these files, check that the user is issuing writes * sequentially from the end of the file. This code assumes that the block layer @@ -744,8 +822,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) struct super_block *sb = inode->i_sb; bool sync = is_sync_kiocb(iocb); bool append = false; - size_t count; - ssize_t ret; + ssize_t ret, count; /* * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT @@ -763,12 +840,11 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } - ret = generic_write_checks(iocb, from); - if (ret <= 0) + count = zonefs_write_checks(iocb, from); + if (count <= 0) { + ret = count; goto inode_unlock; - - iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos); - count = iov_iter_count(from); + } if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { ret = -EINVAL; @@ -828,12 +904,10 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, inode_lock(inode); } - ret = generic_write_checks(iocb, from); + ret = zonefs_write_checks(iocb, from); if (ret <= 0) goto inode_unlock; - iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos); - ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops); if (ret > 0) iocb->ki_pos += ret; @@ -966,9 +1040,7 @@ static int zonefs_open_zone(struct inode *inode) mutex_lock(&zi->i_truncate_mutex); - zi->i_wr_refcnt++; - if (zi->i_wr_refcnt == 1) { - + if (!zi->i_wr_refcnt) { if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { atomic_dec(&sbi->s_open_zones); ret = -EBUSY; @@ -978,7 +1050,6 @@ static int zonefs_open_zone(struct inode *inode) if (i_size_read(inode) < zi->i_max_size) { ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); if (ret) { - zi->i_wr_refcnt--; atomic_dec(&sbi->s_open_zones); goto unlock; } @@ -986,6 +1057,8 @@ static int zonefs_open_zone(struct inode *inode) } } + zi->i_wr_refcnt++; + unlock: mutex_unlock(&zi->i_truncate_mutex); @@ -1104,7 +1177,6 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); enum zonefs_ztype t; - u64 fsid; buf->f_type = ZONEFS_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -1127,9 +1199,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) spin_unlock(&sbi->s_lock); - fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^ - le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b); return 0; } |